Module:template parser: difference between revisions
Theknightwho (talk | contribs) Use new() function in Module:parser, which generates Parser and Node objects unique to this module (instead of the original objects). |
Theknightwho (talk | contribs) Add type safety check. |
||
Line 1,029: | Line 1,029: | ||
function export.parse(text, transcluded) |
function export.parse(text, transcluded) |
||
local text_type = type(text) |
|||
return (select(2, Parser:parse{ |
return (select(2, Parser:parse{ |
||
text = text |
text = text_type == "string" and text or |
||
text_type == "number" and tostring(text) or |
|||
error("bad argument #1 (string expected, got " .. text_type .. ")"), |
|||
node = {Wikitext, true}, |
node = {Wikitext, true}, |
||
route = {"do_parse", transcluded} |
route = {"do_parse", transcluded} |
Revision as of 22:23, 7 April 2024
- The following documentation is located at Module:template parser/documentation. [edit]
- Useful links: subpage list • links • transclusions • testcases • sandbox
This module provides functions for parsing and finding template invocations found in wikitext.
parseTemplate(text, not_transcluded)
- Parses text as a template invocation and returns a pair of values, the template name and the arguments (containing anonymous, numbered and named arguments). If the text could not be parsed as a template invocation, the function returns nil. The parser will correctly parse any wikitext given as template arguments (such as subtemplates, arguments, tables etc), but if the string does not form a valid template in markup, then it will return
nil
. findTemplates(text, not_transcluded)
- Finds all template invocations in the text. This is designed to be used as an iterator in for statements, and returns four values for each invocation:
- The template name.
- The template arguments.
- The the full template invocation as it appears in the original text.
- The index the template appears at within the given text; as with Lua in general, the beginning of the text is index 1.
For convenience, template names will be normalized in two ways:
- They are preprocessed, which means that any templates (
{{ }}
) and arguments ({{{ }}}
) they contain will be resolved. - Any redirects will be converted to their canonical equivalents (e.g. Lua error in package.lua at line 80: module 'Module:string/decode entities' not found is treated as Lua error in package.lua at line 80: module 'Module:string/decode entities' not found).
Note that any templates with invalid names (after preprocessing) will be skipped over. For performance reasons, preprocessing is only applied to the keys in a template's table of arguments, so it should be applied (selectively) to the values by the calling module when needed.
Note that the parser will respect <noinclude>
, <includeonly>
and <onlyinclude>
tags. By default, text is treated as though it has been transcluded, which means that text between <noinclude>
tags will be ignored, and <onlyinclude>
tags will be respected if present. If the parameter not_transcluded is set to true
, then text will be treated as though it has not been transcluded, which means text between <includeonly>
tags will be ignored instead.
Although the parser is very accurate, some discrepancies may still exist between it and the native parser in certain cases.
--[[
NOTE: This module works by using recursive backtracking to build a node tree, which can then be traversed as necessary.
Because it is called by a number of high-use modules, it has been optimised for speed using a profiler, since it is used to scrape data from large numbers of pages very quickly. To that end, it rolls some of its own methods in cases where this is faster than using a function from one of the standard libraries. Please DO NOT "simplify" the code by removing these, since you are almost guaranteed to slow things down, which could seriously impact performance on pages which call this module hundreds or thousands of times.
It has also been designed to emulate the native parser's behaviour as much as possible, which in some cases means replicating bugs or unintuitive behaviours in that code; these should not be "fixed", since it is important that the outputs are the same. Most of these originate from deficient regular expressions, which can't be used here, so the bugs have to be manually reintroduced as special cases (e.g. onlyinclude tags being case-sensitive and whitespace intolerant, unlike all other tags). If any of these are fixed, this module should also be updated accordingly.
]]
local m_parser = require("Module:parser")
local concat = table.concat
local decode_entities = require("Module:string/decode entities")
local find = string.find
local gsub = string.gsub
local insert = table.insert
local is_node = m_parser.is_node
local lower = string.ulower
local match = string.match
local new_title = mw.title.new
local next = next
local pcall = pcall
local php_trim = require("Module:string/php trim")
local rawset = rawset
local rep = string.rep
local scribunto_param_key = require("Module:utilities/scribunto parameter key")
local select = select
local sub = string.sub
local tostring = m_parser.tostring
local type = type
local type_or_class = m_parser.type_or_class
local umatch = mw.ustring.match
local upper = string.uupper
local data = mw.loadData("Module:template parser/data")
local frame = mw.getCurrentFrame()
local invalid_tag_attribute_name_char = data.invalid_tag_attribute_name_char
local Parser, Node = m_parser.new()
local function php_trim_left(text)
return match(text, "[^%z\t-\v\r ].*") or ""
end
local function preprocess(text, args)
return is_node(text) and text:preprocess(args) or text
end
local export = {}
------------------------------------------------------------------------------------
--
-- Nodes
--
------------------------------------------------------------------------------------
function Node:preprocess(args)
local output = {}
for i = 1, #self do
output[i] = preprocess(self[i], args)
end
return concat(output)
end
local Wikitext = Node:new_class("wikitext")
-- force_node ensures the output will always be a node.
function Wikitext:new(this, force_node)
if type(this) ~= "table" then
return force_node and Node.new(self, {this}) or this
elseif #this == 1 then
local this1 = this[1]
return force_node and not is_node(this1) and Node.new(self, this) or this1
end
local success, str = pcall(concat, this)
if success then
return force_node and Node.new(self, {str}) or str
end
return Node.new(self, this)
end
-- First value is the argument name.
-- Second value is the argument's default value.
-- Any additional values are ignored: "{{{a|b|c}}}" is argument "a" with default value "b" (*not* "b|c").
local Argument = Node:new_class("argument")
function Argument:new(this)
local this2 = this[2]
if type_or_class(this2) == "parameter" then
insert(this2, 2, "=")
this2 = Wikitext:new(this2)
end
return Node.new(self, {this[1], this2})
end
function Argument:__tostring()
local output = {}
for i = 1, #self do
output[i] = tostring(self[i])
end
return "{{{" .. concat(output, "|") .. "}}}"
end
function Argument:next(i)
i = i + 1
if i <= 2 then
return self[i], i
end
end
function Argument:get_name(args)
return scribunto_param_key(preprocess(self[1], args))
end
function Argument:get_default(args)
return tostring(self[2]) or "{{{" .. tostring(self[1]) .. "}}}"
end
function Argument:preprocess(args)
if not args then
return preprocess(self[2], args) or
"{{{" .. preprocess(self[1], args) .. "}}}"
end
local name = preprocess(self[1], args)
return args[php_trim(name)] or
preprocess(self[2], args) or
"{{{" .. name .. "}}}"
end
local Parameter = Node:new_class("parameter")
function Parameter:__tostring()
return tostring(self[1]) .. "=" .. tostring(self[2])
end
local Template = Node:new_class("template")
function Template:__tostring()
local output = {}
for i = 1, #self do
output[i] = tostring(self[i])
end
return "{{" .. concat(output, "|") .. "}}"
end
function Template:get_params(args)
local params, implicit = {}, 0
for i = 2, #self do
local param = self[i]
if type_or_class(param) == "parameter" then
params[scribunto_param_key(preprocess(param[1], args))] = php_trim(tostring(param[2]))
else
implicit = implicit + 1
params[implicit] = tostring(param) -- Not trimmed.
end
end
return params
end
-- Normalize the template name, check it's a valid template, then memoize results (using false for invalid titles).
-- Parser functions (e.g. {{#IF:a|b|c}}) need to have the first argument extracted from the title, as it comes after the colon. Because of this, the parser function and first argument are memoized as a table.
-- FIXME: Some parser functions have special argument handling (e.g. {{#SWITCH:}}).
do
local parser_functions = data.parser_functions
local parser_variables = data.parser_variables
local transclusion_modifiers = data.transclusion_modifiers
local memo = {}
local function get_array_params(self)
local params = {}
for i = 2, #self do
params[i - 1] = tostring(self[i])
end
return params
end
local function convert_to_parser_function(self, name, arg1)
insert(self, 2, arg1)
self.get_params = get_array_params
return name
end
-- Mainspace titles starting with "#" should be invalid, but a bug in mw.title.new means a title object is returned that has the empty string for prefixedText, so we need to filter them out. Interwiki links aren't valid as templates, either.
local function is_valid_title(title)
return title and
#title.prefixedText > 0 and
#title.interwiki == 0
end
local function get_normalized_name(title, modifiers)
local namespace = title.namespace
return (modifiers and concat(modifiers) or "") ..
(namespace == 10 and title.text or
namespace == 0 and (":" .. title.text) or
title.prefixedText)
end
local function get_template_name(name, input, modifiers)
local title = new_title(name, 10)
if not is_valid_title(title) then
memo[input] = false
return
end
name = get_normalized_name(title, modifiers)
-- Resolve any redirects.
-- Unlike links, double redirects seem to work (but not higher).
-- If the redirect target is an interwiki link, then the template won't fail, but the redirect page itself gets transcluded (i.e. the template name shouldn't be normalized to the target).
for _ = 1, 2 do
title = title.redirectTarget
if not is_valid_title(title) then
break
end
name = get_normalized_name(title, modifiers)
end
memo[input] = name
return name
end
-- FIXME: handle INT modifier properly.
function Template:get_name(args)
local name = preprocess(self[1], args)
local norm = memo[name]
if norm then
if type(norm) == "table" then
return convert_to_parser_function(self, norm[1], norm[2])
end
return norm
elseif norm == false then
return
end
local input, colon, start, i, chunk, modifiers, int = name, find(name, ":", 1, true), 1, 0
while colon do
chunk = upper(php_trim_left(sub(name, start, colon - 1)))
local priority = transclusion_modifiers[chunk]
if not (priority and priority > i) then
break
elseif chunk == "SUBST" then -- SUBST always fails
return
end
i = priority
start = colon + 1
if chunk == "SAFESUBST" then -- ignore SAFESUBST
name = sub(name, colon + 1)
start = 1
else
modifiers = modifiers or {}
modifiers[#modifiers + 1] = chunk .. ":"
if chunk == "INT" then
int = true
end
end
colon = find(name, ":", start, true)
end
-- Magic words can't take modifiers (except SUBST/SAFESUBST, which are already dealt with).
if modifiers then
return get_template_name(php_trim(sub(name, start)), input, modifiers)
elseif colon then
local pf = parser_functions[1][chunk] or parser_functions[2][upper(chunk)]
if pf then
local arg1 = sub(name, colon + 1)
memo[input] = {pf, arg1}
return convert_to_parser_function(self, pf, arg1)
end
elseif #self == 1 then
name = php_trim(name)
local pv = parser_variables[1][name] or parser_variables[2][upper(name)]
if pv then
memo[input] = pv
return pv
end
return get_template_name(name, input)
end
return get_template_name(php_trim(name), input)
end
end
function Template:preprocess()
return frame:preprocess(tostring(self))
end
local Tag = Node:new_class("tag")
function Tag:__tostring()
local open_tag, attributes, i = {"<", self.name}, self:get_attributes(), 2
for attr, value in next, attributes do
i = i + 1
-- Quote value using "" by default, '' if it contains ", and leave unquoted if it contains both.
local quoter = not find(value, "\"", 1, true) and "\"" or
not find(value, "'", 1, true) and "'" or
not find(value, "%s") and "" or
-- This shouldn't happen, unless the node has been edited manually. Not possible to stringify in a way that can be interpreted by the native parser, since it doesn't recognise escapes.
error("Tag attribute values cannot contain all three of \", ' and whitespace simultaneously.")
open_tag[i] = " " .. attr .. "=" .. quoter .. value .. quoter
end
if self.self_closing then
return concat(open_tag) .. "/>"
end
return concat(open_tag) .. ">" .. concat(self) .. "</" .. self.name .. ">"
end
function Tag:get_attributes()
local raw = self.attributes
if not raw then
self.attributes = {}
return self.attributes
elseif type(raw) == "table" then
return raw
end
local raw_len = #raw
if sub(raw, raw_len) == "/" then
raw = sub(raw, 1, raw_len - 1)
end
local attributes, head = {}, 1
-- Semi-manual implementation of the native regex.
while true do
local name, loc = match(raw, "([^\t\n\f\r />][^\t\n\f\r /=>]*)()", head)
if not name then
break
end
head = loc
local value
loc = match(raw, "^[\t\n\f\r ]*=[\t\n\f\r ]*()", head)
if loc then
head = loc
value = match(raw, "^%b\"\"", head) or match(raw, "^%b''", head)
if value then
local value_len = #value
head = head + value_len
value = sub(value, 2, value_len - 1)
else
value = match(raw, "^[^\t\n\f\r ]*", head)
head = head + #value
if match(value, "^[\"']") then
value = sub(value, 2)
end
end
end
if not (match(name, "^[%-%.]") or umatch(name, invalid_tag_attribute_name_char)) then
attributes[lower(name)] = value and decode_entities(php_trim(gsub(value, "[\t\n\r ]+", " "))) or ""
end
end
self.attributes = attributes
return attributes
end
function Tag:preprocess()
return frame:preprocess(tostring(self))
end
local Heading = Node:new_class("heading")
function Heading:new(this)
if #this > 1 then
local success, str = pcall(concat, this)
if success then
return Node.new(self, {
str,
level = this.level,
section = this.section,
pos = this.pos
})
end
end
return Node.new(self, this)
end
function Heading:__tostring()
local eq = rep("=", self.level)
return eq .. Node.__tostring(self) .. eq
end
function Heading:get_name(args)
return php_trim(Node.preprocess(self, args))
end
function Heading:preprocess(args)
local eq = rep("=", self.level)
return eq .. Node.preprocess(self, args) .. eq
end
------------------------------------------------------------------------------------
--
-- Parser
--
------------------------------------------------------------------------------------
function Parser:read(i, j)
local head, i = self.head, i or 0
return sub(self.text, head + i, head + (j or i))
end
function Parser:advance(n)
self.head = self.head + (n or self[-1].step or 1)
end
function Parser:consume(this)
local layer = self[-1]
if not this then
local text, head = self.text, self.head
local loc1, loc2 = find(text, layer.pattern, head)
this = sub(text, head, loc1 and (loc1 == head and loc2 or loc1 - 1) or nil)
end
layer.step = #this
return layer.handler(self, this)
end
-- Template or argument.
-- Parsed by matching the opening braces innermost-to-outermost (ignoring lone closing braces). Arguments {{{ }}} take priority over templates {{ }} where possible, but a double closing brace will always result in a closure, even if there are 3+ opening braces.
-- For example, "{{{{foo}}}}" (4) is parsed as an argument enclosed by single braces, and "{{{{{foo}}}}}" (5) is an argument inside a template. However, "{{{{{foo }} }}}" is a template inside an argument, due to "}}" forcing the closure of the inner node.
do
-- Handlers.
local handle_name
local handle_parameter
function handle_name(self, ...)
handle_name = self:switch(handle_name, {
["\n"] = Parser.heading_block,
["<"] = Parser.tag,
["["] = Parser.wikilink_block,
["{"] = Parser.braces,
["|"] = function(self)
self:emit(Wikitext:new(self:pop_sublayer()))
self[-1].pattern = "[\n<=%[{|}]"
self:push_sublayer(handle_parameter)
end,
["}"] = function(self)
if self:read(1) == "}" then
self:emit(Wikitext:new(self:pop_sublayer()))
return self:pop()
end
self:emit("}")
end,
[""] = Parser.fail_route,
[false] = Parser.emit
})
return handle_name(self, ...)
end
function handle_parameter(self, ...)
local function emit_parameter(self)
local param = Wikitext:new(self:pop_sublayer())
local layer = self[-1]
local key = layer.key
if key then
param = Parameter:new{key, param}
layer.key = nil
end
self:emit(param)
end
handle_parameter = self:switch(handle_parameter, {
["\n"] = function(self)
if self[-1].key then
return self:heading_block()
end
self:newline()
while self:read(0, 2) == "\n==" do
self:advance()
self:emit(select(2, self:get("do_heading_block")))
end
end,
["<"] = Parser.tag,
["="] = function(self)
local key = Wikitext:new(self:pop_sublayer())
self[-1].key = key
self:push_sublayer(handle_parameter)
rawset(self[-1], "pattern", "[\n<%[{|}]")
end,
["["] = Parser.wikilink_block,
["{"] = Parser.braces,
["|"] = function(self)
emit_parameter(self)
self:push_sublayer(handle_parameter)
end,
["}"] = function(self)
if self:read(1) == "}" then
emit_parameter(self)
return self:pop()
end
self:emit("}")
end,
[""] = Parser.fail_route,
[false] = Parser.emit
})
return handle_parameter(self, ...)
end
function Parser:do_template_or_argument()
self[-1].pattern = "[\n<%[{|}]"
self:push_sublayer(handle_name)
end
function Parser:template_or_argument()
local text, head, node_to_emit = self.text, self.head
-- Comments/tags interrupt the brace count.
local braces = #match(text, "^{+", head)
self:advance(braces)
repeat
local success, node = self:get("do_template_or_argument")
if not success then
self:emit(rep("{", braces))
break
elseif node_to_emit then
-- Nest the already-parsed node at the start of the new node.
local node1 = node[1]
node[1] = (
node1 == "" and node_to_emit or
Wikitext:new{node_to_emit, node1}
)
end
if self:read(2) == "}" and braces > 2 then
self:advance(3)
braces = braces - 3
node = Argument:new(node)
else
self:advance(2)
braces = braces - 2
node = Template:new(node)
end
local pos = head + braces
node.pos = pos
node.raw = sub(text, pos, self.head - 1)
node_to_emit = node
if braces == 1 then
self:emit("{")
break
end
until braces == 0
if node_to_emit then
self:emit(node_to_emit)
end
return self:consume()
end
end
-- Tag.
do
local tags = data.tags
-- Handlers.
local handle_start
local handle_tag
local function is_ignored_tag(self, this)
if self.transcluded then
return this == "includeonly"
end
return this == "noinclude" or this == "onlyinclude"
end
local function ignored_tag(self, text, head)
local loc = find(text, ">", head, true)
if not loc then
return self:fail_route()
end
self.head = loc
self[-1].ignored = true
return self:pop()
end
function handle_start(self, this)
if this == "/" then
local text, head = self.text, self.head + 1
local this = match(text, "^[^%s/>]+", head)
if this and is_ignored_tag(self, lower(this)) then
head = head + #this
if not match(text, "^/[^>]", head) then
return ignored_tag(self, text, head)
end
end
return self:fail_route()
elseif this == "" then
return self:fail_route()
end
this = lower(this)
if not tags[this] then
return self:fail_route()
end
local layer = self[-1]
local text, head = self.text, self.head + layer.step
if match(text, "^/[^>]", head) then
return self:fail_route()
elseif is_ignored_tag(self, this) then
return ignored_tag(self, text, head)
elseif this == "noinclude" or this == "includeonly" then
layer.ignored = true -- Ignored block.
end
layer.name, layer.handler, layer.pattern = this, handle_tag, ">"
end
function handle_tag(self, this)
if this == "" then
return self:fail_route()
elseif this ~= ">" then
local this_len = #this
self[-1].attributes = sub(this, this_len) == "/" and sub(this, 1, this_len - 1) or this
return
elseif self:read(-1) == "/" then
self[-1].self_closing = true
return self:pop()
end
local text, head, layer = self.text, self.head + 1, self[-1]
local loc1, loc2 = find(text, tags[layer.name], head)
if loc1 then
if loc1 > head then
self:emit(sub(text, head, loc1 - 1))
end
self.head = loc2
return self:pop()
elseif layer.ignored then
self.head = #self.text
return self:pop()
end
return self:fail_route()
end
function Parser:do_tag()
local layer = self[-1]
layer.handler, layer.pattern = handle_start, "[%s/>]"
self:advance()
end
local function find_next_chunk(text, pattern, head)
return select(2, find(text, pattern, head, true)) or #text
end
function Parser:tag()
-- HTML comment.
if self:read(1, 3) == "!--" then
self.head = find_next_chunk(self.text, "-->", self.head + 4)
-- onlyinclude closing tag.
elseif self.onlyinclude and self:read(1, 13) == "/onlyinclude>" then
self.head = find_next_chunk(self.text, "<onlyinclude>", self.head + 14)
else
local success, tag = self:get("do_tag")
if not success then
self:emit("<")
elseif not tag.ignored then
self:emit(Tag:new(tag))
end
end
end
end
-- Heading.
-- The preparser assigns each heading a number, which is used for things like section edit links. The preparser will only do this for heading blocks which aren't nested inside templates, arguments and parser tags. In some cases (e.g. when template blocks contain untrimmed newlines), a preparsed heading may not be treated as a heading in the final output. That does not affect the preparser, however, which will always count sections based on the preparser heading count, since it can't know what a template's final output will be.
do
-- Handlers.
local handle_start
local handle_body
local handle_possible_end
function handle_start(self, ...)
-- ===== is "=" as an L2; ======== is "==" as an L3 etc.
local function newline(self)
local layer = self[-1]
local eq = layer.level
if eq <= 2 then
return self:fail_route()
end
-- Calculate which equals signs determine the heading level.
local level_eq = eq - (2 - eq % 2)
level_eq = level_eq > 12 and 12 or level_eq
-- Emit the excess.
self:emit(rep("=", eq - level_eq))
layer.level = level_eq / 2
return self:pop()
end
local function whitespace(self)
local success, possible_end = self:get("do_heading_possible_end")
if success then
self:emit(Wikitext:new(possible_end))
local layer = self[-1]
layer.handler, layer.pattern = handle_body, "[\n<={]"
return self:consume()
end
return newline(self)
end
handle_start = self:switch(handle_start, {
["\t"] = whitespace,
["\n"] = newline,
[" "] = whitespace,
[""] = newline,
[false] = function(self)
-- Emit any excess = signs once we know it's a conventional heading. Up till now, we couldn't know if the heading is just a string of = signs (e.g. ========), so it wasn't guaranteed that the heading text starts after the 6th.
local layer = self[-1]
local eq = layer.level
if eq > 6 then
self:emit(1, rep("=", eq - 6))
layer.level = 6
end
layer.handler, layer.pattern = handle_body, "[\n<={]"
return self:consume()
end
})
return handle_start(self, ...)
end
function handle_body(self, ...)
handle_body = self:switch(handle_body, {
["\n"] = Parser.fail_route,
["<"] = Parser.tag,
["="] = function(self)
-- Comments/tags interrupt the equals count.
local eq = match(self.text, "^=+", self.head)
local eq_len = #eq
self:advance(eq_len)
local success, possible_end = self:get("do_heading_possible_end")
if success then
self:emit(eq)
self:emit(Wikitext:new(possible_end))
return self:consume()
end
local layer = self[-1]
local level = layer.level
if eq_len > level then
self:emit(rep("=", eq_len - level))
elseif level > eq_len then
layer.level = eq_len
self:emit(1, rep("=", level - eq_len))
end
return self:pop()
end,
["{"] = function(self)
if self:read(1) == "{" then
return self:template_or_argument()
end
self:emit("{")
end,
[""] = Parser.fail_route,
[false] = Parser.emit
})
return handle_body(self, ...)
end
function handle_possible_end(self, ...)
handle_possible_end = self:switch(handle_possible_end, {
["\n"] = Parser.fail_route,
["<"] = function(self)
local head = (
self:read(1, 3) == "!--" and
select(2, find(self.text, "-->", self.head + 4, true))
)
if not head then
return self:pop()
end
self.head = head
end,
[""] = Parser.fail_route,
[false] = function(self, this)
if not match(this, "^[\t ]+$") then
return self:pop()
end
self:emit(this)
end
})
return handle_possible_end(self, ...)
end
function Parser:do_heading()
local layer = self[-1]
layer.handler, layer.pattern, layer.pos = handle_start, "[\t\n ]", self.head
-- Comments/tags interrupt the equals count.
local eq = #match(self.text, "^=+", self.head)
layer.level = eq
self:advance(eq)
end
function Parser:do_heading_possible_end()
local layer = self[-1]
layer.handler, layer.pattern = handle_possible_end, "[\n<]"
end
function Parser:heading()
local success, heading = self:get("do_heading")
if success then
local section = self.section + 1
heading.section = section
self.section = section
self:emit(Heading:new(heading))
return self:consume()
else
self:emit("=")
end
end
end
------------------------------------------------------------------------------------
--
-- Block handlers
--
------------------------------------------------------------------------------------
-- Block handlers.
-- These are blocks which can affect template/argument parsing, since they're also parsed by Parsoid at the same time (even though they aren't processed until later).
-- All blocks (including templates/arguments) can nest inside each other, but an inner block must be closed before the outer block which contains it. This is why, for example, the wikitext "{{template| [[ }}" will result in an unprocessed template, since the inner "[[" is treated as the opening of a wikilink block, which prevents "}}" from being treated as the closure of the template block. On the other hand, "{{template| [[ ]] }}" will process correctly, since the wikilink block is closed before the template closure. It makes no difference whether the block will be treated as valid or not when it's processed later on, so "{{template| [[ }} ]] }}" would also work, even though "[[ }} ]]" is not a valid wikilink.
-- Note that nesting also affects pipes and equals signs, in addition to block closures.
-- These blocks can be nested to any degree, so "{{template| [[ [[ [[ ]] }}" will not work, since only one of the three wikilink blocks has been closed. On the other hand, "{{template| [[ [[ [[ ]] ]] ]] }}" will work.
-- All blocks are implicitly closed by the end of the text, since their validity is irrelevant at this stage.
-- Language conversion block.
-- Opens with "-{" and closes with "}-". However, templates/arguments take priority, so "-{{" is parsed as "-" followed by the opening of a template/argument block (depending on what comes after).
-- Note: Language conversion blocks aren't actually enabled on the English Wiktionary, but Parsoid still parses them at this stage, so they can affect the closure of outer blocks: e.g. "[[ -{ ]]" is not a valid wikilink block, since the "]]" falls inside the new language conversion block.
do
local function handle_language_conversion_block(self, ...)
handle_language_conversion_block = self:switch(handle_language_conversion_block, {
["\n"] = Parser.heading_block,
["<"] = Parser.tag,
["["] = Parser.wikilink_block,
["{"] = Parser.braces,
["}"] = function(self)
if self:read(1) == "-" then
self:emit("}-")
self:advance()
return self:pop()
end
self:emit("}")
end,
[""] = Parser.pop,
[false] = Parser.emit
})
return handle_language_conversion_block(self, ...)
end
function Parser:do_language_conversion_block()
local layer = self[-1]
layer.handler, layer.pattern = handle_language_conversion_block, "[\n<%[{}]"
end
function Parser:braces()
if self:read(1) == "{" then
return self:template_or_argument()
end
self:emit("{")
if self:read(-1) == "-" then
self:advance()
self:emit(Wikitext:new(select(2, self:get("do_language_conversion_block"))))
end
end
end
-- Headings
-- Opens with "\n=" (or "=" at the start of the text), and closes with "\n" or the end of the text. Note that it doesn't matter whether the heading will fail to process due to a premature newline (e.g. if there are no closing signs), so at this stage the only thing that matters for closure is the newline or end of text.
-- Note: Heading blocks are only parsed like this if they occur inside a template, since they do not iterate the preparser's heading count (i.e. they aren't proper headings).
-- Note 2: if directly inside a template parameter with no previous equals signs, a newline followed by a single equals sign is parsed as a parameter equals sign, not the opening of a new L1 heading block. This does not apply to any other heading levels. As such, {{template|parameter\n=}}, {{template|key\n=value}} or even {{template|\n=}} will successfully close, but {{template|parameter\n==}}, {{template|key=value\n=more value}}, {{template\n=}} etc. will not, since in the latter cases the "}}" would fall inside the new heading block.
do
local function handle_heading_block(self, ...)
handle_heading_block = self:switch(handle_heading_block, {
["\n"] = function(self)
self:newline()
return self:pop()
end,
["<"] = Parser.tag,
["["] = Parser.wikilink_block,
["{"] = Parser.braces,
[""] = Parser.pop,
[false] = Parser.emit
})
return handle_heading_block(self, ...)
end
function Parser:do_heading_block()
local layer = self[-1]
layer.handler, layer.pattern = handle_heading_block, "[\n<%[{]"
end
function Parser:heading_block()
self:newline()
while self:read(0, 1) == "\n=" do
self:advance()
self:emit(Wikitext:new(select(2, self:get("do_heading_block"))))
end
end
end
-- Wikilink block.
-- Opens with "[[" and closes with "]]".
do
local function handle_wikilink_block(self, ...)
handle_wikilink_block = self:switch(handle_wikilink_block, {
["\n"] = Parser.heading_block,
["<"] = Parser.tag,
["["] = Parser.wikilink_block,
["]"] = function(self)
if self:read(1) == "]" then
self:emit("]]")
self:advance()
return self:pop()
end
self:emit("]")
end,
["{"] = Parser.braces,
[""] = Parser.pop,
[false] = Parser.emit
})
return handle_wikilink_block(self, ...)
end
function Parser:do_wikilink_block()
local layer = self[-1]
layer.handler, layer.pattern = handle_wikilink_block, "[\n<%[%]{]"
end
function Parser:wikilink_block()
if self:read(1) == "[" then
self:emit("[[")
self:advance(2)
self:emit(Wikitext:new(select(2, self:get("do_wikilink_block"))))
else
self:emit("[")
end
end
end
-- Lines which only contain comments, " " and "\t" are eaten, so long as they're bookended by "\n" (i.e. not the first or last line).
function Parser:newline()
local text, head = self.text, self.head
while true do
repeat
local loc = match(text, "^[\t ]*<!%-%-()", head + 1)
if not loc then
break
end
loc = select(2, find(text, "-->", loc, true))
head = loc or head
until not loc
-- Fail if no comments found.
if head == self.head then
break
end
head = match(text, "^[\t ]*()\n", head + 1)
if not head then
break
end
self.head = head
end
self:emit("\n")
end
do
-- Handlers.
local handle_start
local main_handler
-- If the first character is "=", try parsing it as a heading.
function handle_start(self, this)
local layer = self[-1]
layer.handler, layer.pattern = main_handler, "[\n<{]"
if this == "=" then
return self:heading()
end
return self:consume()
end
function main_handler(self, ...)
main_handler = self:switch(main_handler, {
["\n"] = function(self)
self:newline()
if self:read(1) == "=" then
self:advance()
return self:heading()
end
end,
["<"] = Parser.tag,
["{"] = function(self)
if self:read(1) == "{" then
return self:template_or_argument()
end
self:emit("{")
end,
[""] = Parser.pop,
[false] = Parser.emit
})
return main_handler(self, ...)
end
-- If `transcluded` is true, then the text is checked for a pair of onlyinclude tags. If these are found (even if they're in the wrong order), then the start of the page is treated as though it is preceded by a closing onlyinclude tag.
-- Note 1: unlike other parser extension tags, onlyinclude tags are case-sensitive and cannot contain whitespace.
-- Note 2: onlyinclude tags *can* be implicitly closed by the end of the text, but the hard requirement above means this can only happen if either the tags are in the wrong order or there are multiple onlyinclude blocks.
function Parser:do_parse(transcluded)
local layer = self[-1]
layer.handler, layer.pattern = handle_start, "."
self.section = 0
if not transcluded then
return
end
self.transcluded = true
local text = self.text
if find(text, "</onlyinclude>", 1, true) then
local head = find(text, "<onlyinclude>", 1, true)
if head then
self.onlyinclude = true
self.head = head + 13
end
end
end
function export.parse(text, transcluded)
local text_type = type(text)
return (select(2, Parser:parse{
text = text_type == "string" and text or
text_type == "number" and tostring(text) or
error("bad argument #1 (string expected, got " .. text_type .. ")"),
node = {Wikitext, true},
route = {"do_parse", transcluded}
}))
end
end
function export.parseTemplate(text, not_transcluded)
text = export.parse(text, not not_transcluded)
if type_or_class(text) == "template" then
local name = text:get_name()
if name then
return name, text:get_params()
end
end
end
do
local function next_template(iterate)
while true do
local node = iterate()
if not node then
return
elseif type_or_class(node) == "template" then
local name = node:get_name()
if name then
return name, node:get_params(), node.raw, node.pos
end
end
end
end
function export.findTemplates(text, not_transcluded)
return next_template, export.parse(text, not not_transcluded):__pairs("next_node")
end
end
function export.findArguments(text, args, not_transcluded)
local iterate = export.parse(text, not not_transcluded):__pairs("next_node")
return function()
while true do
local node = iterate()
if not node then
return
elseif type_or_class(node) == "argument" then
local name = node:get_name(args)
if name then
return name, node:get_default(args), node.raw, node.pos
end
end
end
end
end
do
local function check_level(level)
if type(level) ~= "number" then
error("Heading levels must be numbers.")
elseif level < 1 or level > 6 or level % 1 ~= 0 then
error("Heading levels must be integers between 1 and 6.")
end
return level
end
-- Note: heading names can contain "\n" (e.g. inside nowiki tags), which causes any heading containing them to fail. When that happens, the heading is not returned by this function, but the heading count is still iterated, since Parsoid's preprocessor still counts it as a heading for the purpose of heading strip markers (i.e. the section number).
-- TODO: section numbers for edit links seem to also include headings nested inside templates and arguments (but apparently not those in parser extension tags - need to test this more). If we ever want to add section edit links manually, this will need to be accounted for.
function export.findHeadings(text, i, j)
i = i and check_level(i) or 1
j = j and check_level(j) or 6
local iterate = export.parse(text):__pairs("next_node")
return function()
while true do
local node = iterate()
if not node then
return
elseif type_or_class(node) == "heading" then
local level = node.level
if level >= i and level <= j then
local name = node:get_name()
if not find(name, "\n", 1, true) then
return name, level, node.section, node.pos
end
end
end
end
end
end
end
return export