`` characters, you
may want to call :class:`ParserElement.parse_with_tabs`
Example::
wd = Word(alphas)
for match in locatedExpr(wd).search_string("ljsdf123lksdjjf123lkkjj1222"):
print(match)
prints::
[[0, 'ljsdf', 5]]
[[8, 'lksdjjf', 15]]
[[18, 'lkkjj', 23]]
"""
locator = Empty().set_parse_action(lambda ss, ll, tt: ll)
return Group(
locator("locn_start")
+ expr("value")
+ locator.copy().leaveWhitespace()("locn_end")
)
def nested_expr(
opener: Union[str, ParserElement] = "(",
closer: Union[str, ParserElement] = ")",
content: typing.Optional[ParserElement] = None,
ignore_expr: ParserElement = quoted_string(),
*,
ignoreExpr: ParserElement = quoted_string(),
) -> ParserElement:
"""Helper method for defining nested lists enclosed in opening and
closing delimiters (``"("`` and ``")"`` are the default).
Parameters:
- ``opener`` - opening character for a nested list
(default= ``"("``); can also be a pyparsing expression
- ``closer`` - closing character for a nested list
(default= ``")"``); can also be a pyparsing expression
- ``content`` - expression for items within the nested lists
(default= ``None``)
- ``ignore_expr`` - expression for ignoring opening and closing delimiters
(default= :class:`quoted_string`)
- ``ignoreExpr`` - this pre-PEP8 argument is retained for compatibility
but will be removed in a future release
If an expression is not provided for the content argument, the
nested expression will capture all whitespace-delimited content
between delimiters as a list of separate values.
Use the ``ignore_expr`` argument to define expressions that may
contain opening or closing characters that should not be treated as
opening or closing characters for nesting, such as quoted_string or
a comment expression. Specify multiple expressions using an
:class:`Or` or :class:`MatchFirst`. The default is
:class:`quoted_string`, but if no expressions are to be ignored, then
pass ``None`` for this argument.
Example::
data_type = one_of("void int short long char float double")
decl_data_type = Combine(data_type + Opt(Word('*')))
ident = Word(alphas+'_', alphanums+'_')
number = pyparsing_common.number
arg = Group(decl_data_type + ident)
LPAR, RPAR = map(Suppress, "()")
code_body = nested_expr('{', '}', ignore_expr=(quoted_string | c_style_comment))
c_function = (decl_data_type("type")
+ ident("name")
+ LPAR + Opt(DelimitedList(arg), [])("args") + RPAR
+ code_body("body"))
c_function.ignore(c_style_comment)
source_code = '''
int is_odd(int x) {
return (x%2);
}
int dec_to_hex(char hchar) {
if (hchar >= '0' && hchar <= '9') {
return (ord(hchar)-ord('0'));
} else {
return (10+ord(hchar)-ord('A'));
}
}
'''
for func in c_function.search_string(source_code):
print("%(name)s (%(type)s) args: %(args)s" % func)
prints::
is_odd (int) args: [['int', 'x']]
dec_to_hex (int) args: [['char', 'hchar']]
"""
if ignoreExpr != ignore_expr:
ignoreExpr = ignore_expr if ignoreExpr == quoted_string() else ignoreExpr
if opener == closer:
raise ValueError("opening and closing strings cannot be the same")
if content is None:
if isinstance(opener, str_type) and isinstance(closer, str_type):
opener = typing.cast(str, opener)
closer = typing.cast(str, closer)
if len(opener) == 1 and len(closer) == 1:
if ignoreExpr is not None:
content = Combine(
OneOrMore(
~ignoreExpr
+ CharsNotIn(
opener + closer + ParserElement.DEFAULT_WHITE_CHARS,
exact=1,
)
)
).set_parse_action(lambda t: t[0].strip())
else:
content = empty.copy() + CharsNotIn(
opener + closer + ParserElement.DEFAULT_WHITE_CHARS
).set_parse_action(lambda t: t[0].strip())
else:
if ignoreExpr is not None:
content = Combine(
OneOrMore(
~ignoreExpr
+ ~Literal(opener)
+ ~Literal(closer)
+ CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)
)
).set_parse_action(lambda t: t[0].strip())
else:
content = Combine(
OneOrMore(
~Literal(opener)
+ ~Literal(closer)
+ CharsNotIn(ParserElement.DEFAULT_WHITE_CHARS, exact=1)
)
).set_parse_action(lambda t: t[0].strip())
else:
raise ValueError(
"opening and closing arguments must be strings if no content expression is given"
)
ret = Forward()
if ignoreExpr is not None:
ret <<= Group(
Suppress(opener) + ZeroOrMore(ignoreExpr | ret | content) + Suppress(closer)
)
else:
ret <<= Group(Suppress(opener) + ZeroOrMore(ret | content) + Suppress(closer))
ret.set_name(f"nested {opener}{closer} expression")
# don't override error message from content expressions
ret.errmsg = None
return ret
def _makeTags(tagStr, xml, suppress_LT=Suppress("<"), suppress_GT=Suppress(">")):
"""Internal helper to construct opening and closing tag expressions, given a tag name"""
if isinstance(tagStr, str_type):
resname = tagStr
tagStr = Keyword(tagStr, caseless=not xml)
else:
resname = tagStr.name
tagAttrName = Word(alphas, alphanums + "_-:")
if xml:
tagAttrValue = dbl_quoted_string.copy().set_parse_action(remove_quotes)
openTag = (
suppress_LT
+ tagStr("tag")
+ Dict(ZeroOrMore(Group(tagAttrName + Suppress("=") + tagAttrValue)))
+ Opt("/", default=[False])("empty").set_parse_action(
lambda s, l, t: t[0] == "/"
)
+ suppress_GT
)
else:
tagAttrValue = quoted_string.copy().set_parse_action(remove_quotes) | Word(
printables, exclude_chars=">"
)
openTag = (
suppress_LT
+ tagStr("tag")
+ Dict(
ZeroOrMore(
Group(
tagAttrName.set_parse_action(lambda t: t[0].lower())
+ Opt(Suppress("=") + tagAttrValue)
)
)
)
+ Opt("/", default=[False])("empty").set_parse_action(
lambda s, l, t: t[0] == "/"
)
+ suppress_GT
)
closeTag = Combine(Literal("") + tagStr + ">", adjacent=False)
openTag.set_name(f"<{resname}>")
# add start results name in parse action now that ungrouped names are not reported at two levels
openTag.add_parse_action(
lambda t: t.__setitem__(
"start" + "".join(resname.replace(":", " ").title().split()), t.copy()
)
)
closeTag = closeTag(
"end" + "".join(resname.replace(":", " ").title().split())
).set_name(f"{resname}>")
openTag.tag = resname
closeTag.tag = resname
openTag.tag_body = SkipTo(closeTag())
return openTag, closeTag
def make_html_tags(
tag_str: Union[str, ParserElement]
) -> tuple[ParserElement, ParserElement]:
"""Helper to construct opening and closing tag expressions for HTML,
given a tag name. Matches tags in either upper or lower case,
attributes with namespaces and with quoted or unquoted values.
Example::
text = 'More info at the pyparsing wiki page | '
# make_html_tags returns pyparsing expressions for the opening and
# closing tags as a 2-tuple
a, a_end = make_html_tags("A")
link_expr = a + SkipTo(a_end)("link_text") + a_end
for link in link_expr.search_string(text):
# attributes in the tag (like "href" shown here) are
# also accessible as named results
print(link.link_text, '->', link.href)
prints::
pyparsing -> https://github.com/pyparsing/pyparsing/wiki
"""
return _makeTags(tag_str, False)
def make_xml_tags(
tag_str: Union[str, ParserElement]
) -> tuple[ParserElement, ParserElement]:
"""Helper to construct opening and closing tag expressions for XML,
given a tag name. Matches tags only in the given upper/lower case.
Example: similar to :class:`make_html_tags`
"""
return _makeTags(tag_str, True)
any_open_tag: ParserElement
any_close_tag: ParserElement
any_open_tag, any_close_tag = make_html_tags(
Word(alphas, alphanums + "_:").set_name("any tag")
)
_htmlEntityMap = {k.rstrip(";"): v for k, v in html.entities.html5.items()}
_most_common_entities = "nbsp lt gt amp quot apos cent pound euro copy".replace(
" ", "|"
)
common_html_entity = Regex(
lambda: f"&(?P{_most_common_entities}|{make_compressed_re(_htmlEntityMap)});"
).set_name("common HTML entity")
def replace_html_entity(s, l, t):
"""Helper parser action to replace common HTML entities with their special characters"""
return _htmlEntityMap.get(t.entity)
class OpAssoc(Enum):
"""Enumeration of operator associativity
- used in constructing InfixNotationOperatorSpec for :class:`infix_notation`"""
LEFT = 1
RIGHT = 2
InfixNotationOperatorArgType = Union[
ParserElement, str, tuple[Union[ParserElement, str], Union[ParserElement, str]]
]
InfixNotationOperatorSpec = Union[
tuple[
InfixNotationOperatorArgType,
int,
OpAssoc,
typing.Optional[ParseAction],
],
tuple[
InfixNotationOperatorArgType,
int,
OpAssoc,
],
]
def infix_notation(
base_expr: ParserElement,
op_list: list[InfixNotationOperatorSpec],
lpar: Union[str, ParserElement] = Suppress("("),
rpar: Union[str, ParserElement] = Suppress(")"),
) -> ParserElement:
"""Helper method for constructing grammars of expressions made up of
operators working in a precedence hierarchy. Operators may be unary
or binary, left- or right-associative. Parse actions can also be
attached to operator expressions. The generated parser will also
recognize the use of parentheses to override operator precedences
(see example below).
Note: if you define a deep operator list, you may see performance
issues when using infix_notation. See
:class:`ParserElement.enable_packrat` for a mechanism to potentially
improve your parser performance.
Parameters:
- ``base_expr`` - expression representing the most basic operand to
be used in the expression
- ``op_list`` - list of tuples, one for each operator precedence level
in the expression grammar; each tuple is of the form ``(op_expr,
num_operands, right_left_assoc, (optional)parse_action)``, where:
- ``op_expr`` is the pyparsing expression for the operator; may also
be a string, which will be converted to a Literal; if ``num_operands``
is 3, ``op_expr`` is a tuple of two expressions, for the two
operators separating the 3 terms
- ``num_operands`` is the number of terms for this operator (must be 1,
2, or 3)
- ``right_left_assoc`` is the indicator whether the operator is right
or left associative, using the pyparsing-defined constants
``OpAssoc.RIGHT`` and ``OpAssoc.LEFT``.
- ``parse_action`` is the parse action to be associated with
expressions matching this operator expression (the parse action
tuple member may be omitted); if the parse action is passed
a tuple or list of functions, this is equivalent to calling
``set_parse_action(*fn)``
(:class:`ParserElement.set_parse_action`)
- ``lpar`` - expression for matching left-parentheses; if passed as a
str, then will be parsed as ``Suppress(lpar)``. If lpar is passed as
an expression (such as ``Literal('(')``), then it will be kept in
the parsed results, and grouped with them. (default= ``Suppress('(')``)
- ``rpar`` - expression for matching right-parentheses; if passed as a
str, then will be parsed as ``Suppress(rpar)``. If rpar is passed as
an expression (such as ``Literal(')')``), then it will be kept in
the parsed results, and grouped with them. (default= ``Suppress(')')``)
Example::
# simple example of four-function arithmetic with ints and
# variable names
integer = pyparsing_common.signed_integer
varname = pyparsing_common.identifier
arith_expr = infix_notation(integer | varname,
[
('-', 1, OpAssoc.RIGHT),
(one_of('* /'), 2, OpAssoc.LEFT),
(one_of('+ -'), 2, OpAssoc.LEFT),
])
arith_expr.run_tests('''
5+3*6
(5+3)*6
-2--11
''', full_dump=False)
prints::
5+3*6
[[5, '+', [3, '*', 6]]]
(5+3)*6
[[[5, '+', 3], '*', 6]]
(5+x)*y
[[[5, '+', 'x'], '*', 'y']]
-2--11
[[['-', 2], '-', ['-', 11]]]
"""
# captive version of FollowedBy that does not do parse actions or capture results names
class _FB(FollowedBy):
def parseImpl(self, instring, loc, doActions=True):
self.expr.try_parse(instring, loc)
return loc, []
_FB.__name__ = "FollowedBy>"
ret = Forward()
ret.set_name(f"{base_expr.name}_expression")
if isinstance(lpar, str):
lpar = Suppress(lpar)
if isinstance(rpar, str):
rpar = Suppress(rpar)
nested_expr = (lpar + ret + rpar).set_name(f"nested_{base_expr.name}")
# if lpar and rpar are not suppressed, wrap in group
if not (isinstance(lpar, Suppress) and isinstance(rpar, Suppress)):
lastExpr = base_expr | Group(nested_expr)
else:
lastExpr = base_expr | nested_expr
arity: int
rightLeftAssoc: opAssoc
pa: typing.Optional[ParseAction]
opExpr1: ParserElement
opExpr2: ParserElement
matchExpr: ParserElement
match_lookahead: ParserElement
for operDef in op_list:
opExpr, arity, rightLeftAssoc, pa = (operDef + (None,))[:4] # type: ignore[assignment]
if isinstance(opExpr, str_type):
opExpr = ParserElement._literalStringClass(opExpr)
opExpr = typing.cast(ParserElement, opExpr)
if arity == 3:
if not isinstance(opExpr, (tuple, list)) or len(opExpr) != 2:
raise ValueError(
"if numterms=3, opExpr must be a tuple or list of two expressions"
)
opExpr1, opExpr2 = opExpr
term_name = f"{opExpr1}{opExpr2} operations"
else:
term_name = f"{opExpr} operations"
if not 1 <= arity <= 3:
raise ValueError("operator must be unary (1), binary (2), or ternary (3)")
if rightLeftAssoc not in (OpAssoc.LEFT, OpAssoc.RIGHT):
raise ValueError("operator must indicate right or left associativity")
thisExpr: ParserElement = Forward().set_name(term_name)
thisExpr = typing.cast(Forward, thisExpr)
match_lookahead = And([])
if rightLeftAssoc is OpAssoc.LEFT:
if arity == 1:
match_lookahead = _FB(lastExpr + opExpr)
matchExpr = Group(lastExpr + opExpr[1, ...])
elif arity == 2:
if opExpr is not None:
match_lookahead = _FB(lastExpr + opExpr + lastExpr)
matchExpr = Group(lastExpr + (opExpr + lastExpr)[1, ...])
else:
match_lookahead = _FB(lastExpr + lastExpr)
matchExpr = Group(lastExpr[2, ...])
elif arity == 3:
match_lookahead = _FB(
lastExpr + opExpr1 + lastExpr + opExpr2 + lastExpr
)
matchExpr = Group(
lastExpr + (opExpr1 + lastExpr + opExpr2 + lastExpr)[1, ...]
)
elif rightLeftAssoc is OpAssoc.RIGHT:
if arity == 1:
# try to avoid LR with this extra test
if not isinstance(opExpr, Opt):
opExpr = Opt(opExpr)
match_lookahead = _FB(opExpr.expr + thisExpr)
matchExpr = Group(opExpr + thisExpr)
elif arity == 2:
if opExpr is not None:
match_lookahead = _FB(lastExpr + opExpr + thisExpr)
matchExpr = Group(lastExpr + (opExpr + thisExpr)[1, ...])
else:
match_lookahead = _FB(lastExpr + thisExpr)
matchExpr = Group(lastExpr + thisExpr[1, ...])
elif arity == 3:
match_lookahead = _FB(
lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr
)
matchExpr = Group(lastExpr + opExpr1 + thisExpr + opExpr2 + thisExpr)
# suppress lookahead expr from railroad diagrams
match_lookahead.show_in_diagram = False
# TODO - determine why this statement can't be included in the following
# if pa block
matchExpr = match_lookahead + matchExpr
if pa:
if isinstance(pa, (tuple, list)):
matchExpr.set_parse_action(*pa)
else:
matchExpr.set_parse_action(pa)
thisExpr <<= (matchExpr | lastExpr).setName(term_name)
lastExpr = thisExpr
ret <<= lastExpr
return ret
def indentedBlock(blockStatementExpr, indentStack, indent=True, backup_stacks=[]):
"""
(DEPRECATED - use :class:`IndentedBlock` class instead)
Helper method for defining space-delimited indentation blocks,
such as those used to define block statements in Python source code.
Parameters:
- ``blockStatementExpr`` - expression defining syntax of statement that
is repeated within the indented block
- ``indentStack`` - list created by caller to manage indentation stack
(multiple ``statementWithIndentedBlock`` expressions within a single
grammar should share a common ``indentStack``)
- ``indent`` - boolean indicating whether block must be indented beyond
the current level; set to ``False`` for block of left-most statements
(default= ``True``)
A valid block must contain at least one ``blockStatement``.
(Note that indentedBlock uses internal parse actions which make it
incompatible with packrat parsing.)
Example::
data = '''
def A(z):
A1
B = 100
G = A2
A2
A3
B
def BB(a,b,c):
BB1
def BBA():
bba1
bba2
bba3
C
D
def spam(x,y):
def eggs(z):
pass
'''
indentStack = [1]
stmt = Forward()
identifier = Word(alphas, alphanums)
funcDecl = ("def" + identifier + Group("(" + Opt(delimitedList(identifier)) + ")") + ":")
func_body = indentedBlock(stmt, indentStack)
funcDef = Group(funcDecl + func_body)
rvalue = Forward()
funcCall = Group(identifier + "(" + Opt(delimitedList(rvalue)) + ")")
rvalue << (funcCall | identifier | Word(nums))
assignment = Group(identifier + "=" + rvalue)
stmt << (funcDef | assignment | identifier)
module_body = stmt[1, ...]
parseTree = module_body.parseString(data)
parseTree.pprint()
prints::
[['def',
'A',
['(', 'z', ')'],
':',
[['A1'], [['B', '=', '100']], [['G', '=', 'A2']], ['A2'], ['A3']]],
'B',
['def',
'BB',
['(', 'a', 'b', 'c', ')'],
':',
[['BB1'], [['def', 'BBA', ['(', ')'], ':', [['bba1'], ['bba2'], ['bba3']]]]]],
'C',
'D',
['def',
'spam',
['(', 'x', 'y', ')'],
':',
[[['def', 'eggs', ['(', 'z', ')'], ':', [['pass']]]]]]]
"""
backup_stacks.append(indentStack[:])
def reset_stack():
indentStack[:] = backup_stacks[-1]
def checkPeerIndent(s, l, t):
if l >= len(s):
return
curCol = col(l, s)
if curCol != indentStack[-1]:
if curCol > indentStack[-1]:
raise ParseException(s, l, "illegal nesting")
raise ParseException(s, l, "not a peer entry")
def checkSubIndent(s, l, t):
curCol = col(l, s)
if curCol > indentStack[-1]:
indentStack.append(curCol)
else:
raise ParseException(s, l, "not a subentry")
def checkUnindent(s, l, t):
if l >= len(s):
return
curCol = col(l, s)
if not (indentStack and curCol in indentStack):
raise ParseException(s, l, "not an unindent")
if curCol < indentStack[-1]:
indentStack.pop()
NL = OneOrMore(LineEnd().set_whitespace_chars("\t ").suppress())
INDENT = (Empty() + Empty().set_parse_action(checkSubIndent)).set_name("INDENT")
PEER = Empty().set_parse_action(checkPeerIndent).set_name("")
UNDENT = Empty().set_parse_action(checkUnindent).set_name("UNINDENT")
if indent:
smExpr = Group(
Opt(NL)
+ INDENT
+ OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))
+ UNDENT
)
else:
smExpr = Group(
Opt(NL)
+ OneOrMore(PEER + Group(blockStatementExpr) + Opt(NL))
+ Opt(UNDENT)
)
# add a parse action to remove backup_stack from list of backups
smExpr.add_parse_action(
lambda: backup_stacks.pop(-1) and None if backup_stacks else None
)
smExpr.set_fail_action(lambda a, b, c, d: reset_stack())
blockStatementExpr.ignore(_bslash + LineEnd())
return smExpr.set_name("indented block")
# it's easy to get these comment structures wrong - they're very common,
# so may as well make them available
c_style_comment = Regex(r"/\*(?:[^*]|\*(?!/))*\*\/").set_name("C style comment")
"Comment of the form ``/* ... */``"
html_comment = Regex(r"").set_name("HTML comment")
"Comment of the form ````"
rest_of_line = Regex(r".*").leave_whitespace().set_name("rest of line")
dbl_slash_comment = Regex(r"//(?:\\\n|[^\n])*").set_name("// comment")
"Comment of the form ``// ... (to end of line)``"
cpp_style_comment = Regex(
r"(?:/\*(?:[^*]|\*(?!/))*\*\/)|(?://(?:\\\n|[^\n])*)"
).set_name("C++ style comment")
"Comment of either form :class:`c_style_comment` or :class:`dbl_slash_comment`"
java_style_comment = cpp_style_comment
"Same as :class:`cpp_style_comment`"
python_style_comment = Regex(r"#.*").set_name("Python style comment")
"Comment of the form ``# ... (to end of line)``"
# build list of built-in expressions, for future reference if a global default value
# gets updated
_builtin_exprs: list[ParserElement] = [
v for v in vars().values() if isinstance(v, ParserElement)
]
# compatibility function, superseded by DelimitedList class
def delimited_list(
expr: Union[str, ParserElement],
delim: Union[str, ParserElement] = ",",
combine: bool = False,
min: typing.Optional[int] = None,
max: typing.Optional[int] = None,
*,
allow_trailing_delim: bool = False,
) -> ParserElement:
"""(DEPRECATED - use :class:`DelimitedList` class)"""
return DelimitedList(
expr, delim, combine, min, max, allow_trailing_delim=allow_trailing_delim
)
# Compatibility synonyms
# fmt: off
opAssoc = OpAssoc
anyOpenTag = any_open_tag
anyCloseTag = any_close_tag
commonHTMLEntity = common_html_entity
cStyleComment = c_style_comment
htmlComment = html_comment
restOfLine = rest_of_line
dblSlashComment = dbl_slash_comment
cppStyleComment = cpp_style_comment
javaStyleComment = java_style_comment
pythonStyleComment = python_style_comment
delimitedList = replaced_by_pep8("delimitedList", DelimitedList)
delimited_list = replaced_by_pep8("delimited_list", DelimitedList)
countedArray = replaced_by_pep8("countedArray", counted_array)
matchPreviousLiteral = replaced_by_pep8("matchPreviousLiteral", match_previous_literal)
matchPreviousExpr = replaced_by_pep8("matchPreviousExpr", match_previous_expr)
oneOf = replaced_by_pep8("oneOf", one_of)
dictOf = replaced_by_pep8("dictOf", dict_of)
originalTextFor = replaced_by_pep8("originalTextFor", original_text_for)
nestedExpr = replaced_by_pep8("nestedExpr", nested_expr)
makeHTMLTags = replaced_by_pep8("makeHTMLTags", make_html_tags)
makeXMLTags = replaced_by_pep8("makeXMLTags", make_xml_tags)
replaceHTMLEntity = replaced_by_pep8("replaceHTMLEntity", replace_html_entity)
infixNotation = replaced_by_pep8("infixNotation", infix_notation)
# fmt: on