""" pygments.lexers.html ~~~~~~~~~~~~~~~~~~~~ Lexers for HTML, XML and related markup. :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS. :license: BSD, see LICENSE for details. """ import re from pygments.lexer import RegexLexer, ExtendedRegexLexer, include, bygroups, \ default, using from pygments.token import Text, Comment, Operator, Keyword, Name, String, \ Punctuation from pygments.util import looks_like_xml, html_doctype_matches from pygments.lexers.javascript import JavascriptLexer from pygments.lexers.jvm import ScalaLexer from pygments.lexers.css import CssLexer, _indentation, _starts_block from pygments.lexers.ruby import RubyLexer __all__ = ['HtmlLexer', 'DtdLexer', 'XmlLexer', 'XsltLexer', 'HamlLexer', 'ScamlLexer', 'PugLexer'] class HtmlLexer(RegexLexer): """ For HTML 4 and XHTML 1 markup. Nested JavaScript and CSS is highlighted by the appropriate lexer. """ name = 'HTML' url = 'https://html.spec.whatwg.org/' aliases = ['html'] filenames = ['*.html', '*.htm', '*.xhtml', '*.xslt'] mimetypes = ['text/html', 'application/xhtml+xml'] flags = re.IGNORECASE | re.DOTALL tokens = { 'root': [ ('[^<&]+', Text), (r'&\S*?;', Name.Entity), (r'\<\!\[CDATA\[.*?\]\]\>', Comment.Preproc), (r'', Comment.Multiline), (r'<\?.*?\?>', Comment.Preproc), (']*>', Comment.Preproc), (r'(<)(\s*)(script)(\s*)', bygroups(Punctuation, Text, Name.Tag, Text), ('script-content', 'tag')), (r'(<)(\s*)(style)(\s*)', bygroups(Punctuation, Text, Name.Tag, Text), ('style-content', 'tag')), # note: this allows tag names not used in HTML like , # this is to support yet-unknown template engines and the like (r'(<)(\s*)([\w:.-]+)', bygroups(Punctuation, Text, Name.Tag), 'tag'), (r'(<)(\s*)(/)(\s*)([\w:.-]+)(\s*)(>)', bygroups(Punctuation, Text, Punctuation, Text, Name.Tag, Text, Punctuation)), ], 'tag': [ (r'\s+', Text), (r'([\w:-]+\s*)(=)(\s*)', bygroups(Name.Attribute, Operator, Text), 'attr'), (r'[\w:-]+', Name.Attribute), (r'(/?)(\s*)(>)', bygroups(Punctuation, Text, Punctuation), '#pop'), ], 'script-content': [ (r'(<)(\s*)(/)(\s*)(script)(\s*)(>)', bygroups(Punctuation, Text, Punctuation, Text, Name.Tag, Text, Punctuation), '#pop'), (r'.+?(?=<\s*/\s*script\s*>)', using(JavascriptLexer)), # fallback cases for when there is no closing script tag # first look for newline and then go back into root state # if that fails just read the rest of the file # this is similar to the error handling logic in lexer.py (r'.+?\n', using(JavascriptLexer), '#pop'), (r'.+', using(JavascriptLexer), '#pop'), ], 'style-content': [ (r'(<)(\s*)(/)(\s*)(style)(\s*)(>)', bygroups(Punctuation, Text, Punctuation, Text, Name.Tag, Text, Punctuation),'#pop'), (r'.+?(?=<\s*/\s*style\s*>)', using(CssLexer)), # fallback cases for when there is no closing style tag # first look for newline and then go back into root state # if that fails just read the rest of the file # this is similar to the error handling logic in lexer.py (r'.+?\n', using(CssLexer), '#pop'), (r'.+', using(CssLexer), '#pop'), ], 'attr': [ ('".*?"', String, '#pop'), ("'.*?'", String, '#pop'), (r'[^\s>]+', String, '#pop'), ], } def analyse_text(text): if html_doctype_matches(text): return 0.5 class DtdLexer(RegexLexer): """ A lexer for DTDs (Document Type Definitions). .. versionadded:: 1.5 """ flags = re.MULTILINE | re.DOTALL name = 'DTD' aliases = ['dtd'] filenames = ['*.dtd'] mimetypes = ['application/xml-dtd'] tokens = { 'root': [ include('common'), (r'(\s]+)', bygroups(Keyword, Text, Name.Tag)), (r'PUBLIC|SYSTEM', Keyword.Constant), (r'[\[\]>]', Keyword), ], 'common': [ (r'\s+', Text), (r'(%|&)[^;]*;', Name.Entity), ('', Comment, '#pop'), ('-', Comment), ], 'element': [ include('common'), (r'EMPTY|ANY|#PCDATA', Keyword.Constant), (r'[^>\s|()?+*,]+', Name.Tag), (r'>', Keyword, '#pop'), ], 'attlist': [ include('common'), (r'CDATA|IDREFS|IDREF|ID|NMTOKENS|NMTOKEN|ENTITIES|ENTITY|NOTATION', Keyword.Constant), (r'#REQUIRED|#IMPLIED|#FIXED', Keyword.Constant), (r'xml:space|xml:lang', Keyword.Reserved), (r'[^>\s|()?+*,]+', Name.Attribute), (r'>', Keyword, '#pop'), ], 'entity': [ include('common'), (r'SYSTEM|PUBLIC|NDATA', Keyword.Constant), (r'[^>\s|()?+*,]+', Name.Entity), (r'>', Keyword, '#pop'), ], 'notation': [ include('common'), (r'SYSTEM|PUBLIC', Keyword.Constant), (r'[^>\s|()?+*,]+', Name.Attribute), (r'>', Keyword, '#pop'), ], } def analyse_text(text): if not looks_like_xml(text) and \ ('', Comment.Preproc), (r'', Comment.Multiline), (r'<\?.*?\?>', Comment.Preproc), (']*>', Comment.Preproc), (r'<\s*[\w:.-]+', Name.Tag, 'tag'), (r'<\s*/\s*[\w:.-]+\s*>', Name.Tag), ], 'tag': [ (r'\s+', Text), (r'[\w.:-]+\s*=', Name.Attribute, 'attr'), (r'/?\s*>', Name.Tag, '#pop'), ], 'attr': [ (r'\s+', Text), ('".*?"', String, '#pop'), ("'.*?'", String, '#pop'), (r'[^\s>]+', String, '#pop'), ], } def analyse_text(text): if looks_like_xml(text): return 0.45 # less than HTML class XsltLexer(XmlLexer): """ A lexer for XSLT. .. versionadded:: 0.10 """ name = 'XSLT' aliases = ['xslt'] filenames = ['*.xsl', '*.xslt', '*.xpl'] # xpl is XProc mimetypes = ['application/xsl+xml', 'application/xslt+xml'] EXTRA_KEYWORDS = { 'apply-imports', 'apply-templates', 'attribute', 'attribute-set', 'call-template', 'choose', 'comment', 'copy', 'copy-of', 'decimal-format', 'element', 'fallback', 'for-each', 'if', 'import', 'include', 'key', 'message', 'namespace-alias', 'number', 'otherwise', 'output', 'param', 'preserve-space', 'processing-instruction', 'sort', 'strip-space', 'stylesheet', 'template', 'text', 'transform', 'value-of', 'variable', 'when', 'with-param' } def get_tokens_unprocessed(self, text): for index, token, value in XmlLexer.get_tokens_unprocessed(self, text): m = re.match(']*)/?>?', value) if token is Name.Tag and m and m.group(1) in self.EXTRA_KEYWORDS: yield index, Keyword, value else: yield index, token, value def analyse_text(text): if looks_like_xml(text) and ']{1,2}(?=[ \t=])', Punctuation), include('eval-or-plain'), ], 'plain': [ (r'([^#\n]|#[^{\n]|(\\\\)*\\#\{)+', Text), (r'(#\{)(' + _dot + r'*?)(\})', bygroups(String.Interpol, using(RubyLexer), String.Interpol)), (r'\n', Text, 'root'), ], 'html-attributes': [ (r'\s+', Text), (r'[\w:-]+[ \t]*=', Name.Attribute, 'html-attribute-value'), (r'[\w:-]+', Name.Attribute), (r'\)', Text, '#pop'), ], 'html-attribute-value': [ (r'[ \t]+', Text), (r'\w+', Name.Variable, '#pop'), (r'@\w+', Name.Variable.Instance, '#pop'), (r'\$\w+', Name.Variable.Global, '#pop'), (r"'(\\\\|\\[^\\]|[^'\\\n])*'", String, '#pop'), (r'"(\\\\|\\[^\\]|[^"\\\n])*"', String, '#pop'), ], 'html-comment-block': [ (_dot + '+', Comment), (r'\n', Text, 'root'), ], 'haml-comment-block': [ (_dot + '+', Comment.Preproc), (r'\n', Text, 'root'), ], 'filter-block': [ (r'([^#\n]|#[^{\n]|(\\\\)*\\#\{)+', Name.Decorator), (r'(#\{)(' + _dot + r'*?)(\})', bygroups(String.Interpol, using(RubyLexer), String.Interpol)), (r'\n', Text, 'root'), ], } class ScamlLexer(ExtendedRegexLexer): """ For `Scaml markup `_. Scaml is Haml for Scala. .. versionadded:: 1.4 """ name = 'Scaml' aliases = ['scaml'] filenames = ['*.scaml'] mimetypes = ['text/x-scaml'] flags = re.IGNORECASE # Scaml does not yet support the " |\n" notation to # wrap long lines. Once it does, use the custom faux # dot instead. # _dot = r'(?: \|\n(?=.* \|)|.)' _dot = r'.' tokens = { 'root': [ (r'[ \t]*\n', Text), (r'[ \t]*', _indentation), ], 'css': [ (r'\.[\w:-]+', Name.Class, 'tag'), (r'\#[\w:-]+', Name.Function, 'tag'), ], 'eval-or-plain': [ (r'[&!]?==', Punctuation, 'plain'), (r'([&!]?[=~])(' + _dot + r'*\n)', bygroups(Punctuation, using(ScalaLexer)), 'root'), default('plain'), ], 'content': [ include('css'), (r'%[\w:-]+', Name.Tag, 'tag'), (r'!!!' + _dot + r'*\n', Name.Namespace, '#pop'), (r'(/)(\[' + _dot + r'*?\])(' + _dot + r'*\n)', bygroups(Comment, Comment.Special, Comment), '#pop'), (r'/' + _dot + r'*\n', _starts_block(Comment, 'html-comment-block'), '#pop'), (r'-#' + _dot + r'*\n', _starts_block(Comment.Preproc, 'scaml-comment-block'), '#pop'), (r'(-@\s*)(import)?(' + _dot + r'*\n)', bygroups(Punctuation, Keyword, using(ScalaLexer)), '#pop'), (r'(-)(' + _dot + r'*\n)', bygroups(Punctuation, using(ScalaLexer)), '#pop'), (r':' + _dot + r'*\n', _starts_block(Name.Decorator, 'filter-block'), '#pop'), include('eval-or-plain'), ], 'tag': [ include('css'), (r'\{(,\n|' + _dot + r')*?\}', using(ScalaLexer)), (r'\[' + _dot + r'*?\]', using(ScalaLexer)), (r'\(', Text, 'html-attributes'), (r'/[ \t]*\n', Punctuation, '#pop:2'), (r'[<>]{1,2}(?=[ \t=])', Punctuation), include('eval-or-plain'), ], 'plain': [ (r'([^#\n]|#[^{\n]|(\\\\)*\\#\{)+', Text), (r'(#\{)(' + _dot + r'*?)(\})', bygroups(String.Interpol, using(ScalaLexer), String.Interpol)), (r'\n', Text, 'root'), ], 'html-attributes': [ (r'\s+', Text), (r'[\w:-]+[ \t]*=', Name.Attribute, 'html-attribute-value'), (r'[\w:-]+', Name.Attribute), (r'\)', Text, '#pop'), ], 'html-attribute-value': [ (r'[ \t]+', Text), (r'\w+', Name.Variable, '#pop'), (r'@\w+', Name.Variable.Instance, '#pop'), (r'\$\w+', Name.Variable.Global, '#pop'), (r"'(\\\\|\\[^\\]|[^'\\\n])*'", String, '#pop'), (r'"(\\\\|\\[^\\]|[^"\\\n])*"', String, '#pop'), ], 'html-comment-block': [ (_dot + '+', Comment), (r'\n', Text, 'root'), ], 'scaml-comment-block': [ (_dot + '+', Comment.Preproc), (r'\n', Text, 'root'), ], 'filter-block': [ (r'([^#\n]|#[^{\n]|(\\\\)*\\#\{)+', Name.Decorator), (r'(#\{)(' + _dot + r'*?)(\})', bygroups(String.Interpol, using(ScalaLexer), String.Interpol)), (r'\n', Text, 'root'), ], } class PugLexer(ExtendedRegexLexer): """ For Pug markup. Pug is a variant of Scaml, see: http://scalate.fusesource.org/documentation/scaml-reference.html .. versionadded:: 1.4 """ name = 'Pug' aliases = ['pug', 'jade'] filenames = ['*.pug', '*.jade'] mimetypes = ['text/x-pug', 'text/x-jade'] flags = re.IGNORECASE _dot = r'.' tokens = { 'root': [ (r'[ \t]*\n', Text), (r'[ \t]*', _indentation), ], 'css': [ (r'\.[\w:-]+', Name.Class, 'tag'), (r'\#[\w:-]+', Name.Function, 'tag'), ], 'eval-or-plain': [ (r'[&!]?==', Punctuation, 'plain'), (r'([&!]?[=~])(' + _dot + r'*\n)', bygroups(Punctuation, using(ScalaLexer)), 'root'), default('plain'), ], 'content': [ include('css'), (r'!!!' + _dot + r'*\n', Name.Namespace, '#pop'), (r'(/)(\[' + _dot + r'*?\])(' + _dot + r'*\n)', bygroups(Comment, Comment.Special, Comment), '#pop'), (r'/' + _dot + r'*\n', _starts_block(Comment, 'html-comment-block'), '#pop'), (r'-#' + _dot + r'*\n', _starts_block(Comment.Preproc, 'scaml-comment-block'), '#pop'), (r'(-@\s*)(import)?(' + _dot + r'*\n)', bygroups(Punctuation, Keyword, using(ScalaLexer)), '#pop'), (r'(-)(' + _dot + r'*\n)', bygroups(Punctuation, using(ScalaLexer)), '#pop'), (r':' + _dot + r'*\n', _starts_block(Name.Decorator, 'filter-block'), '#pop'), (r'[\w:-]+', Name.Tag, 'tag'), (r'\|', Text, 'eval-or-plain'), ], 'tag': [ include('css'), (r'\{(,\n|' + _dot + r')*?\}', using(ScalaLexer)), (r'\[' + _dot + r'*?\]', using(ScalaLexer)), (r'\(', Text, 'html-attributes'), (r'/[ \t]*\n', Punctuation, '#pop:2'), (r'[<>]{1,2}(?=[ \t=])', Punctuation), include('eval-or-plain'), ], 'plain': [ (r'([^#\n]|#[^{\n]|(\\\\)*\\#\{)+', Text), (r'(#\{)(' + _dot + r'*?)(\})', bygroups(String.Interpol, using(ScalaLexer), String.Interpol)), (r'\n', Text, 'root'), ], 'html-attributes': [ (r'\s+', Text), (r'[\w:-]+[ \t]*=', Name.Attribute, 'html-attribute-value'), (r'[\w:-]+', Name.Attribute), (r'\)', Text, '#pop'), ], 'html-attribute-value': [ (r'[ \t]+', Text), (r'\w+', Name.Variable, '#pop'), (r'@\w+', Name.Variable.Instance, '#pop'), (r'\$\w+', Name.Variable.Global, '#pop'), (r"'(\\\\|\\[^\\]|[^'\\\n])*'", String, '#pop'), (r'"(\\\\|\\[^\\]|[^"\\\n])*"', String, '#pop'), ], 'html-comment-block': [ (_dot + '+', Comment), (r'\n', Text, 'root'), ], 'scaml-comment-block': [ (_dot + '+', Comment.Preproc), (r'\n', Text, 'root'), ], 'filter-block': [ (r'([^#\n]|#[^{\n]|(\\\\)*\\#\{)+', Name.Decorator), (r'(#\{)(' + _dot + r'*?)(\})', bygroups(String.Interpol, using(ScalaLexer), String.Interpol)), (r'\n', Text, 'root'), ], } JadeLexer = PugLexer # compat