HTML Tidy  5.9.15
The HTACG Tidy HTML Project
lexer.h File Reference

Detailed Description

Lexer for HTML and XML Parsers.

Given an input source, it returns a sequence of tokens.

GetToken(source) gets the next token UngetToken(source) provides one level undo

The tags include an attribute list:

  • linked list of attribute/value nodes
  • each node has 2 NULL-terminated strings.
  • entities are replaced in attribute values

white space is compacted if not in preformatted mode If not in preformatted mode then leading white space is discarded and subsequent white space sequences compacted to single space characters.

If XmlTags is no then Tag names are folded to upper case and attribute names to lower case.

Not yet done:

  • Doctype subset and marked sections
Author
HTACG, et al (consult git log)
All Rights Reserved.
See tidy.h for the complete license.
Date
Additional updates: consult git log

Go to the source code of this file.

Data Structures

struct  AttVal
 Attribute/Value linked list node. More...
 
struct  IStack
 Mosaic handles inlines via a separate stack from other elements We duplicate this to recover from inline markup errors such as: More...
 
struct  Lexer
 The following are private to the lexer. More...
 
struct  Node
 HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, etc., etc. More...
 
struct  TagStyle
 
struct  StyleProp
 
struct  Stack
 This typedef represents a stack of addresses to nodes. More...
 

Macros

#define CM_BLOCK   (1 << 3)
 HTML "block" elements. More...
 
#define CM_DEFLIST   (1 << 6)
 Elements that mark definition list item ("DL", "DT"). More...
 
#define CM_EMPTY   (1 << 0)
 Elements with no content. More...
 
#define CM_FIELD   (1 << 10)
 Elements whose content must be protected against white space movement. More...
 
#define CM_FRAMES   (1 << 13)
 "FRAME", "FRAMESET", "NOFRAMES". More...
 
#define CM_HEAD   (1 << 2)
 Elements that can appear within HEAD. More...
 
#define CM_HEADING   (1 << 14)
 Heading elements (h1, h2, ...). More...
 
#define CM_HTML   (1 << 1)
 Elements that appear outside of "BODY". More...
 
#define CM_IMG   (1 << 16)
 Elements that use "align" attribute for vertical position. More...
 
#define CM_INLINE   (1 << 4)
 HTML "inline" elements. More...
 
#define CM_LIST   (1 << 5)
 Elements that mark list item ("LI"). More...
 
#define CM_MIXED   (1 << 17)
 Elements with inline and block model. More...
 
#define CM_NEW   (1 << 20)
 User defined elements. More...
 
#define CM_NO_INDENT   (1 << 18)
 Elements whose content needs to be indented only if containing one CM_BLOCK element. More...
 
#define CM_OBJECT   (1 << 11)
 Used to avoid propagating inline emphasis inside some elements such as OBJECT or APPLET. More...
 
#define CM_OBSOLETE   (1 << 19)
 Elements that are obsolete (such as "dir", "menu"). More...
 
#define CM_OMITST   (1 << 21)
 Elements that cannot be omitted. More...
 
#define CM_OPT   (1 << 15)
 Elements with an optional end tag. More...
 
#define CM_PARAM   (1 << 12)
 Elements that allows "PARAM". More...
 
#define CM_ROW   (1 << 9)
 Used for "TD", "TH". More...
 
#define CM_ROWGRP   (1 << 8)
 Used for "THEAD", "TFOOT" or "TBODY". More...
 
#define CM_TABLE   (1 << 7)
 Elements that can appear inside TABLE. More...
 
#define CM_UNKNOWN   0
 Content model shortcut encoding. More...
 
#define CM_VOID   (1 << 22)
 Elements that are void per https://www.w3.org/TR/2011/WD-html-markup-20110113/syntax.html#syntax-elements. More...
 
#define digit   1u
 Lexer character types. More...
 
#define digithex   128u
 
#define H40F   16u
 
#define H40S   4u
 
#define H40T   8u
 
#define H41F   128u
 
#define H41S   32u
 
#define H41T   64u
 
#define HT20   1u
 
#define HT32   2u
 
#define HT50   131072u
 
#define letter   2u
 
#define lowercase   32u
 
#define namechar   4u
 
#define newline   16u
 
#define uppercase   64u
 
#define VERS_ALL   (VERS_HTML20|VERS_HTML32|VERS_FROM40|XH50|HT50)
 
#define VERS_BASIC   (XB10)
 
#define VERS_EVENTS   (VERS_HTML40|VERS_XHTML11)
 
#define VERS_FRAMESET   (H40F|H41F|X10F)
 
#define VERS_FROM32   (VERS_HTML32|VERS_HTML40|HT50)
 
#define VERS_FROM40   (VERS_HTML40|VERS_XHTML11|VERS_BASIC|VERS_HTML5)
 
#define VERS_HTML20   (HT20)
 
#define VERS_HTML32   (HT32)
 
#define VERS_HTML40   (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET)
 
#define VERS_HTML40_LOOSE   (H40T|H41T|X10T)
 
#define VERS_HTML40_STRICT   (H40S|H41S|X10S)
 
#define VERS_HTML5   (HT50|XH50)
 
#define VERS_IFRAME   (VERS_HTML40_LOOSE|VERS_FRAMESET)
 
#define VERS_LOOSE   (VERS_HTML20|VERS_HTML32|VERS_IFRAME)
 
#define VERS_MICROSOFT   32768u
 
#define VERS_NETSCAPE   16384u
 
#define VERS_PROPRIETARY   (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)
 
#define VERS_STRICT   (VERS_HTML5|VERS_HTML40_STRICT)
 
#define VERS_SUN   8192u
 
#define VERS_UNKNOWN   (xxxx)
 
#define VERS_XHTML   (X10S|X10T|X10F|XH11|XB10|XH50)
 
#define VERS_XHTML11   (XH11)
 
#define VERS_XML   65536u
 
#define white   8u
 
#define X10F   1024u
 
#define X10S   256u
 
#define X10T   512u
 
#define XB10   4096u
 
#define XH11   2048u
 
#define XH50   262144u
 
#define xxxx   0u
 If the document uses just HTML 2.0 tags and attributes described it is HTML 2.0. More...
 

Enumerations

enum  GetTokenMode {
  IgnoreWhitespace ,
  MixedContent ,
  Preformatted ,
  IgnoreMarkup ,
  OtherNamespace ,
  CdataContent
}
 modes for GetToken() More...
 
enum  LexerState {
  LEX_CONTENT ,
  LEX_GT ,
  LEX_ENDTAG ,
  LEX_STARTTAG ,
  LEX_COMMENT ,
  LEX_DOCTYPE ,
  LEX_PROCINSTR ,
  LEX_CDATA ,
  LEX_SECTION ,
  LEX_ASP ,
  LEX_JSTE ,
  LEX_PHP ,
  LEX_XMLDECL
}
 Lexer GetToken() states. More...
 
enum  NodeType {
  RootNode ,
  DocTypeTag ,
  CommentTag ,
  ProcInsTag ,
  TextNode ,
  StartTag ,
  EndTag ,
  StartEndTag ,
  CDATATag ,
  SectionTag ,
  AspTag ,
  JsteTag ,
  PhpTag ,
  XmlDecl
}
 node->type is one of these values More...
 
enum  ParseDocTypeDeclState {
  DT_INTERMEDIATE ,
  DT_DOCTYPENAME ,
  DT_PUBLICSYSTEM ,
  DT_QUOTEDSTRING ,
  DT_INTSUBSET
}
 ParseDocTypeDecl state constants. More...
 

Functions

Lexer Functions
TY_PRIVATE void TY_❪AddCharToLexer❫ (Lexer *lexer, uint c)
 Store character c as UTF-8 encoded byte stream. More...
 
TY_PRIVATE Bool TY_❪AddGenerator❫ (TidyDocImpl *doc)
 Add meta element for Tidy. More...
 
TY_PRIVATE void TY_❪AddStringLiteral❫ (Lexer *lexer, ctmbstr str)
 
TY_PRIVATE uint TY_❪ApparentVersion❫ (TidyDocImpl *doc)
 
TY_PRIVATE Node * TY_❪CloneNode❫ (TidyDocImpl *doc, Node *element)
 Used to clone heading nodes when split by an <HR> More...
 
TY_PRIVATE void TY_❪ConstrainVersion❫ (TidyDocImpl *doc, uint vers)
 Everything is allowed in proprietary version of HTML. More...
 
TY_PRIVATE void TY_❪DetachAttribute❫ (Node *node, AttVal *attr)
 Detach attribute from node. More...
 
TY_PRIVATE Node * TY_❪FindBody❫ (TidyDocImpl *doc)
 
TY_PRIVATE Node * TY_❪FindContainer❫ (Node *node)
 Returns containing block element, if any. More...
 
TY_PRIVATE Node * TY_❪FindDocType❫ (TidyDocImpl *doc)
 
TY_PRIVATE Node * TY_❪FindHEAD❫ (TidyDocImpl *doc)
 
TY_PRIVATE Node * TY_❪FindHTML❫ (TidyDocImpl *doc)
 
TY_PRIVATE Node * TY_❪FindTITLE❫ (TidyDocImpl *doc)
 
TY_PRIVATE Node * TY_❪FindXmlDecl❫ (TidyDocImpl *doc)
 
TY_PRIVATE Bool TY_❪FixDocType❫ (TidyDocImpl *doc)
 Fixup doctype if missing. More...
 
TY_PRIVATE Bool TY_❪FixXmlDecl❫ (TidyDocImpl *doc)
 Ensure XML document starts with <?xml version="1.0"?>,and add encoding attribute if not using ASCII or UTF-8 output. More...
 
TY_PRIVATE void TY_❪FreeAttribute❫ (TidyDocImpl *doc, AttVal *av)
 Doesn't repair attribute list linkage. More...
 
TY_PRIVATE void TY_❪FreeAttrs❫ (TidyDocImpl *doc, Node *node)
 Free node's attributes. More...
 
TY_PRIVATE void TY_❪FreeLexer❫ (TidyDocImpl *doc)
 
TY_PRIVATE void TY_❪FreeNode❫ (TidyDocImpl *doc, Node *node)
 Free document nodes by iterating through peers and recursing through children. More...
 
TY_PRIVATE Node * TY_❪GetToken❫ (TidyDocImpl *doc, GetTokenMode mode)
 
TY_PRIVATE ctmbstr TY_❪HTMLVersionNameFromCode❫ (uint vers, Bool isXhtml)
 
TY_PRIVATE uint TY_❪HTMLVersionNumberFromCode❫ (uint vers)
 
TY_PRIVATE int TY_❪HTMLVersion❫ (TidyDocImpl *doc)
 Choose what version to use for new doctype. More...
 
TY_PRIVATE Node * TY_❪InferredTag❫ (TidyDocImpl *doc, TidyTagId id)
 
TY_PRIVATE void TY_❪InitMap❫ (void)
 
TY_PRIVATE void TY_❪InsertAttributeAtEnd❫ (Node *node, AttVal *av)
 Insert attribute at the end of attribute list of a node. More...
 
TY_PRIVATE void TY_❪InsertAttributeAtStart❫ (Node *node, AttVal *av)
 Insert attribute at the start of attribute list of a node. More...
 
TY_PRIVATE Bool TY_❪IsDigit❫ (uint c)
 
TY_PRIVATE Bool TY_❪IsHTMLSpace❫ (uint c)
 
TY_PRIVATE Bool TY_❪IsLetter❫ (uint c)
 
TY_PRIVATE Bool TY_❪IsNamechar❫ (uint c)
 
TY_PRIVATE Bool TY_❪IsNewline❫ (uint c)
 
TY_PRIVATE Bool TY_❪IsUpper❫ (uint c)
 
TY_PRIVATE Bool TY_❪IsWhite❫ (uint c)
 
TY_PRIVATE Bool TY_❪IsXMLLetter❫ (uint c)
 
TY_PRIVATE Bool TY_❪IsXMLNamechar❫ (uint c)
 
TY_PRIVATE AttVal * TY_❪NewAttributeEx❫ (TidyDocImpl *doc, ctmbstr name, ctmbstr value, int delim)
 Create a new attribute with given name and value. More...
 
TY_PRIVATE AttVal * TY_❪NewAttribute❫ (TidyDocImpl *doc)
 Create a new attribute. More...
 
TY_PRIVATE Lexer * TY_❪NewLexer❫ (TidyDocImpl *doc)
 
TY_PRIVATE Node * TY_❪NewLineNode❫ (Lexer *lexer)
 Used for creating preformatted text from Word2000. More...
 
TY_PRIVATE Node * TY_❪NewLiteralTextNode❫ (Lexer *lexer, ctmbstr txt)
 Used for adding a   for Word2000. More...
 
TY_PRIVATE Node * TY_❪NewNode❫ (TidyAllocator *allocator, Lexer *lexer)
 Used for elements and text nodes. More...
 
TY_PRIVATE void TY_❪RemoveAttribute❫ (TidyDocImpl *doc, Node *node, AttVal *attr)
 Detach attribute from node then free it. More...
 
TY_PRIVATE Bool TY_❪SetXHTMLDocType❫ (TidyDocImpl *doc)
 
TY_PRIVATE Node * TY_❪TextToken❫ (Lexer *lexer)
 
TY_PRIVATE uint TY_❪ToLower❫ (uint c)
 
TY_PRIVATE uint TY_❪ToUpper❫ (uint c)
 
TY_PRIVATE void TY_❪UngetToken❫ (TidyDocImpl *doc)
 
TY_PRIVATE Bool TY_❪WarnMissingSIInEmittedDocType❫ (TidyDocImpl *doc)
 
Inline Stack Functions
TY_PRIVATE void TY_❪DeferDup❫ (TidyDocImpl *doc)
 Defer duplicates when entering a table or other element where the inlines shouldn't be duplicated. More...
 
TY_PRIVATE AttVal * TY_❪DupAttrs❫ (TidyDocImpl *doc, AttVal *attrs)
 Duplicate attributes. More...
 
TY_PRIVATE Bool TY_❪InlineDup1❫ (TidyDocImpl *doc, Node *node, Node *element)
 
TY_PRIVATE int TY_❪InlineDup❫ (TidyDocImpl *doc, Node *node)
 This has the effect of inserting "missing" inline elements around the contents of blocklevel elements such as P, TD, TH, DIV, PRE etc. More...
 
TY_PRIVATE Node * TY_❪InsertedToken❫ (TidyDocImpl *doc)
 
TY_PRIVATE Bool TY_❪IsPushedLast❫ (TidyDocImpl *doc, Node *element, Node *node)
 
TY_PRIVATE Bool TY_❪IsPushed❫ (TidyDocImpl *doc, Node *node)
 
TY_PRIVATE void TY_❪PopInline❫ (TidyDocImpl *doc, Node *node)
 Pop inline stack. More...
 
TY_PRIVATE void TY_❪PushInline❫ (TidyDocImpl *doc, Node *node)
 Push a copy of an inline node onto stack, but don't push if implicit or OBJECT or APPLET (implicit tags are ones generated from the istack). More...
 
TY_PRIVATE Bool TY_❪SwitchInline❫ (TidyDocImpl *doc, Node *element, Node *node)
 Stack manipulation for inline elements. More...
 
Generic stack of nodes.
TY_PRIVATE void TY_❪freeStack❫ (Stack *stack)
 Frees the stack when done. More...
 
TY_PRIVATE void TY_❪growStack❫ (Stack *stack)
 Increase the stack size. More...
 
TY_PRIVATE StackTY_❪newStack❫ (TidyDocImpl *doc, uint capacity)
 Create a new stack with a given starting capacity. More...
 
TY_PRIVATE Node * TY_❪peek❫ (Stack *stack)
 Peek at the stack. More...
 
TY_PRIVATE Node * TY_❪pop❫ (Stack *stack)
 Pop an item from the stack. More...
 
TY_PRIVATE void TY_❪push❫ (Stack *stack, Node *node)
 Push an item to the stack. More...
 
TY_PRIVATE Bool TY_❪stackEmpty❫ (Stack *stack)
 Stack is empty when top is equal to -1. More...
 
TY_PRIVATE Bool TY_❪stackFull❫ (Stack *stack)
 Stack is full when top is equal to the last index. More...