HTML Tidy  5.9.15
The HTACG Tidy HTML Project
HTML and XML Lexing

Detailed Description

These functions and structures form the internal API for document lexing.

Data Structures

struct  AttVal
 Attribute/Value linked list node. More...
 
struct  IStack
 Mosaic handles inlines via a separate stack from other elements We duplicate this to recover from inline markup errors such as: More...
 
struct  Lexer
 The following are private to the lexer. More...
 
struct  Node
 HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, etc., etc. More...
 
struct  TagStyle
 
struct  StyleProp
 
struct  Stack
 This typedef represents a stack of addresses to nodes. More...
 

Macros

#define CM_BLOCK   (1 << 3)
 HTML "block" elements. More...
 
#define CM_DEFLIST   (1 << 6)
 Elements that mark definition list item ("DL", "DT"). More...
 
#define CM_EMPTY   (1 << 0)
 Elements with no content. More...
 
#define CM_FIELD   (1 << 10)
 Elements whose content must be protected against white space movement. More...
 
#define CM_FRAMES   (1 << 13)
 "FRAME", "FRAMESET", "NOFRAMES". More...
 
#define CM_HEAD   (1 << 2)
 Elements that can appear within HEAD. More...
 
#define CM_HEADING   (1 << 14)
 Heading elements (h1, h2, ...). More...
 
#define CM_HTML   (1 << 1)
 Elements that appear outside of "BODY". More...
 
#define CM_IMG   (1 << 16)
 Elements that use "align" attribute for vertical position. More...
 
#define CM_INLINE   (1 << 4)
 HTML "inline" elements. More...
 
#define CM_LIST   (1 << 5)
 Elements that mark list item ("LI"). More...
 
#define CM_MIXED   (1 << 17)
 Elements with inline and block model. More...
 
#define CM_NEW   (1 << 20)
 User defined elements. More...
 
#define CM_NO_INDENT   (1 << 18)
 Elements whose content needs to be indented only if containing one CM_BLOCK element. More...
 
#define CM_OBJECT   (1 << 11)
 Used to avoid propagating inline emphasis inside some elements such as OBJECT or APPLET. More...
 
#define CM_OBSOLETE   (1 << 19)
 Elements that are obsolete (such as "dir", "menu"). More...
 
#define CM_OMITST   (1 << 21)
 Elements that cannot be omitted. More...
 
#define CM_OPT   (1 << 15)
 Elements with an optional end tag. More...
 
#define CM_PARAM   (1 << 12)
 Elements that allows "PARAM". More...
 
#define CM_ROW   (1 << 9)
 Used for "TD", "TH". More...
 
#define CM_ROWGRP   (1 << 8)
 Used for "THEAD", "TFOOT" or "TBODY". More...
 
#define CM_TABLE   (1 << 7)
 Elements that can appear inside TABLE. More...
 
#define CM_UNKNOWN   0
 Content model shortcut encoding. More...
 
#define CM_VOID   (1 << 22)
 Elements that are void per https://www.w3.org/TR/2011/WD-html-markup-20110113/syntax.html#syntax-elements. More...
 
#define digit   1u
 Lexer character types. More...
 
#define digithex   128u
 
#define H40F   16u
 
#define H40S   4u
 
#define H40T   8u
 
#define H41F   128u
 
#define H41S   32u
 
#define H41T   64u
 
#define HT20   1u
 
#define HT32   2u
 
#define HT50   131072u
 
#define letter   2u
 
#define lowercase   32u
 
#define namechar   4u
 
#define newline   16u
 
#define uppercase   64u
 
#define VERS_ALL   (VERS_HTML20|VERS_HTML32|VERS_FROM40|XH50|HT50)
 
#define VERS_BASIC   (XB10)
 
#define VERS_EVENTS   (VERS_HTML40|VERS_XHTML11)
 
#define VERS_FRAMESET   (H40F|H41F|X10F)
 
#define VERS_FROM32   (VERS_HTML32|VERS_HTML40|HT50)
 
#define VERS_FROM40   (VERS_HTML40|VERS_XHTML11|VERS_BASIC|VERS_HTML5)
 
#define VERS_HTML20   (HT20)
 
#define VERS_HTML32   (HT32)
 
#define VERS_HTML40   (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET)
 
#define VERS_HTML40_LOOSE   (H40T|H41T|X10T)
 
#define VERS_HTML40_STRICT   (H40S|H41S|X10S)
 
#define VERS_HTML5   (HT50|XH50)
 
#define VERS_IFRAME   (VERS_HTML40_LOOSE|VERS_FRAMESET)
 
#define VERS_LOOSE   (VERS_HTML20|VERS_HTML32|VERS_IFRAME)
 
#define VERS_MICROSOFT   32768u
 
#define VERS_NETSCAPE   16384u
 
#define VERS_PROPRIETARY   (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)
 
#define VERS_STRICT   (VERS_HTML5|VERS_HTML40_STRICT)
 
#define VERS_SUN   8192u
 
#define VERS_UNKNOWN   (xxxx)
 
#define VERS_XHTML   (X10S|X10T|X10F|XH11|XB10|XH50)
 
#define VERS_XHTML11   (XH11)
 
#define VERS_XML   65536u
 
#define white   8u
 
#define X10F   1024u
 
#define X10S   256u
 
#define X10T   512u
 
#define XB10   4096u
 
#define XH11   2048u
 
#define XH50   262144u
 
#define xxxx   0u
 If the document uses just HTML 2.0 tags and attributes described it is HTML 2.0. More...
 

Enumerations

enum  GetTokenMode {
  IgnoreWhitespace ,
  MixedContent ,
  Preformatted ,
  IgnoreMarkup ,
  OtherNamespace ,
  CdataContent
}
 modes for GetToken() More...
 
enum  LexerState {
  LEX_CONTENT ,
  LEX_GT ,
  LEX_ENDTAG ,
  LEX_STARTTAG ,
  LEX_COMMENT ,
  LEX_DOCTYPE ,
  LEX_PROCINSTR ,
  LEX_CDATA ,
  LEX_SECTION ,
  LEX_ASP ,
  LEX_JSTE ,
  LEX_PHP ,
  LEX_XMLDECL
}
 Lexer GetToken() states. More...
 
enum  NodeType {
  RootNode ,
  DocTypeTag ,
  CommentTag ,
  ProcInsTag ,
  TextNode ,
  StartTag ,
  EndTag ,
  StartEndTag ,
  CDATATag ,
  SectionTag ,
  AspTag ,
  JsteTag ,
  PhpTag ,
  XmlDecl
}
 node->type is one of these values More...
 
enum  ParseDocTypeDeclState {
  DT_INTERMEDIATE ,
  DT_DOCTYPENAME ,
  DT_PUBLICSYSTEM ,
  DT_QUOTEDSTRING ,
  DT_INTSUBSET
}
 ParseDocTypeDecl state constants. More...
 

Lexer Functions

TY_PRIVATE int TY_❪HTMLVersion❫ (TidyDocImpl *doc)
 Choose what version to use for new doctype. More...
 
TY_PRIVATE void TY_❪ConstrainVersion❫ (TidyDocImpl *doc, uint vers)
 Everything is allowed in proprietary version of HTML. More...
 
TY_PRIVATE Bool TY_❪IsWhite❫ (uint c)
 
TY_PRIVATE Bool TY_❪IsDigit❫ (uint c)
 
TY_PRIVATE Bool TY_❪IsLetter❫ (uint c)
 
TY_PRIVATE Bool TY_❪IsHTMLSpace❫ (uint c)
 
TY_PRIVATE Bool TY_❪IsNewline❫ (uint c)
 
TY_PRIVATE Bool TY_❪IsNamechar❫ (uint c)
 
TY_PRIVATE Bool TY_❪IsXMLLetter❫ (uint c)
 
TY_PRIVATE Bool TY_❪IsXMLNamechar❫ (uint c)
 
TY_PRIVATE Bool TY_❪IsUpper❫ (uint c)
 
TY_PRIVATE uint TY_❪ToLower❫ (uint c)
 
TY_PRIVATE uint TY_❪ToUpper❫ (uint c)
 
TY_PRIVATE Lexer * TY_❪NewLexer❫ (TidyDocImpl *doc)
 
TY_PRIVATE void TY_❪FreeLexer❫ (TidyDocImpl *doc)
 
TY_PRIVATE void TY_❪AddCharToLexer❫ (Lexer *lexer, uint c)
 Store character c as UTF-8 encoded byte stream. More...
 
TY_PRIVATE Node * TY_❪NewNode❫ (TidyAllocator *allocator, Lexer *lexer)
 Used for elements and text nodes. More...
 
TY_PRIVATE Node * TY_❪CloneNode❫ (TidyDocImpl *doc, Node *element)
 Used to clone heading nodes when split by an <HR> More...
 
TY_PRIVATE void TY_❪FreeAttrs❫ (TidyDocImpl *doc, Node *node)
 Free node's attributes. More...
 
TY_PRIVATE void TY_❪FreeAttribute❫ (TidyDocImpl *doc, AttVal *av)
 Doesn't repair attribute list linkage. More...
 
TY_PRIVATE void TY_❪DetachAttribute❫ (Node *node, AttVal *attr)
 Detach attribute from node. More...
 
TY_PRIVATE void TY_❪RemoveAttribute❫ (TidyDocImpl *doc, Node *node, AttVal *attr)
 Detach attribute from node then free it. More...
 
TY_PRIVATE void TY_❪FreeNode❫ (TidyDocImpl *doc, Node *node)
 Free document nodes by iterating through peers and recursing through children. More...
 
TY_PRIVATE Node * TY_❪TextToken❫ (Lexer *lexer)
 
TY_PRIVATE Node * TY_❪NewLineNode❫ (Lexer *lexer)
 Used for creating preformatted text from Word2000. More...
 
TY_PRIVATE Node * TY_❪NewLiteralTextNode❫ (Lexer *lexer, ctmbstr txt)
 Used for adding a   for Word2000. More...
 
TY_PRIVATE void TY_❪AddStringLiteral❫ (Lexer *lexer, ctmbstr str)
 
TY_PRIVATE Node * TY_❪FindDocType❫ (TidyDocImpl *doc)
 
TY_PRIVATE Node * TY_❪FindHTML❫ (TidyDocImpl *doc)
 
TY_PRIVATE Node * TY_❪FindHEAD❫ (TidyDocImpl *doc)
 
TY_PRIVATE Node * TY_❪FindTITLE❫ (TidyDocImpl *doc)
 
TY_PRIVATE Node * TY_❪FindBody❫ (TidyDocImpl *doc)
 
TY_PRIVATE Node * TY_❪FindXmlDecl❫ (TidyDocImpl *doc)
 
TY_PRIVATE Node * TY_❪FindContainer❫ (Node *node)
 Returns containing block element, if any. More...
 
TY_PRIVATE Bool TY_❪AddGenerator❫ (TidyDocImpl *doc)
 Add meta element for Tidy. More...
 
TY_PRIVATE uint TY_❪ApparentVersion❫ (TidyDocImpl *doc)
 
TY_PRIVATE ctmbstr TY_❪HTMLVersionNameFromCode❫ (uint vers, Bool isXhtml)
 
TY_PRIVATE uint TY_❪HTMLVersionNumberFromCode❫ (uint vers)
 
TY_PRIVATE Bool TY_❪WarnMissingSIInEmittedDocType❫ (TidyDocImpl *doc)
 
TY_PRIVATE Bool TY_❪SetXHTMLDocType❫ (TidyDocImpl *doc)
 
TY_PRIVATE Bool TY_❪FixDocType❫ (TidyDocImpl *doc)
 Fixup doctype if missing. More...
 
TY_PRIVATE Bool TY_❪FixXmlDecl❫ (TidyDocImpl *doc)
 Ensure XML document starts with <?xml version="1.0"?>,and add encoding attribute if not using ASCII or UTF-8 output. More...
 
TY_PRIVATE Node * TY_❪InferredTag❫ (TidyDocImpl *doc, TidyTagId id)
 
TY_PRIVATE void TY_❪UngetToken❫ (TidyDocImpl *doc)
 
TY_PRIVATE Node * TY_❪GetToken❫ (TidyDocImpl *doc, GetTokenMode mode)
 
TY_PRIVATE void TY_❪InitMap❫ (void)
 
TY_PRIVATE AttVal * TY_❪NewAttribute❫ (TidyDocImpl *doc)
 Create a new attribute. More...
 
TY_PRIVATE AttVal * TY_❪NewAttributeEx❫ (TidyDocImpl *doc, ctmbstr name, ctmbstr value, int delim)
 Create a new attribute with given name and value. More...
 
TY_PRIVATE void TY_❪InsertAttributeAtEnd❫ (Node *node, AttVal *av)
 Insert attribute at the end of attribute list of a node. More...
 
TY_PRIVATE void TY_❪InsertAttributeAtStart❫ (Node *node, AttVal *av)
 Insert attribute at the start of attribute list of a node. More...
 

Inline Stack Functions

TY_PRIVATE AttVal * TY_❪DupAttrs❫ (TidyDocImpl *doc, AttVal *attrs)
 Duplicate attributes. More...
 
TY_PRIVATE void TY_❪PushInline❫ (TidyDocImpl *doc, Node *node)
 Push a copy of an inline node onto stack, but don't push if implicit or OBJECT or APPLET (implicit tags are ones generated from the istack). More...
 
TY_PRIVATE void TY_❪PopInline❫ (TidyDocImpl *doc, Node *node)
 Pop inline stack. More...
 
TY_PRIVATE Bool TY_❪IsPushed❫ (TidyDocImpl *doc, Node *node)
 
TY_PRIVATE Bool TY_❪IsPushedLast❫ (TidyDocImpl *doc, Node *element, Node *node)
 
TY_PRIVATE int TY_❪InlineDup❫ (TidyDocImpl *doc, Node *node)
 This has the effect of inserting "missing" inline elements around the contents of blocklevel elements such as P, TD, TH, DIV, PRE etc. More...
 
TY_PRIVATE void TY_❪DeferDup❫ (TidyDocImpl *doc)
 Defer duplicates when entering a table or other element where the inlines shouldn't be duplicated. More...
 
TY_PRIVATE Node * TY_❪InsertedToken❫ (TidyDocImpl *doc)
 
TY_PRIVATE Bool TY_❪SwitchInline❫ (TidyDocImpl *doc, Node *element, Node *node)
 Stack manipulation for inline elements. More...
 
TY_PRIVATE Bool TY_❪InlineDup1❫ (TidyDocImpl *doc, Node *node, Node *element)
 

Generic stack of nodes.

TY_PRIVATE StackTY_❪newStack❫ (TidyDocImpl *doc, uint capacity)
 Create a new stack with a given starting capacity. More...
 
TY_PRIVATE void TY_❪growStack❫ (Stack *stack)
 Increase the stack size. More...
 
TY_PRIVATE Bool TY_❪stackFull❫ (Stack *stack)
 Stack is full when top is equal to the last index. More...
 
TY_PRIVATE Bool TY_❪stackEmpty❫ (Stack *stack)
 Stack is empty when top is equal to -1. More...
 
TY_PRIVATE void TY_❪push❫ (Stack *stack, Node *node)
 Push an item to the stack. More...
 
TY_PRIVATE Node * TY_❪pop❫ (Stack *stack)
 Pop an item from the stack. More...
 
TY_PRIVATE Node * TY_❪peek❫ (Stack *stack)
 Peek at the stack. More...
 
TY_PRIVATE void TY_❪freeStack❫ (Stack *stack)
 Frees the stack when done. More...
 

Data Structure Documentation

◆ _AttVal

struct _AttVal

Attribute/Value linked list node.

Data Fields
Node * asp
tmbstr attribute
int delim
const Attribute * dict
AttVal * next
Node * php
tmbstr value

◆ _IStack

struct _IStack

Mosaic handles inlines via a separate stack from other elements We duplicate this to recover from inline markup errors such as:

<i>italic text
<p>more italic text</b> normal text

which for compatibility with Mosaic is mapped to:

<i>italic text</i>
<p><i>more italic text</i> normal text

Note that any inline end tag pop's the effect of the current inline start tag, so that </b> pop's <i> in the above example.

Data Fields
AttVal * attributes
tmbstr element name (NULL for text nodes)
IStack * next
const Dict * tag tag's dictionary definition

◆ _Lexer

struct _Lexer

The following are private to the lexer.

Use NewLexer() to create a lexer, and FreeLexer() to free it.

Data Fields
TidyAllocator * allocator allocator
Bool bad_doctype e.g.

if html or PUBLIC is missing

uint columns at start of current token
uint doctype version as given by doctype (if any)
Bool excludeBlocks Netscape compatibility.
Bool exiled true if moved out of table
Node * inode for deferring text node
IStack * insert for inferring inline tags
Bool insertspace when space is moved after end tag
IStack * istack
uint istackbase start of frame
uint istacklength allocated
uint istacksize used
Bool isvoyager true if xmlns attribute on html element (i.e., "Voyager" was the W3C codename for XHTML).
Node * itoken last duplicate inline returned by GetToken()
tmbstr lexbuf MB character buffer.
uint lexlength allocated
uint lexsize used
uint lines lines seen
Node * parent remember parent node for CDATA elements
Bool pushed true after token has been pushed back
Node * root remember root node of the document
Bool seenEndBody true if a </body> tag has been encountered
Bool seenEndHtml true if a </html> tag has been encountered
LexerState state state of lexer's finite state machine
TagStyle * styles used for cleaning up presentation markup
Node * token last token returned by GetToken()
uint txtend end of current node
uint txtstart start of current node
uint versionEmitted version of doctype emitted
uint versions bit vector of HTML versions
Bool waswhite used to collapse contiguous white space

◆ _Node

struct _Node

HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, etc., etc.

Data Fields
AttVal * attributes
Bool closed true if closed by explicit end tag
uint column current column of document
Node * content
tmbstr element name (NULL for text nodes)
uint end end of span onto text array
Bool implicit true if inferred
Node * last
uint line current line of document
Bool linebreak true if followed by a line break
Node * next
Node * parent tree structure
Node * prev
uint start start of span onto text array
const Dict * tag tag's dictionary definition
NodeType type TextNode, StartTag, EndTag etc.
const Dict * was old tag when it was changed

◆ _Style

struct _Style
Data Fields
TagStyle * next
tmbstr properties
tmbstr tag
tmbstr tag_class

◆ _StyleProp

struct _StyleProp
Data Fields
tmbstr name
StyleProp * next
tmbstr value

◆ Stack

struct Stack

This typedef represents a stack of addresses to nodes.

Tidy uses these to try to limit recursion by pushing nodes to a stack when possible instead of recursing.

Data Fields
TidyAllocator * allocator A pointer to the first pointer to a Node in an array of node addresses.

Tidy's allocator, used at instantiation and expanding.

unsigned capacity Current capacity.

Can be expanded.

Node ** firstNode
int top Current top position.

Macro Definition Documentation

◆ CM_BLOCK

#define CM_BLOCK   (1 << 3)

HTML "block" elements.

◆ CM_DEFLIST

#define CM_DEFLIST   (1 << 6)

Elements that mark definition list item ("DL", "DT").

◆ CM_EMPTY

#define CM_EMPTY   (1 << 0)

Elements with no content.

Map to HTML specification.

◆ CM_FIELD

#define CM_FIELD   (1 << 10)

Elements whose content must be protected against white space movement.

Includes some elements that can found in forms.

◆ CM_FRAMES

#define CM_FRAMES   (1 << 13)

"FRAME", "FRAMESET", "NOFRAMES".

Used in ParseFrameSet.

◆ CM_HEAD

#define CM_HEAD   (1 << 2)

Elements that can appear within HEAD.

◆ CM_HEADING

#define CM_HEADING   (1 << 14)

Heading elements (h1, h2, ...).

◆ CM_HTML

#define CM_HTML   (1 << 1)

Elements that appear outside of "BODY".

◆ CM_IMG

#define CM_IMG   (1 << 16)

Elements that use "align" attribute for vertical position.

◆ CM_INLINE

#define CM_INLINE   (1 << 4)

HTML "inline" elements.

◆ CM_LIST

#define CM_LIST   (1 << 5)

Elements that mark list item ("LI").

◆ CM_MIXED

#define CM_MIXED   (1 << 17)

Elements with inline and block model.

Used to avoid calling InlineDup.

◆ CM_NEW

#define CM_NEW   (1 << 20)

User defined elements.

Used to determine how attributes without value should be printed.

◆ CM_NO_INDENT

#define CM_NO_INDENT   (1 << 18)

Elements whose content needs to be indented only if containing one CM_BLOCK element.

◆ CM_OBJECT

#define CM_OBJECT   (1 << 11)

Used to avoid propagating inline emphasis inside some elements such as OBJECT or APPLET.

◆ CM_OBSOLETE

#define CM_OBSOLETE   (1 << 19)

Elements that are obsolete (such as "dir", "menu").

◆ CM_OMITST

#define CM_OMITST   (1 << 21)

Elements that cannot be omitted.

◆ CM_OPT

#define CM_OPT   (1 << 15)

Elements with an optional end tag.

◆ CM_PARAM

#define CM_PARAM   (1 << 12)

Elements that allows "PARAM".

◆ CM_ROW

#define CM_ROW   (1 << 9)

Used for "TD", "TH".

◆ CM_ROWGRP

#define CM_ROWGRP   (1 << 8)

Used for "THEAD", "TFOOT" or "TBODY".

◆ CM_TABLE

#define CM_TABLE   (1 << 7)

Elements that can appear inside TABLE.

◆ CM_UNKNOWN

#define CM_UNKNOWN   0

Content model shortcut encoding.

Descriptions are tentative.

◆ CM_VOID

#define CM_VOID   (1 << 22)

◆ digit

#define digit   1u

Lexer character types.

◆ digithex

#define digithex   128u

◆ H40F

#define H40F   16u

◆ H40S

#define H40S   4u

◆ H40T

#define H40T   8u

◆ H41F

#define H41F   128u

◆ H41S

#define H41S   32u

◆ H41T

#define H41T   64u

◆ HT20

#define HT20   1u

◆ HT32

#define HT32   2u

◆ HT50

#define HT50   131072u

◆ letter

#define letter   2u

◆ lowercase

#define lowercase   32u

◆ namechar

#define namechar   4u

◆ newline

#define newline   16u

◆ uppercase

#define uppercase   64u

◆ VERS_ALL

#define VERS_ALL   (VERS_HTML20|VERS_HTML32|VERS_FROM40|XH50|HT50)

◆ VERS_BASIC

#define VERS_BASIC   (XB10)

◆ VERS_EVENTS

#define VERS_EVENTS   (VERS_HTML40|VERS_XHTML11)

◆ VERS_FRAMESET

#define VERS_FRAMESET   (H40F|H41F|X10F)

◆ VERS_FROM32

#define VERS_FROM32   (VERS_HTML32|VERS_HTML40|HT50)

◆ VERS_FROM40

#define VERS_FROM40   (VERS_HTML40|VERS_XHTML11|VERS_BASIC|VERS_HTML5)

◆ VERS_HTML20

#define VERS_HTML20   (HT20)

◆ VERS_HTML32

#define VERS_HTML32   (HT32)

◆ VERS_HTML40

◆ VERS_HTML40_LOOSE

#define VERS_HTML40_LOOSE   (H40T|H41T|X10T)

◆ VERS_HTML40_STRICT

#define VERS_HTML40_STRICT   (H40S|H41S|X10S)

◆ VERS_HTML5

#define VERS_HTML5   (HT50|XH50)

◆ VERS_IFRAME

#define VERS_IFRAME   (VERS_HTML40_LOOSE|VERS_FRAMESET)

◆ VERS_LOOSE

#define VERS_LOOSE   (VERS_HTML20|VERS_HTML32|VERS_IFRAME)

◆ VERS_MICROSOFT

#define VERS_MICROSOFT   32768u

◆ VERS_NETSCAPE

#define VERS_NETSCAPE   16384u

◆ VERS_PROPRIETARY

#define VERS_PROPRIETARY   (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)

◆ VERS_STRICT

#define VERS_STRICT   (VERS_HTML5|VERS_HTML40_STRICT)

◆ VERS_SUN

#define VERS_SUN   8192u

◆ VERS_UNKNOWN

#define VERS_UNKNOWN   (xxxx)

◆ VERS_XHTML

#define VERS_XHTML   (X10S|X10T|X10F|XH11|XB10|XH50)

◆ VERS_XHTML11

#define VERS_XHTML11   (XH11)

◆ VERS_XML

#define VERS_XML   65536u

◆ white

#define white   8u

◆ X10F

#define X10F   1024u

◆ X10S

#define X10S   256u

◆ X10T

#define X10T   512u

◆ XB10

#define XB10   4096u

◆ XH11

#define XH11   2048u

◆ XH50

#define XH50   262144u

◆ xxxx

#define xxxx   0u

If the document uses just HTML 2.0 tags and attributes described it is HTML 2.0.

Similarly for HTML 3.2 and the 3 flavors of HTML 4.0. If there are proprietary tags and attributes then describe it as HTML Proprietary. If it includes the xml-lang or xmlns attributes but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the flavors of Voyager (strict, loose or frameset).

Enumeration Type Documentation

◆ GetTokenMode

modes for GetToken()

MixedContent – for elements which don't accept PCDATA Preformatted – white space preserved as is IgnoreMarkup – for CDATA elements such as script, style

Enumerator
IgnoreWhitespace 
MixedContent 
Preformatted 
IgnoreMarkup 
OtherNamespace 
CdataContent 

◆ LexerState

enum LexerState

Lexer GetToken() states.

Enumerator
LEX_CONTENT 
LEX_GT 
LEX_ENDTAG 
LEX_STARTTAG 
LEX_COMMENT 
LEX_DOCTYPE 
LEX_PROCINSTR 
LEX_CDATA 
LEX_SECTION 
LEX_ASP 
LEX_JSTE 
LEX_PHP 
LEX_XMLDECL 

◆ NodeType

enum NodeType

node->type is one of these values

Enumerator
RootNode 
DocTypeTag 
CommentTag 
ProcInsTag 
TextNode 
StartTag 
EndTag 
StartEndTag 
CDATATag 
SectionTag 
AspTag 
JsteTag 
PhpTag 
XmlDecl 

◆ ParseDocTypeDeclState

ParseDocTypeDecl state constants.

Enumerator
DT_INTERMEDIATE 
DT_DOCTYPENAME 
DT_PUBLICSYSTEM 
DT_QUOTEDSTRING 
DT_INTSUBSET 

Function Documentation

◆ TY_❪AddCharToLexer❫()

TY_PRIVATE void TY_❪AddCharToLexer❫ ( Lexer *  lexer,
uint  c 
)

Store character c as UTF-8 encoded byte stream.

◆ TY_❪AddGenerator❫()

TY_PRIVATE Bool TY_❪AddGenerator❫ ( TidyDocImpl *  doc)

Add meta element for Tidy.

◆ TY_❪AddStringLiteral❫()

TY_PRIVATE void TY_❪AddStringLiteral❫ ( Lexer *  lexer,
ctmbstr  str 
)

◆ TY_❪ApparentVersion❫()

TY_PRIVATE uint TY_❪ApparentVersion❫ ( TidyDocImpl *  doc)

◆ TY_❪CloneNode❫()

TY_PRIVATE Node* TY_❪CloneNode❫ ( TidyDocImpl *  doc,
Node *  element 
)

Used to clone heading nodes when split by an <HR>

◆ TY_❪ConstrainVersion❫()

TY_PRIVATE void TY_❪ConstrainVersion❫ ( TidyDocImpl *  doc,
uint  vers 
)

Everything is allowed in proprietary version of HTML.

This is handled here rather than in the tag/attr dicts

◆ TY_❪DeferDup❫()

TY_PRIVATE void TY_❪DeferDup❫ ( TidyDocImpl *  doc)

Defer duplicates when entering a table or other element where the inlines shouldn't be duplicated.

◆ TY_❪DetachAttribute❫()

TY_PRIVATE void TY_❪DetachAttribute❫ ( Node *  node,
AttVal *  attr 
)

Detach attribute from node.

◆ TY_❪DupAttrs❫()

TY_PRIVATE AttVal* TY_❪DupAttrs❫ ( TidyDocImpl *  doc,
AttVal *  attrs 
)

Duplicate attributes.

◆ TY_❪FindBody❫()

TY_PRIVATE Node* TY_❪FindBody❫ ( TidyDocImpl *  doc)

◆ TY_❪FindContainer❫()

TY_PRIVATE Node* TY_❪FindContainer❫ ( Node *  node)

Returns containing block element, if any.

◆ TY_❪FindDocType❫()

TY_PRIVATE Node* TY_❪FindDocType❫ ( TidyDocImpl *  doc)

◆ TY_❪FindHEAD❫()

TY_PRIVATE Node* TY_❪FindHEAD❫ ( TidyDocImpl *  doc)

◆ TY_❪FindHTML❫()

TY_PRIVATE Node* TY_❪FindHTML❫ ( TidyDocImpl *  doc)

◆ TY_❪FindTITLE❫()

TY_PRIVATE Node* TY_❪FindTITLE❫ ( TidyDocImpl *  doc)

◆ TY_❪FindXmlDecl❫()

TY_PRIVATE Node* TY_❪FindXmlDecl❫ ( TidyDocImpl *  doc)

◆ TY_❪FixDocType❫()

TY_PRIVATE Bool TY_❪FixDocType❫ ( TidyDocImpl *  doc)

Fixup doctype if missing.

◆ TY_❪FixXmlDecl❫()

TY_PRIVATE Bool TY_❪FixXmlDecl❫ ( TidyDocImpl *  doc)

Ensure XML document starts with <?xml version="1.0"?>,and add encoding attribute if not using ASCII or UTF-8 output.

◆ TY_❪FreeAttribute❫()

TY_PRIVATE void TY_❪FreeAttribute❫ ( TidyDocImpl *  doc,
AttVal *  av 
)

Doesn't repair attribute list linkage.

◆ TY_❪FreeAttrs❫()

TY_PRIVATE void TY_❪FreeAttrs❫ ( TidyDocImpl *  doc,
Node *  node 
)

Free node's attributes.

◆ TY_❪FreeLexer❫()

TY_PRIVATE void TY_❪FreeLexer❫ ( TidyDocImpl *  doc)

◆ TY_❪FreeNode❫()

TY_PRIVATE void TY_❪FreeNode❫ ( TidyDocImpl *  doc,
Node *  node 
)

Free document nodes by iterating through peers and recursing through children.

Set next to NULL before calling FreeNode() to avoid freeing peer nodes. Doesn't patch up prev/next links.

◆ TY_❪freeStack❫()

TY_PRIVATE void TY_❪freeStack❫ ( Stack stack)

Frees the stack when done.

◆ TY_❪GetToken❫()

TY_PRIVATE Node* TY_❪GetToken❫ ( TidyDocImpl *  doc,
GetTokenMode  mode 
)

◆ TY_❪growStack❫()

TY_PRIVATE void TY_❪growStack❫ ( Stack stack)

Increase the stack size.

This will be called automatically when the current stack is full. If memory allocation fails, then the allocator will panic the program automatically.

◆ TY_❪HTMLVersionNameFromCode❫()

TY_PRIVATE ctmbstr TY_❪HTMLVersionNameFromCode❫ ( uint  vers,
Bool  isXhtml 
)

◆ TY_❪HTMLVersionNumberFromCode❫()

TY_PRIVATE uint TY_❪HTMLVersionNumberFromCode❫ ( uint  vers)

◆ TY_❪HTMLVersion❫()

TY_PRIVATE int TY_❪HTMLVersion❫ ( TidyDocImpl *  doc)

Choose what version to use for new doctype.

◆ TY_❪InferredTag❫()

TY_PRIVATE Node* TY_❪InferredTag❫ ( TidyDocImpl *  doc,
TidyTagId  id 
)

◆ TY_❪InitMap❫()

TY_PRIVATE void TY_❪InitMap❫ ( void  )

◆ TY_❪InlineDup1❫()

TY_PRIVATE Bool TY_❪InlineDup1❫ ( TidyDocImpl *  doc,
Node *  node,
Node *  element 
)

◆ TY_❪InlineDup❫()

TY_PRIVATE int TY_❪InlineDup❫ ( TidyDocImpl *  doc,
Node *  node 
)

This has the effect of inserting "missing" inline elements around the contents of blocklevel elements such as P, TD, TH, DIV, PRE etc.

This procedure is called at the start of ParseBlock, when the inline stack is not empty, as will be the case in:

<i><h1>italic heading</h1></i>

which is then treated as equivalent to

<h1><i>italic heading</i></h1>

This is implemented by setting the lexer into a mode where it gets tokens from the inline stack rather than from the input stream.

◆ TY_❪InsertAttributeAtEnd❫()

TY_PRIVATE void TY_❪InsertAttributeAtEnd❫ ( Node *  node,
AttVal *  av 
)

Insert attribute at the end of attribute list of a node.

◆ TY_❪InsertAttributeAtStart❫()

TY_PRIVATE void TY_❪InsertAttributeAtStart❫ ( Node *  node,
AttVal *  av 
)

Insert attribute at the start of attribute list of a node.

◆ TY_❪InsertedToken❫()

TY_PRIVATE Node* TY_❪InsertedToken❫ ( TidyDocImpl *  doc)

◆ TY_❪IsDigit❫()

TY_PRIVATE Bool TY_❪IsDigit❫ ( uint  c)

◆ TY_❪IsHTMLSpace❫()

TY_PRIVATE Bool TY_❪IsHTMLSpace❫ ( uint  c)

◆ TY_❪IsLetter❫()

TY_PRIVATE Bool TY_❪IsLetter❫ ( uint  c)

◆ TY_❪IsNamechar❫()

TY_PRIVATE Bool TY_❪IsNamechar❫ ( uint  c)

◆ TY_❪IsNewline❫()

TY_PRIVATE Bool TY_❪IsNewline❫ ( uint  c)

◆ TY_❪IsPushedLast❫()

TY_PRIVATE Bool TY_❪IsPushedLast❫ ( TidyDocImpl *  doc,
Node *  element,
Node *  node 
)

◆ TY_❪IsPushed❫()

TY_PRIVATE Bool TY_❪IsPushed❫ ( TidyDocImpl *  doc,
Node *  node 
)

◆ TY_❪IsUpper❫()

TY_PRIVATE Bool TY_❪IsUpper❫ ( uint  c)

◆ TY_❪IsWhite❫()

TY_PRIVATE Bool TY_❪IsWhite❫ ( uint  c)

◆ TY_❪IsXMLLetter❫()

TY_PRIVATE Bool TY_❪IsXMLLetter❫ ( uint  c)

◆ TY_❪IsXMLNamechar❫()

TY_PRIVATE Bool TY_❪IsXMLNamechar❫ ( uint  c)

◆ TY_❪NewAttributeEx❫()

TY_PRIVATE AttVal* TY_❪NewAttributeEx❫ ( TidyDocImpl *  doc,
ctmbstr  name,
ctmbstr  value,
int  delim 
)

Create a new attribute with given name and value.

◆ TY_❪NewAttribute❫()

TY_PRIVATE AttVal* TY_❪NewAttribute❫ ( TidyDocImpl *  doc)

Create a new attribute.

◆ TY_❪NewLexer❫()

TY_PRIVATE Lexer* TY_❪NewLexer❫ ( TidyDocImpl *  doc)

◆ TY_❪NewLineNode❫()

TY_PRIVATE Node* TY_❪NewLineNode❫ ( Lexer *  lexer)

Used for creating preformatted text from Word2000.

◆ TY_❪NewLiteralTextNode❫()

TY_PRIVATE Node* TY_❪NewLiteralTextNode❫ ( Lexer *  lexer,
ctmbstr  txt 
)

Used for adding a   for Word2000.

◆ TY_❪NewNode❫()

TY_PRIVATE Node* TY_❪NewNode❫ ( TidyAllocator *  allocator,
Lexer *  lexer 
)

Used for elements and text nodes.

  • Element name is NULL for text nodes.
  • start and end are offsets into lexbuf, which contains the textual content of all elements in the parse tree.
  • parent and content allow traversal of the parse tree in any direction.
  • attributes are represented as a linked list of AttVal nodes which hold the strings for attribute/value pairs.

◆ TY_❪newStack❫()

TY_PRIVATE Stack* TY_❪newStack❫ ( TidyDocImpl *  doc,
uint  capacity 
)

Create a new stack with a given starting capacity.

If memory allocation fails, then the allocator will panic the program automatically.

◆ TY_❪peek❫()

TY_PRIVATE Node* TY_❪peek❫ ( Stack stack)

Peek at the stack.

◆ TY_❪PopInline❫()

TY_PRIVATE void TY_❪PopInline❫ ( TidyDocImpl *  doc,
Node *  node 
)

Pop inline stack.

◆ TY_❪pop❫()

TY_PRIVATE Node* TY_❪pop❫ ( Stack stack)

Pop an item from the stack.

◆ TY_❪PushInline❫()

TY_PRIVATE void TY_❪PushInline❫ ( TidyDocImpl *  doc,
Node *  node 
)

Push a copy of an inline node onto stack, but don't push if implicit or OBJECT or APPLET (implicit tags are ones generated from the istack).

One issue arises with pushing inlines when the tag is already pushed. For instance:

<p><em>text
<p><em>more text

Shouldn't be mapped to

<p><em>text</em></p>
<p><em><em>more text</em></em>

◆ TY_❪push❫()

TY_PRIVATE void TY_❪push❫ ( Stack stack,
Node *  node 
)

Push an item to the stack.

◆ TY_❪RemoveAttribute❫()

TY_PRIVATE void TY_❪RemoveAttribute❫ ( TidyDocImpl *  doc,
Node *  node,
AttVal *  attr 
)

Detach attribute from node then free it.

◆ TY_❪SetXHTMLDocType❫()

TY_PRIVATE Bool TY_❪SetXHTMLDocType❫ ( TidyDocImpl *  doc)

◆ TY_❪stackEmpty❫()

TY_PRIVATE Bool TY_❪stackEmpty❫ ( Stack stack)

Stack is empty when top is equal to -1.

◆ TY_❪stackFull❫()

TY_PRIVATE Bool TY_❪stackFull❫ ( Stack stack)

Stack is full when top is equal to the last index.

◆ TY_❪SwitchInline❫()

TY_PRIVATE Bool TY_❪SwitchInline❫ ( TidyDocImpl *  doc,
Node *  element,
Node *  node 
)

Stack manipulation for inline elements.

◆ TY_❪TextToken❫()

TY_PRIVATE Node* TY_❪TextToken❫ ( Lexer *  lexer)

◆ TY_❪ToLower❫()

TY_PRIVATE uint TY_❪ToLower❫ ( uint  c)

◆ TY_❪ToUpper❫()

TY_PRIVATE uint TY_❪ToUpper❫ ( uint  c)

◆ TY_❪UngetToken❫()

TY_PRIVATE void TY_❪UngetToken❫ ( TidyDocImpl *  doc)

◆ TY_❪WarnMissingSIInEmittedDocType❫()

TY_PRIVATE Bool TY_❪WarnMissingSIInEmittedDocType❫ ( TidyDocImpl *  doc)