Detailed Description

These functions and structures form the internal API for document lexing.

Data Structures
struct	AttVal
	Attribute/Value linked list node. More...

struct	IStack
	Mosaic handles inlines via a separate stack from other elements We duplicate this to recover from inline markup errors such as: More...

struct	Lexer
	The following are private to the lexer. More...

struct	Node
	HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, etc., etc. More...

struct	TagStyle

struct	StyleProp

struct	Stack
	This typedef represents a stack of addresses to nodes. More...

Macros
#define	CM_BLOCK (1 << 3)
	HTML "block" elements. More...

#define	CM_DEFLIST (1 << 6)
	Elements that mark definition list item ("DL", "DT"). More...

#define	CM_EMPTY (1 << 0)
	Elements with no content. More...

#define	CM_FIELD (1 << 10)
	Elements whose content must be protected against white space movement. More...

#define	CM_FRAMES (1 << 13)
	"FRAME", "FRAMESET", "NOFRAMES". More...

#define	CM_HEAD (1 << 2)
	Elements that can appear within HEAD. More...

#define	CM_HEADING (1 << 14)
	Heading elements (h1, h2, ...). More...

#define	CM_HTML (1 << 1)
	Elements that appear outside of "BODY". More...

#define	CM_IMG (1 << 16)
	Elements that use "align" attribute for vertical position. More...

#define	CM_INLINE (1 << 4)
	HTML "inline" elements. More...

#define	CM_LIST (1 << 5)
	Elements that mark list item ("LI"). More...

#define	CM_MIXED (1 << 17)
	Elements with inline and block model. More...

#define	CM_NEW (1 << 20)
	User defined elements. More...

#define	CM_NO_INDENT (1 << 18)
	Elements whose content needs to be indented only if containing one CM_BLOCK element. More...

#define	CM_OBJECT (1 << 11)
	Used to avoid propagating inline emphasis inside some elements such as OBJECT or APPLET. More...

#define	CM_OBSOLETE (1 << 19)
	Elements that are obsolete (such as "dir", "menu"). More...

#define	CM_OMITST (1 << 21)
	Elements that cannot be omitted. More...

#define	CM_OPT (1 << 15)
	Elements with an optional end tag. More...

#define	CM_PARAM (1 << 12)
	Elements that allows "PARAM". More...

#define	CM_ROW (1 << 9)
	Used for "TD", "TH". More...

#define	CM_ROWGRP (1 << 8)
	Used for "THEAD", "TFOOT" or "TBODY". More...

#define	CM_TABLE (1 << 7)
	Elements that can appear inside TABLE. More...

#define	CM_UNKNOWN 0
	Content model shortcut encoding. More...

#define	CM_VOID (1 << 22)
	Elements that are void per https://www.w3.org/TR/2011/WD-html-markup-20110113/syntax.html#syntax-elements. More...

#define	digit 1u
	Lexer character types. More...

#define	digithex 128u

#define	H40F 16u

#define	H40S 4u

#define	H40T 8u

#define	H41F 128u

#define	H41S 32u

#define	H41T 64u

#define	HT20 1u

#define	HT32 2u

#define	HT50 131072u

#define	letter 2u

#define	lowercase 32u

#define	namechar 4u

#define	newline 16u

#define	uppercase 64u

#define	VERS_ALL (VERS_HTML20\|VERS_HTML32\|VERS_FROM40\|XH50\|HT50)

#define	VERS_BASIC (XB10)

#define	VERS_EVENTS (VERS_HTML40\|VERS_XHTML11)

#define	VERS_FRAMESET (H40F\|H41F\|X10F)

#define	VERS_FROM32 (VERS_HTML32\|VERS_HTML40\|HT50)

#define	VERS_FROM40 (VERS_HTML40\|VERS_XHTML11\|VERS_BASIC\|VERS_HTML5)

#define	VERS_HTML20 (HT20)

#define	VERS_HTML32 (HT32)

#define	VERS_HTML40 (VERS_HTML40_STRICT\|VERS_HTML40_LOOSE\|VERS_FRAMESET)

#define	VERS_HTML40_LOOSE (H40T\|H41T\|X10T)

#define	VERS_HTML40_STRICT (H40S\|H41S\|X10S)

#define	VERS_HTML5 (HT50\|XH50)

#define	VERS_IFRAME (VERS_HTML40_LOOSE\|VERS_FRAMESET)

#define	VERS_LOOSE (VERS_HTML20\|VERS_HTML32\|VERS_IFRAME)

#define	VERS_MICROSOFT 32768u

#define	VERS_NETSCAPE 16384u

#define	VERS_PROPRIETARY (VERS_NETSCAPE\|VERS_MICROSOFT\|VERS_SUN)

#define	VERS_STRICT (VERS_HTML5\|VERS_HTML40_STRICT)

#define	VERS_SUN 8192u

#define	VERS_UNKNOWN (xxxx)

#define	VERS_XHTML (X10S\|X10T\|X10F\|XH11\|XB10\|XH50)

#define	VERS_XHTML11 (XH11)

#define	VERS_XML 65536u

#define	white 8u

#define	X10F 1024u

#define	X10S 256u

#define	X10T 512u

#define	XB10 4096u

#define	XH11 2048u

#define	XH50 262144u

#define	xxxx 0u
	If the document uses just HTML 2.0 tags and attributes described it is HTML 2.0. More...

Enumerations
enum	GetTokenMode { IgnoreWhitespace , MixedContent , Preformatted , IgnoreMarkup , OtherNamespace , CdataContent }
	modes for GetToken() More...

enum	LexerState { LEX_CONTENT , LEX_GT , LEX_ENDTAG , LEX_STARTTAG , LEX_COMMENT , LEX_DOCTYPE , LEX_PROCINSTR , LEX_CDATA , LEX_SECTION , LEX_ASP , LEX_JSTE , LEX_PHP , LEX_XMLDECL }
	Lexer GetToken() states. More...

enum	NodeType { RootNode , DocTypeTag , CommentTag , ProcInsTag , TextNode , StartTag , EndTag , StartEndTag , CDATATag , SectionTag , AspTag , JsteTag , PhpTag , XmlDecl }
	node->type is one of these values More...

enum	ParseDocTypeDeclState { DT_INTERMEDIATE , DT_DOCTYPENAME , DT_PUBLICSYSTEM , DT_QUOTEDSTRING , DT_INTSUBSET }
	ParseDocTypeDecl state constants. More...

Lexer Functions
TY_PRIVATE int	TY_❪HTMLVersion❫ (TidyDocImpl *doc)
	Choose what version to use for new doctype. More...

TY_PRIVATE void	TY_❪ConstrainVersion❫ (TidyDocImpl *doc, uint vers)
	Everything is allowed in proprietary version of HTML. More...

TY_PRIVATE Bool	TY_❪IsWhite❫ (uint c)

TY_PRIVATE Bool	TY_❪IsDigit❫ (uint c)

TY_PRIVATE Bool	TY_❪IsLetter❫ (uint c)

TY_PRIVATE Bool	TY_❪IsHTMLSpace❫ (uint c)

TY_PRIVATE Bool	TY_❪IsNewline❫ (uint c)

TY_PRIVATE Bool	TY_❪IsNamechar❫ (uint c)

TY_PRIVATE Bool	TY_❪IsXMLLetter❫ (uint c)

TY_PRIVATE Bool	TY_❪IsXMLNamechar❫ (uint c)

TY_PRIVATE Bool	TY_❪IsUpper❫ (uint c)

TY_PRIVATE uint	TY_❪ToLower❫ (uint c)

TY_PRIVATE uint	TY_❪ToUpper❫ (uint c)

TY_PRIVATE Lexer *	TY_❪NewLexer❫ (TidyDocImpl *doc)

TY_PRIVATE void	TY_❪FreeLexer❫ (TidyDocImpl *doc)

TY_PRIVATE void	TY_❪AddCharToLexer❫ (Lexer *lexer, uint c)
	Store character c as UTF-8 encoded byte stream. More...

TY_PRIVATE Node *	TY_❪NewNode❫ (TidyAllocator allocator, Lexer lexer)
	Used for elements and text nodes. More...

TY_PRIVATE Node *	TY_❪CloneNode❫ (TidyDocImpl doc, Node element)
	Used to clone heading nodes when split by an `<HR>` More...

TY_PRIVATE void	TY_❪FreeAttrs❫ (TidyDocImpl doc, Node node)
	Free node's attributes. More...

TY_PRIVATE void	TY_❪FreeAttribute❫ (TidyDocImpl doc, AttVal av)
	Doesn't repair attribute list linkage. More...

TY_PRIVATE void	TY_❪DetachAttribute❫ (Node node, AttVal attr)
	Detach attribute from node. More...

TY_PRIVATE void	TY_❪RemoveAttribute❫ (TidyDocImpl doc, Node node, AttVal *attr)
	Detach attribute from node then free it. More...

TY_PRIVATE void	TY_❪FreeNode❫ (TidyDocImpl doc, Node node)
	Free document nodes by iterating through peers and recursing through children. More...

TY_PRIVATE Node *	TY_❪TextToken❫ (Lexer *lexer)

TY_PRIVATE Node *	TY_❪NewLineNode❫ (Lexer *lexer)
	Used for creating preformatted text from Word2000. More...

TY_PRIVATE Node *	TY_❪NewLiteralTextNode❫ (Lexer *lexer, ctmbstr txt)
	Used for adding a for Word2000. More...

TY_PRIVATE void	TY_❪AddStringLiteral❫ (Lexer *lexer, ctmbstr str)

TY_PRIVATE Node *	TY_❪FindDocType❫ (TidyDocImpl *doc)

TY_PRIVATE Node *	TY_❪FindHTML❫ (TidyDocImpl *doc)

TY_PRIVATE Node *	TY_❪FindHEAD❫ (TidyDocImpl *doc)

TY_PRIVATE Node *	TY_❪FindTITLE❫ (TidyDocImpl *doc)

TY_PRIVATE Node *	TY_❪FindBody❫ (TidyDocImpl *doc)

TY_PRIVATE Node *	TY_❪FindXmlDecl❫ (TidyDocImpl *doc)

TY_PRIVATE Node *	TY_❪FindContainer❫ (Node *node)
	Returns containing block element, if any. More...

TY_PRIVATE Bool	TY_❪AddGenerator❫ (TidyDocImpl *doc)
	Add meta element for Tidy. More...

TY_PRIVATE uint	TY_❪ApparentVersion❫ (TidyDocImpl *doc)

TY_PRIVATE ctmbstr	TY_❪HTMLVersionNameFromCode❫ (uint vers, Bool isXhtml)

TY_PRIVATE uint	TY_❪HTMLVersionNumberFromCode❫ (uint vers)

TY_PRIVATE Bool	TY_❪WarnMissingSIInEmittedDocType❫ (TidyDocImpl *doc)

TY_PRIVATE Bool	TY_❪SetXHTMLDocType❫ (TidyDocImpl *doc)

TY_PRIVATE Bool	TY_❪FixDocType❫ (TidyDocImpl *doc)
	Fixup doctype if missing. More...

TY_PRIVATE Bool	TY_❪FixXmlDecl❫ (TidyDocImpl *doc)
	Ensure XML document starts with <?xml version="1.0"?>,and add encoding attribute if not using ASCII or UTF-8 output. More...

TY_PRIVATE Node *	TY_❪InferredTag❫ (TidyDocImpl *doc, TidyTagId id)

TY_PRIVATE void	TY_❪UngetToken❫ (TidyDocImpl *doc)

TY_PRIVATE Node *	TY_❪GetToken❫ (TidyDocImpl *doc, GetTokenMode mode)

TY_PRIVATE void	TY_❪InitMap❫ (void)

TY_PRIVATE AttVal *	TY_❪NewAttribute❫ (TidyDocImpl *doc)
	Create a new attribute. More...

TY_PRIVATE AttVal *	TY_❪NewAttributeEx❫ (TidyDocImpl *doc, ctmbstr name, ctmbstr value, int delim)
	Create a new attribute with given name and value. More...

TY_PRIVATE void	TY_❪InsertAttributeAtEnd❫ (Node node, AttVal av)
	Insert attribute at the end of attribute list of a node. More...

TY_PRIVATE void	TY_❪InsertAttributeAtStart❫ (Node node, AttVal av)
	Insert attribute at the start of attribute list of a node. More...

Inline Stack Functions
TY_PRIVATE AttVal *	TY_❪DupAttrs❫ (TidyDocImpl doc, AttVal attrs)
	Duplicate attributes. More...

TY_PRIVATE void	TY_❪PushInline❫ (TidyDocImpl doc, Node node)
	Push a copy of an inline node onto stack, but don't push if implicit or OBJECT or APPLET (implicit tags are ones generated from the istack). More...

TY_PRIVATE void	TY_❪PopInline❫ (TidyDocImpl doc, Node node)
	Pop inline stack. More...

TY_PRIVATE Bool	TY_❪IsPushed❫ (TidyDocImpl doc, Node node)

TY_PRIVATE Bool	TY_❪IsPushedLast❫ (TidyDocImpl doc, Node element, Node *node)

TY_PRIVATE int	TY_❪InlineDup❫ (TidyDocImpl doc, Node node)
	This has the effect of inserting "missing" inline elements around the contents of blocklevel elements such as P, TD, TH, DIV, PRE etc. More...

TY_PRIVATE void	TY_❪DeferDup❫ (TidyDocImpl *doc)
	Defer duplicates when entering a table or other element where the inlines shouldn't be duplicated. More...

TY_PRIVATE Node *	TY_❪InsertedToken❫ (TidyDocImpl *doc)

TY_PRIVATE Bool	TY_❪SwitchInline❫ (TidyDocImpl doc, Node element, Node *node)
	Stack manipulation for inline elements. More...

TY_PRIVATE Bool	TY_❪InlineDup1❫ (TidyDocImpl doc, Node node, Node *element)

Generic stack of nodes.
TY_PRIVATE Stack *	TY_❪newStack❫ (TidyDocImpl *doc, uint capacity)
	Create a new stack with a given starting capacity. More...

TY_PRIVATE void	TY_❪growStack❫ (Stack *stack)
	Increase the stack size. More...

TY_PRIVATE Bool	TY_❪stackFull❫ (Stack *stack)
	Stack is full when top is equal to the last index. More...

TY_PRIVATE Bool	TY_❪stackEmpty❫ (Stack *stack)
	Stack is empty when top is equal to -1. More...

TY_PRIVATE void	TY_❪push❫ (Stack stack, Node node)
	Push an item to the stack. More...

TY_PRIVATE Node *	TY_❪pop❫ (Stack *stack)
	Pop an item from the stack. More...

TY_PRIVATE Node *	TY_❪peek❫ (Stack *stack)
	Peek at the stack. More...

TY_PRIVATE void	TY_❪freeStack❫ (Stack *stack)
	Frees the stack when done. More...

Data Structure Documentation

◆ _AttVal

struct _AttVal

Attribute/Value linked list node.

Data Fields
Node *	asp
tmbstr	attribute
int	delim
const Attribute *	dict
AttVal *	next
Node *	php
tmbstr	value

◆ _IStack

struct _IStack

Mosaic handles inlines via a separate stack from other elements We duplicate this to recover from inline markup errors such as:

italic text

more italic text normal text

which for compatibility with Mosaic is mapped to:

italic text

more italic text normal text

Note that any inline end tag pop's the effect of the current inline start tag, so that  pop's  in the above example.

Data Fields
AttVal *	attributes
tmbstr	element	name (NULL for text nodes)
IStack *	next
const Dict *	tag	tag's dictionary definition

◆ _Lexer

struct _Lexer

The following are private to the lexer.

Use NewLexer() to create a lexer, and FreeLexer() to free it.

Data Fields
TidyAllocator *	allocator	allocator
Bool	bad_doctype	e.g. if html or PUBLIC is missing
uint	columns	at start of current token
uint	doctype	version as given by doctype (if any)
Bool	excludeBlocks	Netscape compatibility.
Bool	exiled	true if moved out of table
Node *	inode	for deferring text node
IStack *	insert	for inferring inline tags
Bool	insertspace	when space is moved after end tag
IStack *	istack
uint	istackbase	start of frame
uint	istacklength	allocated
uint	istacksize	used
Bool	isvoyager	true if xmlns attribute on html element (i.e., "Voyager" was the W3C codename for XHTML).
Node *	itoken	last duplicate inline returned by GetToken()
tmbstr	lexbuf	MB character buffer.
uint	lexlength	allocated
uint	lexsize	used
uint	lines	lines seen
Node *	parent	remember parent node for CDATA elements
Bool	pushed	true after token has been pushed back
Node *	root	remember root node of the document
Bool	seenEndBody	true if a `</body>` tag has been encountered
Bool	seenEndHtml	true if a `</html>` tag has been encountered
LexerState	state	state of lexer's finite state machine
TagStyle *	styles	used for cleaning up presentation markup
Node *	token	last token returned by GetToken()
uint	txtend	end of current node
uint	txtstart	start of current node
uint	versionEmitted	version of doctype emitted
uint	versions	bit vector of HTML versions
Bool	waswhite	used to collapse contiguous white space

◆ _Node

struct _Node

HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, etc., etc.

Data Fields
AttVal *	attributes
Bool	closed	true if closed by explicit end tag
uint	column	current column of document
Node *	content
tmbstr	element	name (NULL for text nodes)
uint	end	end of span onto text array
Bool	implicit	true if inferred
Node *	last
uint	line	current line of document
Bool	linebreak	true if followed by a line break
Node *	next
Node *	parent	tree structure
Node *	prev
uint	start	start of span onto text array
const Dict *	tag	tag's dictionary definition
NodeType	type	TextNode, StartTag, EndTag etc.
const Dict *	was	old tag when it was changed

◆ _Style

struct _Style

Data Fields
TagStyle *	next
tmbstr	properties
tmbstr	tag
tmbstr	tag_class

◆ _StyleProp

struct _StyleProp

Data Fields
tmbstr	name
StyleProp *	next
tmbstr	value

◆ Stack

struct Stack

This typedef represents a stack of addresses to nodes.

Tidy uses these to try to limit recursion by pushing nodes to a stack when possible instead of recursing.

Data Fields
TidyAllocator *	allocator	A pointer to the first pointer to a Node in an array of node addresses. Tidy's allocator, used at instantiation and expanding.
unsigned	capacity	Current capacity. Can be expanded.
Node **	firstNode
int	top	Current top position.

Macro Definition Documentation

◆ CM_BLOCK

#define CM_BLOCK (1 << 3)

HTML "block" elements.

◆ CM_DEFLIST

#define CM_DEFLIST (1 << 6)

Elements that mark definition list item ("DL", "DT").

◆ CM_EMPTY

#define CM_EMPTY (1 << 0)

Elements with no content.

Map to HTML specification.

◆ CM_FIELD

#define CM_FIELD (1 << 10)

Elements whose content must be protected against white space movement.

Includes some elements that can found in forms.

◆ CM_FRAMES

#define CM_FRAMES (1 << 13)

"FRAME", "FRAMESET", "NOFRAMES".

Used in ParseFrameSet.

◆ CM_HEAD

#define CM_HEAD (1 << 2)

Elements that can appear within HEAD.

◆ CM_HEADING

#define CM_HEADING (1 << 14)

Heading elements (h1, h2, ...).

◆ CM_HTML

#define CM_HTML (1 << 1)

Elements that appear outside of "BODY".

◆ CM_IMG

#define CM_IMG (1 << 16)

Elements that use "align" attribute for vertical position.

◆ CM_INLINE

#define CM_INLINE (1 << 4)

HTML "inline" elements.

◆ CM_LIST

#define CM_LIST (1 << 5)

Elements that mark list item ("LI").

◆ CM_MIXED

#define CM_MIXED (1 << 17)

Elements with inline and block model.

Used to avoid calling InlineDup.

◆ CM_NEW

#define CM_NEW (1 << 20)

User defined elements.

Used to determine how attributes without value should be printed.

◆ CM_NO_INDENT

#define CM_NO_INDENT (1 << 18)

Elements whose content needs to be indented only if containing one CM_BLOCK element.

◆ CM_OBJECT

#define CM_OBJECT (1 << 11)

Used to avoid propagating inline emphasis inside some elements such as OBJECT or APPLET.

◆ CM_OBSOLETE

#define CM_OBSOLETE (1 << 19)

Elements that are obsolete (such as "dir", "menu").

◆ CM_OMITST

#define CM_OMITST (1 << 21)

Elements that cannot be omitted.

◆ CM_OPT

#define CM_OPT (1 << 15)

Elements with an optional end tag.

◆ CM_PARAM

#define CM_PARAM (1 << 12)

Elements that allows "PARAM".

◆ CM_ROW

#define CM_ROW (1 << 9)

Used for "TD", "TH".

◆ CM_ROWGRP

#define CM_ROWGRP (1 << 8)

Used for "THEAD", "TFOOT" or "TBODY".

◆ CM_TABLE

#define CM_TABLE (1 << 7)

Elements that can appear inside TABLE.

◆ CM_UNKNOWN

#define CM_UNKNOWN 0

Content model shortcut encoding.

Descriptions are tentative.

◆ CM_VOID

#define CM_VOID (1 << 22)

Elements that are void per https://www.w3.org/TR/2011/WD-html-markup-20110113/syntax.html#syntax-elements.

◆ digit

#define digit 1u

Lexer character types.

◆ digithex

#define digithex 128u

◆ H40F

#define H40F 16u

◆ H40S

#define H40S 4u

◆ H40T

#define H40T 8u

◆ H41F

#define H41F 128u

◆ H41S

#define H41S 32u

◆ H41T

#define H41T 64u

◆ HT20

#define HT20 1u

◆ HT32

#define HT32 2u

◆ HT50

#define HT50 131072u

◆ letter

#define letter 2u

◆ lowercase

#define lowercase 32u

◆ namechar

#define namechar 4u

◆ newline

#define newline 16u

◆ uppercase

#define uppercase 64u

◆ VERS_ALL

#define VERS_ALL (VERS_HTML20|VERS_HTML32|VERS_FROM40|XH50|HT50)

◆ VERS_BASIC

#define VERS_BASIC (XB10)

◆ VERS_EVENTS

#define VERS_EVENTS (VERS_HTML40|VERS_XHTML11)

◆ VERS_FRAMESET

#define VERS_FRAMESET (H40F|H41F|X10F)

◆ VERS_FROM32

#define VERS_FROM32 (VERS_HTML32|VERS_HTML40|HT50)

◆ VERS_FROM40

#define VERS_FROM40 (VERS_HTML40|VERS_XHTML11|VERS_BASIC|VERS_HTML5)

◆ VERS_HTML20

#define VERS_HTML20 (HT20)

◆ VERS_HTML32

#define VERS_HTML32 (HT32)

◆ VERS_HTML40

#define VERS_HTML40 (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET)

◆ VERS_HTML40_LOOSE

#define VERS_HTML40_LOOSE (H40T|H41T|X10T)

◆ VERS_HTML40_STRICT

#define VERS_HTML40_STRICT (H40S|H41S|X10S)

◆ VERS_HTML5

#define VERS_HTML5 (HT50|XH50)

◆ VERS_IFRAME

#define VERS_IFRAME (VERS_HTML40_LOOSE|VERS_FRAMESET)

◆ VERS_LOOSE

#define VERS_LOOSE (VERS_HTML20|VERS_HTML32|VERS_IFRAME)

◆ VERS_MICROSOFT

#define VERS_MICROSOFT 32768u

◆ VERS_NETSCAPE

#define VERS_NETSCAPE 16384u

◆ VERS_PROPRIETARY

#define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)

◆ VERS_STRICT

#define VERS_STRICT (VERS_HTML5|VERS_HTML40_STRICT)

◆ VERS_SUN

#define VERS_SUN 8192u

◆ VERS_UNKNOWN

#define VERS_UNKNOWN (xxxx)

◆ VERS_XHTML

#define VERS_XHTML (X10S|X10T|X10F|XH11|XB10|XH50)

◆ VERS_XHTML11

#define VERS_XHTML11 (XH11)

◆ VERS_XML

#define VERS_XML 65536u

◆ white

#define white 8u

◆ X10F

#define X10F 1024u

◆ X10S

#define X10S 256u

◆ X10T

#define X10T 512u

◆ XB10

#define XB10 4096u

◆ XH11

#define XH11 2048u

◆ XH50

#define XH50 262144u

◆ xxxx

#define xxxx 0u

If the document uses just HTML 2.0 tags and attributes described it is HTML 2.0.

Similarly for HTML 3.2 and the 3 flavors of HTML 4.0. If there are proprietary tags and attributes then describe it as HTML Proprietary. If it includes the xml-lang or xmlns attributes but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the flavors of Voyager (strict, loose or frameset).

Enumeration Type Documentation

◆ GetTokenMode

enum GetTokenMode

modes for GetToken()

MixedContent – for elements which don't accept PCDATA Preformatted – white space preserved as is IgnoreMarkup – for CDATA elements such as script, style

Enumerator
IgnoreWhitespace
MixedContent
Preformatted
IgnoreMarkup
OtherNamespace
CdataContent

◆ LexerState

enum LexerState

Lexer GetToken() states.

Enumerator
LEX_CONTENT
LEX_GT
LEX_ENDTAG
LEX_STARTTAG
LEX_COMMENT
LEX_DOCTYPE
LEX_PROCINSTR
LEX_CDATA
LEX_SECTION
LEX_ASP
LEX_JSTE
LEX_PHP
LEX_XMLDECL

◆ NodeType

enum NodeType

node->type is one of these values

Enumerator
RootNode
DocTypeTag
CommentTag
ProcInsTag
TextNode
StartTag
EndTag
StartEndTag
CDATATag
SectionTag
AspTag
JsteTag
PhpTag
XmlDecl

◆ ParseDocTypeDeclState

enum ParseDocTypeDeclState

ParseDocTypeDecl state constants.

Enumerator
DT_INTERMEDIATE
DT_DOCTYPENAME
DT_PUBLICSYSTEM
DT_QUOTEDSTRING
DT_INTSUBSET

Function Documentation

◆ TY_❪AddCharToLexer❫()

TY_PRIVATE void TY_❪AddCharToLexer❫	(	Lexer *	lexer,
		uint	c
	)

Store character c as UTF-8 encoded byte stream.

◆ TY_❪AddGenerator❫()

TY_PRIVATE Bool TY_❪AddGenerator❫ ( TidyDocImpl * doc )

Add meta element for Tidy.

◆ TY_❪AddStringLiteral❫()

TY_PRIVATE void TY_❪AddStringLiteral❫	(	Lexer *	lexer,
		ctmbstr	str
	)

◆ TY_❪ApparentVersion❫()

TY_PRIVATE uint TY_❪ApparentVersion❫ ( TidyDocImpl * doc )

◆ TY_❪CloneNode❫()

TY_PRIVATE Node* TY_❪CloneNode❫	(	TidyDocImpl *	doc,
		Node *	element
	)

Used to clone heading nodes when split by an <HR>

◆ TY_❪ConstrainVersion❫()

TY_PRIVATE void TY_❪ConstrainVersion❫	(	TidyDocImpl *	doc,
		uint	vers
	)

Everything is allowed in proprietary version of HTML.

This is handled here rather than in the tag/attr dicts

◆ TY_❪DeferDup❫()

TY_PRIVATE void TY_❪DeferDup❫ ( TidyDocImpl * doc )

Defer duplicates when entering a table or other element where the inlines shouldn't be duplicated.

◆ TY_❪DetachAttribute❫()

TY_PRIVATE void TY_❪DetachAttribute❫	(	Node *	node,
		AttVal *	attr
	)

Detach attribute from node.

◆ TY_❪DupAttrs❫()

TY_PRIVATE AttVal* TY_❪DupAttrs❫	(	TidyDocImpl *	doc,
		AttVal *	attrs
	)

Duplicate attributes.

◆ TY_❪FindBody❫()

TY_PRIVATE Node* TY_❪FindBody❫ ( TidyDocImpl * doc )

◆ TY_❪FindContainer❫()

TY_PRIVATE Node* TY_❪FindContainer❫ ( Node * node )

Returns containing block element, if any.

◆ TY_❪FindDocType❫()

TY_PRIVATE Node* TY_❪FindDocType❫ ( TidyDocImpl * doc )

◆ TY_❪FindHEAD❫()

TY_PRIVATE Node* TY_❪FindHEAD❫ ( TidyDocImpl * doc )

◆ TY_❪FindHTML❫()

TY_PRIVATE Node* TY_❪FindHTML❫ ( TidyDocImpl * doc )

◆ TY_❪FindTITLE❫()

TY_PRIVATE Node* TY_❪FindTITLE❫ ( TidyDocImpl * doc )

◆ TY_❪FindXmlDecl❫()

TY_PRIVATE Node* TY_❪FindXmlDecl❫ ( TidyDocImpl * doc )

◆ TY_❪FixDocType❫()

TY_PRIVATE Bool TY_❪FixDocType❫ ( TidyDocImpl * doc )

Fixup doctype if missing.

◆ TY_❪FixXmlDecl❫()

TY_PRIVATE Bool TY_❪FixXmlDecl❫ ( TidyDocImpl * doc )

Ensure XML document starts with <?xml version="1.0"?>,and add encoding attribute if not using ASCII or UTF-8 output.

◆ TY_❪FreeAttribute❫()

TY_PRIVATE void TY_❪FreeAttribute❫	(	TidyDocImpl *	doc,
		AttVal *	av
	)

Doesn't repair attribute list linkage.

◆ TY_❪FreeAttrs❫()

TY_PRIVATE void TY_❪FreeAttrs❫	(	TidyDocImpl *	doc,
		Node *	node
	)

Free node's attributes.

◆ TY_❪FreeLexer❫()

TY_PRIVATE void TY_❪FreeLexer❫ ( TidyDocImpl * doc )

◆ TY_❪FreeNode❫()

TY_PRIVATE void TY_❪FreeNode❫	(	TidyDocImpl *	doc,
		Node *	node
	)

Free document nodes by iterating through peers and recursing through children.

Set next to NULL before calling FreeNode() to avoid freeing peer nodes. Doesn't patch up prev/next links.

◆ TY_❪freeStack❫()

TY_PRIVATE void TY_❪freeStack❫ ( Stack * stack )

Frees the stack when done.

◆ TY_❪GetToken❫()

TY_PRIVATE Node* TY_❪GetToken❫	(	TidyDocImpl *	doc,
		GetTokenMode	mode
	)

◆ TY_❪growStack❫()

TY_PRIVATE void TY_❪growStack❫ ( Stack * stack )

Increase the stack size.

This will be called automatically when the current stack is full. If memory allocation fails, then the allocator will panic the program automatically.

◆ TY_❪HTMLVersionNameFromCode❫()

TY_PRIVATE ctmbstr TY_❪HTMLVersionNameFromCode❫	(	uint	vers,
		Bool	isXhtml
	)

◆ TY_❪HTMLVersionNumberFromCode❫()

TY_PRIVATE uint TY_❪HTMLVersionNumberFromCode❫ ( uint vers )

◆ TY_❪HTMLVersion❫()

TY_PRIVATE int TY_❪HTMLVersion❫ ( TidyDocImpl * doc )

Choose what version to use for new doctype.

◆ TY_❪InferredTag❫()

TY_PRIVATE Node* TY_❪InferredTag❫	(	TidyDocImpl *	doc,
		TidyTagId	id
	)

◆ TY_❪InitMap❫()

TY_PRIVATE void TY_❪InitMap❫ ( void )

◆ TY_❪InlineDup1❫()

TY_PRIVATE Bool TY_❪InlineDup1❫	(	TidyDocImpl *	doc,
		Node *	node,
		Node *	element
	)

◆ TY_❪InlineDup❫()

TY_PRIVATE int TY_❪InlineDup❫	(	TidyDocImpl *	doc,
		Node *	node
	)

This has the effect of inserting "missing" inline elements around the contents of blocklevel elements such as P, TD, TH, DIV, PRE etc.

This procedure is called at the start of ParseBlock, when the inline stack is not empty, as will be the case in:

<h1>italic heading</h1>

which is then treated as equivalent to

<h1>italic heading</h1>

This is implemented by setting the lexer into a mode where it gets tokens from the inline stack rather than from the input stream.

◆ TY_❪InsertAttributeAtEnd❫()

TY_PRIVATE void TY_❪InsertAttributeAtEnd❫	(	Node *	node,
		AttVal *	av
	)

Insert attribute at the end of attribute list of a node.

◆ TY_❪InsertAttributeAtStart❫()

TY_PRIVATE void TY_❪InsertAttributeAtStart❫	(	Node *	node,
		AttVal *	av
	)

Insert attribute at the start of attribute list of a node.

◆ TY_❪InsertedToken❫()

TY_PRIVATE Node* TY_❪InsertedToken❫ ( TidyDocImpl * doc )

◆ TY_❪IsDigit❫()

TY_PRIVATE Bool TY_❪IsDigit❫ ( uint c )

◆ TY_❪IsHTMLSpace❫()

TY_PRIVATE Bool TY_❪IsHTMLSpace❫ ( uint c )

◆ TY_❪IsLetter❫()

TY_PRIVATE Bool TY_❪IsLetter❫ ( uint c )

◆ TY_❪IsNamechar❫()

TY_PRIVATE Bool TY_❪IsNamechar❫ ( uint c )

◆ TY_❪IsNewline❫()

TY_PRIVATE Bool TY_❪IsNewline❫ ( uint c )

◆ TY_❪IsPushedLast❫()

TY_PRIVATE Bool TY_❪IsPushedLast❫	(	TidyDocImpl *	doc,
		Node *	element,
		Node *	node
	)

◆ TY_❪IsPushed❫()

TY_PRIVATE Bool TY_❪IsPushed❫	(	TidyDocImpl *	doc,
		Node *	node
	)

◆ TY_❪IsUpper❫()

TY_PRIVATE Bool TY_❪IsUpper❫ ( uint c )

◆ TY_❪IsWhite❫()

TY_PRIVATE Bool TY_❪IsWhite❫ ( uint c )

◆ TY_❪IsXMLLetter❫()

TY_PRIVATE Bool TY_❪IsXMLLetter❫ ( uint c )

◆ TY_❪IsXMLNamechar❫()

TY_PRIVATE Bool TY_❪IsXMLNamechar❫ ( uint c )

◆ TY_❪NewAttributeEx❫()

TY_PRIVATE AttVal* TY_❪NewAttributeEx❫	(	TidyDocImpl *	doc,
		ctmbstr	name,
		ctmbstr	value,
		int	delim
	)

Create a new attribute with given name and value.

◆ TY_❪NewAttribute❫()

TY_PRIVATE AttVal* TY_❪NewAttribute❫ ( TidyDocImpl * doc )

Create a new attribute.

◆ TY_❪NewLexer❫()

TY_PRIVATE Lexer* TY_❪NewLexer❫ ( TidyDocImpl * doc )

◆ TY_❪NewLineNode❫()

TY_PRIVATE Node* TY_❪NewLineNode❫ ( Lexer * lexer )

Used for creating preformatted text from Word2000.

◆ TY_❪NewLiteralTextNode❫()

TY_PRIVATE Node* TY_❪NewLiteralTextNode❫	(	Lexer *	lexer,
		ctmbstr	txt
	)

Used for adding a for Word2000.

◆ TY_❪NewNode❫()

TY_PRIVATE Node* TY_❪NewNode❫	(	TidyAllocator *	allocator,
		Lexer *	lexer
	)

Used for elements and text nodes.

Element name is NULL for text nodes.
start and end are offsets into lexbuf, which contains the textual content of all elements in the parse tree.
parent and content allow traversal of the parse tree in any direction.
attributes are represented as a linked list of AttVal nodes which hold the strings for attribute/value pairs.

◆ TY_❪newStack❫()

TY_PRIVATE Stack* TY_❪newStack❫	(	TidyDocImpl *	doc,
		uint	capacity
	)

Create a new stack with a given starting capacity.

If memory allocation fails, then the allocator will panic the program automatically.

◆ TY_❪peek❫()

TY_PRIVATE Node* TY_❪peek❫ ( Stack * stack )

Peek at the stack.

◆ TY_❪PopInline❫()

TY_PRIVATE void TY_❪PopInline❫	(	TidyDocImpl *	doc,
		Node *	node
	)

Pop inline stack.

◆ TY_❪pop❫()

TY_PRIVATE Node* TY_❪pop❫ ( Stack * stack )

Pop an item from the stack.

◆ TY_❪PushInline❫()

TY_PRIVATE void TY_❪PushInline❫	(	TidyDocImpl *	doc,
		Node *	node
	)

Push a copy of an inline node onto stack, but don't push if implicit or OBJECT or APPLET (implicit tags are ones generated from the istack).

One issue arises with pushing inlines when the tag is already pushed. For instance:

text

more text

Shouldn't be mapped to

text

more text

◆ TY_❪push❫()

TY_PRIVATE void TY_❪push❫	(	Stack *	stack,
		Node *	node
	)

Push an item to the stack.

◆ TY_❪RemoveAttribute❫()

TY_PRIVATE void TY_❪RemoveAttribute❫	(	TidyDocImpl *	doc,
		Node *	node,
		AttVal *	attr
	)

Detach attribute from node then free it.

◆ TY_❪SetXHTMLDocType❫()

TY_PRIVATE Bool TY_❪SetXHTMLDocType❫ ( TidyDocImpl * doc )

◆ TY_❪stackEmpty❫()

TY_PRIVATE Bool TY_❪stackEmpty❫ ( Stack * stack )

Stack is empty when top is equal to -1.

◆ TY_❪stackFull❫()

TY_PRIVATE Bool TY_❪stackFull❫ ( Stack * stack )

Stack is full when top is equal to the last index.

◆ TY_❪SwitchInline❫()

TY_PRIVATE Bool TY_❪SwitchInline❫	(	TidyDocImpl *	doc,
		Node *	element,
		Node *	node
	)

Stack manipulation for inline elements.

◆ TY_❪TextToken❫()

TY_PRIVATE Node* TY_❪TextToken❫ ( Lexer * lexer )

◆ TY_❪ToLower❫()

TY_PRIVATE uint TY_❪ToLower❫ ( uint c )

◆ TY_❪ToUpper❫()

TY_PRIVATE uint TY_❪ToUpper❫ ( uint c )

◆ TY_❪UngetToken❫()

TY_PRIVATE void TY_❪UngetToken❫ ( TidyDocImpl * doc )

◆ TY_❪WarnMissingSIInEmittedDocType❫()

TY_PRIVATE Bool TY_❪WarnMissingSIInEmittedDocType❫ ( TidyDocImpl * doc )

Detailed Description

Data Structures

Macros

Enumerations

Lexer Functions

Inline Stack Functions

Generic stack of nodes.

Data Structure Documentation

◆ _AttVal

◆ _IStack

◆ _Lexer

◆ _Node

◆ _Style

◆ _StyleProp

◆ Stack

Macro Definition Documentation

◆ CM_BLOCK

◆ CM_DEFLIST

◆ CM_EMPTY

◆ CM_FIELD

◆ CM_FRAMES

◆ CM_HEAD

◆ CM_HEADING

◆ CM_HTML

◆ CM_IMG

◆ CM_INLINE

◆ CM_LIST

◆ CM_MIXED

◆ CM_NEW

◆ CM_NO_INDENT

◆ CM_OBJECT

◆ CM_OBSOLETE

◆ CM_OMITST

◆ CM_OPT

◆ CM_PARAM

◆ CM_ROW

◆ CM_ROWGRP

◆ CM_TABLE

◆ CM_UNKNOWN

◆ CM_VOID

◆ digit

◆ digithex

◆ H40F

◆ H40S

◆ H40T

◆ H41F

◆ H41S

◆ H41T

◆ HT20

◆ HT32

◆ HT50

◆ letter

◆ lowercase

◆ namechar

◆ newline

◆ uppercase

◆ VERS_ALL

◆ VERS_BASIC

◆ VERS_EVENTS

◆ VERS_FRAMESET

◆ VERS_FROM32

◆ VERS_FROM40

◆ VERS_HTML20

◆ VERS_HTML32

◆ VERS_HTML40

◆ VERS_HTML40_LOOSE

◆ VERS_HTML40_STRICT

◆ VERS_HTML5

◆ VERS_IFRAME

◆ VERS_LOOSE

◆ VERS_MICROSOFT

◆ VERS_NETSCAPE

◆ VERS_PROPRIETARY

◆ VERS_STRICT

◆ VERS_SUN

◆ VERS_UNKNOWN

◆ VERS_XHTML

◆ VERS_XHTML11

◆ VERS_XML

◆ white