HTML Tidy  5.9.15
The HTACG Tidy HTML Project
lexer.h
Go to the documentation of this file.
1 #ifndef __LEXER_H__
2 #define __LEXER_H__
3 
4 
5 /**************************************************************************//**
6  * @file
7  * Lexer for HTML and XML Parsers.
8  *
9  * Given an input source, it returns a sequence of tokens.
10  *
11  * GetToken(source) gets the next token
12  * UngetToken(source) provides one level undo
13  *
14  * The tags include an attribute list:
15  *
16  * - linked list of attribute/value nodes
17  * - each node has 2 NULL-terminated strings.
18  * - entities are replaced in attribute values
19  *
20  * white space is compacted if not in preformatted mode
21  * If not in preformatted mode then leading white space
22  * is discarded and subsequent white space sequences
23  * compacted to single space characters.
24  *
25  * If XmlTags is no then Tag names are folded to upper
26  * case and attribute names to lower case.
27  *
28  * Not yet done:
29  * - Doctype subset and marked sections
30  *
31  * @author HTACG, et al (consult git log)
32  *
33  * @copyright
34  * (c) 1998-2021 (W3C) MIT, ERCIM, Keio University, and HTACG.
35  * See tidy.h for the copyright notice.
36  * @par
37  * All Rights Reserved.
38  * @par
39  * See `tidy.h` for the complete license.
40  *
41  * @date Additional updates: consult git log
42  *
43  ******************************************************************************/
44 
45 #ifdef __cplusplus
46 extern "C" {
47 #endif
48 
49 #include "forward.h"
50 
51 /** @addtogroup internal_api */
52 /** @{ */
53 
54 
55 /***************************************************************************//**
56  ** @defgroup lexer_h HTML and XML Lexing
57  **
58  ** These functions and structures form the internal API for document
59  ** lexing.
60  **
61  ** @{
62  ******************************************************************************/
63 
64 
65 /**
66  * Lexer character types.
67  */
68 #define digit 1u
69 #define letter 2u
70 #define namechar 4u
71 #define white 8u
72 #define newline 16u
73 #define lowercase 32u
74 #define uppercase 64u
75 #define digithex 128u
76 
77 
78 /**
79  * node->type is one of these values
80  */
81 typedef enum
82 {
96  XmlDecl
98 
99 
100 /**
101  * Lexer GetToken() states.
102  */
103 typedef enum
104 {
119 
120 
121 /**
122  * ParseDocTypeDecl state constants.
123  */
124 typedef enum
125 {
132 
133 
134 /**
135  * Content model shortcut encoding.
136  * Descriptions are tentative.
137  */
138 #define CM_UNKNOWN 0
139 #define CM_EMPTY (1 << 0) /**< Elements with no content. Map to HTML specification. */
140 #define CM_HTML (1 << 1) /**< Elements that appear outside of "BODY". */
141 #define CM_HEAD (1 << 2) /**< Elements that can appear within HEAD. */
142 #define CM_BLOCK (1 << 3) /**< HTML "block" elements. */
143 #define CM_INLINE (1 << 4) /**< HTML "inline" elements. */
144 #define CM_LIST (1 << 5) /**< Elements that mark list item ("LI"). */
145 #define CM_DEFLIST (1 << 6) /**< Elements that mark definition list item ("DL", "DT"). */
146 #define CM_TABLE (1 << 7) /**< Elements that can appear inside TABLE. */
147 #define CM_ROWGRP (1 << 8) /**< Used for "THEAD", "TFOOT" or "TBODY". */
148 #define CM_ROW (1 << 9) /**< Used for "TD", "TH" */
149 #define CM_FIELD (1 << 10) /**< Elements whose content must be protected against white space movement. Includes some elements that can found in forms. */
150 #define CM_OBJECT (1 << 11) /**< Used to avoid propagating inline emphasis inside some elements such as OBJECT or APPLET. */
151 #define CM_PARAM (1 << 12) /**< Elements that allows "PARAM". */
152 #define CM_FRAMES (1 << 13) /**< "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
153 #define CM_HEADING (1 << 14) /**< Heading elements (h1, h2, ...). */
154 #define CM_OPT (1 << 15) /**< Elements with an optional end tag. */
155 #define CM_IMG (1 << 16) /**< Elements that use "align" attribute for vertical position. */
156 #define CM_MIXED (1 << 17) /**< Elements with inline and block model. Used to avoid calling InlineDup. */
157 #define CM_NO_INDENT (1 << 18) /**< Elements whose content needs to be indented only if containing one CM_BLOCK element. */
158 #define CM_OBSOLETE (1 << 19) /**< Elements that are obsolete (such as "dir", "menu"). */
159 #define CM_NEW (1 << 20) /**< User defined elements. Used to determine how attributes without value should be printed. */
160 #define CM_OMITST (1 << 21) /**< Elements that cannot be omitted. */
161 #define CM_VOID (1 << 22) /**< Elements that are void per https://www.w3.org/TR/2011/WD-html-markup-20110113/syntax.html#syntax-elements. */
162 
163 
164 /**
165  * If the document uses just HTML 2.0 tags and attributes described
166  * it is HTML 2.0. Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
167  * If there are proprietary tags and attributes then describe it as
168  * HTML Proprietary. If it includes the xml-lang or xmlns attributes
169  * but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
170  * flavors of Voyager (strict, loose or frameset).
171  */
172 
173 /* unknown */
174 #define xxxx 0u
175 
176 /* W3C defined HTML/XHTML family document types */
177 #define HT20 1u
178 #define HT32 2u
179 #define H40S 4u
180 #define H40T 8u
181 #define H40F 16u
182 #define H41S 32u
183 #define H41T 64u
184 #define H41F 128u
185 #define X10S 256u
186 #define X10T 512u
187 #define X10F 1024u
188 #define XH11 2048u
189 #define XB10 4096u
190 
191 /* proprietary stuff */
192 #define VERS_SUN 8192u
193 #define VERS_NETSCAPE 16384u
194 #define VERS_MICROSOFT 32768u
195 
196 /* special flag */
197 #define VERS_XML 65536u
198 
199 /* HTML5 */
200 #define HT50 131072u
201 #define XH50 262144u
202 
203 /* compatibility symbols */
204 #define VERS_UNKNOWN (xxxx)
205 #define VERS_HTML20 (HT20)
206 #define VERS_HTML32 (HT32)
207 #define VERS_HTML40_STRICT (H40S|H41S|X10S)
208 #define VERS_HTML40_LOOSE (H40T|H41T|X10T)
209 #define VERS_FRAMESET (H40F|H41F|X10F)
210 #define VERS_XHTML11 (XH11)
211 #define VERS_BASIC (XB10)
212 /* HTML5 */
213 #define VERS_HTML5 (HT50|XH50)
214 
215 /* meta symbols */
216 #define VERS_HTML40 (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET)
217 #define VERS_IFRAME (VERS_HTML40_LOOSE|VERS_FRAMESET)
218 #define VERS_LOOSE (VERS_HTML20|VERS_HTML32|VERS_IFRAME)
219 #define VERS_EVENTS (VERS_HTML40|VERS_XHTML11)
220 #define VERS_FROM32 (VERS_HTML32|VERS_HTML40|HT50)
221 #define VERS_FROM40 (VERS_HTML40|VERS_XHTML11|VERS_BASIC|VERS_HTML5)
222 #define VERS_XHTML (X10S|X10T|X10F|XH11|XB10|XH50)
223 
224 /* strict */
225 #define VERS_STRICT (VERS_HTML5|VERS_HTML40_STRICT)
226 
227 /* all W3C defined document types */
228 #define VERS_ALL (VERS_HTML20|VERS_HTML32|VERS_FROM40|XH50|HT50)
229 
230 /* all proprietary types */
231 #define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)
232 
233 
234 /**
235  * Linked list of class names and styles
236  */
237 struct _Style;
238 typedef struct _Style TagStyle;
239 
240 struct _Style
241 {
245  TagStyle *next;
246 };
247 
248 
249 /**
250  * Linked list of style properties
251  */
252 struct _StyleProp;
253 typedef struct _StyleProp StyleProp;
254 
256 {
259  StyleProp *next;
260 };
261 
262 
263 /**
264  * Attribute/Value linked list node
265  */
266 struct _AttVal
267 {
268  AttVal* next;
269  const Attribute* dict;
270  Node* asp;
271  Node* php;
272  int delim;
275 };
276 
277 
278 /**
279  * Mosaic handles inlines via a separate stack from other elements
280  * We duplicate this to recover from inline markup errors such as:
281  * ~~~
282  * <i>italic text
283  * <p>more italic text</b> normal text
284  * ~~~
285  * which for compatibility with Mosaic is mapped to:
286  * ~~~
287  * <i>italic text</i>
288  * <p><i>more italic text</i> normal text
289  * ~~~
290  * Note that any inline end tag pop's the effect of the current
291  * inline start tag, so that `</b>` pop's `<i>` in the above example.
292 */
293 struct _IStack
294 {
295  IStack* next;
296  const Dict* tag; /**< tag's dictionary definition */
297  tmbstr element; /**< name (NULL for text nodes) */
298  AttVal* attributes;
299 };
300 
301 
302 /**
303  * HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, etc., etc.
304  */
305 struct _Node
306 {
307  Node* parent; /**< tree structure */
308  Node* prev;
309  Node* next;
310  Node* content;
311  Node* last;
312 
313  AttVal* attributes;
314  const Dict* was; /**< old tag when it was changed */
315  const Dict* tag; /**< tag's dictionary definition */
316 
317  tmbstr element; /**< name (NULL for text nodes) */
318 
319  uint start; /**< start of span onto text array */
320  uint end; /**< end of span onto text array */
321  NodeType type; /**< TextNode, StartTag, EndTag etc. */
322 
323  uint line; /**< current line of document */
324  uint column; /**< current column of document */
325 
326  Bool closed; /**< true if closed by explicit end tag */
327  Bool implicit; /**< true if inferred */
328  Bool linebreak; /**< true if followed by a line break */
329 };
330 
331 
332 /**
333  * The following are private to the lexer.
334  * Use `NewLexer()` to create a lexer, and `FreeLexer()` to free it.
335  */
336 struct _Lexer
337 {
338  uint lines; /**< lines seen */
339  uint columns; /**< at start of current token */
340  Bool waswhite; /**< used to collapse contiguous white space */
341  Bool pushed; /**< true after token has been pushed back */
342  Bool insertspace; /**< when space is moved after end tag */
343  Bool excludeBlocks; /**< Netscape compatibility */
344  Bool exiled; /**< true if moved out of table */
345  Bool isvoyager; /**< true if xmlns attribute on html element (i.e., "Voyager" was the W3C codename for XHTML). */
346  uint versions; /**< bit vector of HTML versions */
347  uint doctype; /**< version as given by doctype (if any) */
348  uint versionEmitted; /**< version of doctype emitted */
349  Bool bad_doctype; /**< e.g. if html or PUBLIC is missing */
350  uint txtstart; /**< start of current node */
351  uint txtend; /**< end of current node */
352  LexerState state; /**< state of lexer's finite state machine */
353 
354  Node* token; /**< last token returned by GetToken() */
355  Node* itoken; /**< last duplicate inline returned by GetToken() */
356  Node* root; /**< remember root node of the document */
357  Node* parent; /**< remember parent node for CDATA elements */
358 
359  Bool seenEndBody; /**< true if a `</body>` tag has been encountered */
360  Bool seenEndHtml; /**< true if a `</html>` tag has been encountered */
361 
362  /*
363  Lexer character buffer
364 
365  Parse tree nodes span onto this buffer
366  which contains the concatenated text
367  contents of all of the elements.
368 
369  lexsize must be reset for each file.
370  */
371  tmbstr lexbuf; /**< MB character buffer */
372  uint lexlength; /**< allocated */
373  uint lexsize; /**< used */
374 
375  /* Inline stack for compatibility with Mosaic */
376  Node* inode; /**< for deferring text node */
377  IStack* insert; /**< for inferring inline tags */
378  IStack* istack;
379  uint istacklength; /**< allocated */
380  uint istacksize; /**< used */
381  uint istackbase; /**< start of frame */
382 
383  TagStyle *styles; /**< used for cleaning up presentation markup */
384 
385  TidyAllocator* allocator; /**< allocator */
386 };
387 
388 
389 /**
390  * modes for GetToken()
391  *
392  * MixedContent -- for elements which don't accept PCDATA
393  * Preformatted -- white space preserved as is
394  * IgnoreMarkup -- for CDATA elements such as script, style
395  */
396 typedef enum
397 {
405 
406 
407 /** @name Lexer Functions
408  * @{
409  */
410 
411 
412 /**
413  * Choose what version to use for new doctype
414  */
415 TY_PRIVATE int TY_(HTMLVersion)( TidyDocImpl* doc );
416 
417 
418 /**
419  * Everything is allowed in proprietary version of HTML.
420  * This is handled here rather than in the tag/attr dicts
421  */
422 TY_PRIVATE void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers );
423 
424 TY_PRIVATE Bool TY_(IsWhite)(uint c);
425 TY_PRIVATE Bool TY_(IsDigit)(uint c);
426 TY_PRIVATE Bool TY_(IsLetter)(uint c);
427 TY_PRIVATE Bool TY_(IsHTMLSpace)(uint c);
428 TY_PRIVATE Bool TY_(IsNewline)(uint c);
429 TY_PRIVATE Bool TY_(IsNamechar)(uint c);
430 TY_PRIVATE Bool TY_(IsXMLLetter)(uint c);
431 TY_PRIVATE Bool TY_(IsXMLNamechar)(uint c);
432 
433 TY_PRIVATE Bool TY_(IsUpper)(uint c);
434 TY_PRIVATE uint TY_(ToLower)(uint c);
435 TY_PRIVATE uint TY_(ToUpper)(uint c);
436 
437 TY_PRIVATE Lexer* TY_(NewLexer)( TidyDocImpl* doc );
438 TY_PRIVATE void TY_(FreeLexer)( TidyDocImpl* doc );
439 
440 
441 /**
442  * Store character c as UTF-8 encoded byte stream
443  */
444 TY_PRIVATE void TY_(AddCharToLexer)( Lexer *lexer, uint c );
445 
446 
447 /**
448  * Used for elements and text nodes.
449  * - Element name is NULL for text nodes.
450  * - start and end are offsets into lexbuf,
451  * which contains the textual content of
452  * all elements in the parse tree.
453  * - parent and content allow traversal
454  * of the parse tree in any direction.
455  * - attributes are represented as a linked
456  * list of AttVal nodes which hold the
457  * strings for attribute/value pairs.
458 */
459 TY_PRIVATE Node* TY_(NewNode)( TidyAllocator* allocator, Lexer* lexer );
460 
461 
462 /**
463  * Used to clone heading nodes when split by an `<HR>`
464  */
465 TY_PRIVATE Node* TY_(CloneNode)( TidyDocImpl* doc, Node *element );
466 
467 
468 /**
469  * Free node's attributes
470  */
471 TY_PRIVATE void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node );
472 
473 
474 /**
475  * Doesn't repair attribute list linkage
476  */
477 TY_PRIVATE void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av );
478 
479 
480 /**
481  * Detach attribute from node
482  */
483 TY_PRIVATE void TY_(DetachAttribute)( Node *node, AttVal *attr );
484 
485 
486 /**
487  * Detach attribute from node then free it.
488  */
489 TY_PRIVATE void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr );
490 
491 
492 /**
493  * Free document nodes by iterating through peers and recursing
494  * through children. Set `next` to `NULL` before calling `FreeNode()`
495  * to avoid freeing peer nodes. Doesn't patch up prev/next links.
496  */
497 TY_PRIVATE void TY_(FreeNode)( TidyDocImpl* doc, Node *node );
498 
499 
500 TY_PRIVATE Node* TY_(TextToken)( Lexer *lexer );
501 
502 
503 /**
504  * Used for creating preformatted text from Word2000.
505  */
506 TY_PRIVATE Node* TY_(NewLineNode)( Lexer *lexer );
507 
508 
509 /**
510  * Used for adding a &nbsp; for Word2000.
511  */
512 TY_PRIVATE Node* TY_(NewLiteralTextNode)(Lexer *lexer, ctmbstr txt );
513 
514 
515 TY_PRIVATE void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str );
516 TY_PRIVATE Node* TY_(FindDocType)( TidyDocImpl* doc );
517 TY_PRIVATE Node* TY_(FindHTML)( TidyDocImpl* doc );
518 TY_PRIVATE Node* TY_(FindHEAD)( TidyDocImpl* doc );
519 TY_PRIVATE Node* TY_(FindTITLE)(TidyDocImpl* doc);
520 TY_PRIVATE Node* TY_(FindBody)( TidyDocImpl* doc );
521 TY_PRIVATE Node* TY_(FindXmlDecl)(TidyDocImpl* doc);
522 
523 
524 /**
525  * Returns containing block element, if any
526  */
527 TY_PRIVATE Node* TY_(FindContainer)( Node* node );
528 
529 
530 /**
531  * Add meta element for Tidy.
532  */
533 TY_PRIVATE Bool TY_(AddGenerator)( TidyDocImpl* doc );
534 
535 TY_PRIVATE uint TY_(ApparentVersion)( TidyDocImpl* doc );
536 
537 TY_PRIVATE ctmbstr TY_(HTMLVersionNameFromCode)( uint vers, Bool isXhtml );
538 
539 TY_PRIVATE uint TY_(HTMLVersionNumberFromCode)( uint vers );
540 
541 TY_PRIVATE Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc );
542 
543 TY_PRIVATE Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc );
544 
545 
546 /**
547  * Fixup doctype if missing.
548  */
549 TY_PRIVATE Bool TY_(FixDocType)( TidyDocImpl* doc );
550 
551 
552 /**
553  * Ensure XML document starts with <?xml version="1.0"?>,and
554  * add encoding attribute if not using ASCII or UTF-8 output.
555  */
556 TY_PRIVATE Bool TY_(FixXmlDecl)( TidyDocImpl* doc );
557 
558 
559 TY_PRIVATE Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id);
560 
561 TY_PRIVATE void TY_(UngetToken)( TidyDocImpl* doc );
562 
563 TY_PRIVATE Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode );
564 
565 TY_PRIVATE void TY_(InitMap)(void);
566 
567 
568 /**
569  * Create a new attribute.
570  */
571 TY_PRIVATE AttVal* TY_(NewAttribute)( TidyDocImpl* doc );
572 
573 
574 /**
575  * Create a new attribute with given name and value.
576  */
577 TY_PRIVATE AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
578  int delim );
579 
580 
581 /**
582  * Insert attribute at the end of attribute list of a node.
583  */
584 TY_PRIVATE void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av );
585 
586 /**
587  * Insert attribute at the start of attribute list of a node.
588  */
589 TY_PRIVATE void TY_(InsertAttributeAtStart)( Node *node, AttVal *av );
590 
591 
592 /** @}
593  * @name Inline Stack Functions
594  * @{
595  */
596 
597 
598 /**
599  * Duplicate attributes.
600  */
601 TY_PRIVATE AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs );
602 
603 
604 /**
605  * Push a copy of an inline node onto stack, but don't push if
606  * implicit or OBJECT or APPLET (implicit tags are ones generated
607  * from the istack).
608  *
609  * One issue arises with pushing inlines when the tag is already pushed.
610  * For instance:
611  * ~~~
612  * <p><em>text
613  * <p><em>more text
614  * ~~~
615  * Shouldn't be mapped to
616  * ~~~
617  * <p><em>text</em></p>
618  * <p><em><em>more text</em></em>
619  * ~~~
620  */
621 TY_PRIVATE void TY_(PushInline)( TidyDocImpl* doc, Node* node );
622 
623 
624 /**
625  * Pop inline stack.
626  */
627 TY_PRIVATE void TY_(PopInline)( TidyDocImpl* doc, Node* node );
628 
629 
630 TY_PRIVATE Bool TY_(IsPushed)( TidyDocImpl* doc, Node* node );
631 TY_PRIVATE Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node );
632 
633 
634 /**
635  * This has the effect of inserting "missing" inline elements around the
636  * contents of blocklevel elements such as P, TD, TH, DIV, PRE etc. This
637  * procedure is called at the start of `ParseBlock`, when the inline
638  * stack is not empty, as will be the case in:
639  * ~~~
640  * <i><h1>italic heading</h1></i>
641  * ~~~
642  * which is then treated as equivalent to
643  * ~~~
644  * <h1><i>italic heading</i></h1>
645  * ~~~
646  * This is implemented by setting the lexer into a mode where it gets
647  * tokens from the inline stack rather than from the input stream.
648  */
649 TY_PRIVATE int TY_(InlineDup)( TidyDocImpl* doc, Node *node );
650 
651 
652 /**
653  * Defer duplicates when entering a table or other
654  * element where the inlines shouldn't be duplicated.
655  */
656 TY_PRIVATE void TY_(DeferDup)( TidyDocImpl* doc );
657 
658 
659 TY_PRIVATE Node* TY_(InsertedToken)( TidyDocImpl* doc );
660 
661 /**
662  * Stack manipulation for inline elements
663  */
664 TY_PRIVATE Bool TY_(SwitchInline)( TidyDocImpl* doc, Node* element, Node* node );
665 
666 
667 TY_PRIVATE Bool TY_(InlineDup1)( TidyDocImpl* doc, Node* node, Node* element );
668 
669 
670 /** @}
671  * @name Generic stack of nodes.
672  * @{
673  */
674 
675 
676 /**
677  * This typedef represents a stack of addresses to nodes. Tidy uses these to
678  * try to limit recursion by pushing nodes to a stack when possible instead
679  * of recursing.
680  */
681 typedef struct _Stack {
682  int top; /**< Current top position. */
683  unsigned capacity; /**< Current capacity. Can be expanded. */
684  Node **firstNode; /** A pointer to the first pointer to a Node in an array of node addresses. */
685  TidyAllocator* allocator; /**< Tidy's allocator, used at instantiation and expanding. */
686 } Stack;
687 
688 
689 /**
690  * Create a new stack with a given starting capacity. If memory allocation
691  * fails, then the allocator will panic the program automatically.
692  */
693 TY_PRIVATE Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity);
694 
695 
696 /**
697  * Increase the stack size. This will be called automatically when the
698  * current stack is full. If memory allocation fails, then the allocator
699  * will panic the program automatically.
700  */
701 TY_PRIVATE void TY_(growStack)(Stack *stack);
702 
703 
704 /**
705  * Stack is full when top is equal to the last index.
706  */
707 TY_PRIVATE Bool TY_(stackFull)(Stack *stack);
708 
709 
710 /**
711  * Stack is empty when top is equal to -1
712  */
713 TY_PRIVATE Bool TY_(stackEmpty)(Stack *stack);
714 
715 
716 /**
717  * Push an item to the stack.
718  */
719 TY_PRIVATE void TY_(push)(Stack *stack, Node *node);
720 
721 
722 /**
723  * Pop an item from the stack.
724  */
725 TY_PRIVATE Node* TY_(pop)(Stack *stack);
726 
727 
728 /**
729  * Peek at the stack.
730  */
731 TY_PRIVATE Node* TY_(peek)(Stack *stack);
732 
733 /**
734  * Frees the stack when done.
735  */
736 TY_PRIVATE void TY_(freeStack)(Stack *stack);
737 
738 
739 /** @}
740  */
741 
742 
743 #ifdef __cplusplus
744 }
745 #endif
746 
747 
748 /** @} end parser_h group */
749 /** @} end internal_api group */
750 
751 #endif /* __LEXER_H__ */
#define TY_PRIVATE
Definition: forward.h:29
#define TY_(str)
Definition: forward.h:23
Node * next
Definition: lexer.h:309
tmbstr element
name (NULL for text nodes)
Definition: lexer.h:297
Node * parent
tree structure
Definition: lexer.h:307
uint istackbase
start of frame
Definition: lexer.h:381
Node * last
Definition: lexer.h:311
Bool seenEndBody
true if a </body> tag has been encountered
Definition: lexer.h:359
uint txtstart
start of current node
Definition: lexer.h:350
const Attribute * dict
Definition: lexer.h:269
LexerState state
state of lexer's finite state machine
Definition: lexer.h:352
IStack * next
Definition: lexer.h:295
AttVal * attributes
Definition: lexer.h:298
Node * root
remember root node of the document
Definition: lexer.h:356
tmbstr tag
Definition: lexer.h:242
uint istacksize
used
Definition: lexer.h:380
NodeType type
TextNode, StartTag, EndTag etc.
Definition: lexer.h:321
uint columns
at start of current token
Definition: lexer.h:339
TagStyle * styles
used for cleaning up presentation markup
Definition: lexer.h:383
AttVal * next
Definition: lexer.h:268
uint lexlength
allocated
Definition: lexer.h:372
Bool pushed
true after token has been pushed back
Definition: lexer.h:341
tmbstr tag_class
Definition: lexer.h:243
Bool insertspace
when space is moved after end tag
Definition: lexer.h:342
uint lines
lines seen
Definition: lexer.h:338
int delim
Definition: lexer.h:272
uint versions
bit vector of HTML versions
Definition: lexer.h:346
tmbstr attribute
Definition: lexer.h:273
Bool bad_doctype
e.g.
Definition: lexer.h:349
uint versionEmitted
version of doctype emitted
Definition: lexer.h:348
Bool seenEndHtml
true if a </html> tag has been encountered
Definition: lexer.h:360
Node * inode
for deferring text node
Definition: lexer.h:376
tmbstr value
Definition: lexer.h:258
IStack * insert
for inferring inline tags
Definition: lexer.h:377
uint txtend
end of current node
Definition: lexer.h:351
TagStyle * next
Definition: lexer.h:245
uint end
end of span onto text array
Definition: lexer.h:320
uint column
current column of document
Definition: lexer.h:324
TidyAllocator * allocator
allocator
Definition: lexer.h:385
const Dict * tag
tag's dictionary definition
Definition: lexer.h:296
Bool exiled
true if moved out of table
Definition: lexer.h:344
uint lexsize
used
Definition: lexer.h:373
Bool closed
true if closed by explicit end tag
Definition: lexer.h:326
uint istacklength
allocated
Definition: lexer.h:379
unsigned capacity
Current capacity.
Definition: lexer.h:683
tmbstr properties
Definition: lexer.h:244
Bool excludeBlocks
Netscape compatibility.
Definition: lexer.h:343
Node * prev
Definition: lexer.h:308
Node ** firstNode
Definition: lexer.h:684
Node * token
last token returned by GetToken()
Definition: lexer.h:354
Bool waswhite
used to collapse contiguous white space
Definition: lexer.h:340
StyleProp * next
Definition: lexer.h:259
Node * asp
Definition: lexer.h:270
uint doctype
version as given by doctype (if any)
Definition: lexer.h:347
Bool implicit
true if inferred
Definition: lexer.h:327
tmbstr lexbuf
MB character buffer.
Definition: lexer.h:371
Node * php
Definition: lexer.h:271
Bool isvoyager
true if xmlns attribute on html element (i.e., "Voyager" was the W3C codename for XHTML).
Definition: lexer.h:345
IStack * istack
Definition: lexer.h:378
const Dict * was
old tag when it was changed
Definition: lexer.h:314
Node * itoken
last duplicate inline returned by GetToken()
Definition: lexer.h:355
uint start
start of span onto text array
Definition: lexer.h:319
uint line
current line of document
Definition: lexer.h:323
Bool linebreak
true if followed by a line break
Definition: lexer.h:328
int top
Current top position.
Definition: lexer.h:682
tmbstr name
Definition: lexer.h:257
Node * content
Definition: lexer.h:310
LexerState
Lexer GetToken() states.
Definition: lexer.h:104
GetTokenMode
modes for GetToken()
Definition: lexer.h:397
ParseDocTypeDeclState
ParseDocTypeDecl state constants.
Definition: lexer.h:125
NodeType
node->type is one of these values
Definition: lexer.h:82
@ LEX_PROCINSTR
Definition: lexer.h:111
@ LEX_CDATA
Definition: lexer.h:112
@ LEX_DOCTYPE
Definition: lexer.h:110
@ LEX_CONTENT
Definition: lexer.h:105
@ LEX_STARTTAG
Definition: lexer.h:108
@ LEX_JSTE
Definition: lexer.h:115
@ LEX_SECTION
Definition: lexer.h:113
@ LEX_COMMENT
Definition: lexer.h:109
@ LEX_ASP
Definition: lexer.h:114
@ LEX_ENDTAG
Definition: lexer.h:107
@ LEX_XMLDECL
Definition: lexer.h:117
@ LEX_GT
Definition: lexer.h:106
@ LEX_PHP
Definition: lexer.h:116
@ OtherNamespace
Definition: lexer.h:402
@ Preformatted
Definition: lexer.h:400
@ IgnoreWhitespace
Definition: lexer.h:398
@ CdataContent
Definition: lexer.h:403
@ MixedContent
Definition: lexer.h:399
@ IgnoreMarkup
Definition: lexer.h:401
@ DT_QUOTEDSTRING
Definition: lexer.h:129
@ DT_DOCTYPENAME
Definition: lexer.h:127
@ DT_INTERMEDIATE
Definition: lexer.h:126
@ DT_INTSUBSET
Definition: lexer.h:130
@ DT_PUBLICSYSTEM
Definition: lexer.h:128
@ CommentTag
Definition: lexer.h:85
@ StartEndTag
Definition: lexer.h:90
@ XmlDecl
Definition: lexer.h:96
@ ProcInsTag
Definition: lexer.h:86
@ RootNode
Definition: lexer.h:83
@ SectionTag
Definition: lexer.h:92
@ AspTag
Definition: lexer.h:93
@ StartTag
Definition: lexer.h:88
@ PhpTag
Definition: lexer.h:95
@ CDATATag
Definition: lexer.h:91
@ TextNode
Definition: lexer.h:87
@ JsteTag
Definition: lexer.h:94
@ EndTag
Definition: lexer.h:89
@ DocTypeTag
Definition: lexer.h:84
This typedef represents a stack of addresses to nodes.
Definition: lexer.h:681
Attribute/Value linked list node.
Definition: lexer.h:267
Mosaic handles inlines via a separate stack from other elements We duplicate this to recover from inl...
Definition: lexer.h:294
The following are private to the lexer.
Definition: lexer.h:337
HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl, etc., etc.
Definition: lexer.h:306
Definition: lexer.h:241
Definition: lexer.h:256
TidyTagId
Known HTML element types.
Definition: tidyenum.h:857
Bool
Definition: tidyplatform.h:662
unsigned int uint
Definition: tidyplatform.h:576
const tmbchar * ctmbstr
Definition: tidyplatform.h:624
tmbchar * tmbstr
Definition: tidyplatform.h:623