HTML Tidy  5.9.15
The HTACG Tidy HTML Project
parser.h
Go to the documentation of this file.
1 #ifndef __PARSER_H__
2 #define __PARSER_H__
3 
4 /**************************************************************************//**
5  * @file
6  * HTML and XML Parsers.
7  *
8  * Tidy's HTML parser corrects many conditions and enforces certain user
9  * preferences during the parsing process. The XML parser produces a tree
10  * of nodes useful to Tidy but also suitable for use in other XML processing
11  * applications.
12  *
13  * @author HTACG, et al (consult git log)
14  *
15  * @copyright
16  * Copyright (c) 1998-2017 World Wide Web Consortium (Massachusetts
17  * Institute of Technology, European Research Consortium for Informatics
18  * and Mathematics, Keio University) and HTACG.
19  * @par
20  * All Rights Reserved.
21  * @par
22  * See `tidy.h` for the complete license.
23  *
24  * @date Additional updates: consult git log
25  *
26  ******************************************************************************/
27 
28 #include "forward.h"
29 
30 /** @addtogroup internal_api */
31 /** @{ */
32 
33 
34 /***************************************************************************//**
35  ** @defgroup parser_h HTML and XML Parsing
36  **
37  ** These functions and structures form the internal API for document
38  ** parsing.
39  **
40  ** @{
41  ******************************************************************************/
42 
43 
44 /**
45  * This typedef represents the state of a parser when it enters and exits.
46  * When the parser needs to finish work on the way back up the stack, it will
47  * push one of these records to the stack, and it will pop a record from the
48  * stack upon re-entry.
49  */
50 typedef struct _TidyParserMemory
51 {
52  Parser *identity; /**< Which parser pushed this record? */
53  Node *original_node; /**< Originally provided node at entry. */
54  Node *reentry_node; /**< The node with which to re-enter. */
55  GetTokenMode reentry_mode; /**< The token mode to use when re-entering. */
56  int reentry_state; /**< State to set during re-entry. Defined locally in each parser. */
57  GetTokenMode mode; /**< The caller will peek at this value to get the correct mode. */
58  int register_1; /**< Local variable storage. */
59  int register_2; /**< Local variable storage. */
61 
62 
63 /**
64  * This typedef represents a stack of parserState. The Tidy document has its
65  * own instance of this.
66  */
67 typedef struct _TidyParserStack
68 {
69  TidyParserMemory* content; /**< A state record. */
70  uint size; /**< Current size of the stack. */
71  int top; /**< Top of the stack. */
73 
74 
75 /**
76  * Allocates and initializes the parser's stack. TidyCreate will perform
77  * this automatically.
78  */
79 void TY_(InitParserStack)( TidyDocImpl* doc );
80 
81 
82 /**
83  * Frees the parser's stack when done. TidyRelease will perform this
84  * automatically.
85  */
86 void TY_(FreeParserStack)( TidyDocImpl* doc );
87 
88 
89 /**
90  * Indicates whether or not the stack is empty.
91  */
92 Bool TY_(isEmptyParserStack)( TidyDocImpl* doc );
93 
94 
95 /**
96  * Peek at the parser memory.
97  */
98 TidyParserMemory TY_(peekMemory)( TidyDocImpl* doc );
99 
100 
101 /**
102  * Peek at the parser memory "identity" field. This is just a convenience
103  * to avoid having to create a new struct instance in the caller.
104  */
105 Parser* TY_(peekMemoryIdentity)( TidyDocImpl* doc );
106 
107 
108 /**
109  * Peek at the parser memory "mode" field. This is just a convenience
110  * to avoid having to create a new struct instance in the caller.
111  */
112 GetTokenMode TY_(peekMemoryMode)( TidyDocImpl* doc );
113 
114 
115 /**
116  * Pop out a parser memory.
117  */
118 TidyParserMemory TY_(popMemory)( TidyDocImpl* doc );
119 
120 
121 /**
122  * Push the parser memory to the stack.
123  */
124 void TY_(pushMemory)( TidyDocImpl* doc, TidyParserMemory data );
125 
126 
127 /**
128  * Is used to perform a node integrity check recursively after parsing
129  * an HTML or XML document.
130  * @note Actual performance of this check can be disabled by defining the
131  * macro NO_NODE_INTEGRITY_CHECK.
132  * @param node The root node for the integrity check.
133  * @returns Returns yes or no indicating integrity of the node structure.
134  */
135 TY_PRIVATE Bool TY_(CheckNodeIntegrity)(Node *node);
136 
137 
138 /**
139  * Indicates whether or not a text node ends with a space or newline.
140  * @note Implementation of this method is found in `pprint.c` for
141  * some reason.
142  * @param lexer A reference to the lexer used to lex the document.
143  * @param node The node to check.
144  * @returns The result of the check.
145  */
146 TY_PRIVATE Bool TY_(TextNodeEndWithSpace)( Lexer *lexer, Node *node );
147 
148 
149 /**
150  * Used to check if a node uses CM_NEW, which determines how attributes
151  * without values should be printed. This was introduced to deal with
152  * user-defined tags e.g. ColdFusion.
153  * @param node The node to check.
154  * @returns The result of the check.
155  */
156 TY_PRIVATE Bool TY_(IsNewNode)(Node *node);
157 
158 
159 /**
160  * Transforms a given node to another element, for example, from a `p`
161  * to a `br`.
162  * @param doc The document which the node belongs to.
163  * @param node The node to coerce.
164  * @param tid The tag type to coerce the node into.
165  * @param obsolete If the old node was obsolete, a report will be generated.
166  * @param expected If the old node was not expected to be found in this
167  * particular location, a report will be generated.
168  */
169 TY_PRIVATE void TY_(CoerceNode)(TidyDocImpl* doc, Node *node, TidyTagId tid, Bool obsolete, Bool expected);
170 
171 
172 /**
173  * Extract a node and its children from a markup tree.
174  * @param node The node to remove.
175  * @returns Returns the removed node.
176  */
177 TY_PRIVATE Node *TY_(RemoveNode)(Node *node);
178 
179 
180 /**
181  * Remove node from markup tree and discard it.
182  * @param doc The Tidy document from which to discard the node.
183  * @param element The node to discard.
184  * @returns Returns the next node.
185  */
186 TY_PRIVATE Node *TY_(DiscardElement)( TidyDocImpl* doc, Node *element);
187 
188 
189 /**
190  * Insert node into markup tree as the first element of content of element.
191  * @param element The new destination node.
192  * @param node The node to insert.
193  */
194 TY_PRIVATE void TY_(InsertNodeAtStart)(Node *element, Node *node);
195 
196 
197 /**
198  * Insert node into markup tree as the last element of content of element.
199  * @param element The new destination node.
200  * @param node The node to insert.
201  */
202 TY_PRIVATE void TY_(InsertNodeAtEnd)(Node *element, Node *node);
203 
204 
205 /**
206  * Insert node into markup tree before element.
207  * @param element The node before which the node is inserted.
208  * @param node The node to insert.
209  */
210 TY_PRIVATE void TY_(InsertNodeBeforeElement)(Node *element, Node *node);
211 
212 
213 /**
214  * Insert node into markup tree after element.
215  * @param element The node after which the node is inserted.
216  * @param node The node to insert.
217  */
218 TY_PRIVATE void TY_(InsertNodeAfterElement)(Node *element, Node *node);
219 
220 
221 /**
222  * Trims a single, empty element, returning the next node.
223  * @param doc The Tidy document.
224  * @param element The element to trim.
225  * @returns Returns the next node.
226  */
227 TY_PRIVATE Node *TY_(TrimEmptyElement)( TidyDocImpl* doc, Node *element );
228 
229 
230 /**
231  * Trims a tree of empty elements recursively, returning the next node.
232  * @param doc The Tidy document.
233  * @param node The element to trim.
234  * @returns Returns the next node.
235  */
236 TY_PRIVATE Node* TY_(DropEmptyElements)(TidyDocImpl* doc, Node* node);
237 
238 
239 /**
240  * Indicates whether or not a text node is blank, meaning that it consists
241  * of nothing, or a single space.
242  * @param lexer The lexer used to lex the document.
243  * @param node The node to test.
244  * @returns Returns the result of the test.
245  */
246 TY_PRIVATE Bool TY_(IsBlank)(Lexer *lexer, Node *node);
247 
248 
249 /**
250  * Indicates whether or not a node is declared as containing javascript
251  * code.
252  * @param node The node to test.
253  * @returns Returns the result of the test.
254  */
255 TY_PRIVATE Bool TY_(IsJavaScript)(Node *node);
256 
257 
258 /**
259  * Parses a document after lexing using the HTML parser. It begins by properly
260  * configuring the overall HTML structure, and subsequently processes all
261  * remaining nodes. HTML is the root node.
262  * @param doc The Tidy document.
263  */
264 TY_PRIVATE void TY_(ParseDocument)( TidyDocImpl* doc );
265 
266 
267 /**
268  * Indicates whether or not whitespace is to be preserved in XHTML/XML
269  * documents.
270  * @param doc The Tidy document.
271  * @param element The node to test.
272  * @returns Returns the result of the test.
273  */
274 TY_PRIVATE Bool TY_(XMLPreserveWhiteSpace)( TidyDocImpl* doc, Node *element );
275 
276 
277 /**
278  * Parses a document after lexing using the XML parser.
279  * @param doc The Tidy document.
280  */
281 TY_PRIVATE void TY_(ParseXMLDocument)( TidyDocImpl* doc );
282 
283 
284 /** @} end parser_h group */
285 /** @} end internal_api group */
286 
287 #endif /* __PARSER_H__ */
#define TY_PRIVATE
Definition: forward.h:29
#define TY_(str)
Definition: forward.h:23
GetTokenMode
modes for GetToken()
Definition: lexer.h:397
Node * reentry_node
The node with which to re-enter.
Definition: parser.h:54
GetTokenMode reentry_mode
The token mode to use when re-entering.
Definition: parser.h:55
int reentry_state
State to set during re-entry.
Definition: parser.h:56
uint size
Current size of the stack.
Definition: parser.h:70
int register_1
Local variable storage.
Definition: parser.h:58
int register_2
Local variable storage.
Definition: parser.h:59
GetTokenMode mode
The caller will peek at this value to get the correct mode.
Definition: parser.h:57
Parser * identity
Which parser pushed this record?
Definition: parser.h:52
TidyParserMemory * content
A state record.
Definition: parser.h:69
Node * original_node
Originally provided node at entry.
Definition: parser.h:53
int top
Top of the stack.
Definition: parser.h:71
This typedef represents the state of a parser when it enters and exits.
Definition: parser.h:51
This typedef represents a stack of parserState.
Definition: parser.h:68
TidyTagId
Known HTML element types.
Definition: tidyenum.h:857
Node *() Parser(TidyDocImpl *doc, Node *node, GetTokenMode mode)
This typedef describes a function to be used to parse HTML of a Tidy tag.
Definition: tags.h:70
Bool
Definition: tidyplatform.h:662
unsigned int uint
Definition: tidyplatform.h:576