HTML Tidy  5.6.0
The HTACG Tidy HTML Project
tags.h
Go to the documentation of this file.
1 #ifndef __TAGS_H__
2 #define __TAGS_H__
3 
4 /**************************************************************************//**
5  * @file
6  * Recognize HTML Tags.
7  *
8  * The HTML tags are stored as 8 bit ASCII strings.
9  * Use lookupw() to find a tag given a wide char string.
10  *
11  * @author HTACG, et al (consult git log)
12  *
13  * @copyright
14  * Copyright (c) 1998-2017 World Wide Web Consortium (Massachusetts
15  * Institute of Technology, European Research Consortium for Informatics
16  * and Mathematics, Keio University) and HTACG.
17  * @par
18  * All Rights Reserved.
19  * @par
20  * See `tidy.h` for the complete license.
21  *
22  * @date Additional updates: consult git log
23  *
24  ******************************************************************************/
25 
26 #include "forward.h"
27 #include "attrdict.h"
28 
29 /** @addtogroup internal_api */
30 /** @{ */
31 
32 
33 /***************************************************************************//**
34  ** @defgroup tags_h HTML Tags
35  **
36  ** This module organizes all of Tidy's HTML tag operations, such as parsing
37  ** tags, defining tags, and user-defined tags.
38  **
39  ** @{
40  ******************************************************************************/
41 
42 
43 /** @name Basic Structures and Tag Operations.
44  ** These structures form the backbone of Tidy tag processing, and the
45  ** functions in this group provide basic operations with tags and nodes.
46  */
47 /** @{ */
48 
49 
50 /** This enumeration defines the types of user-defined tags that can be
51  ** created.
52  */
53 typedef enum
54 {
55  tagtype_null = 0, /**< First item marker. */
56  tagtype_empty = 1, /**< Tag is an empty element. */
57  tagtype_inline = 2, /**< Tag is an inline element. */
58  tagtype_block = 4, /**< Tag is a block level element. */
59  tagtype_pre = 8 /**< Tag is a preformatted tag. */
60 } UserTagType;
61 
62 
63 /** This typedef describes a function to be used to parse HTML of a Tidy tag.
64  */
65 typedef void (Parser)( TidyDocImpl* doc, Node *node, GetTokenMode mode );
66 
67 
68 /** This typedef describes a function be be used to check the attributes
69  ** of a Tidy tag.
70  */
71 typedef void (CheckAttribs)( TidyDocImpl* doc, Node *node );
72 
73 
74 /** Defines a dictionary entry for a single Tidy tag, including all of the
75  ** relevant information that it requires.
76  */
77 struct _Dict
78 {
79  TidyTagId id; /**< Identifier for this tag. */
80  tmbstr name; /**< The tag name. */
81  uint versions; /**< Accumulates potential HTML versions. See TY_(ConstrainVersion). */
82  AttrVersion const * attrvers; /**< Accumulates potential HTML versions for attributes. */
83  uint model; /**< Indicates the relevant content models for the tag. See lexer.h; there is no enum. */
84  Parser* parser; /**< Specifies the parser to use for this tag. */
85  CheckAttribs* chkattrs; /**< Specifies the function to check this tag's attributes. */
86  Dict* next; /**< Link to next tag. */
87 };
88 
89 
90 /** This enum indicates the maximum size of the has table for tag hash lookup.
91  */
92 enum
93 {
94  ELEMENT_HASH_SIZE=178u /**< Maximum number of tags in the hash table. */
95 };
96 
97 
98 /** This structure provide hash lookup for Tidy tags.
99  */
100 typedef struct _DictHash
101 {
102  Dict const* tag; /**< The current tag. */
103  struct _DictHash* next; /**< The next tag. */
104 } DictHash;
105 
106 
107 /** This structure consists of the lists of all tags known to Tidy.
108  */
109 typedef struct _TidyTagImpl
110 {
111  Dict* xml_tags; /**< Placeholder for all xml tags. */
112  Dict* declared_tag_list; /**< User-declared tags. */
113  DictHash* hashtab[ELEMENT_HASH_SIZE]; /**< All of Tidy's built-in tags. */
114 } TidyTagImpl;
115 
116 
117 /** Coordinates Config update and Tags data.
118  ** @param doc The Tidy document.
119  ** @param opt The option the tag is intended for.
120  ** @param name The name of the new tag.
121  */
122 void TY_(DeclareUserTag)( TidyDocImpl* doc, const TidyOptionImpl* opt, ctmbstr name );
123 
124 
125 /** Interface for finding a tag by TidyTagId.
126  ** @param tid The TidyTagId to search for.
127  ** @returns An instance of a Tidy tag.
128  */
129 const Dict* TY_(LookupTagDef)( TidyTagId tid );
130 
131 /** Assigns the node's tag.
132  ** @param doc The Tidy document.
133  ** @param node The node to assign the tag to.
134  ** @returns Returns a bool indicating whether or not the tag was assigned.
135  */
136 Bool TY_(FindTag)( TidyDocImpl* doc, Node *node );
137 
138 
139 /** Finds the parser function for a given node.
140  ** @param doc The Tidy document.
141  ** @param node The node to lookup.
142  ** @returns The parser for the given node.
143  */
144 Parser* TY_(FindParser)( TidyDocImpl* doc, Node *node );
145 
146 
147 /** Defines a new user-defined tag.
148  ** @param doc The Tidy document.
149  ** @param tagType The type of user-defined tag to define.
150  ** @param name The name of the new tag.
151  */
152 void TY_(DefineTag)( TidyDocImpl* doc, UserTagType tagType, ctmbstr name );
153 
154 
155 /** Frees user-defined tags of the given type, or all user tags in given
156  ** `tagtype_null`.
157  ** @param doc The Tidy document.
158  ** @param tagType The type of tag to free, or `tagtype_null` to free all
159  ** user-defined tags.
160  */
161 void TY_(FreeDeclaredTags)( TidyDocImpl* doc, UserTagType tagType );
162 
163 
164 /** Initiates an iterator for a list of user-declared tags, including autonomous
165  ** custom tags detected in the document if @ref TidyUseCustomTags is not set to
166  ** **no**.
167  ** @param doc An instance of a TidyDocImp to query.
168  ** @result Returns a TidyIterator, which is a token used to represent the
169  ** current position in a list within LibTidy.
170  */
171 TidyIterator TY_(GetDeclaredTagList)( TidyDocImpl* doc );
172 
173 
174 /** Given a valid TidyIterator initiated with TY_(GetDeclaredTagList)(),
175  ** returns a string representing a user-declared or autonomous custom tag.
176  ** @remark Specifying tagType limits the scope of the tags to one of
177  ** @ref UserTagType types. Note that autonomous custom tags (if used)
178  ** are added to one of these option types, depending on the value of
179  ** @ref TidyUseCustomTags.
180  ** @param doc The Tidy document.
181  ** @param tagType The type of tag to iterate through.
182  ** @param iter The iterator token provided initially by
183  ** TY_(GetDeclaredTagList)().
184  ** @result A string containing the next tag.
185  */
186 ctmbstr TY_(GetNextDeclaredTag)( TidyDocImpl* doc, UserTagType tagType,
187  TidyIterator* iter );
188 
189 
190 /** Initializes tags and tag structures for the given Tidy document.
191  ** @param doc The Tidy document.
192  */
193 void TY_(InitTags)( TidyDocImpl* doc );
194 
195 
196 /** Frees the tags and structures used by Tidy for tags.
197  ** @param doc The Tidy document.
198  */
199 void TY_(FreeTags)( TidyDocImpl* doc );
200 
201 
202 /** Tidy defaults to HTML5 mode. If the <!DOCTYPE ...> is found to NOT be
203  ** HTML5, then adjust the tags table to HTML4 mode.
204  ** @param doc The Tidy document.
205  */
206 void TY_(AdjustTags)( TidyDocImpl *doc );
207 
208 
209 /** Reset the tags table back to default HTML5 mode.
210  ** @param doc The Tidy document.
211  */
212 void TY_(ResetTags)( TidyDocImpl *doc );
213 
214 
215 /** Indicates whether or not the Tidy is procesing in HTML5 mode.
216  ** @param doc The Tidy document.
217  ** @returns Returns `yes` if processing in HTML5 mode.
218  */
219 Bool TY_(IsHTML5Mode)( TidyDocImpl *doc );
220 
221 
222 /** @} */
223 /** @name Parser Methods And Attribute Checker Functions for Tags
224  ** These functions define the parsers and attribute checking functions for
225  ** each of Tidy's tags.
226  */
227 /** @{ */
228 
229 
230 Parser TY_(ParseHTML);
231 Parser TY_(ParseHead);
232 Parser TY_(ParseTitle);
233 Parser TY_(ParseScript);
234 Parser TY_(ParseFrameSet);
235 Parser TY_(ParseNoFrames);
236 Parser TY_(ParseBody);
237 Parser TY_(ParsePre);
238 Parser TY_(ParseList);
239 Parser TY_(ParseDefList);
240 Parser TY_(ParseBlock);
241 Parser TY_(ParseInline);
242 Parser TY_(ParseEmpty);
243 Parser TY_(ParseTableTag);
244 Parser TY_(ParseColGroup);
245 Parser TY_(ParseRowGroup);
246 Parser TY_(ParseRow);
247 Parser TY_(ParseSelect);
248 Parser TY_(ParseOptGroup);
249 Parser TY_(ParseText);
250 Parser TY_(ParseDatalist);
251 Parser TY_(ParseNamespace);
252 
253 CheckAttribs TY_(CheckAttributes);
254 
255 
256 /** @} */
257 /** @name Other Tag and Node Lookup Functions
258  ** These functions perform additional lookup on tags and nodes.
259  */
260 /** @{ */
261 
262 
263 /** Gets the TidyTagId of the given node. 0 == TidyTag_UNKNOWN.
264  */
265 #define TagId(node) ((node) && (node)->tag ? (node)->tag->id : TidyTag_UNKNOWN)
266 
267 
268 /** Determines if the given node is of the given tag id type.
269  */
270 #define TagIsId(node, tid) ((node) && (node)->tag && (node)->tag->id == tid)
271 
272 
273 /** Inquires whether or not the given node is a text node.
274  ** @param node The node being interrogated.
275  ** @returns The status of the inquiry.
276  */
277 Bool TY_(nodeIsText)( Node* node );
278 
279 
280 /** Inquires whether or not the given node is an element node.
281  ** @param node The node being interrogated.
282  ** @returns The status of the inquiry.
283  */
284 Bool TY_(nodeIsElement)( Node* node );
285 
286 
287 /** Inquires whether or not the given node has any text.
288  ** @param doc The Tidy document.
289  ** @param node The node being interrogated.
290  ** @returns The status of the inquiry.
291  */
292 Bool TY_(nodeHasText)( TidyDocImpl* doc, Node* node );
293 
294 
295 /** Inquires whether the given element looks like it's an autonomous custom
296  ** element tag.
297  ** @param element A string to be checked.
298  ** @returns The status of the inquiry.
299  */
300 Bool TY_(elementIsAutonomousCustomFormat)( ctmbstr element );
301 
302 
303 /** Inquires whether the given node looks like it's an autonomous custom
304  ** element tag.
305  ** @param node The node being interrogated.
306  ** @returns The status of the inquiry.
307  */
308 Bool TY_(nodeIsAutonomousCustomFormat)( Node* node );
309 
310 
311 /** True if the node looks like it's an autonomous custom element tag, and
312  ** TidyCustomTags is not disabled, and we're in HTML5 mode, which are all
313  ** requirements for valid autonomous custom tags.
314  ** @param doc The Tidy document.
315  ** @param node The node being interrogated.
316  ** @returns The status of the inquiry.
317  */
318 Bool TY_(nodeIsAutonomousCustomTag)( TidyDocImpl* doc, Node* node );
319 
320 
321 /** Does the node have the indicated content model? True if any of the bits
322  ** requested are set.
323  ** @param node The node being interrogated.
324  ** @param contentModel The content model to check against.
325  ** @returns The status of the inquiry.
326  */
327 Bool TY_(nodeHasCM)( Node* node, uint contentModel );
328 
329 
330 /** Does the content model of the node include block?
331  ** @param node The node being interrogated.
332  ** @returns The status of the inquiry.
333  */
334 Bool TY_(nodeCMIsBlock)( Node* node );
335 
336 
337 /** Does the content model of the node include inline?
338  ** @param node The node being interrogated.
339  ** @returns The status of the inquiry.
340  */
341 Bool TY_(nodeCMIsInline)( Node* node );
342 
343 
344 /** Does the content model of the node include empty?
345  ** @param node The node being interrogated.
346  ** @returns The status of the inquiry.
347  */
348 Bool TY_(nodeCMIsEmpty)( Node* node );
349 
350 
351 /** Is the node a header, such as H1, H2, ..., H6?
352  ** @param node The node being interrogated.
353  ** @returns The status of the inquiry.
354  */
355 Bool TY_(nodeIsHeader)( Node* node );
356 
357 
358 /** Inquires as to the header level of the given node: 1, 2, ..., 6.
359  ** @param node The node being interrogated.
360  ** @returns The header level.
361  */
362 uint TY_(nodeHeaderLevel)( Node* node );
363 
364 
365 #define nodeIsHTML( node ) TagIsId( node, TidyTag_HTML )
366 #define nodeIsHEAD( node ) TagIsId( node, TidyTag_HEAD )
367 #define nodeIsTITLE( node ) TagIsId( node, TidyTag_TITLE )
368 #define nodeIsBASE( node ) TagIsId( node, TidyTag_BASE )
369 #define nodeIsMETA( node ) TagIsId( node, TidyTag_META )
370 #define nodeIsBODY( node ) TagIsId( node, TidyTag_BODY )
371 #define nodeIsFRAMESET( node ) TagIsId( node, TidyTag_FRAMESET )
372 #define nodeIsFRAME( node ) TagIsId( node, TidyTag_FRAME )
373 #define nodeIsIFRAME( node ) TagIsId( node, TidyTag_IFRAME )
374 #define nodeIsNOFRAMES( node ) TagIsId( node, TidyTag_NOFRAMES )
375 #define nodeIsHR( node ) TagIsId( node, TidyTag_HR )
376 #define nodeIsH1( node ) TagIsId( node, TidyTag_H1 )
377 #define nodeIsH2( node ) TagIsId( node, TidyTag_H2 )
378 #define nodeIsPRE( node ) TagIsId( node, TidyTag_PRE )
379 #define nodeIsLISTING( node ) TagIsId( node, TidyTag_LISTING )
380 #define nodeIsP( node ) TagIsId( node, TidyTag_P )
381 #define nodeIsUL( node ) TagIsId( node, TidyTag_UL )
382 #define nodeIsOL( node ) TagIsId( node, TidyTag_OL )
383 #define nodeIsDL( node ) TagIsId( node, TidyTag_DL )
384 #define nodeIsDIR( node ) TagIsId( node, TidyTag_DIR )
385 #define nodeIsLI( node ) TagIsId( node, TidyTag_LI )
386 #define nodeIsDT( node ) TagIsId( node, TidyTag_DT )
387 #define nodeIsDD( node ) TagIsId( node, TidyTag_DD )
388 #define nodeIsTABLE( node ) TagIsId( node, TidyTag_TABLE )
389 #define nodeIsCAPTION( node ) TagIsId( node, TidyTag_CAPTION )
390 #define nodeIsTD( node ) TagIsId( node, TidyTag_TD )
391 #define nodeIsTH( node ) TagIsId( node, TidyTag_TH )
392 #define nodeIsTR( node ) TagIsId( node, TidyTag_TR )
393 #define nodeIsCOL( node ) TagIsId( node, TidyTag_COL )
394 #define nodeIsCOLGROUP( node ) TagIsId( node, TidyTag_COLGROUP )
395 #define nodeIsBR( node ) TagIsId( node, TidyTag_BR )
396 #define nodeIsA( node ) TagIsId( node, TidyTag_A )
397 #define nodeIsLINK( node ) TagIsId( node, TidyTag_LINK )
398 #define nodeIsB( node ) TagIsId( node, TidyTag_B )
399 #define nodeIsI( node ) TagIsId( node, TidyTag_I )
400 #define nodeIsSTRONG( node ) TagIsId( node, TidyTag_STRONG )
401 #define nodeIsEM( node ) TagIsId( node, TidyTag_EM )
402 #define nodeIsBIG( node ) TagIsId( node, TidyTag_BIG )
403 #define nodeIsSMALL( node ) TagIsId( node, TidyTag_SMALL )
404 #define nodeIsPARAM( node ) TagIsId( node, TidyTag_PARAM )
405 #define nodeIsOPTION( node ) TagIsId( node, TidyTag_OPTION )
406 #define nodeIsOPTGROUP( node ) TagIsId( node, TidyTag_OPTGROUP )
407 #define nodeIsIMG( node ) TagIsId( node, TidyTag_IMG )
408 #define nodeIsMAP( node ) TagIsId( node, TidyTag_MAP )
409 #define nodeIsAREA( node ) TagIsId( node, TidyTag_AREA )
410 #define nodeIsNOBR( node ) TagIsId( node, TidyTag_NOBR )
411 #define nodeIsWBR( node ) TagIsId( node, TidyTag_WBR )
412 #define nodeIsFONT( node ) TagIsId( node, TidyTag_FONT )
413 #define nodeIsLAYER( node ) TagIsId( node, TidyTag_LAYER )
414 #define nodeIsSPACER( node ) TagIsId( node, TidyTag_SPACER )
415 #define nodeIsCENTER( node ) TagIsId( node, TidyTag_CENTER )
416 #define nodeIsSTYLE( node ) TagIsId( node, TidyTag_STYLE )
417 #define nodeIsSCRIPT( node ) TagIsId( node, TidyTag_SCRIPT )
418 #define nodeIsNOSCRIPT( node ) TagIsId( node, TidyTag_NOSCRIPT )
419 #define nodeIsFORM( node ) TagIsId( node, TidyTag_FORM )
420 #define nodeIsTEXTAREA( node ) TagIsId( node, TidyTag_TEXTAREA )
421 #define nodeIsBLOCKQUOTE( node ) TagIsId( node, TidyTag_BLOCKQUOTE )
422 #define nodeIsAPPLET( node ) TagIsId( node, TidyTag_APPLET )
423 #define nodeIsOBJECT( node ) TagIsId( node, TidyTag_OBJECT )
424 #define nodeIsDIV( node ) TagIsId( node, TidyTag_DIV )
425 #define nodeIsSPAN( node ) TagIsId( node, TidyTag_SPAN )
426 #define nodeIsINPUT( node ) TagIsId( node, TidyTag_INPUT )
427 #define nodeIsQ( node ) TagIsId( node, TidyTag_Q )
428 #define nodeIsLABEL( node ) TagIsId( node, TidyTag_LABEL )
429 #define nodeIsH3( node ) TagIsId( node, TidyTag_H3 )
430 #define nodeIsH4( node ) TagIsId( node, TidyTag_H4 )
431 #define nodeIsH5( node ) TagIsId( node, TidyTag_H5 )
432 #define nodeIsH6( node ) TagIsId( node, TidyTag_H6 )
433 #define nodeIsADDRESS( node ) TagIsId( node, TidyTag_ADDRESS )
434 #define nodeIsXMP( node ) TagIsId( node, TidyTag_XMP )
435 #define nodeIsSELECT( node ) TagIsId( node, TidyTag_SELECT )
436 #define nodeIsBLINK( node ) TagIsId( node, TidyTag_BLINK )
437 #define nodeIsMARQUEE( node ) TagIsId( node, TidyTag_MARQUEE )
438 #define nodeIsEMBED( node ) TagIsId( node, TidyTag_EMBED )
439 #define nodeIsBASEFONT( node ) TagIsId( node, TidyTag_BASEFONT )
440 #define nodeIsISINDEX( node ) TagIsId( node, TidyTag_ISINDEX )
441 #define nodeIsS( node ) TagIsId( node, TidyTag_S )
442 #define nodeIsSTRIKE( node ) TagIsId( node, TidyTag_STRIKE )
443 #define nodeIsSUB( node ) TagIsId( node, TidyTag_SUB )
444 #define nodeIsSUP( node ) TagIsId( node, TidyTag_SUP )
445 #define nodeIsU( node ) TagIsId( node, TidyTag_U )
446 #define nodeIsMENU( node ) TagIsId( node, TidyTag_MENU )
447 #define nodeIsMAIN( node ) TagIsId( node, TidyTag_MAIN )
448 #define nodeIsBUTTON( node ) TagIsId( node, TidyTag_BUTTON )
449 #define nodeIsCANVAS( node ) TagIsId( node, TidyTag_CANVAS )
450 #define nodeIsPROGRESS( node ) TagIsId( node, TidyTag_PROGRESS )
451 
452 #define nodeIsINS( node ) TagIsId( node, TidyTag_INS )
453 #define nodeIsDEL( node ) TagIsId( node, TidyTag_DEL )
454 
455 /* HTML5 */
456 #define nodeIsDATALIST( node ) TagIsId( node, TidyTag_DATALIST )
457 #define nodeIsMATHML( node ) TagIsId( node, TidyTag_MATHML ) /* #130 MathML attr and entity fix! */
458 
459 /* NOT in HTML 5 */
460 #define nodeIsACRONYM( node ) TagIsId( node, TidyTag_ACRONYM )
461 #define nodesIsFRAME( node ) TagIsId( node, TidyTag_FRAME )
462 #define nodeIsTT( node ) TagIsId( node, TidyTag_TT )
463 
464 
465 /** @} name */
466 /** @} tags_h group */
467 /** @} internal_api addtogroup */
468 
469 
470 #endif /* __TAGS_H__ */
This structure provide hash lookup for Tidy tags.
Definition: tags.h:100
Dict * declared_tag_list
User-declared tags.
Definition: tags.h:112
Dict * next
Link to next tag.
Definition: tags.h:86
tmbstr name
The tag name.
Definition: tags.h:80
Defines a dictionary entry for a single Tidy tag, including all of the relevant information that it r...
Definition: tags.h:77
Definition: attrdict.h:13
const tmbchar * ctmbstr
Definition: tidyplatform.h:594
uint versions
Accumulates potential HTML versions.
Definition: tags.h:81
Dict * xml_tags
Placeholder for all xml tags.
Definition: tags.h:111
struct _DictHash * next
The next tag.
Definition: tags.h:103
Tag is an empty element.
Definition: tags.h:56
AttrVersion const * attrvers
Accumulates potential HTML versions for attributes.
Definition: tags.h:82
TidyTagId
Known HTML element types.
Definition: tidyenum.h:852
First item marker.
Definition: tags.h:55
Bool
Definition: tidyplatform.h:631
void( CheckAttribs)(TidyDocImpl *doc, Node *node)
This typedef describes a function be be used to check the attributes of a Tidy tag.
Definition: tags.h:71
Maximum number of tags in the hash table.
Definition: tags.h:94
Dict const * tag
The current tag.
Definition: tags.h:102
Parser * parser
Specifies the parser to use for this tag.
Definition: tags.h:84
Tag is a preformatted tag.
Definition: tags.h:59
tmbchar * tmbstr
Definition: tidyplatform.h:593
unsigned int uint
Definition: tidyplatform.h:554
uint model
Indicates the relevant content models for the tag.
Definition: tags.h:83
This structure consists of the lists of all tags known to Tidy.
Definition: tags.h:109
UserTagType
This enumeration defines the types of user-defined tags that can be created.
Definition: tags.h:53
Tag is an inline element.
Definition: tags.h:57
void( Parser)(TidyDocImpl *doc, Node *node, GetTokenMode mode)
This typedef describes a function to be used to parse HTML of a Tidy tag.
Definition: tags.h:65
#define TY_(str)
Definition: forward.h:23
Tag is a block level element.
Definition: tags.h:58
CheckAttribs * chkattrs
Specifies the function to check this tag&#39;s attributes.
Definition: tags.h:85
TidyTagId id
Identifier for this tag.
Definition: tags.h:79
GetTokenMode
Definition: lexer.h:507