HTMLparser.h

Go to the documentation of this file.
00001 /*
00002  * Summary: interface for an HTML 4.0 non-verifying parser
00003  * Description: this module implements an HTML 4.0 non-verifying parser
00004  *              with API compatible with the XML parser ones. It should
00005  *              be able to parse "real world" HTML, even if severely
00006  *              broken from a specification point of view.
00007  *
00008  * Copy: See Copyright for the status of this software.
00009  *
00010  * Author: Daniel Veillard
00011  */
00012 
00013 #ifndef __HTML_PARSER_H__
00014 #define __HTML_PARSER_H__
00015 #include <libxml/xmlversion.h>
00016 #include <libxml/parser.h>
00017 
00018 #ifdef LIBXML_HTML_ENABLED
00019 
00020 #ifdef __cplusplus
00021 extern "C" {
00022 #endif
00023 
00024 /*
00025  * Most of the back-end structures from XML and HTML are shared.
00026  */
00027 typedef xmlParserCtxt htmlParserCtxt;
00028 typedef xmlParserCtxtPtr htmlParserCtxtPtr;
00029 typedef xmlParserNodeInfo htmlParserNodeInfo;
00030 typedef xmlSAXHandler htmlSAXHandler;
00031 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
00032 typedef xmlParserInput htmlParserInput;
00033 typedef xmlParserInputPtr htmlParserInputPtr;
00034 typedef xmlDocPtr htmlDocPtr;
00035 typedef xmlNodePtr htmlNodePtr;
00036 
00037 /*
00038  * Internal description of an HTML element, representing HTML 4.01
00039  * and XHTML 1.0 (which share the same structure).
00040  */
00041 typedef struct _htmlElemDesc htmlElemDesc;
00042 typedef htmlElemDesc *htmlElemDescPtr;
00043 struct _htmlElemDesc {
00044     const char *name;   /* The tag name */
00045     char startTag;      /* Whether the start tag can be implied */
00046     char endTag;        /* Whether the end tag can be implied */
00047     char saveEndTag;    /* Whether the end tag should be saved */
00048     char empty;         /* Is this an empty element ? */
00049     char depr;          /* Is this a deprecated element ? */
00050     char dtd;           /* 1: only in Loose DTD, 2: only Frameset one */
00051     char isinline;      /* is this a block 0 or inline 1 element */
00052     const char *desc;   /* the description */
00053 
00054 /* NRK Jan.2003
00055  * New fields encapsulating HTML structure
00056  *
00057  * Bugs:
00058  *  This is a very limited representation.  It fails to tell us when
00059  *  an element *requires* subelements (we only have whether they're
00060  *  allowed or not), and it doesn't tell us where CDATA and PCDATA
00061  *  are allowed.  Some element relationships are not fully represented:
00062  *  these are flagged with the word MODIFIER
00063  */
00064     const char** subelts;       /* allowed sub-elements of this element */
00065     const char* defaultsubelt;  /* subelement for suggested auto-repair
00066                        if necessary or NULL */
00067     const char** attrs_opt;     /* Optional Attributes */
00068     const char** attrs_depr;        /* Additional deprecated attributes */
00069     const char** attrs_req;     /* Required attributes */
00070 };
00071 
00072 /*
00073  * Internal description of an HTML entity.
00074  */
00075 typedef struct _htmlEntityDesc htmlEntityDesc;
00076 typedef htmlEntityDesc *htmlEntityDescPtr;
00077 struct _htmlEntityDesc {
00078     unsigned int value; /* the UNICODE value for the character */
00079     const char *name;   /* The entity name */
00080     const char *desc;   /* the description */
00081 };
00082 
00083 /*
00084  * There is only few public functions.
00085  */
00086 XMLPUBFUN const htmlElemDesc * XMLCALL  
00087             htmlTagLookup   (const xmlChar *tag);
00088 XMLPUBFUN const htmlEntityDesc * XMLCALL    
00089             htmlEntityLookup(const xmlChar *name);
00090 XMLPUBFUN const htmlEntityDesc * XMLCALL    
00091             htmlEntityValueLookup(unsigned int value);
00092 
00093 XMLPUBFUN int XMLCALL           
00094             htmlIsAutoClosed(htmlDocPtr doc,
00095                      htmlNodePtr elem);
00096 XMLPUBFUN int XMLCALL           
00097             htmlAutoCloseTag(htmlDocPtr doc,
00098                      const xmlChar *name,
00099                      htmlNodePtr elem);
00100 XMLPUBFUN const htmlEntityDesc * XMLCALL    
00101             htmlParseEntityRef(htmlParserCtxtPtr ctxt,
00102                      const xmlChar **str);
00103 XMLPUBFUN int XMLCALL           
00104             htmlParseCharRef(htmlParserCtxtPtr ctxt);
00105 XMLPUBFUN void XMLCALL          
00106             htmlParseElement(htmlParserCtxtPtr ctxt);
00107 
00108 XMLPUBFUN htmlParserCtxtPtr XMLCALL 
00109             htmlNewParserCtxt(void);
00110 
00111 XMLPUBFUN htmlParserCtxtPtr XMLCALL 
00112             htmlCreateMemoryParserCtxt(const char *buffer,
00113                            int size);
00114 
00115 XMLPUBFUN int XMLCALL           
00116             htmlParseDocument(htmlParserCtxtPtr ctxt);
00117 XMLPUBFUN htmlDocPtr XMLCALL        
00118             htmlSAXParseDoc (xmlChar *cur,
00119                      const char *encoding,
00120                      htmlSAXHandlerPtr sax,
00121                      void *userData);
00122 XMLPUBFUN htmlDocPtr XMLCALL        
00123             htmlParseDoc    (xmlChar *cur,
00124                      const char *encoding);
00125 XMLPUBFUN htmlDocPtr XMLCALL        
00126             htmlSAXParseFile(const char *filename,
00127                      const char *encoding,
00128                      htmlSAXHandlerPtr sax,
00129                      void *userData);
00130 XMLPUBFUN htmlDocPtr XMLCALL        
00131             htmlParseFile   (const char *filename,
00132                      const char *encoding);
00133 XMLPUBFUN int XMLCALL           
00134             UTF8ToHtml  (unsigned char *out,
00135                      int *outlen,
00136                      const unsigned char *in,
00137                      int *inlen);
00138 XMLPUBFUN int XMLCALL           
00139             htmlEncodeEntities(unsigned char *out,
00140                      int *outlen,
00141                      const unsigned char *in,
00142                      int *inlen, int quoteChar);
00143 XMLPUBFUN int XMLCALL           
00144             htmlIsScriptAttribute(const xmlChar *name);
00145 XMLPUBFUN int XMLCALL           
00146             htmlHandleOmittedElem(int val);
00147 
00148 #ifdef LIBXML_PUSH_ENABLED
00149 
00152 XMLPUBFUN htmlParserCtxtPtr XMLCALL 
00153             htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,
00154                          void *user_data,
00155                          const char *chunk,
00156                          int size,
00157                          const char *filename,
00158                          xmlCharEncoding enc);
00159 XMLPUBFUN int XMLCALL           
00160             htmlParseChunk      (htmlParserCtxtPtr ctxt,
00161                          const char *chunk,
00162                          int size,
00163                          int terminate);
00164 #endif /* LIBXML_PUSH_ENABLED */
00165 
00166 XMLPUBFUN void XMLCALL          
00167             htmlFreeParserCtxt  (htmlParserCtxtPtr ctxt);
00168 
00169 /*
00170  * New set of simpler/more flexible APIs
00171  */
00178 typedef enum {
00179     HTML_PARSE_RECOVER  = 1<<0, /* Relaxed parsing */
00180     HTML_PARSE_NOERROR  = 1<<5, /* suppress error reports */
00181     HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */
00182     HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */
00183     HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */
00184     HTML_PARSE_NONET    = 1<<11,/* Forbid network access */
00185     HTML_PARSE_COMPACT  = 1<<16 /* compact small text nodes */
00186 } htmlParserOption;
00187 
00188 XMLPUBFUN void XMLCALL
00189         htmlCtxtReset       (htmlParserCtxtPtr ctxt);
00190 XMLPUBFUN int XMLCALL
00191         htmlCtxtUseOptions  (htmlParserCtxtPtr ctxt,
00192                      int options);
00193 XMLPUBFUN htmlDocPtr XMLCALL
00194         htmlReadDoc     (const xmlChar *cur,
00195                      const char *URL,
00196                      const char *encoding,
00197                      int options);
00198 XMLPUBFUN htmlDocPtr XMLCALL
00199         htmlReadFile        (const char *URL,
00200                      const char *encoding,
00201                      int options);
00202 XMLPUBFUN htmlDocPtr XMLCALL
00203         htmlReadMemory      (const char *buffer,
00204                      int size,
00205                      const char *URL,
00206                      const char *encoding,
00207                      int options);
00208 XMLPUBFUN htmlDocPtr XMLCALL
00209         htmlReadFd      (int fd,
00210                      const char *URL,
00211                      const char *encoding,
00212                      int options);
00213 XMLPUBFUN htmlDocPtr XMLCALL
00214         htmlReadIO      (xmlInputReadCallback ioread,
00215                      xmlInputCloseCallback ioclose,
00216                      void *ioctx,
00217                      const char *URL,
00218                      const char *encoding,
00219                      int options);
00220 XMLPUBFUN htmlDocPtr XMLCALL
00221         htmlCtxtReadDoc     (xmlParserCtxtPtr ctxt,
00222                      const xmlChar *cur,
00223                      const char *URL,
00224                      const char *encoding,
00225                      int options);
00226 XMLPUBFUN htmlDocPtr XMLCALL
00227         htmlCtxtReadFile        (xmlParserCtxtPtr ctxt,
00228                      const char *filename,
00229                      const char *encoding,
00230                      int options);
00231 XMLPUBFUN htmlDocPtr XMLCALL
00232         htmlCtxtReadMemory      (xmlParserCtxtPtr ctxt,
00233                      const char *buffer,
00234                      int size,
00235                      const char *URL,
00236                      const char *encoding,
00237                      int options);
00238 XMLPUBFUN htmlDocPtr XMLCALL
00239         htmlCtxtReadFd      (xmlParserCtxtPtr ctxt,
00240                      int fd,
00241                      const char *URL,
00242                      const char *encoding,
00243                      int options);
00244 XMLPUBFUN htmlDocPtr XMLCALL
00245         htmlCtxtReadIO      (xmlParserCtxtPtr ctxt,
00246                      xmlInputReadCallback ioread,
00247                      xmlInputCloseCallback ioclose,
00248                      void *ioctx,
00249                      const char *URL,
00250                      const char *encoding,
00251                      int options);
00252 
00253 /* NRK/Jan2003: further knowledge of HTML structure
00254  */
00255 typedef enum {
00256   HTML_NA = 0 ,     /* something we don't check at all */
00257   HTML_INVALID = 0x1 ,
00258   HTML_DEPRECATED = 0x2 ,
00259   HTML_VALID = 0x4 ,
00260   HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
00261 } htmlStatus ;
00262 
00263 /* Using htmlElemDesc rather than name here, to emphasise the fact
00264    that otherwise there's a lookup overhead
00265 */
00266 XMLPUBFUN htmlStatus XMLCALL htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;
00267 XMLPUBFUN int XMLCALL htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;
00268 XMLPUBFUN htmlStatus XMLCALL htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;
00269 XMLPUBFUN htmlStatus XMLCALL htmlNodeStatus(const htmlNodePtr, int) ;
00276 #define htmlDefaultSubelement(elt) elt->defaultsubelt
00277 
00287 #define htmlElementAllowedHereDesc(parent,elt) \
00288     htmlElementAllowedHere((parent), (elt)->name)
00289 
00295 #define htmlRequiredAttrs(elt) (elt)->attrs_req
00296 
00297 
00298 #ifdef __cplusplus
00299 }
00300 #endif
00301 
00302 #endif /* LIBXML_HTML_ENABLED */
00303 #endif /* __HTML_PARSER_H__ */
footer
 SourceForge.net Logo