include/xmlutils/htmlparser.h

Go to the documentation of this file.
00001 
00002 // Name:        htmlparser.h
00003 // Purpose:     Simple HTML parser
00004 // Author:      Julian Smart
00005 // Modified by:
00006 // Created:     2002-09-25
00007 // RCS-ID:      $Id: htmlparser.h,v 1.2 2006/12/14 04:19:24 anthemion Exp $
00008 // Copyright:   (c) Julian Smart
00009 // Licence:     New BSD License
00011 
00012 #ifndef _HTMLPARSER_H_
00013 #define _HTMLPARSER_H_
00014 
00015 #include "wx/module.h"
00016 #include "wx/stream.h"
00017 
00018 /*
00019 
00020  So how are going to represent it: compare with my Latex parser.
00021  This generates a hierarchy because it respects the hierarchical nature of the Latex
00022  commands. However, we don't _have_ to do that, we can make it linear, e.g.
00023 
00024  tag-with-attributes text-chunk end-tag-with-attributes tag-with-attributes text-chunk
00025 
00026  Otherwise, we need knowledge about HTML tags to parse hierarchically. This wouldn't be hard.
00027  Would need to specify which tags have open/close parts, which don't, and for which it's optional
00028  (such as <P>).
00029 
00030 
00031  */
00032 
00033 /*
00034  * wxSimpleHtmlAttribute
00035  * Representation of an attribute
00036  */
00037 
00038 class wxSimpleHtmlAttribute
00039 {
00040     friend class wxSimpleHtmlTag;
00041 public:
00042     wxSimpleHtmlAttribute(const wxString& name, const wxString& value)
00043     {
00044         m_name = name; m_value = value; m_next = NULL;
00045     }
00047 
00048     // Write this attribute
00049     void Write(wxOutputStream& stream);
00050 
00052     const wxString& GetName() const { return m_name; }
00053     const wxString& GetValue() const { return m_value; }
00054 
00055     wxSimpleHtmlAttribute* GetNextAttribute() { return m_next; }
00056     void SetNextAttribute(wxSimpleHtmlAttribute* attr) { m_next = attr; }
00057 
00058     bool HasName(const wxString& name) const { return (0 == m_name.CmpNoCase(name)); }
00059     bool HasValue(const wxString& val) const { return (0 == m_value.CmpNoCase(val)); }
00060 
00061 private:
00062     wxString                m_name;
00063     wxString                m_value;
00064     wxSimpleHtmlAttribute*  m_next;
00065 };
00066 
00067 
00068 /*
00069  * wxSimpleHtmlTag
00070  * Representation of a tag or chunk of text
00071  */
00072 
00073 enum { wxSimpleHtmlTag_Text, wxSimpleHtmlTag_TopLevel, wxSimpleHtmlTag_Open, wxSimpleHtmlTag_Close, wxSimpleHtmlTag_Directive  };
00074 
00075 class wxSimpleHtmlTag
00076 {
00077 public:
00078     wxSimpleHtmlTag(const wxString& tagName, int tagType);
00079     ~wxSimpleHtmlTag();
00080 
00082     void ClearAttributes();
00083     wxSimpleHtmlAttribute* FindAttribute(const wxString& name) const ;
00084     void AppendAttribute(const wxString& name, const wxString& value);
00085     void ClearChildren();
00086     void AppendTag(wxSimpleHtmlTag* tag);
00087     // Write this tag
00088     void Write(wxOutputStream& stream);
00089 
00090     // Gets the text from this tag and its descendants
00091     wxString GetTagText();
00092 
00094     const wxString& GetName() const { return m_name; }
00095     void SetName(const wxString& name) { m_name = name; }
00096 
00097     int GetType() const { return m_type; }
00098     void SetType(int t) { m_type = t; }
00099 
00100     // If type is wxSimpleHtmlTag_Text, m_text will contain some text.
00101     const wxString& GetText() const { return m_text; }
00102     void SetText(const wxString& text) { m_text = text; }
00103 
00104     wxSimpleHtmlAttribute* GetFirstAttribute() { return m_attributes; }
00105     void SetFirstAttribute(wxSimpleHtmlAttribute* attr) { m_attributes = attr; }
00106 
00107     int GetAttributeCount() const ;
00108     wxSimpleHtmlAttribute* GetAttribute(int i) const ;
00109 
00110     wxSimpleHtmlTag* GetChildren() const { return m_children; }
00111     void SetChildren(wxSimpleHtmlTag* children) { m_children = children; }
00112 
00113     wxSimpleHtmlTag* GetParent() const { return m_parent; }
00114     void SetParent(wxSimpleHtmlTag* parent) { m_parent = parent; }
00115     int GetChildCount() const;
00116     wxSimpleHtmlTag*    GetChild(int i) const;
00117     wxSimpleHtmlTag*    GetNext() const { return m_next; }
00118 
00120     bool NameIs(const wxString& name) { return (m_name.CmpNoCase(name) == 0); }
00121     bool HasAttribute(const wxString& name, const wxString& value) const;
00122     bool HasAttribute(const wxString& name) const;
00123     bool GetAttributeValue(wxString& value, const wxString& attrName);
00124 
00125     // Search forward from this tag until we find a tag with this name & optionally attribute 
00126     wxSimpleHtmlTag* FindTag(const wxString& tagName, const wxString& attrName = wxEmptyString);
00127 
00128     // Gather the text until we hit the given close tag
00129     bool FindTextUntilTagClose(wxString& text, const wxString& tagName);
00130 
00131 private:
00132     wxString                m_name;
00133     int                     m_type;
00134     wxString                m_text;
00135     wxSimpleHtmlAttribute*  m_attributes;
00136 
00137     // List of children
00138     wxSimpleHtmlTag*        m_children;
00139     wxSimpleHtmlTag*        m_next; // Next sibling
00140     wxSimpleHtmlTag*        m_parent;
00141 };
00142 
00143 /*
00144  * wxSimpleHtmlParser
00145  * Simple HTML parser, for such tasks as scanning HTML for keywords, contents, etc.
00146  */
00147 
00148 class wxSimpleHtmlParser : public wxObject
00149 {
00150     
00151 public:
00152     wxSimpleHtmlParser();
00153     ~wxSimpleHtmlParser();
00154 
00156     bool ParseFile(const wxString& filename, const wxString& encoding = wxEmptyString);
00157     bool ParseString(const wxString& str);
00158     void Clear();
00159     // Write this file
00160     void Write(wxOutputStream& stream);
00161     bool WriteFile(wxString& filename);
00162 
00164 
00165     // Main recursive parsing function
00166     bool ParseHtml(wxSimpleHtmlTag* parent);
00167 
00168     wxSimpleHtmlTag* ParseTagHeader();
00169     wxSimpleHtmlTag* ParseTagClose();
00170     bool ParseAttributes(wxSimpleHtmlTag* tag);
00171     wxSimpleHtmlTag* ParseDirective(); // e.g. <!DOCTYPE ....>
00172     bool ParseComment(); // Throw away comments
00173     // Plain text, up until an angled bracket
00174     bool ParseText(wxString& text);
00175 
00176     bool EatWhitespace(); // Throw away whitespace
00177     bool EatWhitespace(int& pos); // Throw away whitespace: using 'pos'
00178     bool ReadString(wxString& str, bool eatIt = FALSE);
00179     bool ReadWord(wxString& str, bool eatIt = FALSE);
00180     bool ReadNumber(wxString& str, bool eatIt = FALSE);
00181     // Could be number, string, whatever, but read up until whitespace.
00182     bool ReadLiteral(wxString& str, bool eatIt = FALSE);
00183 
00184     bool IsDirective();
00185     bool IsComment();
00186     bool IsString();
00187     bool IsWord();
00188     bool IsTagClose();
00189     bool IsTagStartBracket(int ch);
00190     bool IsTagEndBracket(int ch);
00191     bool IsWhitespace(int ch);
00192     bool IsAlpha(int ch);
00193     bool IsWordChar(int ch);
00194     bool IsNumeric(int ch);
00195 
00196     // Matches this string (case insensitive)
00197     bool Matches(const wxString& tok, bool eatIt = FALSE) ;
00198     bool Eof() const { return (m_pos >= m_length); }
00199     bool Eof(int pos) const { return (pos >= m_length); }
00200 
00201     void SetPosition(int pos) { m_pos = pos; }
00202 
00203 
00205     wxSimpleHtmlTag* GetTopLevelTag() const { return m_topLevel; }
00206 
00207     // Safe way of getting a character
00208     int GetChar(size_t i) const;
00209     
00210 private:
00211 
00212     wxSimpleHtmlTag*    m_topLevel;
00213     int                 m_pos;    // Position in string
00214     int                 m_length; // Length of string
00215     wxString            m_text;   // The actual text
00216 
00217 };
00218 
00219 #endif

Generated on Wed May 6 19:20:19 2009 for AxTk by  doxygen 1.5.1