include/tts/tts_htmlparser.h

Go to the documentation of this file.
00001 
00002 // Name:        tts_htmlparser.cpp
00003 // Purpose:     Simple HTML parser
00004 // Author:      Julian Smart
00005 // Modified by:
00006 // Created:     2002-09-25
00007 // RCS-ID:      $Id: htmlparser.h,v 1.2 2006/12/14 04:19:24 anthemion Exp $
00008 // Copyright:   (c) Julian Smart
00009 // Licence:     New BSD License
00011 
00012 #ifndef _TTS_HTMLPARSER_H_
00013 #define _TTS_HTMLPARSER_H_
00014 
00015 #include "wx/module.h"
00016 #include "wx/stream.h"
00017 
00018 /*
00019  * wxTTSSimpleHtmlAttribute
00020  * Representation of an attribute
00021  */
00022 
00023 class wxTTSSimpleHtmlAttribute
00024 {
00025     friend class wxTTSSimpleHtmlTag;
00026 public:
00027     wxTTSSimpleHtmlAttribute(const wxString& name, const wxString& value)
00028     {
00029         m_name = name; m_value = value; m_next = NULL;
00030     }
00032 
00033     // Write this attribute
00034     void Write(wxOutputStream& stream);
00035 
00037     const wxString& GetName() const { return m_name; }
00038     const wxString& GetValue() const { return m_value; }
00039 
00040     wxTTSSimpleHtmlAttribute* GetNextAttribute() { return m_next; }
00041     void SetNextAttribute(wxTTSSimpleHtmlAttribute* attr) { m_next = attr; }
00042 
00043     bool HasName(const wxString& name) const { return (0 == m_name.CmpNoCase(name)); }
00044     bool HasValue(const wxString& val) const { return (0 == m_value.CmpNoCase(val)); }
00045 
00046 private:
00047     wxString                m_name;
00048     wxString                m_value;
00049     wxTTSSimpleHtmlAttribute*  m_next;
00050 };
00051 
00052 
00053 /*
00054  * wxTTSSimpleHtmlTag
00055  * Representation of a tag or chunk of text
00056  */
00057 
00058 enum { wxTTSSimpleHtmlTag_Text, wxTTSSimpleHtmlTag_TopLevel, wxTTSSimpleHtmlTag_Open, wxTTSSimpleHtmlTag_Close, wxTTSSimpleHtmlTag_Directive,
00059        wxTTSSimpleHtmlTag_Entity };
00060 
00061 class wxTTSSimpleHtmlTag
00062 {
00063 public:
00064     wxTTSSimpleHtmlTag(const wxString& tagName, int tagType);
00065     ~wxTTSSimpleHtmlTag();
00066 
00068     void ClearAttributes();
00069     wxTTSSimpleHtmlAttribute* FindAttribute(const wxString& name) const ;
00070     void AppendAttribute(const wxString& name, const wxString& value);
00071     void ClearChildren();
00072     void AppendTag(wxTTSSimpleHtmlTag* tag);
00073     // Write this tag
00074     void Write(wxOutputStream& stream);
00075 
00076     // Gets the text from this tag and its descendants
00077     wxString GetTagText();
00078 
00080     const wxString& GetName() const { return m_name; }
00081     void SetName(const wxString& name) { m_name = name; }
00082 
00083     int GetType() const { return m_type; }
00084     void SetType(int t) { m_type = t; }
00085 
00086     // If type is wxTTSSimpleHtmlTag_Text, m_text will contain some text.
00087     const wxString& GetText() const { return m_text; }
00088     void SetText(const wxString& text) { m_text = text; }
00089 
00090     wxTTSSimpleHtmlAttribute* GetFirstAttribute() { return m_attributes; }
00091     void SetFirstAttribute(wxTTSSimpleHtmlAttribute* attr) { m_attributes = attr; }
00092 
00093     int GetAttributeCount() const ;
00094     wxTTSSimpleHtmlAttribute* GetAttribute(int i) const ;
00095 
00096     wxTTSSimpleHtmlTag* GetChildren() const { return m_children; }
00097     void SetChildren(wxTTSSimpleHtmlTag* children) { m_children = children; }
00098 
00099     wxTTSSimpleHtmlTag* GetParent() const { return m_parent; }
00100     void SetParent(wxTTSSimpleHtmlTag* parent) { m_parent = parent; }
00101     int GetChildCount() const;
00102     wxTTSSimpleHtmlTag*    GetChild(int i) const;
00103     wxTTSSimpleHtmlTag*    GetNext() const { return m_next; }
00104 
00106     bool NameIs(const wxString& name) { return (m_name.CmpNoCase(name) == 0); }
00107     bool HasAttribute(const wxString& name, const wxString& value) const;
00108     bool HasAttribute(const wxString& name) const;
00109     bool GetAttributeValue(wxString& value, const wxString& attrName);
00110 
00111     // Search forward from this tag until we find a tag with this name & optionally attribute 
00112     wxTTSSimpleHtmlTag* FindTag(const wxString& tagName, const wxString& attrName = wxEmptyString);
00113 
00114     // Gather the text until we hit the given close tag
00115     bool FindTextUntilTagClose(wxString& text, const wxString& tagName);
00116 
00117 private:
00118     wxString                m_name;
00119     int                     m_type;
00120     wxString                m_text;
00121     wxTTSSimpleHtmlAttribute*  m_attributes;
00122 
00123     // List of children
00124     wxTTSSimpleHtmlTag*        m_children;
00125     wxTTSSimpleHtmlTag*        m_next; // Next sibling
00126     wxTTSSimpleHtmlTag*        m_parent;
00127 };
00128 
00129 /*
00130  * wxTTSSimpleHtmlParser
00131  * Simple HTML parser, for such tasks as scanning HTML for keywords, contents, etc.
00132  */
00133 
00134 class wxTTSSimpleHtmlParser : public wxObject
00135 {
00136     
00137 public:
00138     wxTTSSimpleHtmlParser();
00139     ~wxTTSSimpleHtmlParser();
00140 
00142     bool ParseFile(const wxString& filename, const wxString& encoding = wxEmptyString);
00143     bool ParseString(const wxString& str);
00144     void Clear();
00145     // Write this file
00146     void Write(wxOutputStream& stream);
00147     bool WriteFile(wxString& filename);
00148 
00150 
00151     // Main recursive parsing function
00152     bool ParseHtml(wxTTSSimpleHtmlTag* parent);
00153 
00154     wxTTSSimpleHtmlTag* ParseTagHeader();
00155     wxTTSSimpleHtmlTag* ParseTagClose();
00156     bool ParseAttributes(wxTTSSimpleHtmlTag* tag);
00157     wxTTSSimpleHtmlTag* ParseDirective(); // e.g. <!DOCTYPE ....>
00158     bool ParseComment(); // Throw away comments
00159     // Plain text, up until an angled bracket
00160     bool ParseText(wxString& text);
00161 
00162     bool EatWhitespace(); // Throw away whitespace
00163     bool EatWhitespace(int& pos); // Throw away whitespace: using 'pos'
00164     bool ReadString(wxString& str, bool eatIt = FALSE);
00165     bool ReadWord(wxString& str, bool eatIt = FALSE);
00166     bool ReadNumber(wxString& str, bool eatIt = FALSE);
00167     // Could be number, string, whatever, but read up until whitespace.
00168     bool ReadLiteral(wxString& str, bool eatIt = FALSE);
00169 
00170     bool IsDirective();
00171     bool IsComment();
00172     bool IsString();
00173     bool IsWord();
00174     bool IsTagClose();
00175     bool IsTagStartBracket(int ch);
00176     bool IsTagEndBracket(int ch);
00177     bool IsWhitespace(int ch);
00178     bool IsAlpha(int ch);
00179     bool IsWordChar(int ch);
00180     bool IsNumeric(int ch);
00181 
00182     // Matches this string (case insensitive)
00183     bool Matches(const wxString& tok, bool eatIt = FALSE) ;
00184     bool Eof() const { return (m_pos >= m_length); }
00185     bool Eof(int pos) const { return (pos >= m_length); }
00186 
00187     void SetPosition(int pos) { m_pos = pos; }
00188 
00189 
00191     wxTTSSimpleHtmlTag* GetTopLevelTag() const { return m_topLevel; }
00192 
00193     // Safe way of getting a character
00194     int GetChar(size_t i) const;
00195     
00196 private:
00197 
00198     wxTTSSimpleHtmlTag*    m_topLevel;
00199     int                 m_pos;    // Position in string
00200     int                 m_length; // Length of string
00201     wxString            m_text;   // The actual text
00202 
00203 };
00204 
00205 #endif
00206     // _TTS_HTMLPARSER_H_
00207 

Generated on Wed May 6 19:20:19 2009 for AxTk by  doxygen 1.5.1