Skip to content
Snippets Groups Projects
liberalxmlparser.py 5.46 KiB
Newer Older
  • Learn to ignore specific revisions
  • """ 
    Warning: this module is experimental and subject to change and even removal
    at any time. 
    
    For background/rationale, see:
     * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
     * http://tinyurl.com/ylfj8k (and follow-ups)
    
    References:
     * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
     * http://wiki.whatwg.org/wiki/HtmlVsXhtml
    
    @@TODO:
     * Selectively lowercase only XHTML, but not foreign markup
    """
    
    import html5parser
    from constants import voidElements, contentModelFlags
    
    from xml.dom import XHTML_NAMESPACE
    from xml.sax.saxutils import unescape
    
    class XMLParser(html5parser.HTMLParser):
        """ liberal XML parser """
    
        def __init__(self, *args, **kwargs):
            html5parser.HTMLParser.__init__(self, *args, **kwargs)
            
            self.phases["initial"] = XmlRootPhase(self, self.tree)
    
        def normalizeToken(self, token):
    
            if token["type"] in ("StartTag", "EmptyTag"):
                token["data"] = dict(token["data"][::-1])
    
            # For EmptyTags, process both a Start and an End tag
            if token["type"] == "EmptyTag":
                save = self.tokenizer.contentModelFlag
                self.phase.processStartTag(token["name"], token["data"])
                self.tokenizer.contentModelFlag = save
                token["data"] = {}
                token["type"] = "EndTag"
    
            elif token["type"] == "Characters":
                # un-escape rcdataElements (e.g. style, script)
                if self.tokenizer.contentModelFlag == contentModelFlags["CDATA"]:
                    token["data"] = unescape(token["data"])
    
            elif token["type"] == "Comment":
                # Rescue CDATA from the comments
                if (token["data"].startswith("[CDATA[") and
                    token["data"].endswith("]]")):
                    token["type"] = "Characters"
                    token["data"] = token["data"][7:-2]
    
            return token
    
        def _parse(self, stream, innerHTML=False, container="div", encoding=None,
                   **kwargs):
    
            html5parser.HTMLParser._parse(self, stream, innerHTML, container,
                                          encoding, lowercaseElementName=False,
                                          lowercaseAttrName=False)
    
    class XHTMLParser(XMLParser):
        """ liberal XMTHML parser """
    
        def __init__(self, *args, **kwargs):
            html5parser.HTMLParser.__init__(self, *args, **kwargs)
            self.phases["initial"] = XmlInitialPhase(self, self.tree)
            self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
    
        def normalizeToken(self, token):
            token = XMLParser.normalizeToken(self, token)
    
            # ensure that non-void XHTML elements have content so that separate
            # open and close tags are emitted
            if token["type"]  == "EndTag":
                if token["name"] in voidElements:
                    if not self.tree.openElements or \
                      self.tree.openElements[-1].name != token["name"]:
                        token["type"] = "EmptyTag"
                        if not token.has_key("data"): token["data"] = {}
                else:
                    if token["name"] == self.tree.openElements[-1].name and \
                      not self.tree.openElements[-1].hasContent():
                        for e in self.tree.openElements:
                            if 'xmlns' in e.attributes.keys():
                                if e.attributes['xmlns'] != XHTML_NAMESPACE:
                                    break
                        else:
                            self.tree.insertText('')
    
            return token
    
    class XhmlRootPhase(html5parser.RootElementPhase):
        def insertHtmlElement(self):
            element = self.tree.createElement("html", {'xmlns': 'http://www.w3.org/1999/xhtml'})
            self.tree.openElements.append(element)
            self.tree.document.appendChild(element)
            self.parser.phase = self.parser.phases["beforeHead"]
    
    class XmlInitialPhase(html5parser.InitialPhase):
        """ Consume XML Prologs """
        def processComment(self, data):
            if not data.startswith('?xml') or not data.endswith('?'):
                html5parser.InitialPhase.processComment(self, data)
    
    class XmlRootPhase(html5parser.Phase):
        """ Consume XML Prologs """
        def processComment(self, data):
            print repr(data)
            if not data.startswith('?xml') or not data.endswith('?'):
                html5parser.InitialPhase.processComment(self, data)
    
        """ Prime the Xml parser """
        def __getattr__(self, name):
            self.tree.openElements.append(self.tree.document)
            self.parser.phase = XmlElementPhase(self.parser, self.tree)
            return getattr(self.parser.phase, name)
    
    class XmlElementPhase(html5parser.Phase):
        """ Generic handling for all XML elements """
    
        def __init__(self, *args, **kwargs):
            html5parser.Phase.__init__(self, *args, **kwargs)
            self.startTagHandler = html5parser.utils.MethodDispatcher([])
            self.startTagHandler.default = self.startTagOther
            self.endTagHandler = html5parser.utils.MethodDispatcher([])
            self.endTagHandler.default = self.endTagOther
    
        def startTagOther(self, name, attributes):
            element = self.tree.createElement(name, attributes)
            self.tree.openElements[-1].appendChild(element)
            self.tree.openElements.append(element)
    
        def endTagOther(self, name):
            for node in self.tree.openElements[::-1]:
                if node.name == name:
                    while self.tree.openElements.pop() != node:
                        pass
                    break
                else:
                    self.parser.parseError()
    
        def processCharacters(self, data):
            self.tree.insertText(data)