Skip to content
Snippets Groups Projects
soup.py 5.26 KiB
Newer Older
  • Learn to ignore specific revisions
  • from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
    
    import _base
    
    class AttrList(object):
        def __init__(self, element):
            self.element = element
            self.attrs = dict(self.element.attrs)
        def __iter__(self):
            return self.attrs.items().__iter__()
        def __setitem__(self, name, value):
            "set attr", name, value
            self.element[name] = value
        def items(self):
            return self.attrs.items()
        def keys(self):
            return self.attrs.keys()
        def __getitem__(self, name):
            return self.attrs[name]
        def __contains__(self, name):
            return name in self.attrs.keys()
    
    
    class Element(_base.Node):
        def __init__(self, element, soup):
            _base.Node.__init__(self, element.name)
            self.element = element
            self.soup=soup
    
        def appendChild(self, node):
            if (node.element.__class__ == NavigableString and self.element.contents
                and self.element.contents[-1].__class__ == NavigableString):
                newNode = TextNode(NavigableString(
                    self.element.contents[-1]+node.element), self.soup)
                self.element.contents[-1].extract()
                self.appendChild(newNode)
            else:
                self.element.insert(len(self.element.contents), node.element)
                node.parent = self
        
        def getAttributes(self):
            return AttrList(self.element)
    
        def setAttributes(self, attributes):
            if attributes:
                for name, value in attributes.items():
                    self.element[name] =  value
    
        attributes = property(getAttributes, setAttributes)
        
        def insertText(self, data, insertBefore=None):
            text = TextNode(NavigableString(data), self.soup)
            if insertBefore:
                self.insertBefore(text, insertBefore)
            else:
                self.appendChild(text)
    
        def insertBefore(self, node, refNode):
            index = self.element.contents.index(refNode.element)
            if (node.element.__class__ == NavigableString and self.element.contents
                and self.element.contents[index-1].__class__ == NavigableString):
                newNode = TextNode(NavigableString(
                    self.element.contents[index-1]+node.element), self.soup)
                self.element.contents[index-1].extract()
                self.insertBefore(newNode, refNode)
            else:
                self.element.insert(index, node.element)
                node.parent = self
    
        def removeChild(self, node):
            node.element.extract()
            node.parent = None
    
        def reparentChildren(self, newParent):
            while self.element.contents:
                child = self.element.contents[0]
                child.extract()
                if isinstance(child, Tag):
                    newParent.appendChild(Element(child, self.soup))
                else:
                    newParent.appendChild(TextNode(child, self.soup))
    
        def cloneNode(self):
            node = Element(Tag(self.soup, self.element.name), self.soup)
            for key,value in self.attributes:
                node.attributes[key] = value
            return node
    
        def hasContent(self):
            return self.element.contents
    
    class TextNode(Element):
        def __init__(self, element, soup):
            _base.Node.__init__(self, None)
            self.element = element
            self.soup=soup
        
        def cloneNode(self):
            raise NotImplementedError
    
    class TreeBuilder(_base.TreeBuilder):
        def documentClass(self):
            self.soup = BeautifulSoup("")
            return Element(self.soup, self.soup)
        
        def insertDoctype(self, name, publicId, systemId):
            self.soup.insert(0, Declaration(name))
        
        def elementClass(self, name):
            return Element(Tag(self.soup, name), self.soup)
            
        def commentClass(self, data):
            return TextNode(Comment(data), self.soup)
        
        def fragmentClass(self):
            self.soup = BeautifulSoup("")
            self.soup.name = "[document_fragment]"
            return Element(self.soup, self.soup) 
    
        def appendChild(self, node):
            self.soup.insert(len(self.soup.contents), node.element)
    
        def testSerializer(self, element):
            return testSerializer(element)
    
        def getDocument(self):
            return self.soup
        
        def getFragment(self):
            return _base.TreeBuilder.getFragment(self).element
        
    def testSerializer(element):
        rv = []
        def serializeElement(element, indent=0):
            if isinstance(element, Declaration):
                rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.string))
            elif isinstance(element, BeautifulSoup):
                if element.name == "[document_fragment]":
                    rv.append("#document-fragment")                
                else:
                    rv.append("#document")
    
            elif isinstance(element, Comment):
                rv.append("|%s<!-- %s -->"%(' '*indent, element.string))
            elif isinstance(element, unicode):
                rv.append("|%s\"%s\"" %(' '*indent, element))
            else:
                rv.append("|%s<%s>"%(' '*indent, element.name))
                if element.attrs:
                    for name, value in element.attrs:
                        rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
            indent += 2
            if hasattr(element, "contents"):
                for child in element.contents:
                    serializeElement(child, indent)
        serializeElement(element, 0)
    
        return "\n".join(rv)