Skip to content
Snippets Groups Projects
opml.py 5.95 KiB
Newer Older
  • Learn to ignore specific revisions
  • from xml.sax import ContentHandler, make_parser, SAXParseException
    from xml.sax.xmlreader import InputSource
    from sgmllib import SGMLParser
    from cStringIO import StringIO
    from ConfigParser import ConfigParser
    from htmlentitydefs import entitydefs
    import re
    
    # input = opml, output = ConfigParser
    def opml2config(opml, config=None):
    
        if hasattr(opml, 'read'):
            opml = opml.read()
    
        if not config:
            config = ConfigParser()
    
        opmlParser = OpmlParser(config)
    
        try:
            # try SAX
            source = InputSource()
            source.setByteStream(StringIO(opml))
            parser = make_parser()
            parser.setContentHandler(opmlParser)
            parser.parse(source)
        except SAXParseException:
            # try as SGML
            opmlParser.feed(opml)
    
        return config
    
    # Parse OPML via either SAX or SGML
    class OpmlParser(ContentHandler,SGMLParser):
        entities = re.compile('&(#?\w+);')
    
        def __init__(self, config):
            ContentHandler.__init__(self)
            SGMLParser.__init__(self)
            self.config = config
    
        def startElement(self, name, attrs):
    
            # we are only looking for data in 'outline' nodes.
            if name != 'outline': return
    
            # A type of 'rss' is meant to be used generically to indicate that
            # this is an entry in a subscription list, but some leave this
            # attribute off, and others have placed 'atom' in here
            if attrs.has_key('type'):
                if attrs['type'] == 'link' and not attrs.has_key('url'):
                    # Auto-correct WordPress link manager OPML files
                    attrs = dict(attrs.items())
                    attrs['type'] = 'rss'
                if attrs['type'].lower() not in['rss','atom']: return
    
            # The feed itself is supposed to be in an attribute named 'xmlUrl'
            # (note the camel casing), but this has proven to be problematic,
            # with the most common misspelling being in all lower-case
            if not attrs.has_key('xmlUrl') or not attrs['xmlUrl'].strip():
                for attribute in attrs.keys():
                    if attribute.lower() == 'xmlurl' and attrs[attribute].strip():
                        attrs = dict(attrs.items())
                        attrs['xmlUrl'] = attrs[attribute]
                        break
                else:
                    return
    
            # the text attribute is nominally required in OPML, but this
            # data is often found in a title attribute instead
            if not attrs.has_key('text') or not attrs['text'].strip():
                if not attrs.has_key('title') or not attrs['title'].strip(): return
                attrs = dict(attrs.items())
                attrs['text'] = attrs['title']
    
            # if we get this far, we either have a valid subscription list entry,
            # or one with a correctable error.  Add it to the configuration, if
            # it is not already there.
            xmlUrl = attrs['xmlUrl']
            if not self.config.has_section(xmlUrl):
                self.config.add_section(xmlUrl)
                self.config.set(xmlUrl, 'name', self.unescape(attrs['text']))
    
        def unescape(self, text):
            parsed = self.entities.split(text)
    
            for i in range(1,len(parsed),2):
    
                if parsed[i] in entitydefs.keys():
                    # named entities
                    codepoint=entitydefs[parsed[i]]
                    match=self.entities.match(codepoint)
                    if match:
                        parsed[i]=match.group(1)
                    else:
                        parsed[i]=unichr(ord(codepoint))
    
                    # numeric entities
                    if parsed[i].startswith('#'):
                        if parsed[i].startswith('#x'):
                            parsed[i]=unichr(int(parsed[i][2:],16))
                        else:
                            parsed[i]=unichr(int(parsed[i][1:]))
    
            return u''.join(parsed).encode('utf-8')
        # SGML => SAX
        def unknown_starttag(self, name, attrs):
            attrs = dict(attrs)
            for attribute in attrs:
                try:
                    attrs[attribute] = attrs[attribute].decode('utf-8')
                except:
                    work = attrs[attribute].decode('iso-8859-1')
                    work = u''.join([c in cp1252 and cp1252[c] or c for c in work])
                    attrs[attribute] = work
            self.startElement(name, attrs)
    
    # http://www.intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows
    cp1252 = {
      unichr(128): unichr(8364), # euro sign
      unichr(130): unichr(8218), # single low-9 quotation mark
      unichr(131): unichr( 402), # latin small letter f with hook
      unichr(132): unichr(8222), # double low-9 quotation mark
      unichr(133): unichr(8230), # horizontal ellipsis
      unichr(134): unichr(8224), # dagger
      unichr(135): unichr(8225), # double dagger
      unichr(136): unichr( 710), # modifier letter circumflex accent
      unichr(137): unichr(8240), # per mille sign
      unichr(138): unichr( 352), # latin capital letter s with caron
      unichr(139): unichr(8249), # single left-pointing angle quotation mark
      unichr(140): unichr( 338), # latin capital ligature oe
      unichr(142): unichr( 381), # latin capital letter z with caron
      unichr(145): unichr(8216), # left single quotation mark
      unichr(146): unichr(8217), # right single quotation mark
      unichr(147): unichr(8220), # left double quotation mark
      unichr(148): unichr(8221), # right double quotation mark
      unichr(149): unichr(8226), # bullet
      unichr(150): unichr(8211), # en dash
      unichr(151): unichr(8212), # em dash
      unichr(152): unichr( 732), # small tilde
      unichr(153): unichr(8482), # trade mark sign
      unichr(154): unichr( 353), # latin small letter s with caron
      unichr(155): unichr(8250), # single right-pointing angle quotation mark
      unichr(156): unichr( 339), # latin small ligature oe
      unichr(158): unichr( 382), # latin small letter z with caron
      unichr(159): unichr( 376)} # latin capital letter y with diaeresis
    
    if __name__ == "__main__":
        # small main program which converts OPML into config.ini format
        import sys, urllib
        config = ConfigParser()
        for opml in sys.argv[1:]:
            opml2config(urllib.urlopen(opml), config)
        config.write(sys.stdout)