try: frozenset except NameError: # Import from the sets module for python 2.3 from sets import ImmutableSet as frozenset import gettext _ = gettext.gettext from html5lib.constants import voidElements, booleanAttributes, spaceCharacters from html5lib.constants import rcdataElements from xml.sax.saxutils import escape spaceCharacters = u"".join(spaceCharacters) try: from codecs import register_error, xmlcharrefreplace_errors except ImportError: unicode_encode_errors = "strict" else: unicode_encode_errors = "htmlentityreplace" from html5lib.constants import entities encode_entity_map = {} for k, v in entities.items(): if v != "&" and encode_entity_map.get(v) != k.lower(): # prefer < over < and similarly for &, >, etc. encode_entity_map[v] = k def htmlentityreplace_errors(exc): if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): res = [] for c in exc.object[exc.start:exc.end]: e = encode_entity_map.get(c) if e: res.append("&") res.append(e) if not e.endswith(";"): res.append(";") else: res.append(c.encode(exc.encoding, "xmlcharrefreplace")) return (u"".join(res), exc.end) else: return xmlcharrefreplace_errors(exc) register_error(unicode_encode_errors, htmlentityreplace_errors) del register_error def encode(text, encoding): return text.encode(encoding, unicode_encode_errors) class HTMLSerializer(object): quote_attr_values = False quote_char = '"' use_best_quote_char = True minimize_boolean_attributes = True use_trailing_solidus = False space_before_trailing_solidus = True escape_lt_in_attrs = False escape_rcdata = False inject_meta_charset = True strip_whitespace = False sanitize = False omit_optional_tags = True options = ("quote_attr_values", "quote_char", "use_best_quote_char", "minimize_boolean_attributes", "use_trailing_solidus", "space_before_trailing_solidus", "omit_optional_tags", "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs", "escape_rcdata", 'use_trailing_solidus', "sanitize") def __init__(self, **kwargs): if kwargs.has_key('quote_char'): self.use_best_quote_char = False for attr in self.options: setattr(self, attr, kwargs.get(attr, getattr(self, attr))) self.errors = [] self.strict = False def serialize(self, treewalker, encoding=None): in_cdata = False self.errors = [] if encoding and self.inject_meta_charset: from html5lib.filters.inject_meta_charset import Filter treewalker = Filter(treewalker, encoding) # XXX: WhitespaceFilter should be used before OptionalTagFilter # for maximum efficiently of this latter filter if self.strip_whitespace: from html5lib.filters.whitespace import Filter treewalker = Filter(treewalker) if self.sanitize: from html5lib.filters.sanitizer import Filter treewalker = Filter(treewalker) if self.omit_optional_tags: from html5lib.filters.optionaltags import Filter treewalker = Filter(treewalker) for token in treewalker: type = token["type"] if type == "Doctype": doctype = u"<!DOCTYPE %s>" % token["name"] if encoding: yield doctype.encode(encoding) else: yield doctype elif type in ("Characters", "SpaceCharacters"): if type == "SpaceCharacters" or in_cdata: if in_cdata and token["data"].find("</") >= 0: self.serializeError(_("Unexpected </ in CDATA")) if encoding: yield token["data"].encode(encoding, "strict") else: yield token["data"] elif encoding: yield encode(escape(token["data"]), encoding) else: yield escape(token["data"]) elif type in ("StartTag", "EmptyTag"): name = token["name"] if name in rcdataElements and not self.escape_rcdata: in_cdata = True elif in_cdata: self.serializeError(_("Unexpected child element of a CDATA element")) attrs = token["data"] if hasattr(attrs, "items"): attrs = attrs.items() attrs.sort() attributes = [] for k,v in attrs: if encoding: k = k.encode(encoding, "strict") attributes.append(' ') attributes.append(k) if not self.minimize_boolean_attributes or \ (k not in booleanAttributes.get(name, tuple()) \ and k not in booleanAttributes.get("", tuple())): attributes.append("=") if self.quote_attr_values or not v: quote_attr = True else: quote_attr = reduce(lambda x,y: x or (y in v), spaceCharacters + "<>\"'", False) v = v.replace("&", "&") if self.escape_lt_in_attrs: v = v.replace("<", "<") if encoding: v = encode(v, encoding) if quote_attr: quote_char = self.quote_char if self.use_best_quote_char: if "'" in v and '"' not in v: quote_char = '"' elif '"' in v and "'" not in v: quote_char = "'" if quote_char == "'": v = v.replace("'", "'") else: v = v.replace('"', """) attributes.append(quote_char) attributes.append(v) attributes.append(quote_char) else: attributes.append(v) if name in voidElements and self.use_trailing_solidus: if self.space_before_trailing_solidus: attributes.append(" /") else: attributes.append("/") if encoding: yield "<%s%s>" % (name.encode(encoding, "strict"), "".join(attributes)) else: yield u"<%s%s>" % (name, u"".join(attributes)) elif type == "EndTag": name = token["name"] if name in rcdataElements: in_cdata = False elif in_cdata: self.serializeError(_("Unexpected child element of a CDATA element")) end_tag = u"</%s>" % name if encoding: end_tag = end_tag.encode(encoding, "strict") yield end_tag elif type == "Comment": data = token["data"] if data.find("--") >= 0: self.serializeError(_("Comment contains --")) comment = u"<!--%s-->" % token["data"] if encoding: comment = comment.encode(encoding, unicode_encode_errors) yield comment else: self.serializeError(token["data"]) def render(self, treewalker, encoding=None): if encoding: return "".join(list(self.serialize(treewalker, encoding))) else: return u"".join(list(self.serialize(treewalker))) def serializeError(self, data="XXX ERROR MESSAGE NEEDED"): # XXX The idea is to make data mandatory. self.errors.append(data) if self.strict: raise SerializeError def SerializeError(Exception): """Error in serialized tree""" pass