import string try: frozenset except NameError: # Import from the sets module for python 2.3 from sets import Set as set from sets import ImmutableSet as frozenset EOF = None contentModelFlags = { "PCDATA":0, "RCDATA":1, "CDATA":2, "PLAINTEXT":3 } scopingElements = frozenset(( "button", "caption", "html", "marquee", "object", "table", "td", "th" )) formattingElements = frozenset(( "a", "b", "big", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" )) specialElements = frozenset(( "address", "area", "base", "basefont", "bgsound", "blockquote", "body", "br", "center", "col", "colgroup", "dd", "dir", "div", "dl", "dt", "embed", "fieldset", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "iframe", "image", "img", "input", "isindex", "li", "link", "listing", "menu", "meta", "noembed", "noframes", "noscript", "ol", "optgroup", "option", "p", "param", "plaintext", "pre", "script", "select", "spacer", "style", "tbody", "textarea", "tfoot", "thead", "title", "tr", "ul", "wbr" )) spaceCharacters = frozenset(( u"\t", u"\n", u"\u000B", u"\u000C", u" ", u"\r" )) tableInsertModeElements = frozenset(( "table", "tbody", "tfoot", "thead", "tr" )) asciiLowercase = frozenset(string.ascii_lowercase) asciiUppercase = frozenset(string.ascii_uppercase) asciiLetters = frozenset(string.ascii_letters) digits = frozenset(string.digits) hexDigits = frozenset(string.hexdigits) asciiUpper2Lower = dict([(ord(c),ord(c.lower())) for c in string.ascii_uppercase]) # Heading elements need to be ordered headingElements = ( "h1", "h2", "h3", "h4", "h5", "h6" ) # XXX What about event-source and command? voidElements = frozenset(( "base", "link", "meta", "hr", "br", "img", "embed", "param", "area", "col", "input" )) cdataElements = frozenset(('title', 'textarea')) rcdataElements = frozenset(( 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript' )) booleanAttributes = { "": frozenset(("irrelevant",)), "style": frozenset(("scoped",)), "img": frozenset(("ismap",)), "audio": frozenset(("autoplay","controls")), "video": frozenset(("autoplay","controls")), "script": frozenset(("defer", "async")), "details": frozenset(("open",)), "datagrid": frozenset(("multiple", "disabled")), "command": frozenset(("hidden", "disabled", "checked", "default")), "menu": frozenset(("autosubmit",)), "fieldset": frozenset(("disabled", "readonly")), "option": frozenset(("disabled", "readonly", "selected")), "optgroup": frozenset(("disabled", "readonly")), "button": frozenset(("disabled", "autofocus")), "input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")), "select": frozenset(("disabled", "readonly", "autofocus", "multiple")), "output": frozenset(("disabled", "readonly")), } # entitiesWindows1252 has to be _ordered_ and needs to have an index. It # therefore can't be a frozenset. entitiesWindows1252 = ( 8364, # 0x80 0x20AC EURO SIGN 65533, # 0x81 UNDEFINED 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK 402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK 8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK 8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS 8224, # 0x86 0x2020 DAGGER 8225, # 0x87 0x2021 DOUBLE DAGGER 710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT 8240, # 0x89 0x2030 PER MILLE SIGN 352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON 8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK 338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE 65533, # 0x8D UNDEFINED 381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON 65533, # 0x8F UNDEFINED 65533, # 0x90 UNDEFINED 8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK 8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK 8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK 8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK 8226, # 0x95 0x2022 BULLET 8211, # 0x96 0x2013 EN DASH 8212, # 0x97 0x2014 EM DASH 732, # 0x98 0x02DC SMALL TILDE 8482, # 0x99 0x2122 TRADE MARK SIGN 353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON 8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE 65533, # 0x9D UNDEFINED 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS ) entities = { "AElig;": u"\u00C6", "AElig": u"\u00C6", "AMP;": u"\u0026", "AMP": u"\u0026", "Aacute;": u"\u00C1", "Aacute": u"\u00C1", "Acirc;": u"\u00C2", "Acirc": u"\u00C2", "Agrave;": u"\u00C0", "Agrave": u"\u00C0", "Alpha;": u"\u0391", "Aring;": u"\u00C5", "Aring": u"\u00C5", "Atilde;": u"\u00C3", "Atilde": u"\u00C3", "Auml;": u"\u00C4", "Auml": u"\u00C4", "Beta;": u"\u0392", "COPY;": u"\u00A9", "COPY": u"\u00A9", "Ccedil;": u"\u00C7", "Ccedil": u"\u00C7", "Chi;": u"\u03A7", "Dagger;": u"\u2021", "Delta;": u"\u0394", "ETH;": u"\u00D0", "ETH": u"\u00D0", "Eacute;": u"\u00C9", "Eacute": u"\u00C9", "Ecirc;": u"\u00CA", "Ecirc": u"\u00CA", "Egrave;": u"\u00C8", "Egrave": u"\u00C8", "Epsilon;": u"\u0395", "Eta;": u"\u0397", "Euml;": u"\u00CB", "Euml": u"\u00CB", "GT;": u"\u003E", "GT": u"\u003E", "Gamma;": u"\u0393", "Iacute;": u"\u00CD", "Iacute": u"\u00CD", "Icirc;": u"\u00CE", "Icirc": u"\u00CE", "Igrave;": u"\u00CC", "Igrave": u"\u00CC", "Iota;": u"\u0399", "Iuml;": u"\u00CF", "Iuml": u"\u00CF", "Kappa;": u"\u039A", "LT;": u"\u003C", "LT": u"\u003C", "Lambda;": u"\u039B", "Mu;": u"\u039C", "Ntilde;": u"\u00D1", "Ntilde": u"\u00D1", "Nu;": u"\u039D", "OElig;": u"\u0152", "Oacute;": u"\u00D3", "Oacute": u"\u00D3", "Ocirc;": u"\u00D4", "Ocirc": u"\u00D4", "Ograve;": u"\u00D2", "Ograve": u"\u00D2", "Omega;": u"\u03A9", "Omicron;": u"\u039F", "Oslash;": u"\u00D8", "Oslash": u"\u00D8", "Otilde;": u"\u00D5", "Otilde": u"\u00D5", "Ouml;": u"\u00D6", "Ouml": u"\u00D6", "Phi;": u"\u03A6", "Pi;": u"\u03A0", "Prime;": u"\u2033", "Psi;": u"\u03A8", "QUOT;": u"\u0022", "QUOT": u"\u0022", "REG;": u"\u00AE", "REG": u"\u00AE", "Rho;": u"\u03A1", "Scaron;": u"\u0160", "Sigma;": u"\u03A3", "THORN;": u"\u00DE", "THORN": u"\u00DE", "TRADE;": u"\u2122", "Tau;": u"\u03A4", "Theta;": u"\u0398", "Uacute;": u"\u00DA", "Uacute": u"\u00DA", "Ucirc;": u"\u00DB", "Ucirc": u"\u00DB", "Ugrave;": u"\u00D9", "Ugrave": u"\u00D9", "Upsilon;": u"\u03A5", "Uuml;": u"\u00DC", "Uuml": u"\u00DC", "Xi;": u"\u039E", "Yacute;": u"\u00DD", "Yacute": u"\u00DD", "Yuml;": u"\u0178", "Zeta;": u"\u0396", "aacute;": u"\u00E1", "aacute": u"\u00E1", "acirc;": u"\u00E2", "acirc": u"\u00E2", "acute;": u"\u00B4", "acute": u"\u00B4", "aelig;": u"\u00E6", "aelig": u"\u00E6", "agrave;": u"\u00E0", "agrave": u"\u00E0", "alefsym;": u"\u2135", "alpha;": u"\u03B1", "amp;": u"\u0026", "amp": u"\u0026", "and;": u"\u2227", "ang;": u"\u2220", "apos;": u"\u0027", "aring;": u"\u00E5", "aring": u"\u00E5", "asymp;": u"\u2248", "atilde;": u"\u00E3", "atilde": u"\u00E3", "auml;": u"\u00E4", "auml": u"\u00E4", "bdquo;": u"\u201E", "beta;": u"\u03B2", "brvbar;": u"\u00A6", "brvbar": u"\u00A6", "bull;": u"\u2022", "cap;": u"\u2229", "ccedil;": u"\u00E7", "ccedil": u"\u00E7", "cedil;": u"\u00B8", "cedil": u"\u00B8", "cent;": u"\u00A2", "cent": u"\u00A2", "chi;": u"\u03C7", "circ;": u"\u02C6", "clubs;": u"\u2663", "cong;": u"\u2245", "copy;": u"\u00A9", "copy": u"\u00A9", "crarr;": u"\u21B5", "cup;": u"\u222A", "curren;": u"\u00A4", "curren": u"\u00A4", "dArr;": u"\u21D3", "dagger;": u"\u2020", "darr;": u"\u2193", "deg;": u"\u00B0", "deg": u"\u00B0", "delta;": u"\u03B4", "diams;": u"\u2666", "divide;": u"\u00F7", "divide": u"\u00F7", "eacute;": u"\u00E9", "eacute": u"\u00E9", "ecirc;": u"\u00EA", "ecirc": u"\u00EA", "egrave;": u"\u00E8", "egrave": u"\u00E8", "empty;": u"\u2205", "emsp;": u"\u2003", "ensp;": u"\u2002", "epsilon;": u"\u03B5", "equiv;": u"\u2261", "eta;": u"\u03B7", "eth;": u"\u00F0", "eth": u"\u00F0", "euml;": u"\u00EB", "euml": u"\u00EB", "euro;": u"\u20AC", "exist;": u"\u2203", "fnof;": u"\u0192", "forall;": u"\u2200", "frac12;": u"\u00BD", "frac12": u"\u00BD", "frac14;": u"\u00BC", "frac14": u"\u00BC", "frac34;": u"\u00BE", "frac34": u"\u00BE", "frasl;": u"\u2044", "gamma;": u"\u03B3", "ge;": u"\u2265", "gt;": u"\u003E", "gt": u"\u003E", "hArr;": u"\u21D4", "harr;": u"\u2194", "hearts;": u"\u2665", "hellip;": u"\u2026", "iacute;": u"\u00ED", "iacute": u"\u00ED", "icirc;": u"\u00EE", "icirc": u"\u00EE", "iexcl;": u"\u00A1", "iexcl": u"\u00A1", "igrave;": u"\u00EC", "igrave": u"\u00EC", "image;": u"\u2111", "infin;": u"\u221E", "int;": u"\u222B", "iota;": u"\u03B9", "iquest;": u"\u00BF", "iquest": u"\u00BF", "isin;": u"\u2208", "iuml;": u"\u00EF", "iuml": u"\u00EF", "kappa;": u"\u03BA", "lArr;": u"\u21D0", "lambda;": u"\u03BB", "lang;": u"\u3008", "laquo;": u"\u00AB", "laquo": u"\u00AB", "larr;": u"\u2190", "lceil;": u"\u2308", "ldquo;": u"\u201C", "le;": u"\u2264", "lfloor;": u"\u230A", "lowast;": u"\u2217", "loz;": u"\u25CA", "lrm;": u"\u200E", "lsaquo;": u"\u2039", "lsquo;": u"\u2018", "lt;": u"\u003C", "lt": u"\u003C", "macr;": u"\u00AF", "macr": u"\u00AF", "mdash;": u"\u2014", "micro;": u"\u00B5", "micro": u"\u00B5", "middot;": u"\u00B7", "middot": u"\u00B7", "minus;": u"\u2212", "mu;": u"\u03BC", "nabla;": u"\u2207", "nbsp;": u"\u00A0", "nbsp": u"\u00A0", "ndash;": u"\u2013", "ne;": u"\u2260", "ni;": u"\u220B", "not;": u"\u00AC", "not": u"\u00AC", "notin;": u"\u2209", "nsub;": u"\u2284", "ntilde;": u"\u00F1", "ntilde": u"\u00F1", "nu;": u"\u03BD", "oacute;": u"\u00F3", "oacute": u"\u00F3", "ocirc;": u"\u00F4", "ocirc": u"\u00F4", "oelig;": u"\u0153", "ograve;": u"\u00F2", "ograve": u"\u00F2", "oline;": u"\u203E", "omega;": u"\u03C9", "omicron;": u"\u03BF", "oplus;": u"\u2295", "or;": u"\u2228", "ordf;": u"\u00AA", "ordf": u"\u00AA", "ordm;": u"\u00BA", "ordm": u"\u00BA", "oslash;": u"\u00F8", "oslash": u"\u00F8", "otilde;": u"\u00F5", "otilde": u"\u00F5", "otimes;": u"\u2297", "ouml;": u"\u00F6", "ouml": u"\u00F6", "para;": u"\u00B6", "para": u"\u00B6", "part;": u"\u2202", "permil;": u"\u2030", "perp;": u"\u22A5", "phi;": u"\u03C6", "pi;": u"\u03C0", "piv;": u"\u03D6", "plusmn;": u"\u00B1", "plusmn": u"\u00B1", "pound;": u"\u00A3", "pound": u"\u00A3", "prime;": u"\u2032", "prod;": u"\u220F", "prop;": u"\u221D", "psi;": u"\u03C8", "quot;": u"\u0022", "quot": u"\u0022", "rArr;": u"\u21D2", "radic;": u"\u221A", "rang;": u"\u3009", "raquo;": u"\u00BB", "raquo": u"\u00BB", "rarr;": u"\u2192", "rceil;": u"\u2309", "rdquo;": u"\u201D", "real;": u"\u211C", "reg;": u"\u00AE", "reg": u"\u00AE", "rfloor;": u"\u230B", "rho;": u"\u03C1", "rlm;": u"\u200F", "rsaquo;": u"\u203A", "rsquo;": u"\u2019", "sbquo;": u"\u201A", "scaron;": u"\u0161", "sdot;": u"\u22C5", "sect;": u"\u00A7", "sect": u"\u00A7", "shy;": u"\u00AD", "shy": u"\u00AD", "sigma;": u"\u03C3", "sigmaf;": u"\u03C2", "sim;": u"\u223C", "spades;": u"\u2660", "sub;": u"\u2282", "sube;": u"\u2286", "sum;": u"\u2211", "sup1;": u"\u00B9", "sup1": u"\u00B9", "sup2;": u"\u00B2", "sup2": u"\u00B2", "sup3;": u"\u00B3", "sup3": u"\u00B3", "sup;": u"\u2283", "supe;": u"\u2287", "szlig;": u"\u00DF", "szlig": u"\u00DF", "tau;": u"\u03C4", "there4;": u"\u2234", "theta;": u"\u03B8", "thetasym;": u"\u03D1", "thinsp;": u"\u2009", "thorn;": u"\u00FE", "thorn": u"\u00FE", "tilde;": u"\u02DC", "times;": u"\u00D7", "times": u"\u00D7", "trade;": u"\u2122", "uArr;": u"\u21D1", "uacute;": u"\u00FA", "uacute": u"\u00FA", "uarr;": u"\u2191", "ucirc;": u"\u00FB", "ucirc": u"\u00FB", "ugrave;": u"\u00F9", "ugrave": u"\u00F9", "uml;": u"\u00A8", "uml": u"\u00A8", "upsih;": u"\u03D2", "upsilon;": u"\u03C5", "uuml;": u"\u00FC", "uuml": u"\u00FC", "weierp;": u"\u2118", "xi;": u"\u03BE", "yacute;": u"\u00FD", "yacute": u"\u00FD", "yen;": u"\u00A5", "yen": u"\u00A5", "yuml;": u"\u00FF", "yuml": u"\u00FF", "zeta;": u"\u03B6", "zwj;": u"\u200D", "zwnj;": u"\u200C" } encodings = frozenset(( "ansi_x3.4-1968", "iso-ir-6", "ansi_x3.4-1986", "iso_646.irv:1991", "ascii", "iso646-us", "us-ascii", "us", "ibm367", "cp367", "csascii", "ks_c_5601-1987", "korean", "iso-2022-kr", "csiso2022kr", "euc-kr", "iso-2022-jp", "csiso2022jp", "iso-2022-jp-2", "iso-ir-58", "chinese", "csiso58gb231280", "iso_8859-1:1987", "iso-ir-100", "iso_8859-1", "iso-8859-1", "latin1", "l1", "ibm819", "cp819", "csisolatin1", "iso_8859-2:1987", "iso-ir-101", "iso_8859-2", "iso-8859-2", "latin2", "l2", "csisolatin2", "iso_8859-3:1988", "iso-ir-109", "iso_8859-3", "iso-8859-3", "latin3", "l3", "csisolatin3", "iso_8859-4:1988", "iso-ir-110", "iso_8859-4", "iso-8859-4", "latin4", "l4", "csisolatin4", "iso_8859-6:1987", "iso-ir-127", "iso_8859-6", "iso-8859-6", "ecma-114", "asmo-708", "arabic", "csisolatinarabic", "iso_8859-7:1987", "iso-ir-126", "iso_8859-7", "iso-8859-7", "elot_928", "ecma-118", "greek", "greek8", "csisolatingreek", "iso_8859-8:1988", "iso-ir-138", "iso_8859-8", "iso-8859-8", "hebrew", "csisolatinhebrew", "iso_8859-5:1988", "iso-ir-144", "iso_8859-5", "iso-8859-5", "cyrillic", "csisolatincyrillic", "iso_8859-9:1989", "iso-ir-148", "iso_8859-9", "iso-8859-9", "latin5", "l5", "csisolatin5", "iso-8859-10", "iso-ir-157", "l6", "iso_8859-10:1992", "csisolatin6", "latin6", "hp-roman8", "roman8", "r8", "ibm037", "cp037", "csibm037", "ibm424", "cp424", "csibm424", "ibm437", "cp437", "437", "cspc8codepage437", "ibm500", "cp500", "csibm500", "ibm775", "cp775", "cspc775baltic", "ibm850", "cp850", "850", "cspc850multilingual", "ibm852", "cp852", "852", "cspcp852", "ibm855", "cp855", "855", "csibm855", "ibm857", "cp857", "857", "csibm857", "ibm860", "cp860", "860", "csibm860", "ibm861", "cp861", "861", "cp-is", "csibm861", "ibm862", "cp862", "862", "cspc862latinhebrew", "ibm863", "cp863", "863", "csibm863", "ibm864", "cp864", "csibm864", "ibm865", "cp865", "865", "csibm865", "ibm866", "cp866", "866", "csibm866", "ibm869", "cp869", "869", "cp-gr", "csibm869", "ibm1026", "cp1026", "csibm1026", "koi8-r", "cskoi8r", "koi8-u", "big5-hkscs", "ptcp154", "csptcp154", "pt154", "cp154", "utf-7", "utf-16be", "utf-16le", "utf-16", "utf-8", "iso-8859-13", "iso-8859-14", "iso-ir-199", "iso_8859-14:1998", "iso_8859-14", "latin8", "iso-celtic", "l8", "iso-8859-15", "iso_8859-15", "iso-8859-16", "iso-ir-226", "iso_8859-16:2001", "iso_8859-16", "latin10", "l10", "gbk", "cp936", "ms936", "gb18030", "shift_jis", "ms_kanji", "csshiftjis", "euc-jp", "gb2312", "big5", "csbig5", "windows-1250", "windows-1251", "windows-1252", "windows-1253", "windows-1254", "windows-1255", "windows-1256", "windows-1257", "windows-1258", "tis-620", "hz-gb-2312", ))