149 lines
5.1 KiB
Python
149 lines
5.1 KiB
Python
"""
|
|
This module contains the list of possible encodings, taken from
|
|
the standard library documentation:
|
|
https://docs.python.org/3.4/library/codecs.html#standard-encodings
|
|
|
|
"""
|
|
|
|
#: Encodings map, map a codec name to a an alias/language pair.#:
|
|
ENCODINGS_MAP = {
|
|
"ascii": ("US-ASCII", "English"),
|
|
"big5": ("Big5", "Chinese traditional"),
|
|
"big5hkscs": ("Big5-HKSCS", "Chinese traditional"),
|
|
"cp037": ("IBM037", "English"),
|
|
"cp424": ("IBM242", "Hebrew"),
|
|
"cp437": ("IBM437", "English"),
|
|
"cp500": ("IBM500", "Western"),
|
|
"cp737": ("IBM737", "Greek"),
|
|
"cp775": ("IBM775", "Baltic"),
|
|
"cp850": ("IBM850", "Western"),
|
|
"cp852": ("IBM852", "Central European"),
|
|
"cp855": ("IBM855", "Cyrillic"),
|
|
"cp856": ("IBM856", "Hebrew"),
|
|
"cp857": ("IBM857", "Turkish"),
|
|
"cp860": ("IBM860", "Portugese"),
|
|
"cp861": ("IBM861", "Icelandic"),
|
|
"cp862": ("IBM862", "Hebrew"),
|
|
"cp863": ("IBM863", "Canadian"),
|
|
"cp864": ("IBM864", "Arabic"),
|
|
"cp865": ("IBM865", "Nordic"),
|
|
"cp866": ("IBM866", "Russian"),
|
|
"cp869": ("IBM869", "Greek"),
|
|
"cp874": ("IBM874", "Thai"),
|
|
"cp875": ("IBM875", "Greek"),
|
|
"cp932": ("IBM932", "Japanese"),
|
|
"cp949": ("IBM949", "Korean"),
|
|
"cp950": ("IBM950", "Chinese traditional"),
|
|
"cp1006": ("IBM1006", "Urdu"),
|
|
"cp1026": ("IBM1026", "Turkish"),
|
|
"cp1140": ("IBM1140", "Western"),
|
|
"cp1250": ("windows-1250", "Central European"),
|
|
"cp1251": ("windows-1251", "Cyrillic"),
|
|
"cp1252": ("windows-1252", "Western"),
|
|
"cp1253": ("windows-1253", "Greek"),
|
|
"cp1254": ("windows-1254", " Turkish"),
|
|
"cp1255": ("windows-1255", "Hebrew"),
|
|
"cp1256": ("windows-1256", " Arabic"),
|
|
"cp1257": ("windows-1257", "Baltic"),
|
|
"cp1258": ("windows-1258", "Vietnamese"),
|
|
"euc_jp": ("EUC-JP", "Japanese"),
|
|
"euc_jis_2004": ("EUC-JIS-2004", "Japanese"),
|
|
"euc_jisx0213": ("EUC-JISX0213", "Japanese"),
|
|
"euc_kr": ("EUC-KR", "Korean"),
|
|
"gb2312": ("GB2312", "Chinese simplified"),
|
|
"gbk": ("GBK", "Chinese unified"),
|
|
"gb18030": ("GB18030", "Chinese unified"),
|
|
"hz": ("HZ", "Chinese simplified"),
|
|
"iso2022_jp": ("ISO-2022-JP", "Japanese"),
|
|
"iso2022_jp_1": ("ISO-2022-JP-1", "Japanese"),
|
|
"iso2022_jp_2": ("ISO-2022-JP-2", "Japanese"),
|
|
"iso2022_jp_2004": ("ISO-2022-JP-2004", "Japanese"),
|
|
"iso2022_jp_3": ("ISO-2022-JP-3", "Japanese"),
|
|
"iso2022_jp_ext": ("ISO-2022-JP-EXT", "Japanese"),
|
|
"iso2022_kr": ("ISO-2022-KR", "Korean"),
|
|
"latin_1": ("ISO-8859-1", "Western"),
|
|
"iso8859_2": ("ISO-8859-2", "Central European"),
|
|
"iso8859_3": ("ISO-8859-3", "South European"),
|
|
"iso8859_4": ("ISO-8859-4", "Baltic"),
|
|
"iso8859_5": ("ISO-8859-5", "Cyrillic"),
|
|
"iso8859_6": ("ISO-8859-6", "Arabic"),
|
|
"iso8859_7": ("ISO-8859-7", "Greek"),
|
|
"iso8859_8": ("ISO-8859-8", "Hebrew"),
|
|
"iso8859_9": ("ISO-8859-9", "Turkish"),
|
|
"iso8859_10": ("ISO-8859-10", "Nordic"),
|
|
"iso8859_13": ("ISO-8859-13", "Baltic"),
|
|
"iso8859_14": ("ISO-8859-14", "Celtic"),
|
|
"iso8859_15": ("ISO-8859-15", "Western"),
|
|
"johab": ("Johab", "Korean"),
|
|
"koi8_r": ("KOI8-R", "Russian"),
|
|
"koi8_u": ("KOI8-U", "Ukrainian"),
|
|
"mac_cyrillic": ("MacCyrillic", "Cyrillic"),
|
|
"mac_greek": ("MacGreek", "Greek"),
|
|
"mac_iceland": ("MacIceland", "Icelandic"),
|
|
"mac_latin2": ("MacCentralEurope", "Central European"),
|
|
"mac_roman": ("MacRoman", "Western"),
|
|
"mac_turkish": ("MacTurkish", "Turkish"),
|
|
"ptcp154": ("PTCP154", "Cyrillic Asian"),
|
|
"shift_jis": ("Shift_JIS", "Japanese"),
|
|
"shift_jis_2004": ("Shift_JIS-2004", "Japanese"),
|
|
"shift_jisx0213": ("Shift_JISX0213", "Japanese"),
|
|
"utf_16": ("UTF-16", "Unicode"),
|
|
"utf_16_be": ("UTF-16BE", "Unicode"),
|
|
"utf_16_le": ("UTF-16LE", "Unicode"),
|
|
"utf_7": ("UTF-7", "Unicode"),
|
|
"utf_8": ("UTF-8", "Unicode")
|
|
}
|
|
|
|
|
|
def convert_to_codec_key(value):
|
|
"""
|
|
Normalize code key value (encoding codecs must be lower case and must
|
|
not contain any dashes).
|
|
|
|
:param value: value to convert.
|
|
"""
|
|
if not value:
|
|
# fallback to utf-8
|
|
value = 'UTF-8'
|
|
# UTF-8 -> utf_8
|
|
converted = value.replace('-', '_').lower()
|
|
# fix some corner cases, see https://github.com/pyQode/pyQode/issues/11
|
|
all_aliases = {
|
|
'ascii': [
|
|
'us_ascii',
|
|
'us',
|
|
'ansi_x3.4_1968',
|
|
'cp367',
|
|
'csascii',
|
|
'ibm367',
|
|
'iso_ir_6',
|
|
'iso646_us',
|
|
'iso_646.irv:1991'
|
|
],
|
|
'utf-7': [
|
|
'csunicode11utf7',
|
|
'unicode_1_1_utf_7',
|
|
'unicode_2_0_utf_7',
|
|
'x_unicode_1_1_utf_7',
|
|
'x_unicode_2_0_utf_7',
|
|
],
|
|
'utf_8': [
|
|
'unicode_1_1_utf_8',
|
|
'unicode_2_0_utf_8',
|
|
'x_unicode_1_1_utf_8',
|
|
'x_unicode_2_0_utf_8',
|
|
],
|
|
'utf_16': [
|
|
'utf_16le',
|
|
'ucs_2',
|
|
'unicode',
|
|
'iso_10646_ucs2'
|
|
],
|
|
'latin_1': ['iso_8859_1']
|
|
}
|
|
|
|
for key, aliases in all_aliases.items():
|
|
if converted in aliases:
|
|
return key
|
|
return converted
|