encoding.py 1.1 KB

123456789101112131415161718192021222324252627282930313233343536
  1. import codecs
  2. import locale
  3. import re
  4. import sys
  5. from typing import List, Tuple
  6. BOMS: List[Tuple[bytes, str]] = [
  7. (codecs.BOM_UTF8, "utf-8"),
  8. (codecs.BOM_UTF16, "utf-16"),
  9. (codecs.BOM_UTF16_BE, "utf-16-be"),
  10. (codecs.BOM_UTF16_LE, "utf-16-le"),
  11. (codecs.BOM_UTF32, "utf-32"),
  12. (codecs.BOM_UTF32_BE, "utf-32-be"),
  13. (codecs.BOM_UTF32_LE, "utf-32-le"),
  14. ]
  15. ENCODING_RE = re.compile(rb"coding[:=]\s*([-\w.]+)")
  16. def auto_decode(data: bytes) -> str:
  17. """Check a bytes string for a BOM to correctly detect the encoding
  18. Fallback to locale.getpreferredencoding(False) like open() on Python3"""
  19. for bom, encoding in BOMS:
  20. if data.startswith(bom):
  21. return data[len(bom) :].decode(encoding)
  22. # Lets check the first two lines as in PEP263
  23. for line in data.split(b"\n")[:2]:
  24. if line[0:1] == b"#" and ENCODING_RE.search(line):
  25. result = ENCODING_RE.search(line)
  26. assert result is not None
  27. encoding = result.groups()[0].decode("ascii")
  28. return data.decode(encoding)
  29. return data.decode(
  30. locale.getpreferredencoding(False) or sys.getdefaultencoding(),
  31. )