This is my implementation to convert any kind of encoding to UTF-8 without BOM and replacing windows enlines by universal format:
def utf8_converter(file_path, universal_endline=True):
'''
Convert any type of file to UTF-8 without BOM
and using universal endline by default.
Parameters
----------
file_path : string, file path.
universal_endline : boolean (True),
by default convert endlines to universal format.
'''
# Fix file path
file_path = os.path.realpath(os.path.expanduser(file_path))
# Read from file
file_open = open(file_path)
raw = file_open.read()
file_open.close()
# Decode
raw = raw.decode(chardet.detect(raw)['encoding'])
# Remove windows end line
if universal_endline:
raw = raw.replace('\r\n', '\n')
# Encode to UTF-8
raw = raw.encode('utf8')
# Remove BOM
if raw.startswith(codecs.BOM_UTF8):
raw = raw.replace(codecs.BOM_UTF8, '', 1)
# Write to file
file_open = open(file_path, 'w')
file_open.write(raw)
file_open.close()
return 0