As much as I hate to rely on Windows Excel proprietary software, which is not cross-platform, my testing of csvkit
for .xls, which uses xlrd
under the hood, failed to correctly parse dates (even when using the commandline parameters to specify strptime format).
For example, this xls file, when parsed with csvkit
, will convert cell G1
of 12/31/2002
to 37621
, whereas when converted to csv via excel -> save_as (using below) cell G1
will be "December 31, 2002"
.
import re
import os
from win32com.client import Dispatch
xlCSVMSDOS = 24
class CsvConverter(object):
def __init__(self, *, input_dir, output_dir):
self._excel = None
self.input_dir = input_dir
self.output_dir = output_dir
if not os.path.isdir(self.output_dir):
os.makedirs(self.output_dir)
def isSheetEmpty(self, sheet):
# https://archive.is/RuxR7
# WorksheetFunction.CountA(ActiveSheet.UsedRange) = 0 And ActiveSheet.Shapes.Count = 0
return \
(not self._excel.WorksheetFunction.CountA(sheet.UsedRange)) \
and \
(not sheet.Shapes.Count)
def getNonEmptySheets(self, wb, as_name=False):
return [ \
(sheet.Name if as_name else sheet) \
for sheet in wb.Sheets \
if not self.isSheetEmpty(sheet) \
]
def saveWorkbookAsCsv(self, wb, csv_path):
non_empty_sheet_names = self.getNonEmptySheets(wb, as_name=True)
assert (len(non_empty_sheet_names) == 1), \
"Expected exactly 1 sheet but found %i non-empty sheets: '%s'" \
%(
len(non_empty_sheet_names),
"', '".join(name.replace("'", r"\'") for name in non_empty_sheet_names)
)
wb.Worksheets(non_empty_sheet_names[0]).SaveAs(csv_path, xlCSVMSDOS)
wb.Saved = 1
def isXlsFilename(self, filename):
return bool(re.search(r'(?i)\.xls$', filename))
def batchConvertXlsToCsv(self):
xls_names = tuple( filename for filename in next(os.walk(self.input_dir))[2] if self.isXlsFilename(filename) )
self._excel = Dispatch('Excel.Application')
try:
for xls_name in xls_names:
csv_path = os.path.join(self.output_dir, '%s.csv' %os.path.splitext(xls_name)[0])
if not os.path.isfile(csv_path):
workbook = self._excel.Workbooks.Open(os.path.join(self.input_dir, xls_name))
try:
self.saveWorkbookAsCsv(workbook, csv_path)
finally:
workbook.Close()
finally:
if not len(self._excel.Workbooks):
self._excel.Quit()
self._excel = None
if __name__ == '__main__':
self = CsvConverter(
input_dir='C:\\data\\xls\\',
output_dir='C:\\data\\csv\\'
)
self.batchConvertXlsToCsv()
The above will take an input_dir
containing .xls and output them to output_dir
as .csv -- it will assert
that there is exactly 1 non-empty sheet in the .xls; if you need to handle multiple sheets into multiple csv then you'll need to edit saveWorkbookAsCsv
.