[python] Importing a CSV file into a sqlite3 database table using Python

I have a CSV file and I want to bulk-import this file into my sqlite3 database using Python. the command is ".import .....". but it seems that it cannot work like this. Can anyone give me an example of how to do it in sqlite3? I am using windows just in case. Thanks

This question is related to python database csv sqlite

The answer is


With this you can do joins on CSVs as well:

import sqlite3
import os
import pandas as pd
from typing import List

class CSVDriver:
    def __init__(self, table_dir_path: str):
        self.table_dir_path = table_dir_path  # where tables (ie. csv files) are located
        self._con = None

    @property
    def con(self) -> sqlite3.Connection:
        """Make a singleton connection to an in-memory SQLite database"""
        if not self._con:
            self._con = sqlite3.connect(":memory:")
        return self._con
    
    def _exists(self, table: str) -> bool:
        query = """
        SELECT name
        FROM sqlite_master 
        WHERE type ='table'
        AND name NOT LIKE 'sqlite_%';
        """
        tables = self.con.execute(query).fetchall()
        return table in tables

    def _load_table_to_mem(self, table: str, sep: str = None) -> None:
        """
        Load a CSV into an in-memory SQLite database
        sep is set to None in order to force pandas to auto-detect the delimiter
        """
        if self._exists(table):
            return
        file_name = table + ".csv"
        path = os.path.join(self.table_dir_path, file_name)
        if not os.path.exists(path):
            raise ValueError(f"CSV table {table} does not exist in {self.table_dir_path}")
        df = pd.read_csv(path, sep=sep, engine="python")  # set engine to python to skip pandas' warning
        df.to_sql(table, self.con, if_exists='replace', index=False, chunksize=10000)

    def query(self, query: str) -> List[tuple]:
        """
        Run an SQL query on CSV file(s). 
        Tables are loaded from table_dir_path
        """
        tables = extract_tables(query)
        for table in tables:
            self._load_table_to_mem(table)
        cursor = self.con.cursor()
        cursor.execute(query)
        records = cursor.fetchall()
        return records

extract_tables():

import sqlparse
from sqlparse.sql import IdentifierList, Identifier,  Function
from sqlparse.tokens import Keyword, DML
from collections import namedtuple
import itertools

class Reference(namedtuple('Reference', ['schema', 'name', 'alias', 'is_function'])):
    __slots__ = ()

    def has_alias(self):
        return self.alias is not None

    @property
    def is_query_alias(self):
        return self.name is None and self.alias is not None

    @property
    def is_table_alias(self):
        return self.name is not None and self.alias is not None and not self.is_function

    @property
    def full_name(self):
        if self.schema is None:
            return self.name
        else:
            return self.schema + '.' + self.name

def _is_subselect(parsed):
    if not parsed.is_group:
        return False
    for item in parsed.tokens:
        if item.ttype is DML and item.value.upper() in ('SELECT', 'INSERT',
                                                        'UPDATE', 'CREATE', 'DELETE'):
            return True
    return False


def _identifier_is_function(identifier):
    return any(isinstance(t, Function) for t in identifier.tokens)


def _extract_from_part(parsed):
    tbl_prefix_seen = False
    for item in parsed.tokens:
        if item.is_group:
            for x in _extract_from_part(item):
                yield x
        if tbl_prefix_seen:
            if _is_subselect(item):
                for x in _extract_from_part(item):
                    yield x
            # An incomplete nested select won't be recognized correctly as a
            # sub-select. eg: 'SELECT * FROM (SELECT id FROM user'. This causes
            # the second FROM to trigger this elif condition resulting in a
            # StopIteration. So we need to ignore the keyword if the keyword
            # FROM.
            # Also 'SELECT * FROM abc JOIN def' will trigger this elif
            # condition. So we need to ignore the keyword JOIN and its variants
            # INNER JOIN, FULL OUTER JOIN, etc.
            elif item.ttype is Keyword and (
                    not item.value.upper() == 'FROM') and (
                    not item.value.upper().endswith('JOIN')):
                tbl_prefix_seen = False
            else:
                yield item
        elif item.ttype is Keyword or item.ttype is Keyword.DML:
            item_val = item.value.upper()
            if (item_val in ('COPY', 'FROM', 'INTO', 'UPDATE', 'TABLE') or
                    item_val.endswith('JOIN')):
                tbl_prefix_seen = True
        # 'SELECT a, FROM abc' will detect FROM as part of the column list.
        # So this check here is necessary.
        elif isinstance(item, IdentifierList):
            for identifier in item.get_identifiers():
                if (identifier.ttype is Keyword and
                        identifier.value.upper() == 'FROM'):
                    tbl_prefix_seen = True
                    break


def _extract_table_identifiers(token_stream):
    for item in token_stream:
        if isinstance(item, IdentifierList):
            for ident in item.get_identifiers():
                try:
                    alias = ident.get_alias()
                    schema_name = ident.get_parent_name()
                    real_name = ident.get_real_name()
                except AttributeError:
                    continue
                if real_name:
                    yield Reference(schema_name, real_name,
                                    alias, _identifier_is_function(ident))
        elif isinstance(item, Identifier):
            yield Reference(item.get_parent_name(), item.get_real_name(),
                            item.get_alias(), _identifier_is_function(item))
        elif isinstance(item, Function):
            yield Reference(item.get_parent_name(), item.get_real_name(),
                            item.get_alias(), _identifier_is_function(item))


def extract_tables(sql):
    # let's handle multiple statements in one sql string
    extracted_tables = []
    statements = list(sqlparse.parse(sql))
    for statement in statements:
        stream = _extract_from_part(statement)
        extracted_tables.append([ref.name for ref in _extract_table_identifiers(stream)])
    return list(itertools.chain(*extracted_tables))

Example (assuming account.csv and tojoin.csv exist in /path/to/files):

db_path = r"/path/to/files"
driver = CSVDriver(db_path)
query = """
SELECT tojoin.col_to_join 
FROM account
LEFT JOIN tojoin
ON account.a = tojoin.a
"""
driver.query(query)

The following can also add fields' name based on the CSV header:

import sqlite3

def csv_sql(file_dir,table_name,database_name):
    con = sqlite3.connect(database_name)
    cur = con.cursor()
    # Drop the current table by: 
    # cur.execute("DROP TABLE IF EXISTS %s;" % table_name)

    with open(file_dir, 'r') as fl:
        hd = fl.readline()[:-1].split(',')
        ro = fl.readlines()
        db = [tuple(ro[i][:-1].split(',')) for i in range(len(ro))]

    header = ','.join(hd)
    cur.execute("CREATE TABLE IF NOT EXISTS %s (%s);" % (table_name,header))
    cur.executemany("INSERT INTO %s (%s) VALUES (%s);" % (table_name,header,('?,'*len(hd))[:-1]), db)
    con.commit()
    con.close()

# Example:
csv_sql('./surveys.csv','survey','eco.db')

I've found that it can be necessary to break up the transfer of data from the csv to the database in chunks as to not run out of memory. This can be done like this:

import csv
import sqlite3
from operator import itemgetter

# Establish connection
conn = sqlite3.connect("mydb.db")

# Create the table 
conn.execute(
    """
    CREATE TABLE persons(
        person_id INTEGER,
        last_name TEXT, 
        first_name TEXT, 
        address TEXT
    )
    """
)

# These are the columns from the csv that we want
cols = ["person_id", "last_name", "first_name", "address"]

# If the csv file is huge, we instead add the data in chunks
chunksize = 10000

# Parse csv file and populate db in chunks
with conn, open("persons.csv") as f:
    reader = csv.DictReader(f)

    chunk = []
    for i, row in reader: 

        if i % chunksize == 0 and i > 0:
            conn.executemany(
                """
                INSERT INTO persons
                    VALUES(?, ?, ?, ?)
                """, chunk
            )
            chunk = []

        items = itemgetter(*cols)(row)
        chunk.append(items)


Many thanks for bernie's answer! Had to tweak it a bit - here's what worked for me:

import csv, sqlite3
conn = sqlite3.connect("pcfc.sl3")
curs = conn.cursor()
curs.execute("CREATE TABLE PCFC (id INTEGER PRIMARY KEY, type INTEGER, term TEXT, definition TEXT);")
reader = csv.reader(open('PC.txt', 'r'), delimiter='|')
for row in reader:
    to_db = [unicode(row[0], "utf8"), unicode(row[1], "utf8"), unicode(row[2], "utf8")]
    curs.execute("INSERT INTO PCFC (type, term, definition) VALUES (?, ?, ?);", to_db)
conn.commit()

My text file (PC.txt) looks like this:

1 | Term 1 | Definition 1
2 | Term 2 | Definition 2
3 | Term 3 | Definition 3

#!/usr/bin/python
# -*- coding: utf-8 -*-

import sys, csv, sqlite3

def main():
    con = sqlite3.connect(sys.argv[1]) # database file input
    cur = con.cursor()
    cur.executescript("""
        DROP TABLE IF EXISTS t;
        CREATE TABLE t (COL1 TEXT, COL2 TEXT);
        """) # checks to see if table exists and makes a fresh table.

    with open(sys.argv[2], "rb") as f: # CSV file input
        reader = csv.reader(f, delimiter=',') # no header information with delimiter
        for row in reader:
            to_db = [unicode(row[0], "utf8"), unicode(row[1], "utf8")] # Appends data from CSV file representing and handling of text
            cur.execute("INSERT INTO neto (COL1, COL2) VALUES(?, ?);", to_db)
            con.commit()
    con.close() # closes connection to database

if __name__=='__main__':
    main()

import csv, sqlite3

def _get_col_datatypes(fin):
    dr = csv.DictReader(fin) # comma is default delimiter
    fieldTypes = {}
    for entry in dr:
        feildslLeft = [f for f in dr.fieldnames if f not in fieldTypes.keys()]        
        if not feildslLeft: break # We're done
        for field in feildslLeft:
            data = entry[field]

        # Need data to decide
        if len(data) == 0:
            continue

        if data.isdigit():
            fieldTypes[field] = "INTEGER"
        else:
            fieldTypes[field] = "TEXT"
    # TODO: Currently there's no support for DATE in sqllite

if len(feildslLeft) > 0:
    raise Exception("Failed to find all the columns data types - Maybe some are empty?")

return fieldTypes


def escapingGenerator(f):
    for line in f:
        yield line.encode("ascii", "xmlcharrefreplace").decode("ascii")


def csvToDb(csvFile,dbFile,tablename, outputToFile = False):

    # TODO: implement output to file

    with open(csvFile,mode='r', encoding="ISO-8859-1") as fin:
        dt = _get_col_datatypes(fin)

        fin.seek(0)

        reader = csv.DictReader(fin)

        # Keep the order of the columns name just as in the CSV
        fields = reader.fieldnames
        cols = []

        # Set field and type
        for f in fields:
            cols.append("\"%s\" %s" % (f, dt[f]))

        # Generate create table statement:
        stmt = "create table if not exists \"" + tablename + "\" (%s)" % ",".join(cols)
        print(stmt)
        con = sqlite3.connect(dbFile)
        cur = con.cursor()
        cur.execute(stmt)

        fin.seek(0)


        reader = csv.reader(escapingGenerator(fin))

        # Generate insert statement:
        stmt = "INSERT INTO \"" + tablename + "\" VALUES(%s);" % ','.join('?' * len(cols))

        cur.executemany(stmt, reader)
        con.commit()
        con.close()

You're right that .import is the way to go, but that's a command from the SQLite3.exe shell. A lot of the top answers to this question involve native python loops, but if your files are large (mine are 10^6 to 10^7 records), you want to avoid reading everything into pandas or using a native python list comprehension/loop (though I did not time them for comparison).

For large files, I believe the best option is to create the empty table in advance using sqlite3.execute("CREATE TABLE..."), strip the headers from your CSV files, and then use subprocess.run() to execute sqlite's import statement. Since the last part is I believe the most pertinent, I will start with that.

subprocess.run()

from pathlib import Path
db_name = Path('my.db').resolve()
csv_file = Path('file.csv').resolve()
result = subprocess.run(['sqlite3',
                         str(db_name),
                         '-cmd',
                         '.mode csv',
                         '.import '+str(csv_file).replace('\\','\\\\')
                                 +' <table_name>'],
                        capture_output=True)

Explanation
From the command line, the command you're looking for is sqlite3 my.db -cmd ".mode csv" ".import file.csv table". subprocess.run() runs a command line process. The argument to subprocess.run() is a sequence of strings which are interpreted as a command followed by all of it's arguments.

  • sqlite3 my.db opens the database
  • -cmd flag after the database allows you to pass multiple follow on commands to the sqlite program. In the shell, each command has to be in quotes, but here, they just need to be their own element of the sequence
  • '.mode csv' does what you'd expect
  • '.import '+str(csv_file).replace('\\','\\\\')+' <table_name>' is the import command.
    Unfortunately, since subprocess passes all follow-ons to -cmd as quoted strings, you need to double up your backslashes if you have a windows directory path.

Stripping Headers

Not really the main point of the question, but here's what I used. Again, I didn't want to read the whole files into memory at any point:

with open(csv, "r") as source:
    source.readline()
    with open(str(csv)+"_nohead", "w") as target:
        shutil.copyfileobj(source, target)


in the interest of simplicity, you could use the sqlite3 command line tool from the Makefile of your project.

%.sql3: %.csv
    rm -f $@
    sqlite3 $@ -echo -cmd ".mode csv" ".import $< $*"
%.dump: %.sql3
    sqlite3 $< "select * from $*"

make test.sql3 then creates the sqlite database from an existing test.csv file, with a single table "test". you can then make test.dump to verify the contents.


Creating an sqlite connection to a file on disk is left as an exercise for the reader ... but there is now a two-liner made possible by the pandas library

df = pandas.read_csv(csvfile)
df.to_sql(table_name, conn, if_exists='append', index=False)

Here are solutions that'll work if your CSV file is really big. Use to_sql as suggested by another answer, but set chunksize so it doesn't try to process the whole file at once.

import sqlite3
import pandas as pd

conn = sqlite3.connect('my_data.db')
c = conn.cursor()
users = pd.read_csv('users.csv')
users.to_sql('users', conn, if_exists='append', index = False, chunksize = 10000)

You can also use Dask, as described here to write a lot of Pandas DataFrames in parallel:

dto_sql = dask.delayed(pd.DataFrame.to_sql)
out = [dto_sql(d, 'table_name', db_url, if_exists='append', index=True)
       for d in ddf.to_delayed()]
dask.compute(*out)

See here for more details.


The .import command is a feature of the sqlite3 command-line tool. To do it in Python, you should simply load the data using whatever facilities Python has, such as the csv module, and inserting the data as per usual.

This way, you also have control over what types are inserted, rather than relying on sqlite3's seemingly undocumented behaviour.


My 2 cents (more generic):

import csv, sqlite3
import logging

def _get_col_datatypes(fin):
    dr = csv.DictReader(fin) # comma is default delimiter
    fieldTypes = {}
    for entry in dr:
        feildslLeft = [f for f in dr.fieldnames if f not in fieldTypes.keys()]
        if not feildslLeft: break # We're done
        for field in feildslLeft:
            data = entry[field]

            # Need data to decide
            if len(data) == 0:
                continue

            if data.isdigit():
                fieldTypes[field] = "INTEGER"
            else:
                fieldTypes[field] = "TEXT"
        # TODO: Currently there's no support for DATE in sqllite

    if len(feildslLeft) > 0:
        raise Exception("Failed to find all the columns data types - Maybe some are empty?")

    return fieldTypes


def escapingGenerator(f):
    for line in f:
        yield line.encode("ascii", "xmlcharrefreplace").decode("ascii")


def csvToDb(csvFile, outputToFile = False):
    # TODO: implement output to file

    with open(csvFile,mode='r', encoding="ISO-8859-1") as fin:
        dt = _get_col_datatypes(fin)

        fin.seek(0)

        reader = csv.DictReader(fin)

        # Keep the order of the columns name just as in the CSV
        fields = reader.fieldnames
        cols = []

        # Set field and type
        for f in fields:
            cols.append("%s %s" % (f, dt[f]))

        # Generate create table statement:
        stmt = "CREATE TABLE ads (%s)" % ",".join(cols)

        con = sqlite3.connect(":memory:")
        cur = con.cursor()
        cur.execute(stmt)

        fin.seek(0)


        reader = csv.reader(escapingGenerator(fin))

        # Generate insert statement:
        stmt = "INSERT INTO ads VALUES(%s);" % ','.join('?' * len(cols))

        cur.executemany(stmt, reader)
        con.commit()

    return con

You can do this using blaze & odo efficiently

import blaze as bz
csv_path = 'data.csv'
bz.odo(csv_path, 'sqlite:///data.db::data')

Odo will store the csv file to data.db (sqlite database) under the schema data

Or you use odo directly, without blaze. Either ways is fine. Read this documentation


If the CSV file must be imported as part of a python program, then for simplicity and efficiency, you could use os.system along the lines suggested by the following:

import os

cmd = """sqlite3 database.db <<< ".import input.csv mytable" """

rc = os.system(cmd)

print(rc)

The point is that by specifying the filename of the database, the data will automatically be saved, assuming there are no errors reading it.


Based on Guy L solution (Love it) but can handle escaped fields.

import csv, sqlite3

def _get_col_datatypes(fin):
    dr = csv.DictReader(fin) # comma is default delimiter
    fieldTypes = {}
    for entry in dr:
        feildslLeft = [f for f in dr.fieldnames if f not in fieldTypes.keys()]        
        if not feildslLeft: break # We're done
        for field in feildslLeft:
            data = entry[field]

            # Need data to decide
            if len(data) == 0:
                continue

            if data.isdigit():
                fieldTypes[field] = "INTEGER"
            else:
                fieldTypes[field] = "TEXT"
        # TODO: Currently there's no support for DATE in sqllite

    if len(feildslLeft) > 0:
        raise Exception("Failed to find all the columns data types - Maybe some are empty?")

    return fieldTypes


def escapingGenerator(f):
    for line in f:
        yield line.encode("ascii", "xmlcharrefreplace").decode("ascii")


def csvToDb(csvFile,dbFile,tablename, outputToFile = False):

    # TODO: implement output to file

    with open(csvFile,mode='r', encoding="ISO-8859-1") as fin:
        dt = _get_col_datatypes(fin)

        fin.seek(0)

        reader = csv.DictReader(fin)

        # Keep the order of the columns name just as in the CSV
        fields = reader.fieldnames
        cols = []

        # Set field and type
        for f in fields:
            cols.append("\"%s\" %s" % (f, dt[f]))

        # Generate create table statement:
        stmt = "create table if not exists \"" + tablename + "\" (%s)" % ",".join(cols)
        print(stmt)
        con = sqlite3.connect(dbFile)
        cur = con.cursor()
        cur.execute(stmt)

        fin.seek(0)


        reader = csv.reader(escapingGenerator(fin))

        # Generate insert statement:
        stmt = "INSERT INTO \"" + tablename + "\" VALUES(%s);" % ','.join('?' * len(cols))

        cur.executemany(stmt, reader)
        con.commit()
        con.close()

Examples related to python

programming a servo thru a barometer Is there a way to view two blocks of code from the same file simultaneously in Sublime Text? python variable NameError Why my regexp for hyphenated words doesn't work? Comparing a variable with a string python not working when redirecting from bash script is it possible to add colors to python output? Get Public URL for File - Google Cloud Storage - App Engine (Python) Real time face detection OpenCV, Python xlrd.biffh.XLRDError: Excel xlsx file; not supported Could not load dynamic library 'cudart64_101.dll' on tensorflow CPU-only installation

Examples related to database

Implement specialization in ER diagram phpMyAdmin - Error > Incorrect format parameter? Authentication plugin 'caching_sha2_password' cannot be loaded Room - Schema export directory is not provided to the annotation processor so we cannot export the schema SQL Query Where Date = Today Minus 7 Days MySQL Error: : 'Access denied for user 'root'@'localhost' SQL Server date format yyyymmdd How to create a foreign key in phpmyadmin WooCommerce: Finding the products in database TypeError: tuple indices must be integers, not str

Examples related to csv

Pandas: ValueError: cannot convert float NaN to integer Export result set on Dbeaver to CSV Convert txt to csv python script How to import an Excel file into SQL Server? "CSV file does not exist" for a filename with embedded quotes Save Dataframe to csv directly to s3 Python Data-frame Object has no Attribute (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape How to write to a CSV line by line? How to check encoding of a CSV file

Examples related to sqlite

getting " (1) no such column: _id10 " error Laravel: PDOException: could not find driver auto create database in Entity Framework Core How to open .SQLite files Accessing an SQLite Database in Swift When does SQLiteOpenHelper onCreate() / onUpgrade() run? Attempt to write a readonly database - Django w/ SELinux error Android sqlite how to check if a record exists How can I add the sqlite3 module to Python? "Insert if not exists" statement in SQLite