We used what is below to load 12 million rows into a laptop MySQL database in 48 minutes. I’m sure it can be improved, but we were initially OK with this result.
import gzip, os, sys, string, mysql.connector
fd = gzip.open(os.path.join("c:\\users\\showard\\downloads\\orders.txt.gz"))
conn = mysql.connector.connect(host='127.0.0.1', user='root', passwd='******', db='test')
curInsert = conn.cursor()
stmt = "insert into dcspp_order values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
rows = []
k = 0
for l in fd:
l = l.decode()
t = l.split(',')
if len(t) == 22 and t[0].find("orders.txt") == -1:
k = k + 1
for i in range(22):
if len(t[i]) == 0 or t[i] == '\n':
t[i] = None
try:
rows.append((t[0],t[1],t[2],t[3],t[4],t[5],t[6],t[7],t[8],t[9],t[10],t[11],t[12],t[13],t[14],t[15],t[16],t[17],t[18],t[19],t[20],t[21]))
if k % 1000 == 0:
curInsert.executemany(stmt,rows)
conn.commit()
rows = []
k = 0
except:
print(rows[0])
print(str(sys.exc_info()[1]))
However, given the fact we could load the uncompressed file in seven minutes with MySQL tools, the choice became obvious.
mysql> load data local infile 'express_orders.txt' into table express_order fields terminated by ",";
Query OK, 12428870 rows affected, 966 warnings (7 min 15.24 sec)
Records: 12428870 Deleted: 0 Skipped: 0 Warnings: 966
mysql>