· last year · Mar 03, 2024, 04:25 AM
1import sqlite3
2import time
3from uuid import uuid4
4
5
6def dbConn() -> sqlite3.Connection:
7 # Connect to the database
8 conn = sqlite3.connect('fileBlockData_mike_test.db')
9 return conn
10
11
12def recreate_tables(conn, truncate_prod_hash=False, truncate_prod_hash_auto=False):
13 cursor = conn.cursor()
14 print("Creating prodHash table")
15 cursor.execute("""
16 CREATE TABLE IF NOT EXISTS prodHash (
17 id INTEGER PRIMARY KEY AUTOINCREMENT,
18 fullData TEXT
19 );
20 """)
21 if truncate_prod_hash:
22 print("truncating prodhash table")
23 cursor.execute("""
24 DELETE FROM prodHash;
25 """)
26 cursor.execute("""
27 DELETE FROM SQLITE_SEQUENCE WHERE name='prodHash';
28 """)
29
30 print("creating prodHashAuto table")
31 cursor.execute("""
32 CREATE TABLE IF NOT EXISTS prodHashAuto (
33 id INTEGER PRIMARY KEY AUTOINCREMENT,
34 md5hash TEXT,
35 fileInfo TEXT,
36 offsetLocation TEXT,
37 decay INTEGER
38 );
39 """)
40 if truncate_prod_hash_auto:
41 print("truncating prodhashAuto table")
42 cursor.execute("""
43 DELETE FROM prodHashAuto;
44 """)
45 cursor.execute("""
46 DELETE FROM SQLITE_SEQUENCE WHERE name='prodHashAuto';
47 """)
48 conn.commit()
49
50
51def record_sqlite_latency(title, frequency=1_000_000):
52 time_func = time.perf_counter_ns
53 called = 0
54 start_time = time_func()
55 total_time = 0
56 last_check = 0
57
58 def start():
59 nonlocal start_time
60 start_time = time_func()
61
62 def stop(final=False, count=1):
63 nonlocal called, start_time, total_time, last_check
64 if start_time is not None:
65 called += count
66 total_time += (time_func() - start_time)
67 start_time = None
68 if called - last_check >= frequency:
69 last_check = called
70 avg = (total_time / called)
71 units = "nanosecs"
72 if avg > 10_000:
73 units = "ms"
74 avg = avg / 1_000_000
75 print(f"sqlite {title} taking {avg:.3f} {units} on avg. ({called:,} total {title})")
76
77 if final:
78 total_time_secs = total_time / 1_000_000_000
79 avg = (total_time / called)
80 print(
81 f"sqlite {title} took {total_time_secs:.3f} secs total and {avg:.3f} nanosecs on avg. ({title} {called:,} total)")
82 return
83
84 return start, stop
85
86
87def create_prod_hash(conn: sqlite3.Connection):
88 cur: sqlite3.Cursor = conn.cursor()
89 operation_start, operation_end = record_sqlite_latency("operation")
90 count = 513_057_999
91 print(f"Starting process to insert {count:,} rows into prodHash")
92 for i in range(count):
93 operation_start()
94 md5 = uuid4().hex
95 cur.execute("INSERT INTO prodHash (fullData) VALUES(?)", (f"{md5} filename_{i}.txt 0 0",))
96 operation_end()
97 conn.commit()
98 operation_end(final=True)
99
100
101def scan_all_rows(conn: sqlite3.Connection):
102 og_cur: sqlite3.Cursor = conn.cursor()
103 # get functions for logging metrics/progress
104 operation_start, operation_end = record_sqlite_latency("processchunk")
105
106 print("starting to iterate everything in prodHash")
107 cur = og_cur.execute("select * from prodHash")
108 chunk_size = 2000
109 while True:
110 # log metrics/progress
111 operation_start()
112 rows = cur.fetchmany(chunk_size)
113 if not rows:
114 break
115 converted = []
116 for data in rows:
117 parts = data[1].split()
118 md5hash = parts[0]
119 fileInfo = parts[1]
120 offsetLocation = parts[2] + " " + parts[3]
121 decay = 0
122 converted.append((md5hash, fileInfo, offsetLocation, decay))
123 if converted:
124 conn.executemany("INSERT INTO prodHashAuto (md5hash, fileInfo, offsetLocation, decay) VALUES(?, ?, ?, ?)",
125 converted)
126 # log metrics/progress
127 operation_end(count=len(rows))
128 conn.commit()
129 conn.close()
130
131 # log final metrics/progress
132 operation_end(final=True)
133
134
135if __name__ == "__main__":
136 # Call the functions
137 conn = dbConn()
138 recreate_tables(conn, truncate_prod_hash_auto=True)
139 # create_prod_hash(conn)
140 scan_all_rows(conn)
141