· 6 months ago · Mar 24, 2025, 03:40 AM
1-- Desc: Analysis of backblaze SMART drive data using DuckDB
2-- URL: https://www.backblaze.com/cloud-storage/resources/hard-drive-test-data
3-- Auth: Brian Dill
4-- Date: 2025-03-23
5
6-- create/attach to DB (My D:\ drive is a 2.5" SSD)
7Attach "D:/DuckDB/backblaze.duckdb";
8
9-- ================================================================================
10--DROP TABLE drive_days;
11CREATE TABLE drive_days (
12 date DATE
13 , serial_number VARCHAR
14 , model VARCHAR
15 , capacity_bytes BIGINT
16 , failure BIGINT
17-- , smart_5_normalized VARCHAR
18-- , smart_5_raw BIGINT
19-- , smart_187_normalized VARCHAR
20-- , smart_187_raw BIGINT
21-- , smart_188_normalized VARCHAR
22-- , smart_188_raw BIGINT
23-- , smart_197_normalized VARCHAR
24-- , smart_197_raw BIGINT
25-- , smart_198_normalized VARCHAR
26-- , smart_198_raw BIGINT
27);
28
29-- ================================================================================
30-- Load CSV files
31-- My choice of "ORDER BY model, serial_number, date" is to attain maximum columnar compression by ordering by the lowest cardinality first
32
33-- My K:\ drive IS a WD RED 4GB spindle HDD AND IS WHERE I unzipped ALL OF the *.csv files.
34SET VARIABLE path = "K:/R/Backblaze/";
35/*
36--CREATE OR REPLACE TABLE drive_days AS SELECT * FROM read_csv_auto(CONCAT( getvariable(PATH), "2014/*.csv"), header=True, filename=False)
37-- 2.8 GB in 1:08 (the csv files total 2.8 GB and the import took 1 min and 8 seconds)
38INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2014/*.csv"), header=True) ORDER BY model, serial_number, date;
39-- 4.2 GB in 1:22
40INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2015/*.csv"), header=True) ORDER BY model, serial_number, date;
41-- 6.0 GB in 1:55
42INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2016/*.csv"), header=True) ORDER BY model, serial_number, date;
43-- 7.6 GB in 2:34
44INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2017/*.csv"), header=True) ORDER BY model, serial_number, date;
45-- 10.0 GB in 3:10
46INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2018/*.csv"), header=True) ORDER BY model, serial_number, date;
47-- 11.8 GB in 3:28
48INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2019/*.csv"), header=True) ORDER BY model, serial_number, date;
49-- 15.5 GB in 5:30
50INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2020/*.csv"), header=True) ORDER BY model, serial_number, date;
51-- 21.5 GB in 13:36
52INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2021/*.csv"), header=True) ORDER BY model, serial_number, date;
53-- 26.9 GB in 25:35
54INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2022/*.csv"), header=True) ORDER BY model, serial_number, date;
55-- the import started taking too long (exceeding my 32GB RAM), so I sub-divided the CSV files into quarters rather than year
56-- 7.1 GB in 1:21
57INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2023/2023Q1/*.csv"), header=True) ORDER BY model, serial_number, date;
58-- 7.6 GB in 1:26
59INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2023/2023Q2/*.csv"), header=True) ORDER BY model, serial_number, date;
60-- 8.7 GB in 1:40
61INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2023/2023Q3/*.csv"), header=True) ORDER BY model, serial_number, date;
62-- 9.1 GB in 1:49
63INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2023/2023Q4/*.csv"), header=True) ORDER BY model, serial_number, date;
64-- 9.3 GB in 1:49
65INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2024/2024Q1/*.csv"), header=True) ORDER BY model, serial_number, date;
66-- 9.6 GB in 1:53
67INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2024/2024Q2/*.csv"), header=True) ORDER BY model, serial_number, date;
68-- 9.9 GB in 1:59
69INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2024/2024Q3/*.csv"), header=True) ORDER BY model, serial_number, date;
70-- 10.2 GB in 2:04
71INSERT INTO main.drive_days SELECT date, serial_number, model, capacity_bytes, failure FROM read_csv(CONCAT( getvariable(PATH), "2024/2024Q4/*.csv"), header=True) ORDER BY model, serial_number, date;
72*/
73-- ================================================================================
74
75SELECT * FROM main.drive_days LIMIT 10;
76
77DESCRIBE drive_days;
78SUMMARIZE drive_days; -- 0:34
79
80-- Total count - almost 600 Million rows
81SELECT count(*) AS N FROM main.drive_days; -- 559,474,362
82
83-- Count by year
84SELECT year(date) AS "year", count(*) AS N FROM main.drive_days GROUP BY year(date); -- < 1 sec
85
86-- count by day
87SELECT date, count(*) AS N FROM main.drive_days GROUP BY date; -- < 1 sec
88SELECT date, model, count(*) AS N FROM main.drive_days GROUP BY date, model; -- < 1 sec
89SELECT date, S.make, count(*) AS N FROM main.drive_days AS DD JOIN main.serials AS S ON S.model = DD.model GROUP BY date, S.make; -- < 1 sec
90
91-- What are the list of models?
92SELECT model, count(*) AS model_drive_days FROM main.drive_days GROUP BY ALL ORDER BY 2 DESC; -- 171 models
93
94-- how many models were recorded for each day?
95SELECT model, date, count(*) AS N FROM main.drive_days GROUP BY ALL HAVING count(*) > 100 ORDER BY model, date;
96
97-- ================================================================================
98-- Export each year's data to a parquet file.
99
100COPY (SELECT * FROM drive_days WHERE date >= '2014-01-01' AND date < '2015-01-01' ORDER BY model, serial_number, date) TO "D:/DuckDB/drive_days_2014.parquet" (FORMAT parquet); -- 0:06
101COPY (SELECT * FROM drive_days WHERE date >= '2015-01-01' AND date < '2016-01-01' ORDER BY model, serial_number, date) TO "D:/DuckDB/drive_days_2015.parquet" (FORMAT parquet); -- 0:08
102COPY (SELECT * FROM drive_days WHERE date >= '2016-01-01' AND date < '2017-01-01' ORDER BY model, serial_number, date) TO "D:/DuckDB/drive_days_2016.parquet" (FORMAT parquet); -- 0:12
103COPY (SELECT * FROM drive_days WHERE date >= '2017-01-01' AND date < '2018-01-01' ORDER BY model, serial_number, date) TO "D:/DuckDB/drive_days_2017.parquet" (FORMAT parquet); -- 0:16
104COPY (SELECT * FROM drive_days WHERE date >= '2018-01-01' AND date < '2019-01-01' ORDER BY model, serial_number, date) TO "D:/DuckDB/drive_days_2018.parquet" (FORMAT parquet); -- 0:22
105COPY (SELECT * FROM drive_days WHERE date >= '2019-01-01' AND date < '2020-01-01' ORDER BY model, serial_number, date) TO "D:/DuckDB/drive_days_2019.parquet" (FORMAT parquet); -- 0:29
106COPY (SELECT * FROM drive_days WHERE date >= '2020-01-01' AND date < '2021-01-01' ORDER BY model, serial_number, date) TO "D:/DuckDB/drive_days_2020.parquet" (FORMAT parquet); -- 0:40
107COPY (SELECT * FROM drive_days WHERE date >= '2021-01-01' AND date < '2022-01-01' ORDER BY model, serial_number, date) TO "D:/DuckDB/drive_days_2021.parquet" (FORMAT parquet); -- 1:15
108COPY (SELECT * FROM drive_days WHERE date >= '2022-01-01' AND date < '2023-01-01' ORDER BY model, serial_number, date) TO "D:/DuckDB/drive_days_2022.parquet" (FORMAT parquet); -- 1:23
109COPY (SELECT * FROM drive_days WHERE date >= '2023-01-01' AND date < '2024-01-01' ORDER BY model, serial_number, date) TO "D:/DuckDB/drive_days_2023.parquet" (FORMAT parquet); -- 1:46
110COPY (SELECT * FROM drive_days WHERE date >= '2024-01-01' AND date < '2025-01-01' ORDER BY model, serial_number, date) TO "D:/DuckDB/drive_days_2024.parquet" (FORMAT parquet); -- 2:15
111
112-- ================================================================================
113-- Create the "serials" table to store one row per serial number.
114-- DROP TABLE main.serials;
115CREATE TABLE main.serials (
116 make varchar -- derive FROM the model
117 , model varchar
118 , serial_number varchar
119 , capacity_bytes bigint
120 , capacity_gb DECIMAL(12,3) -- calc TO have an easier number TO grasp
121 , failure bigint -- Did this SPECIFIC drive fail?
122 , min_date date -- what IS the FIRST recorded date OF service?
123 , max_date date -- what IS the LAST recorded date OF service?
124 , total_days INT -- how many days was it IN service?
125);
126
127INSERT INTO main.serials -- time TO load: 0:18 (56 sec when using median)
128SELECT DISTINCT
129 max(CASE WHEN LEFT(model, 3) = 'HGS' THEN 'Hitachi'
130 WHEN LEFT(model, 3) = 'Hit' THEN 'Hitachi'
131 WHEN LEFT(model, 3) = 'SAM' THEN 'Samsung'
132 WHEN LEFT(model, 3) = 'Sam' THEN 'Samsung'
133 WHEN LEFT(model, 2) = 'ST' THEN 'Seagate'
134 WHEN LEFT(model, 3) = 'TOS' THEN 'Toshiba'
135 WHEN LEFT(model, 3) = 'WDC' THEN 'Western Digital'
136 WHEN LEFT(model, 2) = 'WD' THEN 'Western Digital'
137 WHEN LEFT(model, 3) = 'DEL' THEN 'DELL'
138 WHEN LEFT(model, 3) = 'Sea' THEN 'Seagate'
139 WHEN LEFT(model, 3) = 'MTF' THEN 'Micron'
140 WHEN LEFT(model, 3) = 'Mic' THEN 'Micron'
141 WHEN LEFT(model, 2) = 'CT' THEN 'Crucial' END
142 ) AS make
143 , max(model) AS model
144 , serial_number
145 , median(capacity_bytes) AS capacity_bytes
146 , median(capacity_bytes) / (1024 * 1024 * 1024) AS capacity_GB
147 , max(failure) AS failure
148 , min(date) AS min_date
149 , max(date) AS max_date
150 , datediff('d', min(date), max(date)) AS total_days
151FROM main.drive_days
152GROUP BY serial_number
153ORDER BY make, model, serial_number;
154
155-- COPY (SELECT * FROM serials ORDER BY make, model, serial_number) TO "D:/DuckDB/serials.parquet" (FORMAT parquet); -- < 1 sec
156-- COPY (SELECT * FROM serials ORDER BY make, model, serial_number) TO "D:/DuckDB/serials.csv" (FORMAT csv); -- < 1sec
157
158-- ================================================================================
159
160SELECT * FROM main.serials LIMIT 100;
161
162SELECT COUNT(*) AS N FROM main.serials; -- Gross count: 444,500
163SELECT COUNT(*) AS N FROM main.serials WHERE failure = 0 AND max_date = '2024-12-31'; -- active: 305,029 (not failed and running on last day)
164SELECT COUNT(*) AS N FROM main.serials WHERE failure = 0 AND max_date < '2024-12-31'; -- retired: 113,444
165SELECT COUNT(*) AS N FROM main.serials WHERE failure = 1; -- Failed: 26,027
166
167
168SELECT * FROM main.serials WHERE failure = 1 ORDER BY make, model, total_days DESC ;
169
170-- How many drives of each make
171SELECT make, count(serial_number) AS NumOfDrives, min(min_date) AS first_used, max(max_date) AS last_used--, sum(failure) AS failures
172FROM main.serials GROUP BY make ORDER BY NumOfDrives DESC;
173
174-- How many drives of each model (167 models)
175SELECT make, model
176 , count(serial_number) AS NumOfDrives
177 , MEDIAN(capacity_gb::INT) AS gb
178 , min(min_date) AS first_used
179 , CASE max(max_date) WHEN '2024-12-31' THEN NULL ELSE max(max_date) END AS retired_on
180 , sum(total_days) AS total_drive_days
181 , (sum(total_days) * 1.0 / count(serial_number))::INT AS avg_drive_days
182FROM main.serials
183GROUP BY make, model
184HAVING NumOfDrives > 100
185ORDER BY make, NumOfDrives DESC;
186
187-- 3 outliers in capacity
188SELECT * FROM main.serials ORDER BY capacity_GB DESC ;
189
190-- ================================================================================
191-- DROP VIEW IF EXISTS vw_serials_status;
192CREATE VIEW main.vw_serials_status
193AS
194SELECT COALESCE(make, 'other') AS make
195 , model
196 , serial_number
197 , capacity_gb::INT AS capacity_gb
198 , total_days
199 , min_date AS first_used
200 , max_date AS last_used
201 , failure
202 , CASE WHEN max_date = '2024-12-31' AND failure = 0 THEN 1 ELSE 0 END AS is_active
203 , CASE WHEN max_date < '2024-12-31' AND failure = 0 THEN 'retired'
204 WHEN max_date = '2024-12-31' AND failure = 0 THEN 'live'
205 WHEN failure = 1 THEN 'failed'
206 ELSE '' END AS status
207FROM main.serials
208WHERE capacity_gb > 0
209;
210
211SELECT * FROM vw_serials_status LIMIT 10;
212
213-- What makes have the most drive days?
214SELECT make
215 , sum(total_days) AS total_days
216 , count(*) AS num_drives
217 , (sum(total_days) / count(*))::INT AS avg_days_per_drive
218FROM main.vw_serials_status
219GROUP BY make
220ORDER BY total_days DESC;
221
222-- What models have the most drive days?
223SELECT make, model, capacity_gb
224 , sum(total_days) AS total_days
225 , count(*) AS num_drives
226 , (sum(total_days) / count(*))::INT AS avg_days_per_drive
227 , min(first_used) AS first_used
228 , max(last_used) AS last_used
229 , CASE WHEN max(last_used) < '2024-12-31' THEN (max(last_used))::VARCHAR(10) ELSE 'active' END AS retired_on
230FROM main.vw_serials_status
231GROUP BY ALL
232ORDER BY total_days DESC ;
233
234-- ================================================================================
235-- create make and model views
236-- create view to calculate status and pivot on status by make
237-- ================================================================================
238DROP VIEW IF EXISTS main.vw_serials_make_status_piv;
239CREATE VIEW main.vw_serials_make_status_piv
240AS
241SELECT *
242 , (live+failed+retired) AS total
243 , (failed * 100.0 / (live+failed+retired))::DECIMAL(5,2) AS pct_failed_drives
244FROM (
245 SELECT make, status, count(*) AS N
246 FROM main.vw_serials_status
247 GROUP BY make, status
248 ORDER BY make, status
249) AS X
250PIVOT (
251 COALESCE(MAX(N), 0) FOR status IN ('live', 'failed', 'retired')
252) AS PIV
253;
254
255-- SELECT * FROM main.vw_serials_make_status_piv ORDER BY live DESC ;
256
257-- ================================================================================
258-- create view to calculate status and pivot on status by model
259DROP VIEW IF EXISTS main.vw_serials_model_status_piv;
260CREATE VIEW main.vw_serials_model_status_piv
261AS
262SELECT *
263 , (live+failed+retired) AS total
264 , (failed * 100.0 / (live+failed+retired))::DECIMAL(5,2) AS pct_failed_drives
265FROM (
266 SELECT make, model, capacity_gb, status, count(*) AS N
267 FROM main.vw_serials_status
268 GROUP BY make, model, capacity_gb, status
269 ORDER BY make, model, capacity_gb, status
270) AS X
271PIVOT (
272 COALESCE(MAX(N), 0) FOR status IN ('live', 'failed', 'retired')
273) AS PIV
274;
275
276-- SELECT * FROM main.vw_serials_model_status_piv ORDER BY live DESC ;
277-- SELECT * FROM main.vw_serials_model_status_piv WHERE model = 'ST12000NM0007' ORDER BY model DESC ; -- one reported as 0GB
278
279-- ================================================================================
280-- make level analysis
281-- Use pivot make status view to factor in total drive days
282-- ================================================================================
283SELECT V.*
284 , S.total_drive_days
285 , (failed / (S.total_drive_days / 1000000))::DECIMAL(9,2) AS failures_per_million_drive_days
286FROM vw_serials_make_status_piv AS V
287JOIN (SELECT make, SUM(total_days) AS total_drive_days FROM main.vw_serials_status GROUP BY make) AS S ON S.make = V.make
288ORDER BY V.live DESC
289;
290
291
292-- ================================================================================
293-- model level analysis
294-- Use pivot model status view to factor in total drive days
295-- ================================================================================
296SELECT V.*
297 , S.total_drive_days
298 , (failed / (S.total_drive_days / 1000000))::DECIMAL(9,2) AS failures_per_million_drive_days
299 , S.first_used::VARCHAR(10) AS first_used
300 --, S.last_used::varchar(10) AS last_used
301 , CASE WHEN S.last_used = '2024-12-31' THEN 'active' ELSE S.last_used::varchar(10) END AS retired_on
302FROM vw_serials_model_status_piv AS V
303JOIN (SELECT make, model
304 , SUM(total_days) AS total_drive_days
305 , min(first_used) AS first_used
306 , max(last_used) AS last_used
307 FROM main.vw_serials_status
308 GROUP BY make, model) AS S ON S.model = V.model
309--WHERE V.model = 'ST12000NM0007'
310WHERE total > 100
311ORDER BY failures_per_million_drive_days
312--ORDER BY V.live DESC
313;
314
315
316/*
317ID SMART Attribute Description
3181 Raw Read Error Rate Rate of hardware read errors when accessing data.
3192 Throughput Performance Overall performance of the HDD (lower values indicate issues).
3203 Spin-Up Time Time it takes for the drive to reach full speed.
3214 Start/Stop Count Number of times the HDD has been started or stopped.
3225 Reallocated Sectors Count Number of bad sectors replaced with spare sectors (higher = failing drive).
3236 Read Channel Margin Internal measurement of read performance.
3247 Seek Error Rate Frequency of seek errors when positioning the drive head.
3258 Seek Time Performance Average time taken for seek operations.
3269 Power-On Hours (POH) Total number of hours the HDD has been powered on.
32710 Spin Retry Count Number of times the drive had to retry spinning up.
32811 Calibration Retry Count Number of failed calibration attempts.
32912 Power Cycle Count Number of times the HDD has been power-cycled (turned off/on).
33013 Soft Read Error Rate Number of corrected read errors (not always used).
331183 SATA Downshift Error Count Number of times the SATA link had to be downgraded.
332184 End-to-End Error Data integrity errors in the HDD cache/RAM.
333187 Reported Uncorrectable Errors Number of errors that could not be corrected.
334188 Command Timeout Number of commands that timed out due to hardware issues.
335189 High Fly Writes Occurs when the drive head is positioned incorrectly.
336190 Airflow Temperature (or Drive Temperature) Current temperature of the HDD (critical if too high).
337191 G-Sense Error Rate Number of errors due to shocks or vibrations.
338192 Power-Off Retract Count Number of times the drive head was parked due to power loss.
339193 Load Cycle Count Number of times the drive head has been loaded/unloaded.
340194 Temperature Temperature of the HDD in degrees Celsius.
341195 Hardware ECC Recovered Number of errors corrected by hardware Error Correction Code (ECC).
342196 Reallocation Event Count Number of times a bad sector was replaced.
343197 Current Pending Sector Count Number of unstable sectors awaiting reallocation (higher = bad).
344198 Uncorrectable Sector Count Number of sectors that could not be recovered (bad sign).
345199 UltraDMA CRC Error Count Data transfer errors due to faulty cables or connectors.
346200 Write Error Rate Rate of errors during write operations.
347201 Soft Read Error Rate Corrected errors during reads.
348202 Data Address Mark Errors Errors in address marking of the disk.
349220 Disk Shift Measures disk platter movement due to impact/shock.
350222 Loaded Hours Number of hours the HDD head has been loaded.
351223 Load Retry Count Number of failed attempts to load the drive head.
352224 Load Friction Measures resistance while loading the HDD head.
353226 Load-in Time Time taken for the HDD head to load.
354240 Head Flying Hours Total hours the read/write head has been operational.
355241 Total LBAs Written Total data written to the HDD in Logical Block Addresses (LBAs).
356242 Total LBAs Read Total data read from the HDD in LBAs.
357
358
359Key Indicators for Drive Failure
360If any of these values are high or increasing, your drive may be failing:
361
362Reallocated Sectors Count (ID 5) → Bad sectors that were replaced.
363Current Pending Sector Count (ID 197) → Unstable sectors waiting to be remapped.
364Uncorrectable Sector Count (ID 198) → Sectors that cannot be recovered.
365Reported Uncorrectable Errors (ID 187) → High values indicate serious errors.
366Command Timeout (ID 188) → Indicates drive delays or failures.
367*/