· 7 years ago · Nov 04, 2018, 01:16 PM
1"""é‡è¤‡ãƒ•ァイル探ã—。
2
3sha256, file path ã‚’ sqlite3 ã«é›‘ã«è¨˜éŒ²ã—ã¦è¢«ã£ãŸãƒ•ァイルを見ã¤ã‘出ã™ã€‚
4"""
5
6import argparse
7import functools
8import hashlib
9import itertools
10import logging
11import operator
12import pathlib
13import sqlite3
14
15logger = logging.getLogger(__name__)
16logger.addHandler(logging.NullHandler())
17
18
19def filter_file(p):
20 return p.is_file()
21
22
23def filter_suf(p, suf):
24 return p.suffix in suf
25
26
27def iter_files(directory):
28 return (p.resolve() for p in directory.rglob('*'))
29
30
31def init_db(p):
32 db = sqlite3.connect(str(p), isolation_level='IMMEDIATE', detect_types=sqlite3.PARSE_DECLTYPES)
33 create_table = """
34 CREATE TABLE IF NOT EXISTS media_files (path TEXT PRIMARY KEY, hash TEXT, size INT);
35 """
36 create_index = """
37 CREATE INDEX IF NOT EXISTS hash_index on media_files(hash);
38 """
39 with db as cur:
40 cur.execute(create_table)
41 cur.execute(create_index)
42 return db
43
44
45def load_data(db, directory, suf):
46 """ãƒ•ã‚¡ã‚¤ãƒ«æƒ…å ±æ§‹ç¯‰"""
47 if not directory.is_dir():
48 raise ValueError('{} is no directory'.format(directory))
49
50 with db as cur:
51 it = iter_files(directory)
52 it = filter(filter_file, it)
53 if suf:
54 f = functools.partial(filter_suf, suf=suf)
55 it = filter(f, it)
56 for p in it:
57 h = hashlib.sha256()
58 with p.open('rb') as f:
59 for b in iter(f.read1, b''):
60 h.update(b)
61 size = p.stat().st_size
62 digest = h.hexdigest()
63 logging.info('read %s', p)
64 logging.debug('%s %d', digest, size)
65 cur.execute('INSERT OR REPLACE INTO media_files VALUES (?, ?, ?)', (str(p), digest, size))
66
67
68def output_data(db):
69 """æƒ…å ±å…¨å‡ºåŠ›"""
70 with db as cur:
71 for row in cur.execute('SELECT path, hash, size FROM media_files'):
72 print(row)
73
74
75def output_duplicated_data(db):
76 """é‡è¤‡ãƒ•ã‚¡ã‚¤ãƒ«æƒ…å ±å‡ºåŠ›"""
77 with db as cur:
78 getter = operator.itemgetter(2)
79 for hash_, rows in itertools.groupby(cur.execute('SELECT path, hash, size FROM media_files ORDER BY hash'), getter):
80 L = tuple(rows)
81 if len(L) >= 2:
82 for row in L:
83 print(row)
84
85
86def main():
87 logging.basicConfig(level=logging.INFO)
88 # logging.getLogger().setLevel(logging.DEBUG)
89
90 parser = argparse.ArgumentParser()
91 parser.add_argument('--db', type=pathlib.Path, default=pathlib.Path(__file__).resolve().with_name('db.sqlite3'), help='DB ファイルパス')
92 subparsers = parser.add_subparsers(dest='command', title='commands')
93
94 parser_load = subparsers.add_parser('load', help='ãƒ•ã‚¡ã‚¤ãƒ«æƒ…å ±æ§‹ç¯‰')
95 parser_load.add_argument('--suf', type=lambda s: frozenset(s.split(',')))
96 parser_load.add_argument('targets', nargs='+', type=pathlib.Path, help='èªã¿å‡ºã—先ディレクトリ')
97
98 parser_output = subparsers.add_parser('output', help='æƒ…å ±å…¨å‡ºåŠ›')
99
100 parser_output_dup = subparsers.add_parser('output_dup', help='é‡è¤‡ãƒ•ã‚¡ã‚¤ãƒ«æƒ…å ±å‡ºåŠ›')
101
102 args = parser.parse_args()
103 logger.debug(args)
104
105 db = init_db(args.db)
106 if args.command == 'output':
107 output_data(db)
108 elif args.command == 'load':
109 for target in args.targets:
110 load_data(db, target, args.suf)
111 elif args.command == 'output_dup':
112 output_duplicated_data(db)
113
114
115if __name__ == '__main__':
116 main()