· 6 years ago · Jul 30, 2019, 05:28 PM
1#!/usr/bin/env python3
2from glob import glob
3from pyquery import PyQuery as pq
4from os import path
5
6import argparse
7import json
8import sqlite3
9import sys
10import time
11
12def optional_starts_with(text, query):
13 if not text:
14 return False
15
16 return text.startswith(text)
17
18def get_metadata(targetfile, add_underscores=False):
19 with open(targetfile, "r") as f:
20 html = "".join(f.readlines()[1:])
21
22 doc = pq(html)
23
24 table = doc("table.itg.glte")
25
26 for row in table.children():
27 row = pq(row)
28 meta = row("div.gl3e")
29 meat = row("div.gl4e")
30 obj = {}
31
32 ex_link = row("a").eq(0).attr("href")
33 ex_id = ex_link.split('/')[4]
34
35 obj["title"] = meat("div.glink").text()
36 obj["thumbnail"] = row("img").eq(0).attr("src").replace("https://exhentai.org/", "")
37 obj["path"] = ex_link.replace("https://exhentai.org/", "")
38 obj["category"] = meta("div.cn").eq(0).text().lower()
39 obj["date"] = int(time.mktime(time.strptime(meta("[id^='posted_']").text(), "%Y-%m-%d %H:%M")))
40 obj["uploader"] = meta("a").text()
41 obj["pages"] = meta.children().filter(lambda i: " page" in pq(this).eq(0).text()).text().split(' ')[0]
42
43 tags = {}
44 obj["tags"] = tags
45
46 for namespace in meat("td.tc"):
47 namespace = pq(namespace)
48 namespace_name = namespace.text().split(':')[0]
49 namespace_tags = {}
50 tags[namespace_name] = namespace_tags
51
52 for tag in namespace.next_all().find("div"):
53 tag = pq(tag)
54 if tag.is_(".gtl"):
55 score = 0
56 elif tag.is_(".gtw"):
57 score = -1
58 else: # tag.is_(".gt")
59 score = 1
60 tag = tag.text()
61 if add_underscores:
62 tag = tag.replace(' ', '_')
63 namespace_tags[tag] = score
64
65 yield { ex_id: obj }
66
67def get_all_metadata(input, verbose, use_underscores):
68 num_files = len(input)
69 for n, file in enumerate(input, start=1):
70 if verbose:
71 print('scraperscraper: parsing {1} / {2}: "{0}"'.format(file, n, num_files))
72 for obj in get_metadata(file, use_underscores):
73 yield obj
74
75def run_json(input, output, merge, verbose, use_underscores):
76 num_files = len(input)
77 with open(output, "a" if merge else 'w') as f:
78 for n, file in enumerate(input, start=1):
79 if verbose:
80 print('scraperscraper: parsing {1} / {2}: "{0}"'.format(file, n, num_files))
81 for obj in get_metadata(file, use_underscores):
82 json.dump(obj, f)
83 f.write('\n')
84
85def item_or_insert(table, nspace, nlist, c):
86 nid = nlist.get(nspace)
87 if not nid:
88 c.execute("INSERT INTO " + table + " (name) VALUES (?)", (nspace,))
89 nid = c.lastrowid
90 nlist[nspace] = nid
91 return nid
92
93def tag_or_insert(tag, taglist, nspace, nlist, c):
94 nid = item_or_insert("namespaces", nspace, nlist, c)
95 taglist_x = taglist.get(nspace)
96 if not taglist_x:
97 taglist_x = {}
98 taglist[nspace] = taglist_x
99 tid = taglist_x.get(tag)
100 if not tid:
101 c.execute("INSERT INTO tags (namespace, name) VALUES (?, ?)", (nid, tag))
102 tid = c.lastrowid
103 taglist_x[tag] = tid
104
105 return tid
106
107def run_sql(input, conn, verbose, use_underscores):
108 c = conn.cursor()
109 c.executescript('''
110 CREATE TABLE IF NOT EXISTS galleries (
111 id INTEGER PRIMARY KEY,
112 title TEXT,
113 thumbnail TEXT,
114 path TEXT,
115 category INTEGER,
116 date INTEGER,
117 uploader TEXT,
118 pages INTEGER
119 );
120
121 CREATE TABLE IF NOT EXISTS categories (
122 id INTEGER PRIMARY KEY,
123 name VARCHAR(255) UNIQUE
124 );
125
126 CREATE TABLE IF NOT EXISTS namespaces (
127 id INTEGER PRIMARY KEY,
128 name VARCHAR(255) UNIQUE
129 );
130
131 CREATE TABLE IF NOT EXISTS tags (
132 id INTEGER PRIMARY KEY,
133 namespace INTEGER REFERENCES namespaces(id) ON UPDATE CASCADE ON DELETE CASCADE,
134 name TEXT,
135 CONSTRAINT unq_tag UNIQUE (namespace, name)
136 );
137
138 CREATE TABLE IF NOT EXISTS taggings (
139 gallery INTEGER REFERENCES galleries(id) ON UPDATE CASCADE ON DELETE CASCADE,
140 tag INTEGER REFERENCES tags(id) ON UPDATE CASCADE ON DELETE CASCADE,
141 strength INTEGER,
142 CONSTRAINT unq_tag UNIQUE (gallery, tag)
143 );
144 ''')
145
146 catlist = {}
147 c.execute("SELECT id, name FROM categories")
148 for id, name in c.fetchall():
149 catlist[name] = id
150
151 nlist = {}
152 rev_nlist = {}
153 c.execute("SELECT id, name FROM namespaces")
154 for id, name in c.fetchall():
155 nlist[name] = id
156 rev_nlist[id] = name
157
158 taglist = {}
159 c.execute("SELECT id, namespace, name from tags")
160 for id, nspaceid, name in c.fetchall():
161 nspace = rev_nlist[nspaceid]
162 taglist_x = taglist.get(nspace)
163 if not taglist_x:
164 taglist_x = {}
165 taglist[nspace] = taglist_x
166 taglist_x[name] = id
167
168 for obj in get_all_metadata(input, verbose, use_underscores):
169 id, gallery = next(iter(obj.items()))
170 try:
171 gallery['id'] = int(id)
172 gallery['category'] = item_or_insert("categories", gallery['category'], catlist, c)
173 gallery['pages'] = int(gallery['pages'])
174 c.execute("INSERT INTO galleries (id, title, thumbnail, path, category, date, uploader, pages) VALUES (:id, :title, :thumbnail, :path, :category, :date, :uploader, :pages)", gallery)
175 for nspace, nspacetags in gallery['tags'].items():
176 for tag, score in nspacetags.items():
177 c.execute("INSERT INTO taggings (gallery, tag, strength) VALUES (?, ?, ?)", (id, tag_or_insert(tag, taglist, nspace, nlist, c), score))
178 except sqlite3.IntegrityError as e:
179 if str(e).startswith("UNIQUE constraint failed"):
180 sys.stderr.write("warning: {} (gallery: {})\n".format(e, id))
181 else:
182 throw
183
184def main():
185 parser = argparse.ArgumentParser()
186 parser.add_argument("--add-underscores", help="replace spaces in tag names with underscores", dest="use_underscores", action="store_true")
187 parser.add_argument("-f", "--force", help="overwrite the destination file if it exists (no effect on SQL)", dest="overwrite", action="store_true")
188 parser.add_argument("-a", "--append", help="append output to the destination file if it exists (no effect on SQL)", dest="merge", action="store_true")
189 parser.add_argument("-o", "--output", help="the output file to dump metadata into (if this ends with .db, SQLite will be used)", dest="output", required=True)
190 parser.add_argument("-v", "--verbose", help="print progress", dest="verbose", action="store_true")
191 parser.add_argument("files", help="the files to parse (glob)")
192 args = parser.parse_args()
193
194 use_sql = False
195 if args.output.endswith(".db"):
196 use_sql = True
197 elif path.exists(args.output):
198 if not args.merge and not args.overwrite:
199 sys.stderr.write("error: output file exists; use --append or --force\n")
200 sys.exit(1)
201 elif path.isdir(args.output):
202 sys.stderr.write("error: output is a directory\n")
203 sys.exit(1)
204
205 files = glob(args.files, recursive=True)
206 verbose = args.verbose
207
208 if use_sql:
209 with sqlite3.connect(args.output) as conn:
210 run_sql(files, conn, args.verbose, args.use_underscores)
211 else:
212 run_json(files, args.output, args.merge, args.verbose, args.use_underscores)
213
214main()