· 4 years ago · Mar 31, 2021, 05:04 PM
1<?php
2namespace import;
3
4use Generator;
5use InvalidArgumentException;
6use mysqli;
7
8class GameDevImportHtmlMessage
9{
10 public function __construct(
11 public string $msg_id,
12 public string $created,
13 public string $rank,
14 public string $user,
15 public string $nick,
16 public string $name,
17 public string $has_avatar,
18 public string $html,
19 ){}
20}
21
22class GameDevImportHtmlTopic
23{
24 public function __construct(
25 public string $name,
26 public string $path,
27 public string $href,
28 ){}
29}
30
31class GameDevImportViaRegexp
32{
33 private array $users, $paths, $ranks;
34 private array $sql_buffer;
35 private array $months_map;
36
37 private const MONTHS = [
38 'янв.', 'фев.', 'мар.', 'апр.', 'мая', 'июня',
39 'июля', 'авг.', 'сен.', 'окт.', 'ноя.', 'дек.',
40 ];
41
42 public function __construct(
43 private mysqli $db,
44 private string $root_dir,
45 private string $table_paths,
46 private string $table_topics,
47 private string $table_ranks,
48 private string $table_users,
49 private string $table_messages,
50 private string $table_messages_body,
51 private string $table_messages_quotes,
52 ){
53 mysqli_report(MYSQLI_REPORT_ALL);
54 $this->db->set_charset('utf8mb4');
55 $this->dbQuery('SET @@character_set_connection="utf8mb4"');
56 $this->months_map = array_flip(self::MONTHS);
57 }
58
59 private function recreateMessagesTables(){
60 $this->dbQuery('DROP TABLE IF EXISTS '.$this->table_messages);
61 $this->dbQuery('DROP TABLE IF EXISTS '.$this->table_users);
62 $this->dbQuery('DROP TABLE IF EXISTS '.$this->table_topics);
63 $this->dbQuery('DROP TABLE IF EXISTS '.$this->table_paths);
64 $this->dbQuery('DROP TABLE IF EXISTS '.$this->table_ranks);
65
66 $this->dbQuery(
67 "CREATE TABLE {$this->table_paths}(
68 id INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
69 name VARCHAR(255),
70 url VARCHAR(255)
71 ) Engine=InnoDB CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci"
72 );
73
74 $this->dbQuery(
75 "CREATE TABLE {$this->table_topics}(
76 id INT UNSIGNED NOT NULL PRIMARY KEY,
77 path INT UNSIGNED,
78 user INT UNSIGNED,
79 date DATETIME,
80 name VARCHAR(255),
81 KEY(path),
82 KEY(user),
83 KEY(date)
84 -- CONSTRAINT fk_{$this->table_topics}_path FOREIGN KEY (path) REFERENCES {$this->table_paths} (id)
85 ) Engine=InnoDB CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci"
86 );
87
88 $this->dbQuery(
89 "CREATE TABLE {$this->table_ranks}(
90 id INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
91 name VARCHAR(255)
92 ) Engine=InnoDB CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci"
93 );
94
95 $this->dbQuery(
96 "CREATE TABLE {$this->table_users}(
97 id INT UNSIGNED NOT NULL PRIMARY KEY,
98 `rank` INT UNSIGNED,
99 has_avatar TINYINT NOT NULL DEFAULT 0,
100 nick VARCHAR(255),
101 name VARCHAR(255),
102 KEY(`rank`)
103 ) Engine=InnoDB CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci"
104 );
105
106 $this->dbQuery(
107 "CREATE TABLE {$this->table_messages}(
108 id INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
109 date DATETIME,
110 topic INT UNSIGNED,
111 user INT UNSIGNED,
112 msg_id INT UNSIGNED,
113 KEY(date),
114 KEY(topic),
115 KEY(user),
116 UNIQUE(msg_id)
117 -- CONSTRAINT fk_{$this->table_messages}_topic FOREIGN KEY (topic) REFERENCES {$this->table_topics} (id),
118 -- CONSTRAINT fk_{$this->table_messages}_user FOREIGN KEY (user) REFERENCES {$this->table_users} (id)
119 ) Engine=InnoDB CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci"
120 );
121 }
122
123 private function recreateMessagesBodyTable(){
124 $this->dbQuery('DROP TABLE IF EXISTS '.$this->table_messages_body);
125 $this->dbQuery(
126 "CREATE TABLE {$this->table_messages_body}(
127 msg_id INT UNSIGNED NOT NULL PRIMARY KEY,
128 html VARCHAR(12000)
129 ) Engine=InnoDB CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci -- ROW_FORMAT=COMPRESSED"
130 );
131 }
132
133 private function recreateMessagesQuotesTable(){
134 $this->dbQuery('DROP TABLE IF EXISTS '.$this->table_messages_quotes);
135 $this->dbQuery(
136 "CREATE TABLE {$this->table_messages_quotes}(
137 id INT UNSIGNED NOT NULL PRIMARY KEY AUTO_INCREMENT,
138 msg_id INT UNSIGNED NOT NULL,
139 user INT UNSIGNED,
140 KEY(msg_id),
141 KEY(user)
142 ) Engine=InnoDB CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci"
143 );
144 }
145
146 private function dbQuery(string $sql): bool{
147 return $this->db->query($sql);
148 }
149
150 private function dbInsert(string $sql): int{
151 //file_put_contents('d:/sql.sql', $sql.";\n\n", FILE_APPEND);
152 $this->dbQuery($sql);
153 return $this->db->insert_id;
154 }
155
156 private function dbBuildInsertRecords(string $table_name, array $records): string{
157 $first_record = $records[array_key_first($records)];
158 $fields = array_keys($first_record);
159 $insert_records = array_map(function($record)use($fields){
160 $record_escaped = array_map(function($field_name)use($record){
161 $val = $record[$field_name];
162 return is_null($val) ? null : $this->db->real_escape_string($val);
163 }, $fields);
164 return '('.implode(',', array_map(function($val) {
165 return $val === null ? 'NULL' : "'$val'";
166 }, $record_escaped)).')';
167 }, $records);
168 $values_str = implode(",\n", $insert_records);
169 $fields_str = implode(',', $fields);
170
171 return "INSERT IGNORE INTO {$table_name} ({$fields_str}) VALUES \n".$values_str;
172 }
173
174 private function sqlBufferReset(){
175 $this->sql_buffer = [];
176 }
177
178 private function sqlBufferProcess(){
179 foreach($this->sql_buffer as $table_name=>$records){
180 $sql = $this->dbBuildInsertRecords($table_name, $records);
181 $this->dbInsert($sql);
182 }
183 $this->sqlBufferReset();
184 }
185
186 private function sqlBufferAddTableRecord(string $table_name, array $rec){
187 $this->sql_buffer[$table_name][] = $rec;
188 }
189
190 private function sqlBufferGetSize(string $table_name): int{
191 return count($this->sql_buffer[$table_name] ?? []);
192 }
193
194 private function addRank(string $rank){
195 $rank_id = $this->ranks[$rank] ?? null;
196 if($rank_id === null){
197 $rank_id = count($this->ranks) + 1;
198 $this->ranks[$rank] = $rank_id;
199 $this->sqlBufferAddTableRecord($this->table_ranks, [
200 'id'=>$rank_id,
201 'name'=>$rank,
202 ]);
203 }
204 return $rank_id;
205 }
206
207 private function addPath(string $topic_path, string $topic_url){
208 $path_id = $this->paths[$topic_path] ?? null;
209 if($path_id === null){
210 $path_id = count($this->paths) + 1;
211 $this->paths[$topic_path] = $path_id;
212 $this->sqlBufferAddTableRecord($this->table_paths, [
213 'id'=>$path_id,
214 'name'=>$topic_path,
215 'url'=>$topic_url,
216 ]);
217 }
218 return $path_id;
219 }
220
221 private function addTopic(int $topic_id, int $path_id, ?int $user_id, ?string $created, string $name){
222 $this->sqlBufferAddTableRecord($this->table_topics, [
223 'id'=>$topic_id,
224 'path'=>$path_id,
225 'user'=>$user_id,
226 'date'=>$created,
227 'name'=>$name,
228 ]);
229 }
230
231 private function addUser(int $id, int $rank_id, int $has_avatar, string $nick, string $name){
232 if(!isset($this->users[$id])){
233 $this->users[$id] = 1;
234 $this->sqlBufferAddTableRecord($this->table_users, [
235 'id'=>$id,
236 'rank'=>$rank_id,
237 'has_avatar'=>$has_avatar,
238 'nick'=>$nick,
239 'name'=>$name,
240 ]);
241 }
242 }
243
244 private function addMessage(string $created, int $topic_id, int $user_id, int $msg_id){
245 $this->sqlBufferAddTableRecord($this->table_messages, [
246 'date'=>$created,
247 'topic'=>$topic_id,
248 'user'=>$user_id,
249 'msg_id'=>$msg_id,
250 ]);
251 }
252
253 private function addMessagesBody(int $msg_id, string $html){
254 $this->sqlBufferAddTableRecord($this->table_messages_body, [
255 'msg_id'=>$msg_id,
256 'html'=>$html,
257 ]);
258 }
259
260 private function addMessagesQuote(int $msg_id, int $user_id){
261 $this->sqlBufferAddTableRecord($this->table_messages_quotes, [
262 'msg_id'=>$msg_id,
263 'user'=>$user_id,
264 ]);
265 }
266
267 private function getTopics(): Generator{
268 $root_mask = $this->root_dir.'/*';
269 $directories = glob($root_mask, GLOB_ONLYDIR);
270 $directories = array_unique($directories);
271 foreach($directories as $dir){
272 $topic_id = basename($dir);
273 $topic_id = (int)ltrim($topic_id, '0');
274 yield $topic_id=>$this->getPages($dir);
275 }
276 }
277
278 private function getPages(string $dir): array{
279 $files = array_diff(scandir($dir), ['..', '.']);
280 sort($files);
281 return array_map(function($file_name)use($dir){
282 return $dir.'/'.$file_name;
283 }, $files);
284 }
285
286 private function getPageHtml(string $file_path): string{
287 $html = file_get_contents($file_path);
288 if($html === false){
289 throw new InvalidArgumentException("file '$file_path' not found");
290 }
291 return $html;
292 }
293
294 private function getDbUsersNickIdMap(): array{
295 $users = [];
296 $result = $this->db->query(
297 /** @lang GenericSQL */
298 "SELECT id, nick FROM {$this->table_users} ORDER BY id"
299 );
300 if($result){
301 $records = $result->fetch_all(MYSQLI_ASSOC);
302 foreach($records as $rec){
303 $users[$rec['nick']] = $rec['id'] ;
304 }
305 }
306 return $users;
307 }
308
309 private function getMessageQuotedUsers(string $html, array $existing_users): array{
310 $quotes = [];
311 if(preg_match_all('~<b>(.+?)</b>~m', $html, $m)){
312 foreach($m[1] as $user){
313 $user_id = $existing_users[$user] ?? null;
314 if($user_id !== null){
315 $quotes[] = $user_id;
316 }
317 }
318 }
319 return $quotes;
320 }
321
322 private function htmlGetTopicInfo(string $html): GameDevImportHtmlTopic{
323 $name = $path = $href = '';
324 if(preg_match('~<h1 itemprop="name headline">(.*)(?:\s+\(\d+\s+стр\))?</h1>~m', $html, $m)){
325 $name = html_entity_decode($m[1]);
326 }
327 if(preg_match('~<div class="path"><div class="bound">(.*)</span></div></div>\r\n~m', $html, $m)){
328 $line = trim($m[1]);
329 if(preg_match_all('~<a href="([^"]*)" title="([^"]*)"~', $line, $m2)){
330 $path = html_entity_decode(implode(' / ', array_reverse($m2[2])));
331 $href = end($m2[1]);
332 }
333 }
334 return new GameDevImportHtmlTopic($name, $path, $href);
335 }
336
337 private function createDate(int $year, string $month_str, int $day, int $hour, int $minute): string|null{
338 $created = null;
339 $month = $this->months_map[$month_str] ?? null;
340 if($month !== null){
341 $time = mktime($hour, $minute, 0, $month + 1, $day, $year);
342 $created = date('Y-m-d H:i:s', $time);
343 }
344 return $created;
345 }
346
347 private function htmlGetMessages(string $html): array{
348 $html_no_ads = preg_replace('~^(.*)<hr class="forum">\s*<div id="advert">.*?</div>\s*(.*)$~ms', '$1$2', $html);
349 preg_match_all('~'.
350 '(<div class="bound head" id="m\d+">)(.*?)'.
351 '(<!-- Message #\d+ -->)(?<html>.*?)'.
352 '(?=<div class="bound head" id="m\d+">|<a id="l"></a>)'.
353 '~ms',
354 $html_no_ads, $html_matches, PREG_SET_ORDER
355 );
356
357 $messages = [];
358 foreach($html_matches as $html_match){
359 $header = $html_match[2];
360 preg_match('~'.
361 '<li class="bold"><a href="https://gamedev.ru/users/\?id=(?<user>\d+)"( title="(?<name>.*?)")?>(?<nick>.*?)</a></li>.*'.
362 '<li class="small">(?<rank>.*?)</li>.*'.
363 '<a class="gray" rel="nofollow" href="[^?]+\?id=(?<topic>\d+).*?&m=(?<msg_id>\d+)#m(?<page_msg_num>\d+)">.*'.
364 '<span class="date">(?<date>(?<date_edit>.*?)(?<hour>\d+):(?<minute>\d+),\s+(?<day>\d+)\s+(?<month>.+?)\s+(?<year>\d+))</span>.*'.
365 '~ms',
366 $header, $m
367 );
368 $messages[] = new GameDevImportHtmlMessage(
369 $m['msg_id'],
370 $this->createDate($m['year'], $m['month'], $m['day'], $m['hour'], $m['minute']),
371 $m['rank'],
372 html_entity_decode($m['user']),
373 html_entity_decode($m['nick']),
374 html_entity_decode($m['name']),
375 (string)(int)str_contains($header, '<div class="pic">'),
376 trim($html_match['html']),
377 );
378 }
379 return $messages;
380 }
381
382 private function processMessagesQuotes(int $chunk_size = 5000){
383 $this->recreateMessagesQuotesTable();
384 $this->sqlBufferReset();
385
386 $offset = 0;
387 $row_count = $chunk_size;
388 $existing_users = $this->getDbUsersNickIdMap();
389 while(true){
390 $sql = /** @lang GenericSQL */
391 "SELECT msg_id, html FROM {$this->table_messages_body} ORDER BY msg_id LIMIT $offset, $row_count";
392 $result = $this->db->query($sql);
393 if(!$result || !$result->num_rows){
394 break;
395 }
396 while($rec = $result->fetch_object()){
397 $users = $this->getMessageQuotedUsers($rec->html, $existing_users);
398 foreach($users as $user_id){
399 $this->addMessagesQuote($rec->msg_id, $user_id);
400 }
401 }
402 if($this->sqlBufferGetSize($this->table_messages_quotes) > $chunk_size){
403 $this->sqlBufferProcess();
404 }
405 $offset += $row_count;
406 }
407 $this->sqlBufferProcess();
408 }
409
410 private function processMessages(bool $import_body, int $chunk_size = 1000){
411 $this->recreateMessagesTables();
412 if($import_body){
413 $this->recreateMessagesBodyTable();
414 }
415
416 $this->users = [];
417 $this->ranks = [];
418 $this->paths = [];
419 $this->sqlBufferReset();
420
421 foreach($this->getTopics() as $topic_id=>$pages){
422 /** @var GameDevImportHtmlTopic $topic */
423 $topic = null;
424 /** @var GameDevImportHtmlMessage[] $messages */
425 $messages = [];
426 foreach($pages as $page_num=>$file_name){
427 $html = $this->getPageHtml($file_name);
428 if($page_num == 0){
429 $topic = $this->htmlGetTopicInfo($html);
430 }
431 $messages = array_merge($messages, $this->htmlGetMessages($html));
432 }
433 if($topic){
434 $path_id = $this->addPath($topic->path, $topic->href);
435 $this->addTopic(
436 $topic_id,
437 $path_id,
438 $messages[0]->user ?? null,
439 $messages[0]->created ?? null,
440 $topic->name
441 );
442
443 foreach($messages as $msg){
444 $rank_id = $this->addRank($msg->rank);
445 $this->addUser($msg->user, $rank_id, $msg->has_avatar, $msg->nick, $msg->name);
446 $this->addMessage($msg->created, $topic_id, $msg->user, $msg->msg_id);
447 if($import_body){
448 $this->addMessagesBody($msg->msg_id, $msg->html);
449 }
450 }
451 }
452
453 if($this->sqlBufferGetSize($this->table_messages) > $chunk_size){
454 $this->sqlBufferProcess();
455 }
456 }
457 $this->sqlBufferProcess();
458 }
459
460 public function importMessages(){
461 $this->processMessages(false);
462 }
463
464 public function importMessagesWithBody(){
465 $this->processMessages(true);
466 }
467
468 public function createMessagesQuotes(){
469 $this->processMessagesQuotes();
470 }
471}
472