· 7 years ago · Feb 05, 2019, 11:46 AM
1<?php
2/**
3 * Created by PhpStorm.
4 * User: turgutsaricam
5 * Date: 24/08/16
6 * Time: 22:09
7 */
8
9namespace WPCCrawler\Objects\Crawling\Savers;
10
11
12use WP_User_Query;
13use WPCCrawler\Exceptions\DuplicatePostException;
14use WPCCrawler\Exceptions\StopSavingException;
15use WPCCrawler\Factory;
16use WPCCrawler\Objects\Crawling\Bot\PostBot;
17use WPCCrawler\Objects\Crawling\Data\PostData;
18use WPCCrawler\PostDetail\PostDetailsService;
19use WPCCrawler\PostDetail\PostSaverData;
20use WPCCrawler\Objects\Enums\ErrorType;
21use WPCCrawler\Objects\Enums\InformationMessage;
22use WPCCrawler\Objects\Enums\InformationType;
23use WPCCrawler\Objects\File\MediaFile;
24use WPCCrawler\Objects\File\MediaService;
25use WPCCrawler\Objects\Informing\Information;
26use WPCCrawler\Objects\Informing\Informer;
27use WPCCrawler\Objects\Settings\SettingsImpl;
28use WPCCrawler\Objects\Traits\ErrorTrait;
29use WPCCrawler\Objects\Traits\SettingsTrait;
30use WPCCrawler\Utils;
31
32class PostSaver extends AbstractSaver {
33
34 use SettingsTrait;
35 use ErrorTrait;
36
37 private static $DEBUG = false;
38
39 /** @var string Stores ID of the site for which the last post crawl was performed. */
40 public $optionLastCrawledSiteId = '_wpcc_last_crawled_site_id';
41
42 /** @var string Stores ID of the site for which the last post recrawl was performed */
43 public $optionLastRecrawledSiteId = '_wpcc_last_recrawled_site_id';
44
45 /** @var string Stores source URLs as an array. Each inserted post will have this meta. */
46 private $postMetaSourceUrls = '_wpcc_source_urls';
47
48 /** @var string Stores first page URL of the target post. Each inserted post will have this meta. */
49 private $postMetaPostFirstPageUrl = '_wpcc_post_url';
50
51 /*
52 *
53 */
54
55 /** @var string Prefix that will be added to the meta keys used in regular crawling task */
56 public $cronCrawlPostMetaPrefix = '_cron';
57
58 /** @var string Prefix that will be added to the meta keys used in recrawl task */
59 public $cronRecrawlPostMetaPrefix = '_cron_recrawl';
60
61 /*
62 * DUPLICATE CHECK TYPES
63 */
64
65 const DUPLICATE_CHECK_URL = 'url';
66 const DUPLICATE_CHECK_TITLE = 'title';
67 const DUPLICATE_CHECK_CONTENT = 'content';
68
69 /*
70 *
71 */
72
73 /** @var PostData */
74 private $data;
75
76 /** @var bool Stores whether the current task is a recrawl task or not. */
77 private $isRecrawl = false;
78
79 /*
80 *
81 */
82
83 /** @var string|null */
84 private $nextPageUrl = null;
85
86 /** @var array|null */
87 private $nextPageUrls = null;
88
89 /** @var bool */
90 private $isFirstPage = false;
91
92 /** @var null|object */
93 private $urlTuple = null;
94
95 /** @var string|null */
96 private $urlToCrawl = null;
97
98 /** @var int|null */
99 private $postId = null;
100
101 /** @var int|null */
102 private $draftPostId = null;
103
104 /** @var int|null */
105 private $siteIdToCheck = null;
106
107 /** @var bool */
108 private $updateLastCrawled = false;
109
110 /** @var string|null */
111 private $postUrl = null;
112
113 /** @var PostBot|null */
114 private $bot = null;
115
116 /** @var bool */
117 private $contentExists = true;
118
119 /**
120 * Update (recrawl) a post of a URL tuple.
121 *
122 * @param object $urlTuple A row in wpcc_urls table
123 * @return null
124 */
125 public function executePostRecrawl($urlTuple) {
126 $this->setRequestMade(false);
127 $this->clearErrors();
128
129 // Do not proceed if the URL tuple is not found or it does not have a saved post ID.
130 if(!$urlTuple || !$urlTuple->saved_post_id) return null;
131
132 $this->isRecrawl = true;
133
134 $siteIdToCheck = $urlTuple->post_id;
135
136 // Get settings for the site ID
137 $settings = get_post_meta($siteIdToCheck);
138
139 $this->setSettings($settings, Factory::postService()->getSingleMetaKeys());
140
141 $prefix = $this->getCronPostMetaPrefix();
142 $lastRecrawledUrlId = $this->getSetting($prefix . '_last_crawled_url_id');
143 $nextPageUrl = $this->getSetting($prefix . '_post_next_page_url');
144 $nextPageUrls = $this->getSetting($prefix . '_post_next_page_urls');
145 $draftPostId = $this->getSetting($prefix . '_post_draft_id');
146
147 // If the post with saved_post_id does not exist, make URL tuple's saved_post_id null, and stop.
148 $post = get_post($lastRecrawledUrlId && $draftPostId ? $draftPostId : $urlTuple->saved_post_id);
149 if(!$post) {
150 Factory::databaseService()->updateUrlSavedPostId($lastRecrawledUrlId, null);
151
152 // Otherwise, make variables null to continue with the URL tuple.
153 $lastRecrawledUrlId = null;
154 $nextPageUrl = null;
155 $nextPageUrls = null;
156 $draftPostId = null;
157 }
158
159 $this->savePost(
160 $siteIdToCheck,
161 $settings,
162 // If there is a draft post ID, it means that post is not finished to be saved. So, use URL ID of the draft
163 // post instead of the ID of the current URL tuple.
164 $lastRecrawledUrlId && $draftPostId ? $lastRecrawledUrlId : $urlTuple->id,
165 true,
166 $nextPageUrl,
167 $nextPageUrls,
168 $lastRecrawledUrlId && $draftPostId ? $draftPostId : $urlTuple->saved_post_id
169 );
170 }
171
172 /**
173 * Save a post for a site. This method does two things:
174 * <li>Save a post's next page if there is a post that has pages and has not yet saved completely.</li>
175 * <li>Save an unsaved post.</li>
176 *
177 * @param int $siteIdToCheck Site ID for which a post will be saved
178 */
179 public function executePostSave($siteIdToCheck) {
180 $this->setRequestMade(false);
181 $this->clearErrors();
182
183 if(!$siteIdToCheck) return;
184
185 $this->isRecrawl = false;
186
187 // Get settings for the site ID
188 $settings = get_post_meta($siteIdToCheck);
189
190 $this->setSettings($settings, Factory::postService()->getSingleMetaKeys());
191
192 $prefix = $this->getCronPostMetaPrefix();
193 $lastCrawledUrlId = $this->getSetting($prefix . '_last_crawled_url_id');
194 $nextPageUrl = $this->getSetting($prefix . '_post_next_page_url');
195 $nextPageUrls = $this->getSetting($prefix . '_post_next_page_urls');
196 $draftPostId = $this->getSetting($prefix . '_post_draft_id');
197
198 $this->savePost($siteIdToCheck, $settings, $lastCrawledUrlId, true, $nextPageUrl, $nextPageUrls, $draftPostId);
199 }
200
201 /*
202 *
203 */
204
205 /**
206 * Save a post to the database. This method does 3 things:
207 * <ul>
208 * <li> If a urlId is supplied, saves its post URL to the database. This is used to save a post manually. Just pick
209 * an ID from the database.</li>
210 * <li> If there are only siteIdToCheck and its settings, then a URL will be found by using CRON settings and saved
211 * to the database.</li>
212 * <li> If there are urlId, nextPageUrl(s) and draftPostId, then a next page will be saved for the specified urlId.</li>
213 * </ul>
214 *
215 * @param int $siteIdToCheck Site ID which the post belongs to, to get the settings for crawling
216 * @param array $settings Settings for siteIdToCheck
217 * @param null|int $urlId ID of a URL tuple from wpcc_urls table
218 * @param bool $updateLastCrawled True if you want to update CRON options about last crawled site, false
219 * otherwise
220 * @param null|string $nextPageUrl Next page URL for the post, if exists
221 * @param null|array $nextPageUrls All next page URLs for the post, if exists
222 * @param null|int $draftPostId ID of a post which is used to save content for this post, for previous
223 * pages
224 * @return int|null Post ID, or null if the post is not saved
225 */
226 public function savePost($siteIdToCheck, $settings, $urlId = null, $updateLastCrawled = false,
227 $nextPageUrl = null, $nextPageUrls = null, $draftPostId = null) {
228
229 if(!$this->getSettings()) $this->setSettings($settings, Factory::postService()->getSingleMetaKeys());
230
231 // Initialize instance variables
232 $this->urlToCrawl = false;
233 $this->isFirstPage = true;
234 $this->nextPageUrls = $nextPageUrls;
235 $this->nextPageUrl = $nextPageUrl;
236 $this->draftPostId = $draftPostId;
237 $this->siteIdToCheck = $siteIdToCheck;
238 $this->updateLastCrawled = $updateLastCrawled;
239
240 if(static::$DEBUG) {
241 var_dump('Last Crawled Url ID: ' . $urlId);
242 var_dump('Next Page URL: ' . $this->nextPageUrl);
243 var_dump('Next Page URLs:');
244 var_dump($this->nextPageUrls);
245 var_dump('Draft Post ID: ' . $this->draftPostId);
246 }
247
248 try {
249 // Prepare $this->isFirstPage, $this->urlTuple, and $this->urlToCrawl
250 $this->prepareUrlTupleToCrawl($urlId);
251
252 // Lock the URL tuple so that it won't be selected as the URL to crawl again during saving process
253 Factory::databaseService()->updateUrlSavedStatus($this->urlTuple->id, $this->urlTuple->is_saved, $this->urlTuple->saved_post_id, $this->urlTuple->update_count, true);
254
255 $mainSiteUrl = $this->getSetting('_main_page_url');
256 $this->postUrl = Utils::prepareUrl($mainSiteUrl, $this->urlToCrawl);
257
258 // Create a new bot
259 $this->bot = new PostBot($settings, $this->siteIdToCheck);
260
261 // Prepare the post data
262 $this->preparePostData();
263
264 // Prepare next page URL
265 $this->prepareNextPageUrl();
266
267 // Check content existence
268 $this->checkAndReactToContentExistence();
269
270 // Prepare the post data and store it in the PostData instance
271 $this->data->setWpPostData($this->createWPPostData());
272
273 // Check if the post is duplicate and, if so, handle the situation.
274 $this->handleIfDuplicate();
275
276 // Insert the prepared post data into the database.
277 $this->insertPostData();
278
279 // Set post's category if it belongs to a custom taxonomy
280 $this->saveCategories();
281
282 // Delete already-existing attachments when updating a post.
283 $this->maybeDeleteAttachments();
284
285 // Save featured image
286 $this->saveFeaturedImage();
287
288 // Save meta keywords
289 $this->saveMetaKeywords();
290
291 // Save meta description
292 $this->saveMetaDescription();
293
294 // Save attachments
295 $galleryAttachmentIds = $this->saveAttachments();
296
297 /*
298 * SAVE REGISTERED POST DETAILS
299 */
300
301 // Create the data that will be used by the savers
302 $saverData = new PostSaverData(
303 $this,
304 $this->postId,
305 $this->data,
306 $this->isRecrawl,
307 $this->isFirstPage,
308 $this->urlTuple,
309 $galleryAttachmentIds
310 );
311
312 // Save registered post details
313 PostDetailsService::getInstance()->save($this->bot, $saverData);
314
315 /*
316 *
317 */
318
319 // Save custom meta. This should be done at last to allow the user to override some previously-set post meta values.
320 $this->saveCustomMeta();
321
322 // Save custom taxonomies. This should be done at last to allow the user to override some previously-set taxonomy values.
323 $this->saveCustomTaxonomies();
324
325 } catch (StopSavingException $e) {
326 // If the saving operation must be stopped, return null.
327 return null;
328
329 } catch(DuplicatePostException $e) {
330 $this->onDuplicatePostException($e, isset($saverData) ? $saverData : null);
331
332 // Return.
333 return null;
334 }
335
336 /*
337 *
338 */
339
340 // Save related meta
341 if($this->updateLastCrawled)
342 $this->updateLastCrawled($this->siteIdToCheck, $this->nextPageUrl ? $this->urlTuple->id : null, $this->nextPageUrl, $this->nextPageUrls, $this->nextPageUrl ? $this->postId : '');
343
344 // Save post URL as post meta
345 if($this->isFirstPage && $this->postId && isset($this->urlTuple->url))
346 update_post_meta($this->postId, $this->postMetaPostFirstPageUrl, $this->urlTuple->url);
347
348 // Update saved_at if this is the first page and the URL tuple does not have a saved_post_id
349 if($this->isFirstPage && $this->postId && !$this->urlTuple->saved_post_id) {
350 Factory::databaseService()->updateUrlPostSavedAt($this->urlTuple->id, $this->postId, $this->data->getDateCreated());
351 }
352
353 // If this is the last page, tidy up things.
354 if(!$this->nextPageUrl) {
355
356 // Set this URL as saved
357 if(!$this->isRecrawl) {
358 Factory::databaseService()->updateUrlSavedStatus(
359 $this->urlTuple->id,
360 true,
361 $this->postId ? $this->postId : null,
362 $this->urlTuple->update_count,
363 false
364 );
365
366 // Otherwise, set this URL as recrawled
367 } else {
368 Factory::databaseService()->updateUrlRecrawledStatus($this->urlTuple->id, $this->urlTuple->update_count + 1, false);
369 }
370
371 // Otherwise, remove the lock so that the next page can be saved. Also, make this URL not saved so that it won't
372 // be selected as a URL that needs to be crawled for post crawling event.
373 } else {
374 Factory::databaseService()->updateUrlSavedStatus($this->urlTuple->id, false, $this->postId ? $this->postId : null, $this->urlTuple->update_count, false);
375 }
376
377 if(static::$DEBUG) {
378 var_dump('Last Crawled Url ID: ' . $this->urlTuple->id);
379 var_dump('Category ID: ' . $this->urlTuple->category_id);
380 var_dump('Next Page URL: ' . $this->nextPageUrl);
381 var_dump('Next Page URLs:');
382 var_dump($this->nextPageUrls);
383 var_dump('Draft Post ID: ' . ($this->nextPageUrl ? $this->postId : ''));
384 }
385
386 return $this->postId;
387 }
388
389 /**
390 * Handles what happens when there is a duplicate post.
391 *
392 * @param DuplicatePostException $e
393 * @param null|PostSaverData $saverData
394 * @since 1.8.0
395 */
396 private function onDuplicatePostException(DuplicatePostException $e, $saverData) {
397 // There is a duplicate post.
398 $duplicateId = $e->getCode();
399
400 /**
401 * Fires just after a post is decided to be duplicate. At this point, no new post is inserted to the database
402 * and the saved files are not deleted yet.
403 *
404 * @param int $siteIdToCheck ID of the site
405 * @param int $duplicatePostId Found duplicate post ID
406 * @param PostData $data Data retrieved from the target post URL
407 * @param string $postUrl URL of the post
408 * @param PostSaver $this PostSaver itself
409 * @since 1.6.3
410 */
411 do_action('wpcc/post/after_decided_duplicate', $this->siteIdToCheck, $duplicateId, $this->data, $this->postUrl, $this);
412
413 // Make the factories delete the things they are concerned with. Make them delete only if there is a
414 // saver data. If saver data does not exist, it means they did not save anything, since their savers were
415 // not called.
416 if ($saverData) {
417 PostDetailsService::getInstance()->delete($this->bot->getSettingsImpl(), $saverData);
418 }
419
420 // If there is a PostData, delete the attachments.
421 if ($this->data) $this->data->deleteAttachments();
422
423 // If there is a post saved, delete it from the database. If there is a different draft post ID, delete it as well.
424 $postIds = array_unique([$this->postId, $this->draftPostId]);
425 foreach($postIds as $postId) $this->deletePost($postId);
426
427 // If there are gallery attachment IDs, delete them as well.
428 if ($saverData && $saverData->getGalleryAttachmentIds()) {
429 foreach($saverData->getGalleryAttachmentIds() as $mediaId) wp_delete_post($mediaId, true);
430 }
431
432 $this->resetLastCrawled($this->siteIdToCheck);
433
434 // Set this URL as saved so that this won't be tried to be saved again and unlock it.
435 Factory::databaseService()->updateUrlSavedStatus($this->urlTuple->id, true, null, $this->urlTuple->update_count, false);
436
437 /*
438 * Notify the user
439 */
440
441 $msg0 = _wpcc('A duplicate post has been found.');
442
443 $msg1 = sprintf(
444 _wpcc('Current URL: %1$s, Duplicate post ID: %2$s, Duplicate post title: %3$s, Site ID: %4$s.'),
445 $this->postUrl,
446 $duplicateId,
447 get_the_title($duplicateId),
448 $this->siteIdToCheck
449 );
450
451 $msg2 = _wpcc('The URL is not saved and it is marked as saved so that it will not be tried again.');
452
453 $info = Information::fromInformationMessage(
454 InformationMessage::DUPLICATE_POST,
455 implode(' ', [$msg0, $msg1, $msg2]),
456 InformationType::INFO
457 );
458
459 Informer::add($info->setException($e)->addAsLog());
460 }
461
462 /**
463 * Delete post media, thumbnail and the post itself with ID
464 *
465 * @param int $postId ID of the post to be deleted
466 * @since 1.8.0
467 */
468 private function deletePost($postId) {
469 if (!$postId) return;
470
471 // Delete the thumbnail
472 Utils::deletePostThumbnail($postId);
473
474 // Delete the attachments
475 foreach(get_attached_media('image', $postId) as $mediaPost) wp_delete_post($mediaPost->ID);
476
477 // Delete the post without sending it to trash.
478 wp_delete_post($postId, true);
479 }
480
481 /**
482 * Assigns {@link urlToCrawl}, {@link isFirstPage} and {@link urlTuple} instance variables, considering whether
483 * this is a recrawl or not.
484 *
485 * @param int|null $lastCrawledUrlId
486 * @throws StopSavingException
487 */
488 private function prepareUrlTupleToCrawl($lastCrawledUrlId) {
489 global $wpdb;
490
491 // Decide what we're doing. Crawling a next page for the same post, or a new post?
492 if($this->nextPageUrl && $lastCrawledUrlId) {
493 // We're getting a next page for a post.
494 $this->isFirstPage = false;
495
496 $query = "SELECT * FROM " . Factory::databaseService()->getDbTableUrlsName() . " WHERE id = %d";
497 $results = $wpdb->get_results($wpdb->prepare($query, $lastCrawledUrlId));
498
499 // If the URL is not found, then reset the cron options for this site and stop.
500 if (empty($results)) {
501 error_log(
502 "WPCC - There are a next page URL and a last crawled URL ID, but the URL does not exist in database."
503 . " URL ID: " . $lastCrawledUrlId
504 . ", Next Page URL: " . $this->nextPageUrl
505 );
506
507 if($this->updateLastCrawled) {
508 $this->resetLastCrawled($this->siteIdToCheck);
509
510 } else {
511 error_log("WPCC - CRON settings for last-crawled are not reset. This may cause a loop where no post will be saved.");
512 }
513
514 $this->addError(ErrorType::URL_TUPLE_NOT_EXIST);
515 Informer::add(Information::fromInformationMessage(
516 InformationMessage::URL_TUPLE_NOT_EXIST,
517 null,
518 InformationType::ERROR
519 )->addAsLog());
520
521 // Stop crawling
522 throw new StopSavingException();
523 }
524
525 // Get the URL tuple we will work on
526 $this->urlTuple = $results[0];
527
528 // Set the page url we should crawl
529 $this->urlToCrawl = $this->nextPageUrl;
530
531 } else {
532 // We're getting a specified post or a random-ish one
533 $this->urlTuple = $lastCrawledUrlId ? Factory::databaseService()->getUrlById($lastCrawledUrlId) : null;
534
535 if(!$this->urlTuple || (!$this->isRecrawl && $this->urlTuple->is_saved)) {
536 // We're getting a new post. Let's find a URL tuple to save.
537 $this->urlTuple = $this->getUrlTupleToCrawl($this->siteIdToCheck, $lastCrawledUrlId);
538
539 // If no URL is found, then reset the cron options for this site and stop.
540 if($this->urlTuple === null) {
541 error_log("WPCC - No URL is found in the database."
542 . " Site ID to check: " . ($this->siteIdToCheck ? $this->siteIdToCheck : 'does not exist')
543 . ", Last Crawled URL ID: " . ($lastCrawledUrlId ? $lastCrawledUrlId : 'does not exist')
544 );
545
546 if($this->updateLastCrawled) {
547 $this->resetLastCrawled($this->siteIdToCheck);
548
549 } else {
550 error_log("WPCC - CRON settings for last-crawled are not reset. This may cause a loop where no post will be saved.");
551 }
552
553 $this->addError(ErrorType::URL_TUPLE_NOT_EXIST);
554 Informer::add(Information::fromInformationMessage(
555 InformationMessage::URL_TUPLE_NOT_EXIST,
556 null,
557 InformationType::ERROR
558 )->addAsLog());
559
560 // Stop crawling
561 throw new StopSavingException();
562 }
563 }
564
565 // Set the page url we should crawl
566 $this->urlToCrawl = $this->urlTuple->url;
567
568 }
569
570 if(static::$DEBUG) var_dump($this->urlTuple);
571
572 // Do not proceed if this URL tuple is locked.
573 if($this->urlTuple->is_locked) {
574 $this->addError(ErrorType::URL_LOCKED);
575 Informer::add(Information::fromInformationMessage(
576 InformationMessage::URL_LOCKED,
577 null,
578 InformationType::ERROR
579 )->addAsLog());
580
581 // Stop crawling
582 throw new StopSavingException();
583 }
584 }
585
586 /**
587 * Sends a request to the target URL, retrieves a PostData, and assigns it to {@link data}.
588 *
589 * @throws StopSavingException
590 */
591 private function preparePostData() {
592 $this->data = $this->bot->crawlPost($this->postUrl);
593 $this->setRequestMade(true);
594
595 // If there is an error with the connection, reset last crawled and set this URL as saved. By this way,
596 // this URL won't be tried again in the future.
597 if($this->data === null) {
598 $this->resetLastCrawled($this->siteIdToCheck);
599
600 $this->addError(ErrorType::URL_COULD_NOT_BE_FETCHED);
601 Informer::add(Information::fromInformationMessage(
602 InformationMessage::URL_COULD_NOT_BE_FETCHED,
603 $this->postUrl,
604 InformationType::ERROR
605 )->addAsLog());
606
607 // If the URL tuple does not have a post, delete it.
608 if(!$this->urlTuple->saved_post_id) {
609 Factory::databaseService()->deleteUrl($this->urlTuple->id);
610
611 // Write an error
612 error_log("WPCC - The URL cannot be fetched (" . $this->postUrl . "). There was a connection error. The URL is
613 deleted.");
614
615 // Stop saving
616 throw new StopSavingException();
617 }
618
619 // Set this URL as saved
620 Factory::databaseService()->updateUrlSavedStatus($this->urlTuple->id, true, $this->urlTuple->saved_post_id, $this->urlTuple->update_count, false);
621
622 // If this is a recrawl, mark this URL as recrawled so that it won't be tried again and again.
623 if($this->isRecrawl) {
624 Factory::databaseService()->updateUrlRecrawledStatus($this->urlTuple->id, $this->urlTuple->update_count + 1, false);
625 }
626
627 // Write an error
628 error_log("WPCC - The URL cannot be fetched (" . $this->postUrl . "). There was a connection error. The URL is
629 marked as saved now. Last crawled settings are reset.");
630
631 // Stop saving
632 throw new StopSavingException();
633 }
634
635 }
636
637 /**
638 * Prepares {@link nextPageUrl} and {@link nextPageUrls}
639 */
640 private function prepareNextPageUrl() {
641 // Reset next page variables and assign them according to the data.
642 $this->nextPageUrl = '';
643
644 // If the post should be paginated, get the next page's URL (or URLs) and store it as option
645 if($this->data->isPaginate()) {
646 if($this->data->getNextPageUrl()) {
647 // The post has a next page URL on each page.
648 $this->nextPageUrl = $this->data->getNextPageUrl();
649
650 } else if($this->data->getAllPageUrls()) {
651
652 if(static::$DEBUG) var_dump("All page URLs are found.");
653
654 // If there is no next page URLs, then this is the first time we crawl this post.
655 // First, save all page URLs.
656 if(!$this->nextPageUrls || empty($this->nextPageUrls)) {
657 if(static::$DEBUG) var_dump('Next Page URLs is false or empty. Get them from the data.');
658 // The post has all URLs for pages in a page.
659 $this->nextPageUrls = $this->data->getAllPageUrls();
660
661 // Check if the urls array contains the current page. If so, remove the current page.
662 foreach ($this->nextPageUrls as $key => &$mUrl) {
663 if ($mUrl["data"] == $this->postUrl) {
664 unset($this->nextPageUrls[$key]);
665 if(static::$DEBUG) var_dump("Unset " . $mUrl);
666 }
667 }
668
669 // Reset the keys of the array
670 $this->nextPageUrls = array_values(array_map(function($url) {
671 return $url["data"];
672 }, $this->nextPageUrls));
673 }
674
675 if(static::$DEBUG) var_dump("Next Page URLs: ");
676 if(static::$DEBUG) var_dump($this->nextPageUrls);
677
678 // Get the next page URL.
679 if(!empty($this->nextPageUrls)) {
680 if(static::$DEBUG) var_dump("Next page URLs is not empty. Find next page URL.");
681 if(static::$DEBUG) var_dump("Current URL is: " . $this->urlToCrawl);
682
683 // We have next page URLs. Find the next page URL.
684 $currentUrlPos = false;
685 foreach ($this->nextPageUrls as $key => $url) {
686 if(static::$DEBUG) var_dump("Possible Current URL: " . $url);
687
688 if ($url == $this->urlToCrawl) {
689 $currentUrlPos = $key;
690
691 if(static::$DEBUG) var_dump("Current URL pos is found as " . $currentUrlPos . ", which is " . $url);
692
693 break;
694 }
695 }
696
697 // If current URL is found among next page URLs, and it is not the last URL, then we can get the next
698 // URL as next page URL.
699 if ($currentUrlPos !== false && $currentUrlPos < sizeof($this->nextPageUrls) - 1) {
700 if(static::$DEBUG) var_dump("Current URL position is valid: " . $currentUrlPos . ". Get the next item in the list.");
701 $this->nextPageUrl = $this->nextPageUrls[$currentUrlPos + 1];
702
703 // If current URL is not found among next page URLs, then get the first URL as next page URL.
704 } else if($currentUrlPos === false) {
705 if(static::$DEBUG) var_dump("Current URL Position is false. Get the first URL in the list.");
706 $this->nextPageUrl = $this->nextPageUrls[0];
707 }
708
709 // Otherwise, next page URL will be empty, since it is not assigned.
710
711 // Also, since there is no next page, reset all next pages.
712 if(!$this->nextPageUrl) $this->nextPageUrls = [];
713 }
714
715 }
716 }
717 }
718
719 /**
720 * Checks the content existence and, if it does not exist, sets next page URLs as null. Sets the value of
721 * {@link contentExists}.
722 */
723 private function checkAndReactToContentExistence() {
724 // Sometimes, next pages may be empty due to a malfunction of the site. Scenario is that the post does not have
725 // content on the next page, but there is a link on the page indicating there is a next page. In this case,
726 // the crawler cannot find any content in the next page. If this is the case, do not continue to next pages.
727 $this->contentExists = true;
728
729 // Get main post template
730 $templateMain = $this->getSetting('_post_template_main');
731 $clearedTemplateMain = $templateMain;
732
733 // Remove short codes
734 // First get predefined short codes
735 $allShortCodes = Factory::postService()->getPredefinedShortCodes();
736
737 // Now get user-defined short codes
738 $shortCodeSelectors = $this->getSetting('_post_custom_content_shortcode_selectors');
739 if($shortCodeSelectors) {
740 foreach ($shortCodeSelectors as $selector) {
741 if (isset($selector["short_code"]) && $selector["short_code"]) {
742 $allShortCodes[] = "[" . $selector["short_code"] . "]";
743 }
744 }
745 }
746
747 // Now remove them from the original raw template
748 foreach($allShortCodes as $shortCode) {
749 $clearedTemplateMain = str_replace($shortCode, "", $clearedTemplateMain);
750 }
751
752 if(static::$DEBUG) var_dump("Cleared Template Main:" . $clearedTemplateMain);
753 if(static::$DEBUG) var_dump("Original Template Main: " . $templateMain);
754 if(static::$DEBUG) var_dump($allShortCodes);
755 if(static::$DEBUG) var_dump(mb_strlen($this->data->getTemplate()) <= mb_strlen($clearedTemplateMain));
756
757 // Now, check if the prepared template's length is greater than that of short-codes-removed template. So, if
758 // the prepared template's length is less, it means the page is empty. Hence, we do not have any variables in
759 // the page.
760 if (!$this->data->getTemplate() || mb_strlen($this->data->getTemplate()) <= mb_strlen($clearedTemplateMain)) {
761 $this->nextPageUrl = null;
762 $this->nextPageUrls = null;
763 $this->contentExists = false;
764 }
765 }
766
767 /**
768 * Prepares post data array that contains the required WordPress post variables and their values, using {@link data}.
769 * @return array Prepared post data array
770 */
771 private function createWPPostData() {
772 // Get general settings
773 // If this site has different settings, then use them.
774 if($this->getSetting('_do_not_use_general_settings')) {
775 $allowComments = $this->getSetting('_wpcc_allow_comments');
776 $postStatus = $this->getSetting('_wpcc_post_status');
777 $postType = $this->getSetting('_wpcc_post_type');
778 $postAuthor = $this->getSetting('_wpcc_post_author');
779 $tagLimit = $this->getSetting('_wpcc_post_tag_limit');
780 $postPassword = $this->getSetting('_wpcc_post_password');
781
782 // Otherwise, go on with general settings.
783 } else {
784 $allowComments = get_option('_wpcc_allow_comments');
785 $postStatus = get_option('_wpcc_post_status');
786 $postType = get_option('_wpcc_post_type');
787 $postAuthor = get_option('_wpcc_post_author');
788 $tagLimit = get_option('_wpcc_post_tag_limit', 0);
789 $postPassword = get_option('_wpcc_post_password');
790 }
791
792 // Prepare the data
793 if($this->data->getPreparedTags() && $tagLimit && ((int) $tagLimit) > 0 && sizeof($this->data->getPreparedTags()) > $tagLimit) {
794 $this->data->setPreparedTags(array_slice($this->data->getPreparedTags(), 0, $tagLimit));
795 }
796
797 // Check if we have a draft post ID to edit
798 $content = '';
799 $sourceUrls = [];
800 $post = null;
801
802 if($this->draftPostId && $post = get_post($this->draftPostId)) {
803
804 if(!$this->isFirstPage) {
805
806 $content = $post->post_content;
807 if(!empty($content)) {
808 $content = $content . "<!--nextpage-->";
809 }
810
811 // Get source URLs
812 $sourceUrls = get_post_meta($this->draftPostId, $this->postMetaSourceUrls, true);
813
814 if(!$sourceUrls) $sourceUrls = [];
815 }
816 }
817
818 // Append current source URL
819 $sourceUrls[] = $this->postUrl;
820
821 /*
822 * PREPARE POST DATA
823 */
824
825 // If post author is not set, then set the first administrator as post author.
826 if(!$postAuthor) {
827 $userQuery = new WP_User_Query([
828 'role' => 'Administrator',
829 'fields' => 'ID',
830 'number' => 1
831 ]);
832 $postAuthor = $userQuery->get_results()[0];
833 }
834
835 $postData = [
836 'ID' => $this->draftPostId ? $this->draftPostId : 0,
837 // If there is a next page to append to this post, then make this post's status draft no matter what.
838 // Otherwise, go on with the settings.
839 'post_status' => $this->nextPageUrl ? 'draft' : ($postStatus ? $postStatus : 'draft'),
840 'post_type' => post_type_exists($postType) ? $postType : 'post',
841 'post_password' => $postPassword ? $postPassword : '',
842 'post_category' => [$this->urlTuple->category_id],
843 'meta_input' => [
844 // Store the source URLs just in case
845 $this->postMetaSourceUrls => $sourceUrls
846 ],
847 ];
848
849 // If this is the first page of the newly created post.
850 if(!$this->isRecrawl && $this->isFirstPage) {
851 // Set the date
852 $postDate = $this->data->getDateCreated();
853 $postData["post_date"] = $postDate;
854
855 // Set the slug if there exists one
856 if ($this->data->getSlug()) $postData['post_name'] = $this->data->getSlug();
857 }
858
859 // If content exists, append in to the content of the original post
860 if($this->contentExists) {
861 $postData['post_content'] = $content . $this->data->getTemplate();
862
863 // Otherwise, do not change the content.
864 } else if($post) {
865 $postData['post_content'] = $post->post_content;
866 }
867
868 // If this is the first page, set other required data
869 if($this->isFirstPage || !$post) {
870 $postData = array_merge($postData, [
871 'post_author' => $postAuthor,
872 'post_title' => $this->data->getTitle() ? $this->data->getTitle() : '',
873 'post_excerpt' => $this->data->getExcerpt() ? $this->data->getExcerpt()["data"] : '',
874 'comment_status' => $allowComments ? 'open' : 'closed',
875 'tags_input' => $this->data->getPreparedTags() ? $this->data->getPreparedTags() : ''
876 ]);
877
878 if($post) {
879 $postData = array_merge($postData, [
880 'post_date' => $post->post_date,
881 'post_date_gmt' => $post->post_date_gmt,
882 'post_name' => $post->post_name,
883 'guid' => $post->guid,
884 ]);
885 }
886
887 // Set everything from the current found post. Even if this is an update, WP requires some variables again.
888 } else if($post) {
889 $postData = array_merge($postData, [
890 'post_author' => $post->post_author,
891 'post_title' => $post->post_title,
892 'post_excerpt' => $post->post_excerpt,
893 'comment_status' => $post->comment_status,
894 'post_date' => $post->post_date,
895 'post_date_gmt' => $post->post_date_gmt,
896 'post_name' => $post->post_name,
897 'guid' => $post->guid,
898 ]);
899 }
900
901 return $postData;
902 }
903
904 /**
905 * Checks if the post is duplicate. If it is, deletes its attachments, deletes the draft post, resets last-crawled
906 * CRON metas, marks the URL tuple as saved.
907 *
908 * @throws DuplicatePostException If the post is duplicate and saving process should no longer continue
909 */
910 private function handleIfDuplicate() {
911 // No need to do this when recrawling.
912 if ($this->isRecrawl) return;
913
914 // Try to find a duplicate post
915 $duplicatePostId = $this->isDuplicate($this->urlTuple->url, $this->data->getWpPostData(), $this->isFirstPage, !$this->nextPageUrl);
916
917 // If none, stop.
918 if (!$duplicatePostId) return;
919
920 // This is a duplicate post. Throw a duplicate post exception.
921 throw new DuplicatePostException(_wpcc("A duplicate post is found."), $duplicatePostId);
922 }
923
924 /**
925 * Inserts given post data into the database. This also sets {@link postId} as the inserted post's ID.
926 *
927 * @throws StopSavingException
928 */
929 private function insertPostData() {
930 // Get the post data
931 $postData = $this->data->getWpPostData();
932
933 /**
934 * Modify post data before it is saved to the database.
935 *
936 * @param array $postData The data that will be passed to wp_insert_post function.
937 * @param PostData $data Data retrieved from the target post page's source code
938 * @param PostBot $bot PostBot object used to retrieve the data from the target page
939 * @param PostSaver $this PostSaver itself
940 * @param int $siteIdToCheck ID of the site that stores the settings
941 * @param string $postUrl URL of the post
942 * @param array $urlTuple An array containing info about the URL. This array is retrieved from the URL table.
943 * Hence, it has all the columns and their values in that table.
944 * @param bool isRecrawl True if this is fired for a recrawl.
945 *
946 * @return array|null $postData Modified post data. Return null if you do not want to save the post.
947 * @since 1.6.3
948 */
949 $postData = apply_filters('wpcc/post/wp-post', $postData, $this->data, $this->bot, $this, $this->siteIdToCheck, $this->postUrl, $this->urlTuple, $this->isRecrawl);
950
951 // If the post data is null, do not save the post.
952 if($postData === null) throw new StopSavingException();
953
954 /**
955 * Fires just before a post is inserted/updated.
956 *
957 * @param array $postData Data that will be used to create/update a post in the database. If 'ID' key has
958 * a valid integer value, this means this is fired for an update.
959 * @param PostData $data Data retrieved from the target site according to the site settings
960 * @param PostSaver $this PostSaver itself
961 * @param int $siteIdToCheck ID of the site for which the post is retrieved
962 * @param string $postUrl URL of the post
963 * @param array $urlTuple An array containing the URL data. The keys are columns of the DB table storing the URLs.
964 * @param bool $isRecrawl True if this is a recrawl.
965 * @param bool $isFirstPage True if this is the first page of the post
966 * @since 1.6.3
967 */
968 do_action('wpcc/post/before_save', $postData, $this->data, $this, $this->siteIdToCheck, $this->postUrl, $this->urlTuple, $this->isRecrawl, $this->isFirstPage);
969
970 //
971
972 $this->postId = wp_insert_post($postData);
973
974 //
975
976 /**
977 * Fires just after a post is inserted/updated.
978 *
979 * @param array $postData Data that was used to create/update a post in the database. If 'ID' key has
980 * a valid integer value, this means this is fired for an update.
981 * @param PostData $data Data retrieved from the target site according to the site settings
982 * @param PostSaver $this PostSaver itself
983 * @param int $siteIdToCheck ID of the site for which the post is retrieved
984 * @param string $postUrl URL of the post
985 * @param array $urlTuple An array containing the URL data. The keys are columns of the DB table storing the URLs.
986 * @param bool $isRecrawl True if this is a recrawl.
987 * @param int $postId ID of the saved post
988 * @param bool $isFirstPage True if this is the first page of the post
989 * @since 1.6.3
990 */
991 do_action('wpcc/post/after_save', $postData, $this->data, $this, $this->siteIdToCheck, $this->postUrl, $this->urlTuple, $this->isRecrawl, $this->postId, $this->isFirstPage);
992
993 if($this->draftPostId && $this->postId != $this->draftPostId) {
994 error_log("Draft post ID ({$this->draftPostId}) and inserted post ID ({$this->postId}) are different.");
995 }
996
997 if(static::$DEBUG) var_dump("Inserted Post ID: " . $this->postId);
998
999 // Set the WP post data to PostData, since $postData might have been modified
1000 $this->data->setWpPostData($postData);
1001 }
1002
1003 /**
1004 * Sets the custom post taxonomy if the post's category belongs to a custom category taxonomy.
1005 *
1006 * @since 1.8.0
1007 */
1008 private function saveCategories() {
1009 // Do this only in the first page
1010 if (!$this->isFirstPage) return;
1011
1012 // Get the categories
1013 $categories = Utils::getCategories($this->getSettingsImpl());
1014
1015 // Find the selected category's taxonomy
1016 $taxonomy = null;
1017 foreach($categories as $categoryItem) {
1018 $id = Utils::array_get($categoryItem, 'id');
1019 if (!$id) continue;
1020
1021 if ($id == $this->urlTuple->category_id) {
1022 $taxonomy = Utils::array_get($categoryItem, 'taxonomy');
1023 break;
1024 }
1025 }
1026
1027 // If a taxonomy is not found, use the default WP category taxonomy
1028 if (!$taxonomy) $taxonomy = 'category';
1029
1030 // Set the categories under the defined taxonomy
1031 $this->insertAndSetPostCategories($taxonomy);
1032 }
1033
1034 /**
1035 * Sets the category of the post
1036 *
1037 * @param string $catTaxonomy Category taxonomy
1038 * @since 1.8.0
1039 */
1040 private function insertAndSetPostCategories($catTaxonomy = 'category') {
1041 // If this is a recrawl, remove already-existing categories.
1042 if ($this->isRecrawl) {
1043 wp_set_post_terms($this->postId, [], $catTaxonomy, false);
1044 }
1045
1046 // Define the category taxonomy and get the category names that should be added as the post's categories.
1047 $categoryNames = $this->data->getCategoryNames();
1048
1049 // Get the post category defined in the category map
1050 $term = get_term_by('id', $this->urlTuple->category_id, $catTaxonomy);
1051 $mainCatTermId = $term && isset($term->term_id) ? $term->term_id : null;
1052
1053 // If there is no category name, set the main category ID as the category ID specified in the category map and
1054 // stop.
1055 if (!$categoryNames) {
1056 if($mainCatTermId !== null) {
1057 wp_set_post_terms($this->postId, $this->urlTuple->category_id, $catTaxonomy, false);
1058 }
1059
1060 return;
1061 }
1062
1063 // Get whether the user wants to use the category ID defined in the category map or not
1064 $doNotAddCategoryDefinedInMap = $this->getSettingForCheckbox('_post_category_do_not_add_category_in_map');
1065
1066 // Insert/retrieve the category term IDs.
1067 $categoryIds = $this->insertPostCategories($categoryNames, $catTaxonomy, $doNotAddCategoryDefinedInMap ? null : $mainCatTermId);
1068
1069 // If there is no category, stop.
1070 if (!$categoryIds) return;
1071
1072 // Set the category IDs of the post
1073 $result = wp_set_post_terms($this->postId, $categoryIds, $catTaxonomy, false);
1074 if (is_wp_error($result)) {
1075 $info = Informer::addError(_wpcc('Category IDs could not be assigned to the post.'));
1076 if (is_a($result, \WP_Error::class)) {
1077 /** @var \WP_Error $result */
1078 $info->setDetails($info->getDetails() . ' ' . $result->get_error_message());
1079 }
1080
1081 $info->addAsLog();
1082 }
1083 }
1084
1085 /**
1086 * Inserts/retrieves product categories considering the settings.
1087 *
1088 * @param array $categoryNames Category names to be set as product's category, possibly retrieved from
1089 * {@link WooCommerceDetailData::getCategoryNames()}. See
1090 * {@link WooCommerceDetailData::getCategoryNames()} for details.
1091 * @param string $catTaxonomy Taxonomy name to which the categories inserted. Possible 'product_cat'
1092 * @param int|null $mainCatTermId Category ID that will be the parent of the inserted categories. Null if you do
1093 * not want to set a parent to the to-be-inserted categories.
1094 * @return array Array of taxonomy IDs that can be assigned to the product
1095 * @since 1.8.0
1096 */
1097 private function insertPostCategories($categoryNames, $catTaxonomy, $mainCatTermId) {
1098 // Insert/retrieve the category taxonomies
1099 $categoryIds = [];
1100 foreach($categoryNames as $catNameValue) {
1101 // If the category name value is not an array, make it an array to keep the algorithm simple.
1102 if (!is_array($catNameValue)) $catNameValue = [$catNameValue];
1103
1104 // We need to add all categories hierarchically.
1105
1106 // Store the parent term ID.
1107 $parentTermId = $mainCatTermId;
1108
1109 $isError = false;
1110 $hierarchicalCatIds = [];
1111
1112 // Add the categories one by one
1113 foreach($catNameValue as $catName) {
1114 $args = $parentTermId !== null ? ['parent' => $parentTermId] : [];
1115 $termId = Utils::insertTerm($catName, $catTaxonomy, $args);
1116
1117 // If a term ID could not be retrieved, stop.
1118 if ($termId === null) {
1119 $isError = true;
1120 break;
1121 }
1122
1123 // Add the term ID to the hierarchical category IDs
1124 $hierarchicalCatIds[] = $termId;
1125
1126 // Set this term ID as the previous term ID so that it can be set as the next category's parent.
1127 $parentTermId = $termId;
1128 }
1129
1130 // If there was an error, it means at least one of the categories could not be inserted. In this case,
1131 // do not set successfully-retrieved category IDs as the category of the post, since the user wants
1132 // all of the categories.
1133 if ($isError) continue;
1134
1135 if ($hierarchicalCatIds) $categoryIds = array_merge($categoryIds, $hierarchicalCatIds);
1136
1137 }
1138
1139 return $categoryIds;
1140 }
1141
1142 /*
1143 *
1144 */
1145
1146 /**
1147 * Deletes already-existing attachments when updating the post, and when this is the first page of the post.
1148 */
1149 private function maybeDeleteAttachments() {
1150 // Do this only when this is the first page, we are updating the post, and a post ID exists
1151 if(!$this->isFirstPage || !$this->isRecrawl || !$this->postId) return;
1152
1153 // Delete already-attached media
1154 $alreadyAttachedMedia = get_attached_media('image', $this->postId);
1155 foreach($alreadyAttachedMedia as $mediaPost) {
1156 wp_delete_post($mediaPost->ID);
1157 }
1158
1159 // Delete the already existing thumbnail of the post
1160 Utils::deletePostThumbnail($this->postId);
1161 }
1162
1163 /**
1164 * Saves featured image of the post
1165 */
1166 private function saveFeaturedImage() {
1167 // If this is not the first page or the post ID does not exist, stop.
1168 if(!$this->isFirstPage || !$this->postId) return;
1169
1170 // Get the thumbnail image file path
1171 $mediaFile = null;
1172 if($this->urlTuple->thumbnail_url) {
1173 $thumbnailUrl = $this->urlTuple->thumbnail_url;
1174
1175 // If there is no thumbnail image URL, stop.
1176 if (!$thumbnailUrl) return;
1177
1178 // Prepare the thumbnail URL
1179 try {
1180 $thumbnailUrl = $this->bot->resolveUrl($thumbnailUrl);
1181 } catch (\Exception $e) {
1182 Informer::addError(_wpcc('URL could not be resolved') . ' - ' . $thumbnailUrl)->addAsLog();
1183 }
1184
1185 // Save the featured image
1186 $file = MediaService::getInstance()->saveMedia($thumbnailUrl, $this->getSetting("_wpcc_http_user_agent", null));
1187 if (!$file) return;
1188
1189 $mediaFile = new MediaFile($thumbnailUrl, $file['file']);
1190
1191 } else if($this->data->getThumbnailData()) {
1192 $mediaFile = $this->data->getThumbnailData();
1193 }
1194
1195 // If there is no file, stop.
1196 if (!$mediaFile) return;
1197
1198 // Save as attachment and get the attachment id.
1199 try {
1200 $thumbnailAttachmentId = MediaService::getInstance()->insertMedia($this->postId, $mediaFile);
1201 } catch (\Exception $e) {
1202 Informer::addError(_wpcc('Media file does not have a local path.'))->addAsLog();
1203 return;
1204 }
1205
1206 // Set the media ID
1207 $mediaFile->setMediaId($thumbnailAttachmentId);
1208
1209 // Set this attachment as post thumbnail
1210 set_post_thumbnail($this->postId, $thumbnailAttachmentId);
1211 }
1212
1213 /**
1214 * Saves meta keywords
1215 */
1216 private function saveMetaKeywords() {
1217 // If this is not the first page or the post ID does not exist, stop.
1218 if(!$this->isFirstPage || !$this->postId) return;
1219
1220 if(!$this->data->getMetaKeywords()) return;
1221
1222 $key = get_option('_wpcc_meta_keywords_meta_key');
1223 if (!$key) return;
1224
1225 Utils::savePostMeta($this->postId, $key, $this->data->getMetaKeywords(), true);
1226 }
1227
1228 /**
1229 * Saves meta description
1230 */
1231 private function saveMetaDescription() {
1232 // If this is not the first page or the post ID does not exist, stop.
1233 if(!$this->isFirstPage || !$this->postId) return;
1234
1235 if(!$this->data->getMetaDescription()) return;
1236
1237 $key = get_option('_wpcc_meta_description_meta_key');
1238 if(!$key) return;
1239
1240 Utils::savePostMeta($this->postId, $key, $this->data->getMetaDescription(), true);
1241 }
1242
1243 /**
1244 * Saves attachments
1245 *
1246 * @return array Gallery attachment IDs
1247 */
1248 private function saveAttachments() {
1249 if(!$this->postId || !$this->data->getAttachmentData()) return [];
1250
1251 $galleryAttachmentIds = [];
1252
1253 foreach($this->data->getAttachmentData() as $mediaFile) {
1254 // Insert the media
1255 try {
1256 $attachmentId = MediaService::getInstance()->insertMedia($this->postId, $mediaFile);
1257 } catch (\Exception $e) {
1258 Informer::addError(_wpcc('Media file does not have a local path.'))->addAsLog();
1259 continue;
1260 }
1261
1262 // Set the media ID
1263 $mediaFile->setMediaId($attachmentId);
1264
1265 if($mediaFile->isGalleryImage()) {
1266 $galleryAttachmentIds[] = $attachmentId;
1267 }
1268
1269 }
1270
1271 // Add srcset attributes to media elements in the content.
1272 $this->setMediaSrcSetsInContent();
1273
1274 return $galleryAttachmentIds;
1275 }
1276
1277 /**
1278 * Updates the post content such that media elements in the content have srcset attributes.
1279 *
1280 * @since 1.8.0
1281 */
1282 private function setMediaSrcSetsInContent() {
1283 // Change the template by adding srcset attributes.
1284 $oldTemplate = $this->addMediaSrcSetsToTemplate();
1285
1286 // If there was no change, no need to continue.
1287 if ($oldTemplate === false) return;
1288
1289 // Update the post content
1290 $this->updatePostContentForCurrentTemplate($oldTemplate);
1291 }
1292
1293 /**
1294 * Modifies the current template of {@link $data} by adding srcset attributes to media elements.
1295 *
1296 * @return string|false The old template if there is a change in the template. Otherwise, false.
1297 * @since 1.8.0
1298 */
1299 private function addMediaSrcSetsToTemplate() {
1300 // If the function that creates srcset does not exist, stop.
1301 if (!function_exists('wp_get_attachment_image_srcset')) return false;
1302
1303 // If there is no attachment data, stop.
1304 if (!$this->data->getAttachmentData()) return false;
1305
1306 // Get the template
1307 $template = $this->data->getTemplate();
1308 if (!$template) return false;
1309
1310 // Create a dummy crawler for the post template
1311 $dummyTemplateCrawler = $this->bot->createDummyCrawler($template);
1312
1313 foreach($this->data->getAttachmentData() as $mediaFile) {
1314 // If the media does not have an ID, continue with the next one.
1315 if (!$mediaFile->getMediaId()) continue;
1316
1317 // Get the srcset
1318 $srcSet = wp_get_attachment_image_srcset($mediaFile->getMediaId());
1319 if (!$srcSet) continue;
1320
1321 // Add the srcset to the corresponding media element
1322 $this->bot->modifyMediaElement($dummyTemplateCrawler, $mediaFile, function(MediaFile $mediaFile, \DOMElement $element) use (&$srcSet) {
1323 $element->setAttribute('srcset', $srcSet);
1324 });
1325 }
1326
1327 // Get the modified content
1328 $newTemplate = $this->bot->getContentFromDummyCrawler($dummyTemplateCrawler);
1329
1330 // If there is no change, stop.
1331 if ($newTemplate === $template) return false;
1332
1333 // Update the post content
1334 $this->data->setTemplate($newTemplate);
1335
1336 return $template;
1337 }
1338
1339 /**
1340 * Updates the post content to reflect changes made to the current template which will be retrieved from
1341 * {@link $data} ({@link PostData::getTemplate()}).
1342 *
1343 * @param string $oldTemplate Old template that will be changed with the new one which will be retrieved from
1344 * {@link $data}
1345 * @since 1.8.0
1346 */
1347 private function updatePostContentForCurrentTemplate($oldTemplate) {
1348 // If there is no post ID, we cannot update the content.
1349 if (!$this->postId) return;
1350
1351 $wpPostData = $this->data->getWpPostData();
1352 $newPostContent = $this->data->getTemplate();
1353
1354 // If this is not the first page, it means the template was appended to the content of the previous pages.
1355 if (!$this->isFirstPage) {
1356 // Get the existing content
1357 $existingContent = Utils::array_get($wpPostData, 'post_content', null);
1358
1359 // If there is an existing content
1360 if ($existingContent) {
1361 // Replace the unmodified template with the modified one in the existing content. By this way,
1362 // previous content will not be changed and the changes will be reflected properly.
1363 $newPostContent = str_replace($oldTemplate, $this->data->getTemplate(), $existingContent);
1364 }
1365
1366 }
1367
1368 // Update the post's content with new post content
1369 wp_update_post([
1370 'ID' => $this->postId,
1371 'post_content' => $newPostContent
1372 ]);
1373
1374 // Update content of WP post data in the PostData
1375 $wpPostData['post_content'] = $newPostContent;
1376 $this->data->setWpPostData($wpPostData);
1377 }
1378
1379 /**
1380 * Saves custom post meta
1381 */
1382 private function saveCustomMeta() {
1383 if(!$this->postId || !$this->data->getCustomMeta()) return;
1384
1385 foreach($this->data->getCustomMeta() as $metaData) {
1386 $metaValue = $metaData["data"];
1387 $metaKey = $metaData["meta_key"];
1388
1389 // Delete old meta values first when updating. Do this only when the first page is being crawled.
1390 if($this->isFirstPage && $this->isRecrawl) {
1391 delete_post_meta($this->postId, $metaKey);
1392 }
1393
1394 // If it must be saved as multiple
1395 if(isset($metaData["multiple"]) && $metaData["multiple"]) {
1396
1397 // If the value is array
1398 if(is_array($metaValue)) {
1399 if(empty($metaValue)) continue;
1400
1401 // Add each value
1402 foreach($metaValue as $value) {
1403 add_post_meta($this->postId, $metaKey, $value, false);
1404 }
1405
1406 } else {
1407 // Otherwise, add it directly
1408 add_post_meta($this->postId, $metaKey, $metaValue, false);
1409 }
1410
1411 } else {
1412 // Otherwise, save it as a single post meta.
1413 update_post_meta($this->postId, $metaKey, $metaValue);
1414 }
1415 }
1416 }
1417
1418 /**
1419 * Saves custom post taxonomies
1420 * @since 1.8.0
1421 */
1422 private function saveCustomTaxonomies() {
1423 if(!$this->postId || !$this->data->getCustomTaxonomies()) return;
1424
1425 // Delete old taxonomy values first when updating. Do this only when the first page is being crawled.
1426 if($this->data->getCustomTaxonomies() && $this->isFirstPage && $this->isRecrawl) {
1427 $taxNames = array_unique(array_map(function($v) {
1428 return $v['taxonomy'];
1429 }, $this->data->getCustomTaxonomies()));
1430
1431 wp_delete_object_term_relationships($this->postId, $taxNames);
1432 }
1433
1434 foreach($this->data->getCustomTaxonomies() as $taxonomyData) {
1435 $taxValue = $taxonomyData['data'];
1436 $taxName = $taxonomyData['taxonomy'];
1437 $isAppend = isset($taxonomyData['append']) && $taxonomyData['append'];
1438
1439 // Make sure the value is an array.
1440 if (!is_array($taxValue)) $taxValue = [$taxValue];
1441
1442 // Save them as terms
1443 $termIds = [];
1444 foreach($taxValue as $tv) {
1445 $termId = Utils::insertTerm($tv, $taxName);
1446 if (!$termId) continue;
1447
1448 $termIds[] = $termId;
1449 }
1450
1451 // If there is no term ID, continue with the next one.
1452 if (!$termIds) continue;
1453
1454 wp_set_post_terms($this->postId, $termIds, $taxName, $isAppend);
1455 }
1456 }
1457
1458 /**
1459 * Does the updates for post-crawling event
1460 *
1461 * @param int $siteId Last updated site ID
1462 * @param int $lastCrawledUrlId ID of the URL from the urls table which is crawled
1463 * @param string|null $nextPageUrl Next page URL
1464 * @param array|null $nextPageUrls Next page URLs
1465 * @param int|null $draftPostId Draft post ID
1466 */
1467 private function updateLastCrawled($siteId, $lastCrawledUrlId, $nextPageUrl, $nextPageUrls, $draftPostId) {
1468 // Get the prefix for the CRON meta keys of the current task
1469 $prefix = $this->getCronPostMetaPrefix();
1470
1471 Utils::savePostMeta($siteId, $prefix . '_last_crawled_url_id', $lastCrawledUrlId, true);
1472 Utils::savePostMeta($siteId, $prefix . '_post_next_page_url', $nextPageUrl, true);
1473 Utils::savePostMeta($siteId, $prefix . '_post_next_page_urls', $nextPageUrls, true);
1474 Utils::savePostMeta($siteId, $prefix . '_post_draft_id', $draftPostId ? $draftPostId : '', true);
1475 Utils::savePostMeta($siteId, $prefix . '_last_crawled_at', current_time('mysql'), true);
1476
1477 // Set last crawled site id if there is no draft post ID. By this way, if there is a paged post crawling in progress,
1478 // before we get a post from another site, we finish crawling all pages of current post.
1479 if(!$draftPostId) update_option($this->isRecrawl ? $this->optionLastRecrawledSiteId : $this->optionLastCrawledSiteId, $siteId, false);
1480 }
1481
1482 /**
1483 * Updates last recrawled site ID option
1484 *
1485 * @param int $siteId
1486 */
1487 public function updateLastRecrawledSiteId($siteId) {
1488 update_option($this->optionLastRecrawledSiteId, $siteId, false);
1489 }
1490
1491 /**
1492 * Reset CRON metas about last-crawled URL
1493 *
1494 * @param int $siteId ID of the site
1495 */
1496 public function resetLastCrawled($siteId) {
1497 $this->updateLastCrawled($siteId, null, null, null, null);
1498 }
1499
1500 /**
1501 * Get a URL tuple to crawl. This method is good for crawling URLs uniformly, by getting a URL from a different
1502 * category.
1503 *
1504 * @param int $siteId Site ID for which a URL tuple will be retrieved
1505 * @param int $lastCrawledUrlId Last crawled URL id from urls table
1506 * @return null|object Null or found URL tuple as object
1507 */
1508 public function getUrlTupleToCrawl($siteId, $lastCrawledUrlId) {
1509 global $wpdb;
1510 $tableName = Factory::databaseService()->getDbTableUrlsName();
1511
1512 // If last crawled URL id is null, then get the first URL that needs to be saved.
1513 if($lastCrawledUrlId === null) {
1514 // Get the last crawled URL ID instead of getting the first found URL ID that needs saving.
1515 $query = "SELECT * FROM $tableName WHERE is_saved = TRUE AND is_locked = FALSE AND saved_post_id IS NOT NULL AND post_id = %d ORDER BY saved_at DESC LIMIT 1";
1516 $results = $wpdb->get_results($wpdb->prepare($query, $siteId));
1517
1518 // Then, if a URL is found, call this method with that URL ID so that another URL ID from a different
1519 // category can be found.
1520 if(!empty($results)) return $this->getUrlTupleToCrawl($siteId, $results[0]->id);
1521
1522 // Otherwise, if there is no last crawled URL, get the first URL that needs to be saved.
1523 $query = "SELECT * FROM $tableName WHERE is_saved = FALSE AND is_locked = FALSE AND saved_post_id IS NULL AND post_id = %d LIMIT 1";
1524 $results = $wpdb->get_results($wpdb->prepare($query, $siteId));
1525
1526 return empty($results) ? null : $results[0];
1527 }
1528
1529 // Get the last crawled URL as object from the table
1530 $query = "SELECT * FROM $tableName WHERE id = %d";
1531 $results = $wpdb->get_results($wpdb->prepare($query, $lastCrawledUrlId));
1532
1533 // If the URL is not found in the table, then get the first URL that needs to be saved or return null.
1534 // Recalling this method with a null lastCrawledSiteId will do the job.
1535 if(empty($results)) {
1536 return $this->getUrlTupleToCrawl($siteId, null);
1537 }
1538
1539 // Get the tuple as object
1540 $lastCrawledUrlTuple = $results[0];
1541
1542 // Get reference category ID and try to get a URL for the next category.
1543 $referenceCategoryId = $lastCrawledUrlTuple->category_id;
1544
1545 // Find all categories with an unsaved URL for the target site ID.
1546 $query = "SELECT DISTINCT category_id FROM $tableName WHERE is_saved = FALSE AND is_locked = FALSE AND saved_post_id IS NULL AND post_id = %d";
1547 $categoryIds = $wpdb->get_results($wpdb->prepare($query, $siteId));
1548
1549 // If there is no category, it means there is no URL to be saved. Return null.
1550 if(empty($categoryIds)) return null;
1551
1552 // Try to find a URL with a category different than the reference category. If there is no other category, then
1553 // find a URL with the reference category ID.
1554 $referenceCategoryPos = null;
1555 foreach($categoryIds as $key => $categoryIdObject) {
1556 if($categoryIdObject->category_id == $referenceCategoryId) {
1557 $referenceCategoryPos = $key;
1558 break;
1559 }
1560 }
1561
1562 // If the reference category is not found, get the first category in the list.
1563 // If the reference category is the last item in the list, get the first category in the list.
1564 // Otherwise, get the category next to the reference category.
1565 $targetCategoryId = null;
1566 if($referenceCategoryPos === null || $referenceCategoryPos == sizeof($categoryIds) - 1) {
1567 $targetCategoryId = $categoryIds[0]->category_id;
1568 } else {
1569 $targetCategoryId = $categoryIds[$referenceCategoryPos + 1]->category_id;
1570 }
1571
1572 // Now, get a URL that needs to be saved and belongs to the target site ID and target category ID.
1573 $query = "SELECT * FROM $tableName WHERE post_id = %d AND category_id = %d AND is_saved = FALSE AND is_locked = FALSE AND saved_post_id IS NULL LIMIT 1";
1574 $results = $wpdb->get_results($wpdb->prepare($query, [$siteId, $targetCategoryId]));
1575
1576 // The results cannot be empty according to the logic. Return the first found URL tuple.
1577 return $results[0];
1578 }
1579
1580 /**
1581 * Check if a post is duplicate considering the current settings set by {@link SettingsTrait::setSettings}.
1582 *
1583 * @param string $url URL of the post
1584 * @param array|null $postData An array having keys named as columns in wp_posts table. And their values, of
1585 * course.
1586 * @param bool $isFirstPage True if this check is done for the first page of the post.
1587 * @param bool $isLastPage True if this check is done for the last page of the post.
1588 * @return false|int Previously saved post ID if this is a duplicate. Otherwise, false.
1589 */
1590 public function isDuplicate($url, $postData, $isFirstPage, $isLastPage) {
1591 // If this is not the first and the last page, no need to check for duplicate.
1592 if(!$isFirstPage && !$isLastPage) return false;
1593
1594 // Get the current post ID
1595 $currentPostId = Utils::array_get($postData, "ID");
1596 if(!$currentPostId) $currentPostId = 0;
1597
1598 // Get the settings for duplicate checking
1599 $duplicateCheckSettingValues = $this->getSetting('_duplicate_check_types');
1600
1601 // The values are stored under 0 key. So, make sure 0 key exists.
1602 if(!$duplicateCheckSettingValues || !isset($duplicateCheckSettingValues[0])) return false;
1603
1604 $values = $duplicateCheckSettingValues[0];
1605 $checkUrl = isset($values[PostSaver::DUPLICATE_CHECK_URL]);
1606 $checkTitle = isset($values[PostSaver::DUPLICATE_CHECK_TITLE]);
1607 $checkContent = isset($values[PostSaver::DUPLICATE_CHECK_CONTENT]);
1608
1609 global $wpdb;
1610
1611 $id = null;
1612
1613 // If this is the first page, check URL and title
1614 if($isFirstPage) {
1615 // Check the URL
1616 if($checkUrl && $url) {
1617 // Check the URL with and without a trailing slash
1618 $query = "SELECT post_id
1619 FROM {$wpdb->postmeta}
1620 WHERE meta_key = '{$this->postMetaPostFirstPageUrl}'
1621 AND (meta_value = %s OR meta_value = %s)
1622 AND post_id <> %d;
1623 ";
1624 $id = $wpdb->get_var($wpdb->prepare($query, trailingslashit($url), untrailingslashit($url), $currentPostId));
1625 }
1626
1627 // Check the title
1628 if(!$id && $checkTitle && $postData) {
1629 $postTitle = Utils::array_get($postData, "post_title");
1630 $postType = Utils::array_get($postData, "post_type");
1631
1632 $query = "SELECT ID FROM {$wpdb->posts} WHERE post_title = %s AND post_type = %s AND ID <> %d";
1633 $id = $wpdb->get_var($wpdb->prepare($query, $postTitle, $postType, $currentPostId));
1634 }
1635 }
1636
1637 // If this is the last page, check the content
1638 if(!$id && $isLastPage && $checkContent && $postData) {
1639 $postContent = Utils::array_get($postData, "post_content");
1640 $postType = Utils::array_get($postData, "post_type");
1641
1642 $query = "SELECT ID FROM {$wpdb->posts} WHERE post_content = %s AND post_type = %s AND ID <> %d";
1643 $id = $wpdb->get_var($wpdb->prepare($query, $postContent, $postType, $currentPostId));
1644 }
1645
1646 // If a duplicate post is found, add an error.
1647 if($id) {
1648 $this->addError(ErrorType::DUPLICATE_POST, $id);
1649 Informer::add(Information::fromInformationMessage(
1650 InformationMessage::DUPLICATE_POST,
1651 _wpcc("Post ID") . ": {$id}",
1652 InformationType::ERROR
1653 )->addAsLog());
1654 }
1655
1656 return $id ? $id : false;
1657 }
1658
1659 /**
1660 * Get post meta prefix for the meta keys that will be used to store data for current task.
1661 * @see $recrawlPostMetaPrefix
1662 * @see $crawlPostMetaPrefix
1663 * @return string
1664 */
1665 private function getCronPostMetaPrefix() {
1666 return $this->isRecrawl ? $this->cronRecrawlPostMetaPrefix : $this->cronCrawlPostMetaPrefix;
1667 }
1668
1669 /**
1670 * @param bool $isRecrawl See {@link isRecrawl}
1671 */
1672 public function setIsRecrawl($isRecrawl) {
1673 $this->isRecrawl = $isRecrawl;
1674 }
1675
1676 /*
1677 * STATIC METHODS
1678 */
1679
1680 /**
1681 * Get duplicate check types prepared to be shown in a select element.
1682 *
1683 * @param array $settings Post settings
1684 * @return array Returns an array with "values" and "defaults" keys, both of which has an array value. The
1685 * key-description pairs are stored under "values" key. "defaults" stores key-defaultValue pairs.
1686 */
1687 public static function getDuplicateCheckOptionsForSelect($settings) {
1688 $result = [
1689 "values" => [
1690 PostSaver::DUPLICATE_CHECK_URL => _wpcc("URL"),
1691 PostSaver::DUPLICATE_CHECK_TITLE => _wpcc("Title"),
1692 PostSaver::DUPLICATE_CHECK_CONTENT => _wpcc("Content"),
1693 ],
1694 "defaults" => [
1695 PostSaver::DUPLICATE_CHECK_URL => 1,
1696 PostSaver::DUPLICATE_CHECK_TITLE => 1,
1697 PostSaver::DUPLICATE_CHECK_CONTENT => 0,
1698 ]
1699 ];
1700
1701 // Get the duplicate check options from the post details
1702 $postSettings = new SettingsImpl($settings, Factory::postService()->getSingleMetaKeys());
1703 $options = PostDetailsService::getInstance()->getDuplicateOptions($postSettings);
1704 if ($options) {
1705 $result["values"] = array_merge($result["values"], $options["values"]);
1706 $result["defaults"] = array_merge($result["defaults"], $options["defaults"]);
1707 }
1708
1709 return $result;
1710 }
1711
1712 /*
1713 * GETTERS
1714 */
1715
1716 /**
1717 * Get the next page URL that is found in {@link savePost()} method.
1718 *
1719 * @return null|string
1720 */
1721 public function getNextPageUrl() {
1722 return $this->nextPageUrl;
1723 }
1724
1725 /**
1726 * Get the next page URLs that are found in {@link savePost()} method. This returns a non-null value only if the post
1727 * has all page URLs in a single page.
1728 *
1729 * @return array|null
1730 */
1731 public function getNextPageUrls() {
1732 return $this->nextPageUrls;
1733 }
1734
1735}