· 4 years ago · May 29, 2021, 03:54 AM
1import hashlib
2import shutil
3from apis.onlyfans import onlyfans as OnlyFans
4from helpers import db_helper
5from typing import Union
6from apis.onlyfans.onlyfans import auth_details, content_types, create_auth, create_subscription, media_types, start
7from classes.prepare_metadata import create_metadata, format_content, prepare_reformat
8import os
9from datetime import datetime, timedelta
10from itertools import chain, product
11from urllib.parse import urlparse
12import copy
13import json
14import html
15import extras.OFRenamer.start as ofrenamer
16
17import requests
18
19import helpers.main_helper as main_helper
20from types import SimpleNamespace
21from mergedeep import merge, Strategy
22from sqlalchemy.orm import session, sessionmaker, declarative_base
23from helpers.main_helper import choose_option, download_session, export_data, import_archive
24import extras.OFLogin.start_ofl as oflogin
25
26site_name = "OnlyFans"
27json_config = None
28json_global_settings = None
29max_threads = -1
30json_settings = None
31auto_media_choice = ""
32profile_directory = ""
33download_directory = ""
34metadata_directory = ""
35file_directory_format = None
36filename_format = None
37metadata_directory_format = ""
38delete_legacy_metadata = False
39overwrite_files = None
40date_format = None
41ignored_keywords = None
42ignore_type = None
43blacklist_name = None
44webhook = None
45text_length = None
46
47
48def assign_vars(json_auth: auth_details, config, site_settings, site_name):
49 global json_config, json_global_settings, max_threads, json_settings, auto_media_choice, profile_directory, download_directory, metadata_directory, metadata_directory_format, delete_legacy_metadata, overwrite_files, date_format, file_directory_format, filename_format, ignored_keywords, ignore_type, blacklist_name, webhook, text_length
50
51 json_config = config
52 json_global_settings = json_config["settings"]
53 max_threads = json_global_settings["max_threads"]
54 json_settings = site_settings
55 auto_media_choice = json_settings["auto_media_choice"]
56 profile_directory = main_helper.get_directory(
57 json_global_settings['profile_directories'], ".profiles")
58 download_directory = main_helper.get_directory(
59 json_settings['download_directories'], ".sites")
60 metadata_directory = main_helper.get_directory(
61 json_settings['metadata_directories'], ".metadatas")
62 file_directory_format = json_settings["file_directory_format"]
63 filename_format = json_settings["filename_format"]
64 metadata_directory_format = json_settings["metadata_directory_format"]
65 delete_legacy_metadata = json_settings["delete_legacy_metadata"]
66 overwrite_files = json_settings["overwrite_files"]
67 date_format = json_settings["date_format"]
68 ignored_keywords = json_settings["ignored_keywords"]
69 ignore_type = json_settings["ignore_type"]
70 blacklist_name = json_settings["blacklist_name"]
71 webhook = json_settings["webhook"]
72 text_length = json_settings["text_length"]
73
74
75def account_setup(auth: create_auth, identifiers: list = [], jobs: dict = {}, auth_count=0):
76 status = False
77 subscriptions = []
78 authed = auth.login()
79 if authed.active:
80 profile_directory = json_global_settings["profile_directories"][0]
81 profile_directory = os.path.abspath(profile_directory)
82 profile_directory = os.path.join(profile_directory, authed.username)
83 profile_metadata_directory = os.path.join(
84 profile_directory, "Metadata")
85 metadata_filepath = os.path.join(
86 profile_metadata_directory, "Mass Messages.json")
87 print
88 if authed.isPerformer:
89 imported = import_archive(metadata_filepath)
90 if "auth" in imported:
91 imported = imported["auth"]
92 mass_messages = authed.get_mass_messages(resume=imported)
93 if mass_messages:
94 main_helper.export_data(mass_messages, metadata_filepath)
95 # chats = api.get_chats()
96 if identifiers or jobs["scrape_names"]:
97 subscriptions += manage_subscriptions(
98 authed, auth_count, identifiers=identifiers)
99 status = True
100 elif auth.auth_details.email and auth.auth_details.password and json_settings["browser"]["auth"]:
101 proxy = None
102 session = auth.session_manager.sessions[0]
103 if session.proxies:
104 proxy = session.proxies["https"]
105 domain = "https://onlyfans.com"
106 cookies = oflogin.login(auth, domain, proxy)
107 return status, subscriptions
108
109# The start lol
110
111
112def start_datascraper(authed: create_auth, identifier, site_name, choice_type=None):
113 print("Scrape Processing")
114 subscription = authed.get_subscription(identifier=identifier)
115 if not subscription:
116 return [False, subscription]
117 post_count = subscription.postsCount
118 user_id = str(subscription.id)
119 avatar = subscription.avatar
120 username = subscription.username
121 link = subscription.link
122 print("Name: "+username)
123 api_array = scrape_choice(authed, subscription)
124 api_array = format_options(api_array, "apis")
125 apis = api_array[0]
126 api_string = api_array[1]
127 if not json_settings["auto_api_choice"]:
128 print(f"Apis: {api_string}")
129 value = int(input().strip())
130 else:
131 value = 0
132 if value:
133 apis = [apis[value]]
134 else:
135 apis.pop(0)
136 metadata_locations = {}
137 for item in apis:
138 print("Type: "+item["api_type"])
139 only_links = item["api_array"]["only_links"]
140 post_count = str(item["api_array"]["post_count"])
141 item["api_array"]["username"] = username
142 item["api_array"]["subscription"] = subscription
143 api_type = item["api_type"]
144 results = prepare_scraper(
145 authed, site_name, item)
146 print
147 print("Scrape Completed"+"\n")
148 return [True, subscription]
149
150
151# Checks if the model is valid and grabs content count
152def link_check(authed: create_auth, identifier):
153 y = authed.get_user(identifier)
154 return y
155
156
157# Allows the user to choose which api they want to scrape
158def scrape_choice(authed: create_auth, subscription):
159 user_id = subscription.id
160 post_count = subscription.postsCount
161 archived_count = subscription.archivedPostsCount
162 message = "Scrape: 0 = All | 1 = Images | 2 = Videos | 3 = Audios | 4 = Texts"
163 media_types = [[["", "All"], ["", "Images"], [
164 "", "Videos"], ["", "Audios"], ["", "Texts"]], message]
165 choice_list = choose_option(
166 media_types, auto_media_choice)
167 user_api = OnlyFans.endpoint_links(user_id).users
168 message_api = OnlyFans.endpoint_links(user_id).message_api
169 mass_messages_api = OnlyFans.endpoint_links().mass_messages_api
170 stories_api = OnlyFans.endpoint_links(user_id).stories_api
171 list_highlights = OnlyFans.endpoint_links(user_id).list_highlights
172 post_api = OnlyFans.endpoint_links(user_id).post_api
173 archived_api = OnlyFans.endpoint_links(user_id).archived_posts
174 # ARGUMENTS
175 only_links = False
176 mandatory = [download_directory, only_links]
177 y = ["photo", "video", "stream", "gif", "audio", "text"]
178 u_array = ["You have chosen to scrape {}", [
179 user_api, media_types, *mandatory, post_count], "Profile"]
180 s_array = ["You have chosen to scrape {}", [
181 stories_api, media_types, *mandatory, post_count], "Stories"]
182 h_array = ["You have chosen to scrape {}", [
183 list_highlights, media_types, *mandatory, post_count], "Highlights"]
184 p_array = ["You have chosen to scrape {}", [
185 post_api, media_types, *mandatory, post_count], "Posts"]
186 m_array = ["You have chosen to scrape {}", [
187 message_api, media_types, *mandatory, post_count], "Messages"]
188 a_array = ["You have chosen to scrape {}", [
189 archived_api, media_types, *mandatory, archived_count], "Archived"]
190 array = [u_array, s_array, p_array, a_array, m_array]
191 # array = [u_array, s_array, p_array, a_array, m_array]
192 # array = [s_array, h_array, p_array, a_array, m_array]
193 # array = [s_array]
194 # array = [u_array]
195 # array = [p_array]
196 # array = [a_array]
197 # array = [m_array]
198 new_array = []
199 valid_input = True
200 for xxx in array:
201 if xxx[2] == "Mass Messages":
202 if not subscription.is_me:
203 continue
204 new_item = dict()
205 new_item["api_message"] = xxx[0]
206 new_item["api_array"] = {}
207 new_item["api_array"]["api_link"] = xxx[1][0]
208 new_item["api_array"]["media_types"] = xxx[1][1]
209 new_item["api_array"]["directory"] = xxx[1][2]
210 new_item["api_array"]["only_links"] = xxx[1][3]
211 new_item["api_array"]["post_count"] = xxx[1][4]
212 formatted = format_media_types()
213 final_format = []
214 for choice in choice_list:
215 choice = choice[1]
216 final_format.extend(
217 [result for result in formatted if result[0] == choice])
218 new_item["api_array"]["media_types"] = final_format
219 new_item["api_type"] = xxx[2]
220 if valid_input:
221 new_array.append(new_item)
222 return new_array
223
224
225# Downloads the model's avatar and header
226def profile_scraper(authed: create_auth, site_name, api_type, username, base_directory):
227 reformats = {}
228 reformats["metadata_directory_format"] = json_settings["metadata_directory_format"]
229 reformats["file_directory_format"] = json_settings["file_directory_format"]
230 reformats["file_directory_format"] = reformats["file_directory_format"].replace(
231 "{value}", "")
232 reformats["filename_format"] = json_settings["filename_format"]
233 option = {}
234 option["site_name"] = site_name
235 option["api_type"] = api_type
236 option["username"] = username
237 option["date_format"] = date_format
238 option["maximum_length"] = text_length
239 option["directory"] = base_directory
240 a, b, c = prepare_reformat(option, keep_vars=True).reformat(reformats)
241 print
242 y = authed.get_subscription(identifier=username)
243 override_media_types = []
244 avatar = y.avatar
245 header = y.header
246 if avatar:
247 override_media_types.append(["Avatars", avatar])
248 if header:
249 override_media_types.append(["Headers", header])
250 d_session = download_session()
251 d_session.start(unit='B', unit_scale=True,
252 miniters=1)
253 for override_media_type in override_media_types:
254 new_dict = dict()
255 media_type = override_media_type[0]
256 media_link = override_media_type[1]
257 new_dict["links"] = [media_link]
258 directory2 = os.path.join(b, media_type)
259 os.makedirs(directory2, exist_ok=True)
260 download_path = os.path.join(
261 directory2, media_link.split("/")[-2]+".jpg")
262 if not overwrite_files:
263 if os.path.isfile(download_path):
264 continue
265 r = authed.session_manager.json_request(media_link, stream=True,
266 json_format=False, sleep=False)
267 if not isinstance(r, requests.Response):
268 continue
269 tsize = r.headers.get("content-length")
270 d_session.update_total_size(tsize)
271 downloaded = main_helper.downloader(r, download_path, d_session)
272 if not downloaded:
273 continue
274 d_session.close()
275
276
277def paid_content_scraper(api: start, identifiers=[]):
278 for authed in api.auths:
279 paid_contents = []
280 paid_contents = authed.get_paid_content()
281 if not authed.active:
282 return
283 authed.subscriptions = authed.subscriptions
284 for paid_content in paid_contents:
285 author = paid_content.get("author")
286 author = paid_content.get("fromUser", author)
287 subscription = authed.get_subscription(
288 check=True, identifier=author["id"])
289 if not subscription:
290 subscription = create_subscription(author)
291 authed.subscriptions.append(subscription)
292 if paid_content["responseType"] == "post":
293 if paid_content["isArchived"]:
294 print(f"Model: {author['username']}")
295 # print(
296 # "ERROR, PLEASE REPORT THIS AS AN ISSUE AND TELL ME WHICH MODEL YOU'RE SCRAPIMG, THANKS")
297 # input()
298 # exit()
299 api_type = paid_content["responseType"].capitalize()+"s"
300 api_media = getattr(subscription.temp_scraped, api_type)
301 api_media.append(paid_content)
302 count = 0
303 my_final_list = []
304 last_name = ""
305 for subscription in authed.subscriptions:
306 if (last_name != subscription.username):
307 my_final_list.append(subscription)
308 last_name = subscription.username
309 authed.subscriptions = my_final_list
310 max_count = len(authed.subscriptions)
311 for subscription in authed.subscriptions:
312 if any(subscription.username != x for x in identifiers):
313 continue
314 string = f"Scraping - {subscription.username} | {count+1} / {max_count}"
315 print(string)
316 subscription.session_manager = authed.session_manager
317 username = subscription.username
318 site_name = "OnlyFans"
319 media_type = format_media_types()
320 count += 1
321 for api_type, paid_content in subscription.temp_scraped:
322 if api_type == "Archived":
323 if any(x for k, x in paid_content if not x):
324 input(
325 "OPEN A ISSUE GITHUB ON GITHUB WITH THE MODEL'S USERNAME AND THIS ERROR, THANKS")
326 exit(0)
327 continue
328 mandatory_directories = {}
329 mandatory_directories["profile_directory"] = profile_directory
330 mandatory_directories["download_directory"] = download_directory
331 mandatory_directories["metadata_directory"] = metadata_directory
332 formatted_directories = format_directories(
333 mandatory_directories, site_name, username, metadata_directory_format, media_type, api_type)
334 formatted_metadata_directory = formatted_directories["metadata_directory"]
335 metadata_path = os.path.join(
336 formatted_metadata_directory, api_type+".db")
337 unrefined_set = media_scraper(paid_content, authed, subscription,
338 formatted_directories, username, api_type)
339 unrefined_set = [x for x in [unrefined_set]]
340 new_metadata = main_helper.format_media_set(unrefined_set)
341 new_metadata = new_metadata["content"]
342 if new_metadata:
343 api_path = os.path.join(api_type, "")
344 old_metadata, delete_metadatas = process_legacy_metadata(
345 authed, new_metadata, formatted_directories, subscription, api_type, api_path, metadata_path, site_name)
346 parent_type = ""
347 new_metadata = new_metadata + old_metadata
348 subscription.set_scraped(api_type, new_metadata)
349 w = process_metadata(api, metadata_path, formatted_directories, new_metadata,
350 site_name, parent_type, api_path, subscription, delete_metadatas)
351 print
352
353
354def format_media_types():
355 media_types = ["Images", "Videos", "Audios", "Texts"]
356 media_types2 = ["photo", "video", "stream", "gif", "audio", "text"]
357 new_list = []
358 for z in media_types:
359 if z == "Images":
360 new_list.append([z, [media_types2[0]]])
361 if z == "Videos":
362 new_list.append([z, media_types2[1:4]])
363 if z == "Audios":
364 new_list.append([z, [media_types2[4]]])
365 if z == "Texts":
366 new_list.append([z, [media_types2[5]]])
367 return new_list
368
369
370def process_messages(authed: create_auth, subscription, messages) -> list:
371 if "list" in messages:
372 unrefined_set = messages["list"]
373 elif not messages:
374 unrefined_set = []
375 else:
376 unrefined_set = [messages]
377 return unrefined_set
378
379
380def process_mass_messages(authed: create_auth, subscription, metadata_directory, mass_messages) -> list:
381 def compare_message(queue_id, remote_messages):
382 for message in remote_messages:
383 if "isFromQueue" in message and message["isFromQueue"]:
384 if queue_id == message["queueId"]:
385 return message
386 print
387 print
388 global_found = []
389 chats = []
390 session = authed.session_manager.sessions[0]
391 salt = json_global_settings["random_string"]
392 encoded = f"{session.ip}{salt}"
393 encoded = encoded.encode('utf-8')
394 hash = hashlib.md5(encoded).hexdigest()
395 profile_directory = json_global_settings["profile_directories"][0]
396 profile_directory = os.path.abspath(profile_directory)
397 profile_directory = os.path.join(profile_directory, subscription.username)
398 profile_metadata_directory = os.path.join(profile_directory, "Metadata")
399 mass_message_path = os.path.join(
400 profile_metadata_directory, "Mass Messages.json")
401 chats_path = os.path.join(profile_metadata_directory, "Chats.json")
402 if os.path.exists(chats_path):
403 chats = import_archive(chats_path)
404 date_object = datetime.today()
405 date_string = date_object.strftime("%d-%m-%Y %H:%M:%S")
406 for mass_message in mass_messages:
407 if "status" not in mass_message:
408 mass_message["status"] = ""
409 if "found" not in mass_message:
410 mass_message["found"] = {}
411 if "hashed_ip" not in mass_message:
412 mass_message["hashed_ip"] = ""
413 mass_message["hashed_ip"] = mass_message.get("hashed_ip", hash)
414 mass_message["date_hashed"] = mass_message.get(
415 "date_hashed", date_string)
416 if mass_message["isCanceled"]:
417 continue
418 queue_id = mass_message["id"]
419 text = mass_message["textCropped"]
420 text = html.unescape(text)
421 mass_found = mass_message["found"]
422 media_type = mass_message.get("mediaType")
423 media_types = mass_message.get("mediaTypes")
424 if mass_found or (not media_type and not media_types):
425 continue
426 identifier = None
427 if chats:
428 list_chats = chats
429 for chat in list_chats:
430 identifier = chat["identifier"]
431 messages = chat["messages"]["list"]
432 mass_found = compare_message(queue_id, messages)
433 if mass_found:
434 mass_message["found"] = mass_found
435 mass_message["status"] = True
436 break
437 if not mass_found:
438 list_chats = subscription.search_messages(text=text, limit=2)
439 if not list_chats:
440 continue
441 for item in list_chats["list"]:
442 user = item["withUser"]
443 identifier = user["id"]
444 messages = []
445 print("Getting Messages")
446 keep = ["id", "username"]
447 list_chats2 = [
448 x for x in chats if x["identifier"] == identifier]
449 if list_chats2:
450 chat2 = list_chats2[0]
451 messages = chat2["messages"]["list"]
452 messages = subscription.get_messages(
453 identifier=identifier, resume=messages)
454 for message in messages:
455 message["withUser"] = {
456 k: item["withUser"][k] for k in keep}
457 message["fromUser"] = {
458 k: message["fromUser"][k] for k in keep}
459 mass_found = compare_message(queue_id, messages)
460 if mass_found:
461 mass_message["found"] = mass_found
462 mass_message["status"] = True
463 break
464 else:
465 item2 = {}
466 item2["identifier"] = identifier
467 item2["messages"] = subscription.get_messages(
468 identifier=identifier)
469 chats.append(item2)
470 messages = item2["messages"]["list"]
471 for message in messages:
472 message["withUser"] = {
473 k: item["withUser"][k] for k in keep}
474 message["fromUser"] = {
475 k: message["fromUser"][k] for k in keep}
476 mass_found = compare_message(queue_id, messages)
477 if mass_found:
478 mass_message["found"] = mass_found
479 mass_message["status"] = True
480 break
481 print
482 print
483 print
484 if not mass_found:
485 mass_message["status"] = False
486 main_helper.export_data(chats, chats_path)
487 for mass_message in mass_messages:
488 found = mass_message["found"]
489 if found and found["media"]:
490 user = found["withUser"]
491 identifier = user["id"]
492 print
493 date_hashed_object = datetime.strptime(
494 mass_message["date_hashed"], "%d-%m-%Y %H:%M:%S")
495 next_date_object = date_hashed_object+timedelta(days=1)
496 print
497 if mass_message["hashed_ip"] != hash or date_object > next_date_object:
498 print("Getting Message By ID")
499 x = subscription.get_message_by_id(
500 identifier=identifier, identifier2=found["id"], limit=1)
501 new_found = x["result"]["list"][0]
502 new_found["withUser"] = found["withUser"]
503 mass_message["found"] = new_found
504 mass_message["hashed_ip"] = hash
505 mass_message["date_hashed"] = date_string
506 global_found.append(found)
507 print
508 print
509 main_helper.export_data(
510 mass_messages, mass_message_path)
511 return global_found
512
513
514def process_legacy_metadata(authed: create_auth, new_metadata_set, formatted_directories, subscription, api_type, api_path, archive_path, site_name):
515 print("Processing metadata.")
516 delete_metadatas = []
517 legacy_metadata2 = formatted_directories["legacy_metadatas"]["legacy_metadata2"]
518 legacy_metadata_path2 = os.path.join(
519 legacy_metadata2, os.path.basename(archive_path))
520 exists = os.path.exists(legacy_metadata_path2)
521 exists2 = os.path.exists(archive_path)
522 if legacy_metadata_path2 != archive_path:
523 if exists and not exists2:
524 os.makedirs(os.path.dirname(archive_path), exist_ok=True)
525 shutil.move(legacy_metadata_path2, archive_path)
526 archive_path = archive_path.replace("db", "json")
527 legacy_metadata_object, delete_legacy_metadatas = legacy_metadata_fixer(
528 formatted_directories, authed)
529 if delete_legacy_metadatas:
530 print("Merging new metadata with legacy metadata.")
531 delete_metadatas.extend(delete_legacy_metadatas)
532 old_metadata_set = import_archive(archive_path)
533 old_metadata_object = create_metadata(
534 authed, old_metadata_set, api_type=api_type)
535 if old_metadata_set:
536 print("Merging new metadata with old metadata.")
537 old_metadata_object = compare_metadata(
538 old_metadata_object, legacy_metadata_object)
539 old_metadata_set = []
540 for media_type, value in old_metadata_object.content:
541 for status, value2 in value:
542 for value3 in value2:
543 x = value3.medias
544 item = value3.convert(keep_empty_items=True)
545 old_metadata_set.append(item)
546 print
547 print
548 print
549 if old_metadata_set:
550 delete_metadatas.append(archive_path)
551 final_set = []
552 for item in old_metadata_set:
553 item["api_type"] = api_type
554 x = [x for x in new_metadata_set if x["post_id"] == item["post_id"]]
555 if not x:
556 final_set.append(item)
557 print
558 print
559 print("Finished processing metadata.")
560 return final_set, delete_metadatas
561
562
563def process_metadata(api, archive_path: str, formatted_directories: dict, new_metadata_object, site_name, parent_type, api_path, subscription, delete_metadatas):
564 print
565 Session, api_type, folder = main_helper.export_sqlite(
566 archive_path, new_metadata_object, parent_type)
567 if not subscription.download_info:
568 subscription.download_info["metadata_locations"] = {}
569 subscription.download_info["directory"] = download_directory
570 subscription.download_info["webhook"] = webhook
571 database_name = parent_type if parent_type else api_type
572 subscription.download_info["metadata_locations"][api_type] = {}
573 subscription.download_info["metadata_locations"][api_type][database_name] = archive_path
574 if json_global_settings["helpers"]["renamer"]:
575 print("Renaming files.")
576 new_metadata_object = ofrenamer.start(
577 api, Session, parent_type, api_type, api_path, site_name, subscription, folder, json_settings)
578 if delete_legacy_metadata:
579 for old_metadata in delete_metadatas:
580 if os.path.exists(old_metadata):
581 os.remove(old_metadata)
582
583
584def format_directories(directories, site_name, username, unformatted, locations: list = [], api_type="") -> dict:
585 x = {}
586 x["profile_directory"] = ""
587 x["legacy_metadatas"] = {}
588 for key, directory in directories.items():
589 option = {}
590 option["site_name"] = site_name
591 option["username"] = username
592 option["directory"] = directory
593 option["postedAt"] = datetime.today()
594 option["date_format"] = date_format
595 option["text_length"] = text_length
596 prepared_format = prepare_reformat(option)
597 if key == "profile_directory":
598 x["profile_directory"] = prepared_format.directory
599 if key == "download_directory":
600 x["download_directory"] = prepared_format.directory
601 legacy_model_directory = x["legacy_model_directory"] = os.path.join(
602 directory, site_name, username)
603 x["legacy_metadatas"]["legacy_metadata"] = os.path.join(
604 legacy_model_directory, api_type, "Metadata")
605 x["legacy_metadatas"]["legacy_metadata2"] = os.path.join(
606 legacy_model_directory, "Metadata")
607 if key == "metadata_directory":
608 x["metadata_directory"] = main_helper.reformat(
609 prepared_format, unformatted)
610 x["locations"] = []
611 for location in locations:
612 directories = {}
613 cats = ["Unsorted", "Free", "Paid"]
614 for cat in cats:
615 cat2 = cat
616 if "Unsorted" in cat2:
617 cat2 = ""
618 path = os.path.join(api_type, cat2, location[0])
619 directories[cat.lower()] = path
620 y = {}
621 y["sorted_directories"] = directories
622 y["media_type"] = location[0]
623 y["alt_media_type"] = location[1]
624 x["locations"].append(y)
625 return x
626# Prepares the API links to be scraped
627
628
629def prepare_scraper(authed: create_auth, site_name, item):
630 api_type = item["api_type"]
631 api_array = item["api_array"]
632 subscription: create_subscription = api_array["subscription"]
633 media_type = api_array["media_types"]
634 username = api_array["username"]
635 master_set = []
636 pool = authed.pool
637 mandatory_directories = {}
638 mandatory_directories["profile_directory"] = profile_directory
639 mandatory_directories["download_directory"] = download_directory
640 mandatory_directories["metadata_directory"] = metadata_directory
641 formatted_directories = format_directories(
642 mandatory_directories, site_name, username, metadata_directory_format, media_type, api_type)
643 legacy_model_directory = formatted_directories["legacy_model_directory"]
644 formatted_download_directory = formatted_directories["download_directory"]
645 formatted_metadata_directory = formatted_directories["metadata_directory"]
646 if api_type == "Profile":
647 profile_scraper(authed, site_name, api_type, username,
648 formatted_download_directory)
649 return True
650 if api_type == "Stories":
651 master_set = subscription.get_stories()
652 highlights = subscription.get_highlights()
653 valid_highlights = []
654 for highlight in highlights:
655 if "error" == highlight:
656 continue
657 highlight = subscription.get_highlights(
658 hightlight_id=highlight["id"])
659 valid_highlights.append(highlight)
660 master_set.extend(valid_highlights)
661 print
662 if api_type == "Posts":
663 master_set = subscription.get_posts()
664 if api_type == "Archived":
665 master_set = subscription.get_archived(authed)
666 if api_type == "Messages":
667 unrefined_set = subscription.get_messages()
668 unrefined_set = process_messages(authed, subscription, unrefined_set)
669 mass_messages = getattr(authed, "mass_messages")
670 if subscription.is_me and mass_messages:
671 mass_messages = getattr(authed, "mass_messages")
672 unrefined_set2 = process_mass_messages(authed,
673 subscription, formatted_metadata_directory, mass_messages)
674 unrefined_set += unrefined_set2
675 master_set = [unrefined_set]
676 master_set2 = master_set
677 parent_type = ""
678 if "Archived" == api_type:
679 unrefined_set = []
680 for master_set3 in master_set2:
681 if not isinstance(master_set3, dict):
682 continue
683 parent_type = master_set3["type"]
684 results = master_set3["results"]
685 unrefined_result = pool.starmap(media_scraper, product(
686 results, [authed], [subscription], [formatted_directories], [username], [api_type], [parent_type]))
687 unrefined_set.append(unrefined_result)
688 unrefined_set = list(chain(*unrefined_set))
689 else:
690 unrefined_set = pool.starmap(media_scraper, product(
691 master_set2, [authed], [subscription], [formatted_directories], [username], [api_type], [parent_type]))
692 unrefined_set = [x for x in unrefined_set]
693 new_metadata = main_helper.format_media_set(unrefined_set)
694 if new_metadata:
695 new_metadata = new_metadata["content"]
696 metadata_path = os.path.join(
697 formatted_metadata_directory, api_type+".db")
698 api_path = os.path.join(api_type, parent_type)
699 old_metadata, delete_metadatas = process_legacy_metadata(
700 authed, new_metadata, formatted_directories, subscription, api_type, api_path, metadata_path, site_name)
701 new_metadata = new_metadata + old_metadata
702 subscription.set_scraped(api_type, new_metadata)
703 w = process_metadata(authed, metadata_path, formatted_directories, new_metadata,
704 site_name, parent_type, api_path, subscription, delete_metadatas)
705 print
706 else:
707 print("No "+api_type+" Found.")
708 return True
709
710
711def legacy_metadata_fixer(formatted_directories: dict, authed: create_auth) -> tuple[create_metadata, list]:
712 delete_legacy_metadatas = []
713 legacy_metadatas = formatted_directories["legacy_metadatas"]
714 new_metadata_directory = formatted_directories["metadata_directory"]
715 old_metadata_directory = os.path.dirname(
716 legacy_metadatas["legacy_metadata"])
717 metadata_name = os.path.basename(f"{old_metadata_directory}.json")
718 q = []
719 for key, legacy_directory in legacy_metadatas.items():
720 if legacy_directory == formatted_directories["metadata_directory"]:
721 continue
722 if os.path.exists(legacy_directory):
723 folders = os.listdir(legacy_directory)
724 api_names = [metadata_name]
725 metadata_names = media_types()
726 metadata_names = [f"{k}.json" for k, v in metadata_names]
727 api_names += metadata_names
728 print
729 type_one_files = main_helper.remove_mandatory_files(
730 folders, keep=api_names)
731 new_format = []
732 for type_one_file in type_one_files:
733 api_type = type_one_file.removesuffix(".json")
734 legacy_metadata_path = os.path.join(
735 legacy_directory, type_one_file)
736 legacy_metadata = import_archive(legacy_metadata_path)
737 if legacy_metadata:
738 delete_legacy_metadatas.append(legacy_metadata_path)
739 legacy_metadata = create_metadata(
740 authed, legacy_metadata, api_type=api_type).convert()
741 new_format.append(legacy_metadata)
742 new_format = dict(
743 merge({}, *new_format, strategy=Strategy.ADDITIVE))
744 old_metadata_object = create_metadata(authed, new_format)
745 if legacy_directory != new_metadata_directory:
746 import_path = os.path.join(legacy_directory, metadata_name)
747 new_metadata_set = import_archive(
748 import_path)
749 if new_metadata_set:
750 new_metadata_object2 = create_metadata(
751 authed, new_metadata_set)
752 old_metadata_object = compare_metadata(
753 new_metadata_object2, old_metadata_object)
754 q.append(old_metadata_object)
755 print
756 print
757 results = create_metadata()
758 for merge_into in q:
759 print
760 results = compare_metadata(
761 results, merge_into)
762 print
763 print
764 return results, delete_legacy_metadatas
765
766
767def test(new_item, old_item):
768 new_found = None
769 if old_item.media_id == None:
770 for link in old_item.links:
771 # Handle Links
772 a = urlparse(link)
773 link2 = os.path.basename(a.path)
774 if any(link2 in new_link for new_link in new_item.links):
775 new_found = new_item
776 break
777 print
778 elif old_item.media_id == new_item.media_id:
779 new_found = new_item
780 return new_found
781
782
783def compare_metadata(new_metadata: create_metadata, old_metadata: create_metadata) -> create_metadata:
784 for key, value in old_metadata.content:
785 new_value = getattr(new_metadata.content, key, None)
786 if not new_value:
787 continue
788 if not value:
789 setattr(old_metadata, key, new_value)
790 for key2, value2 in value:
791 new_value2 = getattr(new_value, key2)
792 seen = set()
793 old_status = []
794 for d in value2:
795 if d.post_id not in seen:
796 seen.add(d.post_id)
797 old_status.append(d)
798 else:
799 print
800 setattr(value, key2, old_status)
801 value2 = old_status
802 new_status = new_value2
803 for post in old_status:
804 if key != "Texts":
805 for old_media in post.medias:
806 # if old_item.post_id == 1646808:
807 # l = True
808 new_found = None
809 new_items = [
810 x for x in new_status if post.post_id == x.post_id]
811 if new_items:
812 for new_item in (x for x in new_items if not new_found):
813 for new_media in (x for x in new_item.medias if not new_found):
814 new_found = test(new_media, old_media)
815 print
816 if new_found:
817 for key3, v in new_found:
818 if key3 in ["directory", "downloaded", "size", "filename"]:
819 continue
820 setattr(old_media, key3, v)
821 setattr(new_found, "found", True)
822 else:
823 new_items = [
824 x for x in new_status if post.post_id == x.post_id]
825 if new_items:
826 new_found = new_items[0]
827 for key3, v in new_found:
828 if key3 in ["directory", "downloaded", "size", "filename"]:
829 continue
830 setattr(post, key3, v)
831 setattr(new_found, "found", True)
832 print
833 for new_post in new_status:
834 not_found = []
835 if key != "Texts":
836 not_found = [
837 new_post for media in new_post.medias if not getattr(media, "found", None)][:1]
838 else:
839 found = getattr(new_post, "found", None)
840 if not found:
841 not_found.append(new_post)
842
843 if not_found:
844 old_status += not_found
845 old_status.sort(key=lambda x: x.post_id, reverse=True)
846 new_metadata = old_metadata
847 return new_metadata
848
849# Scrapes the API for content
850
851
852def media_scraper(results, authed: create_auth, subscription: create_subscription, formatted_directories, username, api_type, parent_type="", print_output=True):
853 new_set = {}
854 new_set["content"] = []
855 directories = []
856 session = authed.session_manager.sessions[0]
857 if api_type == "Stories":
858 if "stories" in results:
859 items = results["stories"]
860 for item in items:
861 item["text"] = results["title"]
862 results = results["stories"]
863 if api_type == "Archived":
864 print
865 pass
866 if api_type == "Posts":
867 print
868 if api_type == "Messages":
869 pass
870 if not results or "error" in results:
871 return new_set
872 if "result" in results:
873 session = results["session"]
874 results = results["result"]
875 if "error" in results:
876 return new_set
877 download_path = formatted_directories["download_directory"]
878 for location in formatted_directories["locations"]:
879 sorted_directories = copy.copy(location["sorted_directories"])
880 master_date = "01-01-0001 00:00:00"
881 media_type = location["media_type"]
882 alt_media_type = location["alt_media_type"]
883 file_directory_format = json_settings["file_directory_format"]
884 if api_type == "Archived":
885 x = file_directory_format.split(os.sep)
886 for y in x:
887 substr = "{api_type}"
888 if substr == y:
889 new_path = os.path.join(substr, parent_type)
890 file_directory_format = file_directory_format.replace(
891 substr, new_path)
892 break
893 print
894 print
895 seperator = " | "
896 if print_output:
897 print(
898 f"Scraping [{seperator.join(alt_media_type)}]. Should take less than a minute.")
899 for media_api in results:
900 isReportedByMe = media_api.get("isReportedByMe")
901 if isReportedByMe:
902 continue
903 post_id = media_api["id"]
904 new_post = {}
905 new_post["medias"] = []
906 rawText = media_api.get("rawText", "")
907 text = media_api.get("text", "")
908 final_text = rawText if rawText else text
909 previews = media_api.get("preview", [])
910 if api_type == "Stories":
911 previews = []
912 if api_type == "Messages":
913 media_api["rawText"] = media_api["text"]
914 previews = media_api.get("previews", None)
915 if api_type == "Mass Messages":
916 media_user = media_api["fromUser"]
917 media_username = media_user["username"]
918 if media_username != username:
919 continue
920 elif api_type == "Archived":
921 test_previews = media_api.get("previews", [])
922 if test_previews:
923 input("REPORT THIS ERROR TO GITHUB, I'LL KNOW WHAT IT IS :)")
924 # if previews == None:
925 # # REMOVE BEFORE PUSHING COMMIT
926 # input("PREVIEW NOT FOUND")
927 date = media_api["postedAt"] if "postedAt" in media_api else media_api["createdAt"]
928 if date == "-001-11-30T00:00:00+00:00":
929 date_string = master_date
930 date_object = datetime.strptime(
931 master_date, "%d-%m-%Y %H:%M:%S")
932 else:
933 date_object = datetime.fromisoformat(date)
934 date_string = date_object.replace(tzinfo=None).strftime(
935 "%d-%m-%Y %H:%M:%S")
936 master_date = date_string
937 new_post["post_id"] = media_api["id"]
938 new_post["text"] = final_text
939 new_post["postedAt"] = date_string
940 new_post["paid"] = False
941 new_post["preview_media_ids"] = previews
942 new_post["api_type"] = api_type
943 price = new_post["price"] = media_api["price"]if "price" in media_api else None
944 if price == None:
945 price = 0
946 canPurchase = media_api.get("canPurchase", None)
947 if price:
948 if all(media["canView"] for media in media_api["media"]):
949 new_post["paid"] = True
950 else:
951 print
952 for media in media_api["media"]:
953 media_id = media["id"]
954 date = "-001-11-30T00:00:00+00:00"
955 size = 0
956 link = ""
957 preview_link = ""
958 if "source" in media:
959 quality_key = "source"
960 source = media[quality_key]
961 link = source[quality_key]
962 if link:
963 if media["type"] == "video":
964 qualities = media["videoSources"]
965 qualities = dict(
966 sorted(qualities.items(), reverse=False))
967 qualities[quality_key] = source[quality_key]
968 for quality, quality_link in qualities.items():
969 video_quality_json = json_settings["video_quality"]
970 video_quality_json = video_quality_json.removesuffix(
971 "p")
972 if quality == video_quality_json:
973 if quality_link:
974 link = quality_link
975 break
976 print
977 print
978 print
979
980 size = media["info"]["preview"]["size"] if "info" in media_api else 1
981 if "src" in media:
982 link = media["src"]
983 size = media["info"]["preview"]["size"] if "info" in media_api else 1
984 date = media_api["createdAt"]
985 matches = ["us", "uk", "ca", "ca2", "de"]
986
987 if not link:
988 continue
989 url = urlparse(link)
990 subdomain = url.hostname.split('.')[0]
991 preview_link = media["preview"]
992 if any(subdomain in nm for nm in matches):
993 subdomain = url.hostname.split('.')[1]
994 if "upload" in subdomain:
995 continue
996 if "convert" in subdomain:
997 link = preview_link
998 rules = [link == "",
999 preview_link == ""]
1000 if all(rules):
1001 continue
1002 new_media = dict()
1003 new_media["media_id"] = media_id
1004 new_media["links"] = []
1005 new_media["media_type"] = media_type
1006 new_media["preview"] = False
1007 if int(media_id) in new_post["preview_media_ids"]:
1008 new_media["preview"] = True
1009 for xlink in link, preview_link:
1010 if xlink:
1011 new_media["links"].append(xlink)
1012 break
1013
1014 if media["type"] not in alt_media_type:
1015 continue
1016 session.links.extend(new_media["links"])
1017 matches = [s for s in ignored_keywords if s in final_text]
1018 if matches:
1019 print("Matches: ", matches)
1020 continue
1021 filename = link.rsplit('/', 1)[-1]
1022 filename, ext = os.path.splitext(filename)
1023 ext = ext.__str__().replace(".", "").split('?')[0]
1024
1025 option = {}
1026 option = option | new_post
1027 option["site_name"] = "OnlyFans"
1028 option["media_id"] = media_id
1029 option["filename"] = filename
1030 option["api_type"] = api_type
1031 option["media_type"] = media_type
1032 option["ext"] = ext
1033 option["username"] = username
1034 option["date_format"] = date_format
1035 option["text_length"] = text_length
1036 option["directory"] = download_path
1037 option["preview"] = new_media["preview"]
1038
1039 prepared_format = prepare_reformat(option)
1040 file_directory = main_helper.reformat(
1041 prepared_format, file_directory_format)
1042 prepared_format.directory = file_directory
1043 file_path = main_helper.reformat(
1044 prepared_format, filename_format)
1045 new_media["directory"] = os.path.join(file_directory)
1046 new_media["filename"] = os.path.basename(file_path)
1047 if file_directory not in directories:
1048 directories.append(file_directory)
1049 new_media["linked"] = None
1050 for k, v in subscription.temp_scraped:
1051 if k == api_type:
1052 continue
1053 if k == "Archived":
1054 v = getattr(v, api_type, [])
1055 if v:
1056 for post in v:
1057 medias = post.get("medias", [])
1058 if not medias:
1059 medias = post.get("media", [])
1060 found_medias = []
1061 for temp_media in medias:
1062 temp_filename = temp_media.get("filename")
1063 if temp_filename:
1064 if temp_filename == new_media["filename"]:
1065 found_medias.append(temp_media)
1066 else:
1067 continue
1068 # found_medias = [x for x in medias
1069 # if x["filename"] == new_media["filename"]]
1070 if found_medias:
1071 for found_media in found_medias:
1072 found_media["linked"] = api_type
1073 new_media["linked"] = post["api_type"]
1074 new_media["filename"] = f"linked_{new_media['filename']}"
1075 print
1076 print
1077 print
1078 print
1079 new_post["medias"].append(new_media)
1080 found_post = [x for x in new_set["content"]
1081 if x["post_id"] == post_id]
1082 if found_post:
1083 found_post = found_post[0]
1084 found_post["medias"] += new_post["medias"]
1085 else:
1086 new_set["content"].append(new_post)
1087 new_set["directories"] = directories
1088 return new_set
1089
1090
1091# Downloads scraped content
1092class download_media():
1093 def __init__(self, authed: create_auth = None, subscription=None) -> None:
1094 username = subscription.username
1095 download_info = subscription.download_info
1096 if download_info:
1097 self.downloaded = True
1098 metadata_locations = download_info["metadata_locations"]
1099 directory = download_info["directory"]
1100 for parent_type, value in metadata_locations.items():
1101 for api_type, metadata_path in value.items():
1102 Session, engine = db_helper.create_database_session(
1103 metadata_path)
1104 database_session = Session()
1105 database_name = api_type.lower()
1106 db_collection = db_helper.database_collection()
1107 database = db_collection.chooser(database_name)
1108 api_table = database.api_table
1109 media_table = database.media_table
1110 result = database_session.query(media_table).all()
1111 media_type_list = media_types()
1112 for r in result:
1113 item = getattr(media_type_list, r.media_type)
1114 item.append(r)
1115 media_type_list = media_type_list.__dict__
1116 for location, v in media_type_list.items():
1117 if location == "Texts":
1118 continue
1119 media_set = v
1120 media_set_count = len(media_set)
1121 if not media_set:
1122 continue
1123 string = "Download Processing\n"
1124 string += f"Name: {username} | Type: {api_type} | Count: {media_set_count} {location} | Directory: {directory}\n"
1125 print(string)
1126 d_session = download_session()
1127 d_session.start(unit='B', unit_scale=True,
1128 miniters=1)
1129 pool = authed.session_manager.pool
1130 pool.starmap(self.prepare_download, product(
1131 media_set, [authed], [api_type], [subscription], [d_session]))
1132 d_session.close()
1133 database_session.commit()
1134 else:
1135 self.downloaded = False
1136
1137 def prepare_download(self, media, authed: create_auth, api_type, subscription: create_subscription, d_session):
1138 return_bool = True
1139 if not overwrite_files and media.downloaded:
1140 return
1141 count = 0
1142 sessions = [
1143 x for x in authed.session_manager.sessions if media.link in x.links]
1144 if not sessions:
1145 return
1146 session = sessions[0]
1147 while count < 11:
1148 links = [media.link]
1149
1150 def choose_link(session, links):
1151 for link in links:
1152 r = authed.session_manager.json_request(link, session, "HEAD",
1153 stream=False, json_format=False)
1154 if not isinstance(r, requests.Response):
1155 continue
1156
1157 header = r.headers
1158 content_length = header.get('content-length')
1159 if not content_length:
1160 continue
1161 content_length = int(content_length)
1162 return [link, content_length]
1163 result = choose_link(session, links)
1164 if not result:
1165 result_list = []
1166 if api_type == "Messages":
1167 result = subscription.get_message_by_id(
1168 identifier2=media.post_id, limit=1)["result"]
1169 result_list = result.get("list")
1170 if not result_list:
1171 print
1172 elif api_type == "Posts":
1173 result = subscription.get_post(media.post_id)
1174 result_list = [result.get("result")]
1175 else:
1176 print
1177 mandatory_directories = {}
1178 mandatory_directories["profile_directory"] = profile_directory
1179 mandatory_directories["download_directory"] = download_directory
1180 mandatory_directories["metadata_directory"] = metadata_directory
1181 media_type = format_media_types()
1182 formatted_directories = format_directories(
1183 mandatory_directories, site_name, subscription.username, metadata_directory_format, media_type, api_type)
1184 unrefined_set = media_scraper(result_list, authed, subscription,
1185 formatted_directories, subscription.username, api_type, print_output=False)
1186 unrefined_set = [x for x in [unrefined_set]]
1187 new_metadata = main_helper.format_media_set(unrefined_set)
1188 new_metadata = new_metadata["content"]
1189 found_post = main_helper.format_media_set(new_metadata)
1190 if found_post:
1191 found_media = [x for x in found_post["medias"]
1192 if x["media_id"] == media.media_id]
1193 if found_media:
1194 new_link = found_media[0]["links"][0]
1195 media.link = new_link
1196 count += 1
1197 continue
1198 link = result[0]
1199 content_length = result[1]
1200 media.size = content_length
1201 date_object = media.created_at
1202 download_path = os.path.join(
1203 media.directory, media.filename)
1204 timestamp = date_object.timestamp()
1205 if not overwrite_files:
1206 if main_helper.check_for_dupe_file(download_path, content_length):
1207 main_helper.format_image(download_path, timestamp)
1208 return_bool = False
1209 media.downloaded = True
1210 break
1211 r = authed.session_manager.json_request(
1212 link, session, stream=True, json_format=False)
1213 if not isinstance(r, requests.Response):
1214 return_bool = False
1215 count += 1
1216 continue
1217 d_session.update_total_size(content_length)
1218 downloader = main_helper.downloader(
1219 r, download_path, d_session, count)
1220 if not downloader:
1221 count += 1
1222 continue
1223 main_helper.format_image(download_path, timestamp)
1224 media.downloaded = True
1225 break
1226 if not media.downloaded:
1227 print(f"Download Failed: {media.link}")
1228 d_session.colour = "Red"
1229
1230 return return_bool
1231
1232
1233def manage_subscriptions(authed: create_auth, auth_count=0, identifiers: list = [], refresh: bool = True):
1234 results = authed.get_subscriptions(
1235 identifiers=identifiers, refresh=refresh)
1236 if blacklist_name:
1237 r = authed.get_lists()
1238 if not r:
1239 return [False, []]
1240 new_results = [c for c in r if blacklist_name == c["name"]]
1241 if new_results:
1242 item = new_results[0]
1243 list_users = item["users"]
1244 if int(item["usersCount"]) > 2:
1245 list_id = str(item["id"])
1246 list_users = authed.get_lists_users(list_id)
1247 users = list_users
1248 bl_ids = [x["username"] for x in users]
1249 results2 = results.copy()
1250 for result in results2:
1251 identifier = result.username
1252 if identifier in bl_ids:
1253 print("Blacklisted: "+identifier)
1254 results.remove(result)
1255 results.sort(key=lambda x: x.subscribedByData.expiredAt)
1256 results.sort(key=lambda x: x.is_me, reverse=True)
1257 results2 = []
1258 hard_blacklist = ["onlyfanscreators"]
1259 for result in results:
1260 result.auth_count = auth_count
1261 username = result.username
1262 bl = [x for x in hard_blacklist if x == username]
1263 if bl:
1264 continue
1265 now = datetime.utcnow().date()
1266 # subscribedBy = result["subscribedBy"]
1267 subscribedByData = result.subscribedByData
1268 result_date = subscribedByData.expiredAt if subscribedByData else datetime.utcnow(
1269 ).isoformat()
1270 price = subscribedByData.price
1271 subscribePrice = subscribedByData.subscribePrice
1272 result_date = datetime.fromisoformat(
1273 result_date).replace(tzinfo=None).date()
1274 if ignore_type in ["paid"]:
1275 if price > 0:
1276 continue
1277 if ignore_type in ["free"]:
1278 if subscribePrice == 0:
1279 continue
1280 results2.append(result)
1281 authed.subscriptions = results2
1282 return results2
1283
1284
1285def format_options(f_list: Union[list[create_auth], list[create_subscription], list[dict], list[str]], choice_type: str) -> list:
1286 new_item = {}
1287 new_item["auth_count"] = -1
1288 new_item["username"] = "All"
1289 new_item = json.loads(json.dumps(
1290 new_item), object_hook=lambda d: SimpleNamespace(**d))
1291 f_list = [new_item]+f_list
1292 name_count = len(f_list)
1293
1294 count = 0
1295 names = []
1296 string = ""
1297 seperator = " | "
1298 last_name = ""
1299 if name_count > 1:
1300 if "users" == choice_type:
1301 for auth in f_list:
1302 if not isinstance(auth, create_auth):
1303 name = getattr(auth, "username", None)
1304 else:
1305 name = auth.auth_details.username
1306 names.append([auth, name])
1307 string += str(count)+" = "+name
1308 if count+1 != name_count:
1309 string += seperator
1310 count += 1
1311 if "usernames" == choice_type:
1312 for x in f_list:
1313 if isinstance(x, create_auth) or isinstance(x, dict):
1314 continue
1315 if (last_name != x.username):
1316 name = x.username
1317 last_name = name
1318 string += str(count)+" = "+name
1319 names.append([x.auth_count, name])
1320 if count+1 != name_count:
1321 string += seperator
1322 count += 1
1323 if "apis" == choice_type:
1324 names = f_list
1325 for api in f_list:
1326 if isinstance(api, SimpleNamespace):
1327 name = getattr(api, "username", None)
1328 else:
1329 if isinstance(api, create_auth) or isinstance(api, create_subscription):
1330 continue
1331 name = api.get("api_type")
1332 string += f"{count} = {name}"
1333 if count+1 != name_count:
1334 string += seperator
1335 count += 1
1336 return [names, string]
1337