· 5 years ago · Sep 28, 2020, 12:28 AM
1# -*- coding: utf-8 -*-
2
3# Copyright 2014-2020 Mike Fährmann
4#
5# This program is free software; you can redistribute it and/or modify
6# it under the terms of the GNU General Public License version 2 as
7# published by the Free Software Foundation.
8
9"""Extract images from https://gelbooru.com/"""
10
11from . import booru
12from .common import Message
13from .. import text
14
15
16class GelbooruExtractor(booru.XmlParserMixin,
17 booru.GelbooruPageMixin,
18 booru.BooruExtractor):
19 """Base class for gelbooru extractors"""
20 category = "gelbooru"
21 api_url = "https://gelbooru.com/index.php"
22 post_url = "https://gelbooru.com/index.php?page=post&s=view&id={}"
23 pool_url = "https://gelbooru.com/index.php?page=pool&s=show&id={}"
24 id_pattern = ['<span id="s', '"']
25
26 def __init__(self, match, config_override):
27 super().__init__(match)
28
29 self.use_api = self.get_config(config_override, "api", True)
30
31 if self.use_api:
32 self.params.update({"page": "dapi", "s": "post", "q": "index"})
33 else:
34 self.items = self.items_noapi
35 self.session.cookies["fringeBenefits"] = "yup"
36 self.per_page = 42
37
38 def get_config(self, config_override, key, default):
39 if (config_override and key in config_override):
40 return config_override.get(key)
41 return self.config(key, default)
42
43 def items_noapi(self):
44 yield Message.Version, 1
45 data = self.get_metadata()
46
47 for post in self.get_posts():
48 post = self.get_post_data(post)
49 url = post["file_url"]
50 post.update(data)
51 text.nameext_from_url(url, post)
52 yield Message.Directory, post
53 yield Message.Url, url, post
54
55 def get_posts(self):
56 """Return an iterable containing all relevant post objects"""
57 url = self.api_url + "?page=post&s=list"
58 params = {
59 "tags": self.params["tags"],
60 "pid" : self.page_start * self.per_page
61 }
62
63 while True:
64 page = self.request(url, params=params).text
65 # Only keep valid ids (positive integers), in case the pattern is too wide
66 ids = list(filter(self.is_id_valid, text.extract_iter(page, self.id_pattern[0], self.id_pattern[1])))
67 yield from ids
68 if len(ids) < self.per_page:
69 return
70 params["pid"] += self.per_page
71
72 def is_id_valid(self, x):
73 return x and x.isdigit()
74
75 def get_post_data(self, post_id):
76 """Extract metadata of a single post"""
77 page = self.request(self.post_url.format(post_id)).text
78 data = text.extract_all(page, (
79 (None , '<meta name="keywords"', ''),
80 ("tags" , ' imageboard- ', '"'),
81 ("id" , '<li>Id: ', '<'),
82 ("created_at", '<li>Posted: ', '<'),
83 ("width" , '<li>Size: ', 'x'),
84 ("height" , '', '<'),
85 ("source" , '<li>Source: <a href="', '"'),
86 ("rating" , '<li>Rating: ', '<'),
87 (None , '<li>Score: ', ''),
88 ("score" , '>', '<'),
89 ("file_url" , '<li><a href="http', '"'),
90 ("change" , ' id="lupdated" value="', '"'),
91 ))[0]
92 data["file_url"] = "http" + data["file_url"].replace("m//", "m/", 1)
93 data["md5"] = data["file_url"].rpartition("/")[2].partition(".")[0]
94 data["rating"] = (data["rating"] or "?")[0].lower()
95 data["tags"] = " ".join(
96 [tag.replace(" ", "_") for tag in data["tags"].split(", ")])
97 if self.extags:
98 self.extended_tags(data, page)
99 return data
100
101
102class GelbooruTagExtractor(booru.TagMixin, GelbooruExtractor):
103 """Extractor for images from gelbooru.com based on search-tags"""
104 pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
105 r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
106 test = (
107 ("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {
108 "count": 5,
109 }),
110 ("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {
111 "options": (("api", False),),
112 "count": 5,
113 }),
114 )
115
116
117class GelbooruPoolExtractor(booru.PoolMixin, GelbooruExtractor):
118 """Extractor for image-pools from gelbooru.com"""
119 pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
120 r"\?page=pool&s=show&id=(?P<pool>\d+)")
121 test = (
122 ("https://gelbooru.com/index.php?page=pool&s=show&id=761", {
123 "count": 6,
124 }),
125 ("https://gelbooru.com/index.php?page=pool&s=show&id=761", {
126 "options": (("api", False),),
127 "count": 6,
128 }),
129 )
130
131
132class GelbooruPostExtractor(booru.PostMixin, GelbooruExtractor):
133 """Extractor for single images from gelbooru.com"""
134 pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
135 r"\?page=post&s=view&id=(?P<post>\d+)")
136 test = ("https://gelbooru.com/index.php?page=post&s=view&id=313638", {
137 "content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
138 "count": 1,
139 })
140
141 def get_posts(self):
142 return (self.post,)
143