· 5 years ago · May 09, 2020, 03:12 PM
1# encoding: utf-8
2require 'fileutils'
3require 'json'
4require 'logger'
5require 'net/http'
6require 'nokogiri'
7require 'ostruct'
8require 'time'
9require 'uri'
10require 'zlib'
11
12module Fourcat
13
14class Catalog
15
16 VERSION = '3.7.0'
17
18 BB_TAGS = {
19 '[spoiler]' => '<s>',
20 '[/spoiler]' => '</s>'
21 }
22
23 ENTITIES = { '<' => '<', '>' => '>' }
24
25 @defaults = {
26 :approot => File.expand_path(File.dirname(__FILE__) + '/../'),
27 :loglevel => Logger::WARN,
28 :title => nil,
29 :public_dir => nil,
30 :default_name => 'Anonymous',
31 :precompress => false,
32 :stats => false,
33 :archive => nil,
34 :archive_ssl => false,
35 :debug => false,
36 :text_only => false,
37 :nsfw => false,
38 :tagged => false,
39 :es_host => 'localhost:9200',
40 :local_host => nil,
41 :filename_tag => false,
42 :skipwords => nil,
43 :deep_search => false,
44 :activity_range => nil,
45 :write_html => true,
46 :html_template => 'catalog.html',
47 :write_replies => true,
48 :image_replies => false,
49 :write_rss => false,
50 :write_json => false,
51 :rss_desc => nil,
52 :web_uri => nil,
53 :content_uri => nil,
54 :user_agent => "4cat/#{VERSION}",
55 :spoiler_text => false,
56 :spoiler_size => [100, 100],
57 :thumb404_size => [125, 125],
58 :remove_exif => false,
59 :remove_oekaki => false,
60 :remove_fortune => false,
61 :country_flags => false,
62 :req_delay => 1.0,
63 :req_delay_media => 0.5,
64 :req_timeout => 30,
65 :req_timeout_media => 10,
66 :retries => 3,
67 :refresh_delay => 60,
68 :refresh_range => [30, 120],
69 :refresh_step => 10,
70 :refresh_thres => 5,
71 :use_ssl => false,
72 :proxy_addr => nil,
73 :proxy_port => nil,
74 :workers_limit => 3,
75 }
76
77 def self.defaults=(opts)
78 @defaults = opts
79 end
80
81 def self.defaults
82 @defaults
83 end
84
85 attr_accessor :opts
86
87 begin
88 require_relative 'fourcat-es.rb'
89 include Fourcat::ES
90 rescue LoadError
91 end
92
93 # Constructor
94 #
95 # @param [String, Symbol] board
96 # Remote board slug, ex: :jp or 'jp'
97 #
98 # @param [Hash] opts
99 # Options, will be merged with the defaults
100 #
101 # @option opts [String] approot
102 # Application root
103 #
104 # @option opts [Fixnum] loglevel
105 # Logging severity, defaults to Logger::WARN
106 #
107 # @option opts [true, false] use_json_api
108 # Use the JSON API. Defaults to false (parse the HTML)
109 #
110 # @option opts [String] slug
111 # Local board slug, defaults to the remote slug
112 #
113 # @option opts [String] title
114 # Board's title, defaults to '/:slug/ - Catalog'
115 #
116 # @option opts [String] public_dir
117 # Content directory.
118 # Defaults to 'approot/public/'
119 #
120 # @option opts [String] default_name
121 # Default author for threads
122 #
123 # @option opts [true, false] precompress
124 # Compress output, defaults to false
125 #
126 # @option opts [true, false] stats
127 # Compile statistics, defaults to false
128 #
129 # @option opts [String, nil] proxy
130 # Fuuka style archive URL. ex: 'http://archive.example.com/jp/thread/'
131 #
132 # @option opts [true, false] debug
133 # Log everything to STDERR
134 #
135 # @option opts [true, false] text_only
136 # Don't fetch thumbnails. Defaults to false
137 #
138 # @option opts [true, false] write_html
139 # Generate HTML, requires 'erubis' gem
140 #
141 # @option opts [String] html_template
142 # Erubis template for HTML generation
143 #
144 # @option opts [true, false] write_rss
145 # Generate RSS Feed, requires 'nokogiri' gem, 'web_uri' needs to be set
146 #
147 # @option opts [true, false] write_json
148 # Generate JSON
149 #
150 # @option opts [String] rss_desc
151 # RSS feed description, defaults to 'Meanwhile on /:slug/'
152 #
153 # @option opts [String] web_uri
154 # Catalog root URL, required for RSS feeds, ex: 'http://catalog.neet.tv/'
155 #
156 # @option opts [String] content_uri
157 # Thumbnails server URL, defaults to nil, ex: 'http://static.neet.tv/'
158 #
159 # @option opts [String] user_agent
160 # User Agent string, defaults to '4cat/{VERSION}'
161 #
162 # @option opts [String, false] spoiler_text
163 # Handle spoiler tags. Defaults to false.
164 #
165 # @option opts [true, false] remove_exif
166 # Remove EXIF meta. Defaults to false.
167 #
168 # @option opts [true, false] remove_oekaki
169 # Remove Oekaki meta. Defaults to false.
170 #
171 # @option opts [true, false] remove_fortune
172 # Remove Fortune meta. Defaults to false.
173 #
174 # @option opts [true, false] country_flags
175 # Display country flags.
176 #
177 # @option opts [true, false] filename_tag
178 # Generate a filterable tags from filenames.
179 #
180 # @option opts [Numeric] req_delay
181 # Delay in seconds between requests, defaults to 1.0
182 #
183 # @option opts [Numeric] req_delay_media
184 # Delay in seconds between requests for files, defaults to 0.5
185 #
186 # @option opts [Numeric] req_timeout Socket open/read timeouts in seconds,
187 # defaults to 30
188 #
189 # @option opts [Numeric] req_timeout_media Same as req_timeout but for files,
190 # defaults to 10
191 #
192 # @option opts [String] local_host Bind to a specific IP
193 #
194 # @option opts [String] proxy_addr Proxy IP
195 #
196 # @option opts [Numeric] proxy_port Proxy port
197 #
198 # @option opts [Fixnum] workers_limit
199 # Maximum number of HTTP worker threads,
200 # prevents hammering on laggy days. Defaults to 3
201 #
202 # @option opts [Integer] retries
203 # Number of retries before dropping the page or thumbnail, defaults to 2
204 #
205 # @option opts [true,false] no_partial
206 # Aborts the refresh cycle if a page can't be retrived and if the refresh
207 # delay is less than the maximum allowed. Defaults to true
208 #
209 # @option opts [Array<Integer>] page_count
210 # Maximum number of pages to fetch for each run.
211 # The default value of [16, 2] means that during a refresh cycle,
212 # the crawler will first fetch 16 pages, then will get the first 2 pages.
213 # This is fine for moderately fast boards, faster boards may need
214 # additional runs in order to avoid missing threads.
215 #
216 # @option opts [Integer] page_size
217 # Number of treads per page. Defaults to 15
218 #
219 # @option opts [Numeric] refresh_delay
220 # Base refresh delay in seconds, defaults to 60
221 #
222 # @option opts [Array(Numeric, Numeric)] refresh_range
223 # Min and max refresh delays in seconds, defaults to [60, 300]
224 #
225 # @option opts [Numeric] refresh_step
226 # Refresh delay modifier in seconds. For every new thread crawled,
227 # the refresh delay is reduced by refresh_step. If no new threads were
228 # found the refresh delay is increased by refresh_step. Defaults to 10
229 #
230 # @option opts [Integer, nil] refresh_thres
231 # Reduces the refresh delay by 'refresh_delay' if the number of new replies
232 # is greater than the 'refresh_thres'. Defaults to nil (disabled)
233 #
234 # @option opts [String, false] use_ssl
235 # Use HTTP/SSL. Defaults to false.
236 #
237 # @example Initialize a catalog
238 # catalog = Catalog.new('jp', {
239 # title: '/jp/ - Neet Pride Worldwide',
240 # refresh_delay: 120,
241 # refresh_range: [ 120, 300 ]
242 # })
243 #
244 def initialize(board, opts = {})
245 # Too lazy to validate every option properly
246 raise "Invalid board #{board}" if (@board = board.to_s).empty?
247
248 @opts = OpenStruct.new(
249 opts ? self.class.defaults.merge(opts) : self.class.defaults
250 )
251
252 @opts.slug ||= @board # Local short name for the board
253
254 @opts.title ||= "/#{@opts.slug}/ - Catalog"
255
256 if @opts.public_dir
257 @opts.public_dir << '/' if @opts.public_dir[-1, 1] != '/'
258 else
259 @opts.public_dir = File.join(@opts.approot, '/public/')
260 end
261
262 if @opts.web_uri
263 @opts.web_uri << '/' if @opts.web_uri[-1, 1] != '/'
264 end
265
266 if @opts.content_uri
267 @opts.content_uri << '/' if @opts.content_uri[-1, 1] != '/'
268 end
269
270 @rss_content_uri = @opts.content_uri || @opts.web_uri
271
272 # Board directory
273 @board_dir = @opts.public_dir + @opts.slug + '/'
274
275 # Thumbnails directory
276 @thumbs_dir = @board_dir + 'src/'
277
278 # Templates directory
279 @templates_dir = File.join(@opts.approot, '/views/')
280
281 # Stats directory
282 @stats_dir = File.join(@opts.approot, '/stats/')
283
284 if @opts.write_replies
285 # Replies directory
286 @replies_dir = @board_dir + 'replies/'
287 @replies_stamps = {}
288 end
289
290 if @opts.write_html
291 require 'erubis'
292 @tpl_file = @templates_dir << @opts.html_template
293 unless File.exists?(@tpl_file)
294 raise "Can't find template #{@tpl_file}"
295 end
296 @tpl_mtime = File.mtime(@tpl_file)
297 @template = load_template(@tpl_file)
298 end
299
300 if @opts.write_rss
301 raise "RSS writer: web_uri can't be empty" if !@opts.web_uri
302 @opts.rss_desc ||= "Meanwhile, on /#{@opts.slug}/"
303 end
304
305 @headers = {
306 'User-Agent' => @opts.user_agent,
307 'Accept-Encoding' => 'gzip'
308 }
309
310 # Local HTML file
311 @board_file = @board_dir + 'index.html'
312
313 # Local RSS file
314 @rss_file = @board_dir + 'feed.rss'
315
316 # Local JSON file
317 @json_file = @board_dir + 'threads.json'
318
319 @last_refresh_time = 0
320
321 # Previous refresh cycle highest thread and reply ids
322 @last_high_thread = 0
323 @last_high_reply = 0
324 @new_threads_count = 0
325
326 # For stats tracking
327 @skip_stats = true
328 @last_hour = false
329
330 # Thumbnail server url for spoiler revealing
331 @thumbs_url = "http://thumbs.4chan.org/#{@opts.slug}/thumb/"
332
333 # Checking for the spoiler file (spoiler-SLUG.png)
334 @spoiler_pic =
335 if File.exist?(@opts.public_dir + "images/spoiler-#{@opts.slug}.png")
336 "spoiler-#{@opts.slug}.png"
337 else
338 'spoiler-default.png'
339 end
340
341 # Checking for the placeholder file (thumb-404-SLUG.png)
342 @thumb_404 =
343 if File.exist?(@opts.public_dir + "images/thumb-404-#{@opts.slug}.png")
344 "thumb-404-#{@opts.slug}.png"
345 else
346 'thumb-404.png'
347 end
348
349 if @opts.use_ssl
350 require 'net/https'
351 end
352
353 if @opts.tagged || @opts.activity_range
354 @elasticsearch = Elasticsearch::Client.new(host: @opts.es_host)
355
356 @tag_status = {}
357 @global_terms = {}
358 @global_terms_timestamp = 0
359 end
360
361 @api_server = 'a.4cdn.org'
362
363 # jsonified threadlist cache for when write_json and write_html are true
364 @json_cache = nil
365
366 # Last successful write time (Time UTC)
367 @mtime = 0
368
369 @thread_cache = {}
370
371 @halt = false
372 end
373
374 # Runs the main loop
375 def run
376 init_logger
377 init_dirs
378
379 @log.unknown "Running 4cat #{VERSION}"
380
381 init_stats if @opts.stats
382
383 loop do
384 delta = Time.now.to_i - @last_refresh_time
385 if delta < @opts.refresh_delay
386 sleep(@opts.refresh_delay - delta)
387 end
388 @last_refresh_time = Time.now.to_i
389 begin
390 refresh
391 raise CatalogHalt if @halt
392 rescue Exception => e
393 if e.kind_of?(CatalogHalt)
394 @log.unknown 'Halting'
395 raise e
396 else
397 @log.error get_error(e)
398 end
399 end
400 end
401
402 cleanup
403 end
404
405 # Runs the crawler once
406 def run_once
407 init_logger
408 init_dirs
409 refresh
410 cleanup
411 end
412
413 # Tells the crawler to stop after the current refresh cycle
414 def halt
415 @halt = true
416 end
417
418 # Adjusts the refresh speed
419 # @param [Fixnum] new_replies_count New replies since the last refresh cycle
420 def adjust_speed(new_replies_count)
421 if new_replies_count > @opts.refresh_thres
422 multiplier = new_replies_count / @opts.refresh_thres
423 @opts.refresh_delay -= @opts.refresh_step * multiplier
424 else
425 @opts.refresh_delay += @opts.refresh_step
426 end
427
428 if @opts.refresh_delay < @opts.refresh_range[0]
429 @opts.refresh_delay = @opts.refresh_range[0]
430 elsif @opts.refresh_delay > @opts.refresh_range[1]
431 @opts.refresh_delay = @opts.refresh_range[1]
432 end
433 end
434
435 # Cleans up stuff
436 def cleanup
437 @log.close unless @opts.debug
438 @stats_io.close if @stats_io
439 end
440
441 # HTTP requests
442 # @param [Net::HTTP] http HTTP object to use for the connection
443 # @param [String] path
444 # @return [String] Response body
445 def fetch(http, path)
446
447
448 if @opts.use_ssl
449 http.use_ssl = true
450 http.verify_mode = OpenSSL::SSL::VERIFY_NONE
451 end
452
453 if @opts.local_host
454 http.local_host = @opts.local_host
455 end
456
457 try = 1
458 begin
459
460 resp = http.request_get(path, @headers)
461
462 if resp.code == "301"
463 resp = Net::HTTP.get_response(URI.parse(resp.header['location']))
464 end
465
466
467 if resp.code != '200'
468 if resp.code == '404'
469 raise HTTPNotFound, "Not Found #{http.address}#{path}"
470 elsif resp.kind_of?(Net::HTTPServerError)
471 raise HTTPServerError, "HTTP #{resp.code}: #{http.address}#{path}"
472 end
473 end
474 rescue Timeout::Error, Errno::ECONNRESET, EOFError, HTTPServerError => e
475 if try > @opts.retries
476 raise "Skipping after #{e.message}: #{http.address}#{path}"
477 end
478 @log.debug "Retrying after #{e.message} (#{try}): #{http.address}#{path}"
479 try += 1
480 sleep(@opts.req_delay)
481 retry
482 end
483 resp
484 end
485
486 def get_catalog
487 http = Net::HTTP.new(@api_server, nil, @opts.proxy_addr, @opts.proxy_port)
488 http.open_timeout = http.read_timeout = @opts.req_timeout
489 resp = fetch(http, "https://a.4cdn.org/#{@opts.slug}/catalog.json")
490
491 data =
492 if resp['content-encoding'] == 'gzip'
493 Zlib::GzipReader.new(StringIO.new(resp.body)).read
494 else
495 resp.body
496 end
497
498 data.force_encoding(Encoding::UTF_8)
499
500 data
501 end
502
503 # Generates an error message from an exception
504 # @param [Exception] e Exception
505 def get_error(e)
506 "#{e.message} (#{e.backtrace.first})"
507 end
508
509 # Fetches a thumbnail
510 # @param [String] url
511 # @see #fetch
512 def get_image(url)
513 uri = URI.parse(url)
514 http = Net::HTTP.new(uri.host, nil, @opts.proxy_addr, @opts.proxy_port)
515 http.open_timeout = http.read_timeout = @opts.req_timeout_media
516 fetch(http, uri.path).body
517 end
518
519 # Creates board specific directories
520 def init_dirs
521 if !File.directory?(@thumbs_dir)
522 @log.debug 'Creating thumbs dir'
523 FileUtils.mkdir_p(@thumbs_dir)
524 end
525 if @opts.write_replies && !File.directory?(@replies_dir)
526 @log.debug 'Creating replies dir'
527 FileUtils.mkdir_p(@replies_dir)
528 end
529 end
530
531 # Sets up the logger object
532 def init_logger
533 if @opts.debug == true
534 @log = Logger.new(STDERR)
535 @log.level = Logger::DEBUG
536 else
537 log_dir = File.join(@opts.approot, 'logs')
538 FileUtils.mkdir(log_dir) unless File.directory?(log_dir)
539 log_file = File.join(log_dir, "fourcat.#{@opts.slug}.log")
540 @log = Logger.new(log_file, 2, 262144)
541 @log.level = @opts.loglevel
542 end
543 end
544
545 # Creates or reopens the stats file
546 def init_stats
547 FileUtils.mkdir(@stats_dir) unless File.directory?(@stats_dir)
548
549 time_now = Time.now.utc
550 this_hour = time_now.hour
551 last_hour_limit = time_now.to_i - 3599
552
553 filename = "#{@stats_dir}#{@opts.slug}-current"
554
555 mtime = 0
556
557 if File.exist?(filename)
558 entry = File.open(filename, 'r:UTF-8') do |f|
559 f.read.split("\n")[-1].to_s.split(':')
560 end
561 if entry.length > 0
562 mtime = entry[0].to_i
563 @last_high_reply = @last_high_thread = entry[1].to_i
564 @skip_stats = false
565 end
566 end
567
568 if mtime > last_hour_limit && Time.at(mtime).utc.hour == this_hour
569 @log.debug 'init_stats: reopening file'
570 @last_hour = this_hour
571 @stats_io = File.open(filename, 'a+:UTF-8')
572 else
573 @log.debug 'init_stats: creating new file'
574 @stats_io = File.open(filename, 'w+:UTF-8')
575 end
576 end
577
578 # Returns the thread list as JSON
579 # @see #refresh
580 def jsonify_threads(threadlist, order)
581 return @json_cache if @json_cache
582
583 page_size = false
584
585 threads = {}
586
587 threadlist.each do |id, thread|
588 page_size = thread[:page_size] unless page_size
589
590 th = {
591 :date => thread[:date].to_i
592 }
593
594 th[:teaser] = thread[:teaser] || ''
595 th[:author] = thread[:author] if thread[:author]
596 th[:sticky] = thread[:sticky] if thread[:sticky]
597 th[:w] = thread[:w]
598 th[:h] = thread[:h]
599
600 th[:tags] = thread[:tags] if thread[:tags]
601 th[:file] = thread[:file] if thread[:file]
602
603 if thread[:s]
604 th[:s] = thread[:s]
605 if thread[:splr]
606 th[:splr] = true
607 th[:sw] = thread[:sw]
608 th[:sh] = thread[:sh]
609 end
610 end
611
612 if thread[:r] != 0
613 th[:r] = thread[:r]
614 th[:i] = thread[:i] if thread[:i]
615 end
616
617 if @opts.country_flags
618 th[:loc] = thread[:loc]
619 th[:locname] = thread[:locname]
620 end
621
622 th[:lr] = thread[:lr] if thread[:lr]
623
624 th[:act] = thread[:act] if thread[:act]
625
626 threads[id] = th
627 end
628
629 json = {
630 :threads => threads,
631 :order => order,
632 :count => threads.size,
633 :slug => @opts.slug,
634 :delay => @opts.refresh_delay,
635 :anon => @opts.default_name,
636 :mtime => @mtime.to_i,
637 :pagesize => page_size,
638 :deep_search => @opts.deep_search
639 }
640
641 if @opts.activity_range
642 json[:activity_range] = @opts.activity_range
643 end
644
645 if @opts.archive
646 json[:proxy] = @opts.archive
647 json[:proxy_ssl] = @opts.archive_ssl
648 end
649
650 json[:flags] = true if @opts.country_flags
651
652 json[:nsfw] = true if @opts.nsfw
653
654 if @opts.write_json && @opts.write_html
655 @json_cache = json.to_json
656 @json_cache.gsub!(/[\u2028\u2029]/, '')
657 @json_cache
658 else
659 json.to_json.gsub(/[\u2028\u2029]/, '')
660 end
661 end
662
663 # Generates links to threads
664 # @param [Integer] id Thread id
665 # @return [String] the thread's URL
666 def link_to_thread(id)
667 "https://boards.4chan.org/#{@board}/res/#{id}#{@opts.extension}"
668 end
669
670 # Creates a new erubis template from a file
671 # @param [String] filename Path to the html template
672 # @return [Erubis::FastEruby]
673 def load_template(filename)
674 Erubis::FastEruby.new(
675 File.open(filename, 'r:UTF-8') { |f| f.read },
676 :bufvar => '@_out_buf'
677 )
678 end
679
680 # Removes dead thumbnails and updates the deletion queue
681 # @param [Array<String>] remote an Array of absolute paths
682 # @param [Array<String>] local an Array of absolute paths
683 def purge_thumbnails(local, remote)
684 dead = local - remote
685 if dead.length > 0
686 begin
687 FileUtils.rm(dead, force: true)
688 #@log.debug "Purged #{dead.length} dead thumbnails."
689 rescue StandardError => e
690 @log.error 'purge_thumbnails: ' << get_error(e)
691 end
692 end
693 end
694
695 # Updates the catalog
696 def refresh
697 @log.debug "Refreshing /#{@opts.slug}/"
698
699 @this_high_reply = @last_high_reply
700 @this_high_thread = @last_high_thread
701 @new_threads_count = 0
702
703 catalog = get_catalog
704 threads = parse_catalog(catalog)
705
706 @thread_cache = threads
707
708 # Bailing out on empty threadlist
709 if threads.empty?
710 adjust_speed(0)
711 update_stats(0, 0) if @opts.stats
712 return @log.error 'Breaking on empty threadlist'
713 end
714
715 if @opts.tagged
716 tag_threads(threads)
717 end
718
719 # Sorting orders
720 order = {}
721
722 # Bump date (natural)
723 order[:alt] = threads.keys
724
725 # Creation date
726 order[:date] = order[:alt].sort do |x, y|
727 y <=> x
728 end
729
730 # Reply count
731 order[:r] = order[:alt].sort do |x, y|
732 threads[y][:r] <=> threads[x][:r]
733 end
734
735 # Last reply date
736 order[:lr] = order[:alt].sort do |x, y|
737 threads[y][:lrdate] <=> threads[x][:lrdate]
738 end
739
740 # Activity
741 if @opts.activity_range
742 order[:act] = get_activity_order(threads, order[:alt])
743 end
744
745 # Fetching thumbnails
746 unless @opts.text_only
747 thumbnails = {}
748
749 threads.each do |id, thread|
750 if thread[:src]
751 thumbnails["#{@thumbs_dir}#{id}.jpg".freeze] = [ thread[:src], id ]
752 end
753
754 if @opts.image_replies && thread[:imgs]
755 i = 0
756 thread[:imgs].each do |img|
757 key = "#{@thumbs_dir}#{img[1]}.jpg".freeze
758 # url, post id, thread id, reply index
759 thumbnails[key] = [ img[0], img[1], id, img[2] ]
760 i += 1
761 end
762 end
763 end
764
765 remote_thumbs = thumbnails.keys
766 local_thumbs = Dir.glob("#{@thumbs_dir}*.jpg")
767 new_thumbs = remote_thumbs - local_thumbs
768
769 workers = {}
770 active_workers = 0
771
772 #@log.debug "Fetching thumbnails"
773
774 new_thumbs.each do |file|
775 while active_workers >= @opts.workers_limit
776 sleep(@opts.req_delay)
777 end
778
779 th = thumbnails[file]
780 # image url
781 src = th[0]
782 # post id
783 id = th[1]
784 # thread id (replies only)
785 tid = th[2]
786 # reply index
787 rindex = th[3]
788
789 #@log.debug "Thumbnail (#{id}) #{src}"
790 workers[id] = Thread.new(src, id, tid, rindex) do |src, id, tid, rindex|
791 begin
792 active_workers += 1
793
794 data = get_image(src)
795 write_image(id, data)
796 rescue StandardError => e
797 if e.kind_of?(HTTPNotFound)
798 @log.debug e.message
799 else
800 @log.error get_error(e)
801 end
802 if tid
803 threads[tid][:replies][rindex].delete(:img)
804 else
805 threads[id][:s] = @thumb_404
806 threads[id][:w], threads[id][:h] = @opts.thumb404_size
807 threads[id].delete(:src)
808 end
809 ensure
810 active_workers -= 1
811 end
812 end
813 sleep @opts.req_delay_media
814 end
815
816 while active_workers > 0
817 sleep 0.5
818 end
819
820 purge_thumbnails(local_thumbs, remote_thumbs)
821 end
822
823 if @this_high_reply > @last_high_reply
824 new_replies_count = @this_high_reply - @last_high_reply
825 @last_high_reply = @this_high_reply
826 else
827 new_replies_count = 0
828 end
829
830 if @this_high_thread > @last_high_thread
831 @last_high_thread = @this_high_thread
832 end
833
834 if !@skip_stats
835 adjust_speed(new_replies_count)
836 update_stats(@new_threads_count, new_replies_count) if @opts.stats
837 #@log.debug "Stats: #{@new_threads_count}/#{new_replies_count}"
838 else
839 @log.debug 'First run, skipping stats'
840 @skip_stats = false
841 end
842
843 @mtime = Time.now.utc
844
845 begin
846 write_json(threads, order)
847 rescue StandardError => e
848 @log.error 'write_json: ' << get_error(e)
849 end if @opts.write_json
850
851 begin
852 write_html(threads, order)
853 rescue StandardError => e
854 @log.error 'write_html: ' << get_error(e)
855 end if @opts.write_html
856
857 begin
858 write_replies(threads)
859 rescue StandardError => e
860 @log.error 'write_replies: ' << get_error(e)
861 end if @opts.write_replies
862
863 begin
864 write_rss(threads, order[:date])
865 rescue StandardError => e
866 @log.error 'write_rss: ' << get_error(e)
867 end if @opts.write_rss
868
869 @json_cache = nil if @json_cache
870 end
871
872 def format_author(post)
873 author = post[:name] ? post[:name].to_s : ''
874 author << " #{post[:trip]}" if post[:trip]
875 author << " ## #{post[:capcode].capitalize}" if post[:capcode]
876 if !author.empty? && author != @opts.default_name
877 author.gsub!(/'/, "'")
878 author
879 else
880 nil
881 end
882 rescue
883 @log.warn 'format_author: ' << get_error($!)
884 nil
885 end
886
887 def format_body(post, is_reply = false)
888 return nil if !post[:com]
889
890 body = post[:com].to_s
891
892 # Remove EXIF meta
893 if @opts.remove_exif
894 body.gsub!(/(<br>)+<span class="abbr">.+$/, '')
895 end
896
897 # Remove Oekaki meta
898 if @opts.remove_oekaki
899 body.gsub!(/<br><br><small><b>Oekaki Post<\/b>.+?<\/small>/, '')
900 end
901
902 # Remove fortune
903 if @opts.remove_fortune
904 body.gsub!(/<span class="fortune".+?<\/span>/, '')
905 end
906
907 has_spoilers = body.include?('<s>')
908
909 if has_spoilers
910 body.gsub!(/\[\/spoiler\]/, '')
911
912 frag = Nokogiri::HTML.fragment(body, 'utf-8')
913
914 nodes = frag.xpath('./s')
915 nodes.each do |node|
916 node.replace("[spoiler]#{node.inner_html}[/spoiler]")
917 end
918
919 body = frag.to_s
920 end
921
922 if !is_reply
923 body.gsub!(/<br>/i, "\n")
924 else
925 body.gsub!(/(?:<br>)+/i, "\n")
926 end
927 body.gsub!(/<[^>]+>/i, '')
928 body.gsub!(/[<>]/, ENTITIES)
929
930 if has_spoilers
931 body.gsub!(/\[\/?spoiler\]/, BB_TAGS)
932 end
933
934 body
935 rescue
936 @log.warn 'format_body: ' << get_error($!)
937 nil
938 end
939
940 def format_teaser(title, body)
941 teaser =
942 if title
943 if body
944 "#{title}:\n#{body}"
945 else
946 title
947 end
948 else
949 body
950 end
951
952 teaser.gsub!(/'/, "'") if teaser
953
954 teaser
955 end
956
957 def format_op(post)
958 # Pulling from cache
959 if th = @thread_cache[post[:no]]
960 th[:act] = nil if th[:act]
961 if post[:filedeleted]
962 th[:s] = @thumb_404
963 th[:w], th[:h] = @opts.thumb404_size
964 end
965 else
966 th = {}
967
968 th[:id] = post[:no]
969 th[:date] = post[:time].to_i
970
971 th[:author] = format_author(post)
972
973 th[:title] = post[:sub]
974 th[:body] = format_body(post)
975 th[:teaser] = format_teaser(th[:title], th[:body])
976
977 # Thumbnail
978 if !post[:tim] || post[:filedeleted] || @opts.text_only
979 th[:s] = @thumb_404
980 th[:w], th[:h] = @opts.thumb404_size
981 else
982 if post[:spoiler]
983 th[:s] = @spoiler_pic
984 th[:splr] = true
985 th[:w], th[:h] = @opts.spoiler_size
986 th[:sw] = post[:tn_w]
987 th[:sh] = post[:tn_h]
988 else
989 th[:w] = post[:tn_w]
990 th[:h] = post[:tn_h]
991 end
992 th[:src] = "#{@thumbs_url}#{post[:tim]}s.jpg"
993 end
994
995 # Flags
996 if (@opts.country_flags && post[:country])
997 th[:loc] = post[:country].downcase
998 th[:locname] = post[:country_name]
999 end
1000
1001 # Filename tag
1002 if @opts.filename_tag && post[:filename] && !(post[:filename] =~ /^[0-9]+$/)
1003 file_tag = post[:filename].gsub(/\[[^\]]+\]/, '')
1004 file_tag.gsub!(/'/, "'")
1005 file_tag.gsub!(/\([^\)]+\)/, '')
1006 file_tag.gsub!(/\.(?:mkv|mp4|avi).*$/, '')
1007 file_tag.gsub!(/[._\s]+/, ' ')
1008
1009 if !file_tag.empty? && !(file_tag =~ /[a-f0-9]{32,}/)
1010 th[:file] = file_tag
1011 .split(/([-',a-zA-Z]+)/)
1012 .reject { |m| m.strip!; m.empty? }
1013 .join(' ')
1014 end
1015 end
1016 end
1017
1018 th[:r] = post[:replies]
1019 th[:i] = post[:images]
1020
1021 if post[:sticky]
1022 th[:sticky] = true
1023 elsif th[:sticky]
1024 th.delete(:sticky)
1025 end
1026
1027 th
1028 end
1029
1030 # Parses the decoded JSON API response
1031 # @param [Hash] json decoded JSON API response
1032 # @return [Hash{thread_id(Integer) => Hash}] a Hash of threads
1033 def parse_catalog(json)
1034
1035 catalog = JSON.parse(json, symbolize_names: true)
1036
1037 threadlist = {}
1038
1039 catalog.each do |page|
1040 threads = page[:threads]
1041
1042 page_size = threads.size
1043
1044 threads.each do |t|
1045 th = format_op(t)
1046
1047 tid = th[:id]
1048
1049 if tid > @last_high_thread
1050 @new_threads_count += 1
1051 @this_high_thread = tid if tid > @this_high_thread
1052 end
1053
1054 th[:page_size] = page_size
1055
1056 # Last reply
1057 if th[:r] > 0
1058 th[:lrdate] = t[:last_replies].last[:time].to_i
1059
1060 if @opts.write_replies
1061 repstamp = 0
1062 last_replies = []
1063 last_replies_map = {}
1064 img_replies = []
1065
1066 cached_replies = th[:replies_map] || {}
1067
1068 author = nil
1069 i = 0
1070 while r = t[:last_replies][i]
1071 repstamp += r[:no]
1072
1073 @this_high_reply = r[:no] if r[:no] > @this_high_reply
1074
1075 unless rep = cached_replies[r[:no]]
1076 rep = {}
1077 rep[:date] = r[:time].to_i
1078 author = format_author(r)
1079 rep[:author] = author if author
1080 teaser = format_teaser(r[:sub], format_body(r, true))
1081 rep[:teaser] = teaser if teaser
1082 end
1083
1084 if r[:tim] && !r[:filedeleted]
1085 if @opts.image_replies && !@opts.text_only
1086 if r[:spoiler]
1087 rep[:s] = @spoiler_pic
1088 rep[:splr] = true
1089 rep[:w], rep[:h] = @opts.spoiler_size
1090 rep[:sw] = r[:tn_w]
1091 rep[:sh] = r[:tn_h]
1092 else
1093 rep[:w] = r[:tn_w]
1094 rep[:h] = r[:tn_h]
1095 end
1096
1097 rep[:img] = r[:no]
1098 img_replies << [ "#{@thumbs_url}#{r[:tim]}s.jpg", r[:no], i ]
1099 end
1100 end
1101
1102 last_replies << rep
1103 last_replies_map[r[:no]] = rep
1104
1105 i += 1
1106 end
1107
1108 th[:repstamp] = repstamp
1109 th[:replies] = last_replies
1110 th[:replies_map] = last_replies_map
1111 th[:imgs] = img_replies
1112
1113 th[:lr] = { date: rep[:date] }
1114 th[:lr][:author] = author if author
1115 else
1116 unless rep = th[:lr]
1117 rep = {}
1118 rep[:date] = r[:time].to_i
1119 author = format_author(r)
1120 rep[:author] = author if author
1121 end
1122
1123 th[:lr] = rep
1124 end
1125 else
1126 th[:lrdate] = t[:time].to_i
1127 end
1128
1129 threadlist[tid] = th
1130 end
1131 end
1132
1133 threadlist
1134 end
1135
1136 # Updates stats
1137 # @param [Fixnum] new_threads New threads since the last refresh cycle
1138 # @param [Fixnum] new_replies New replies since the last refresh cycle
1139 def update_stats(new_threads, new_replies)
1140 now = Time.now.utc
1141 if @last_hour
1142 if @last_hour != now.hour
1143 begin
1144 file = @stats_dir + @opts.slug + '-daily'
1145 if File.exists?(file)
1146 stats = JSON.parse(File.open(file, 'r:UTF-8') { |f| f.read })
1147 else
1148 stats = Hash.new { |h, k| h[k] = Array.new(24, 0) }
1149 end
1150 stats['threads'][@last_hour] = 0
1151 stats['replies'][@last_hour] = 0
1152 @stats_io.rewind
1153 lines = @stats_io.read.split("\n")
1154 lines.map do |line|
1155 vals = line.split(':')
1156 stats['threads'][@last_hour] += vals[2].to_i
1157 stats['replies'][@last_hour] += vals[3].to_i
1158 end
1159 File.open(file, 'w:UTF-8') { |f| f.write(stats.to_json) }
1160 rescue StandardError => e
1161 @log.error 'update_stats: daily: ' << get_error(e)
1162 ensure
1163 @stats_io.reopen(@stats_io.path, 'w+')
1164 end
1165 end
1166 begin
1167 line = "#{now.to_i}:#{@last_high_reply}:#{new_threads}:#{new_replies}\n"
1168 @stats_io.write(line)
1169 @stats_io.flush
1170 rescue StandardError => e
1171 @log.error 'update_stats: current: ' << get_error(e)
1172 end
1173 else
1174 @log.debug 'update_stats: skipping first run'
1175 end
1176 @last_hour = now.hour
1177 end
1178
1179 # File writer
1180 # @param [String] data
1181 # @param [String] path
1182 # @param [true, false] gzip
1183 def write_content(data, path, gzip = false)
1184 tmp = path + '.tmp'
1185 if gzip
1186 Zlib::GzipWriter.open(tmp) { |f| f.write(data) }
1187 else
1188 File.open(tmp, 'w:UTF-8') { |f| f.write(data) }
1189 end
1190 File.rename(tmp, path)
1191 end
1192
1193 # Renders the HTML page
1194 # @see #refresh
1195 def write_html(threads, order)
1196 # Template changed?
1197 if (mtime = File.mtime(@tpl_file)) > @tpl_mtime
1198 @log.unknown 'Reloading template'
1199 @tpl_mtime = mtime
1200 @template = load_template(@tpl_file)
1201 end
1202
1203 html = @template.result(binding())
1204
1205 write_content(html, @board_file)
1206 write_content(html, @board_file + '.gz', true) if @opts.precompress
1207 end
1208
1209 # Writes thumbnail files
1210 # @param [Integer] id Image id
1211 def write_image(id, data)
1212 File.open("#{@thumbs_dir}#{id}.jpg", 'wb') { |f| f.write(data) }
1213 end
1214
1215 # Outputs the thread list as JSON
1216 # @see #refresh
1217 def write_json(threads, order)
1218 data = jsonify_threads(threads, order)
1219 write_content(data, @json_file)
1220 write_content(data, @json_file + '.gz', true) if @opts.precompress
1221 end
1222
1223 # Outlast replies as JSON
1224 # @see #refresh
1225 def write_replies(threads)
1226 stamps = {}
1227 current = []
1228 old = Dir.glob("#{@replies_dir}*.json")
1229
1230 threads.each do |id, thread|
1231 next unless thread[:replies]
1232 file = "#{@replies_dir}#{id}.json"
1233 current << file
1234 if @replies_stamps[id] != thread[:repstamp]
1235 stamps[id] = thread[:repstamp]
1236 data = thread[:replies].to_json
1237 write_content(data, file)
1238 end
1239 end
1240
1241 @replies_stamps = stamps
1242
1243 purgelist = old - current
1244
1245 return if purgelist.empty?
1246
1247 begin
1248 FileUtils.rm(purgelist, force: true)
1249 #@log.debug "Purged #{purgelist.length} dead reply file(s)."
1250 rescue StandardError => e
1251 @log.error 'write_replies: ' << get_error(e)
1252 end
1253 end
1254
1255 # Generates the RSS feed
1256 # @see #refresh
1257 def write_rss(threads, order)
1258 now = Time.now.gmtime.rfc2822
1259 builder = Nokogiri::XML::Builder.new do |xml|
1260 xml.rss('version' => '2.0') {
1261 xml.channel {
1262 xml.title @opts.title
1263 xml.description @opts.rss_desc
1264 xml.link @opts.web_uri + @opts.slug + '/'
1265 xml.lastBuildDate now
1266 order.each do |id|
1267 th = threads[id]
1268 xml.item {
1269 title = "No.#{id}"
1270 title << ": " << th[:title] if th[:title]
1271 xml.title title
1272 src = @rss_content_uri +
1273 if th[:s]
1274 "images/#{th[:s]}"
1275 else
1276 @opts.slug + "/src/#{id}.jpg"
1277 end
1278 xml.description(
1279 '<img src="' << src << '" alt="' << "#{id}" << '" />' <<
1280 '<p>' <<
1281 (th[:body] ? th[:body].gsub("\n", '<br>') : '') <<
1282 '</p>'
1283 )
1284 xml.link link_to_thread(th[:id])
1285 xml.guid "#{th[:id]}"
1286 xml.pubDate Time.at(th[:date]).rfc2822.to_s
1287 }
1288 end
1289 }
1290 }
1291 end
1292
1293 output = builder.to_xml(:indent => 0, :encoding => 'UTF-8')
1294
1295 write_content(output, @rss_file)
1296 write_content(output, @rss_file + '.gz', true) if @opts.precompress
1297 end
1298
1299end
1300
1301# Raised on HTTP responses other than 200 and 404
1302class HTTPError < StandardError; end
1303
1304# Raised on HTTP 404
1305class HTTPNotFound < HTTPError; end
1306
1307# Raised on HTTP 5xx
1308class HTTPServerError < HTTPError; end
1309
1310# Raised when asked to halt
1311class CatalogHalt < StandardError; end
1312
1313end