#!/usr/bin/perl -wl use strict; use HTML::TreeBuilder; #use LWP::Simple; use LWP::UserAgent; use File::Slurp; use File::Basename; use Encode; use Cwd; use Fcntl qw(F_SETFL O_NONBLOCK :DEFAULT); use List::Util qw(min); use List::MoreUtils qw(uniq); use DB_File; use constant SLEEP_SECONDS => 30; use constant VERBOSE => 1; use constant USE_WGET => 0; use constant CONF_DIR => "$ENV{HOME}/.4chan-save"; -d CONF_DIR or mkdir CONF_DIR or die "mkdir: $!"; my %URLS; sub remove_url { delete $URLS{$_} for @_; } sub add_url { $URLS{$_} = 0 for @_; } sub schedule_url($$) { my ($url, $time) = @_; exists $URLS{$url} and $URLS{$url} = $time; } $SIG{INT} = sub { exit }; END { warn "exiting\n"; untie %URLS; } tie(%URLS, 'DB_File', CONF_DIR . '/argv.db', O_RDWR|O_CREAT, 0600, $DB_HASH) or die $!; add_url(@ARGV); #my @OLD_URLS; #tie(@OLD_URLS, 'DB_File', CONF_DIR . '/argv.old.db', O_RDWR|O_CREAT, 0600, $DB_RECNO) or die $!; #add_url(@OLD_URLS); fcntl(STDIN, F_SETFL, O_NONBLOCK) or die $!; use IO::Select; my $select = IO::Select->new(); $select->add(\*STDIN); sub maybe_read_line($&) { my ($timeleft, $callback) = @_; my $done = time + $timeleft; do { if ($select->can_read($timeleft)) { my $line; if (sysread STDIN, $line, 1024*8) { if ($line =~ m/\n/) { chomp $line; $callback->($line); } else { die qq{maybe_read_line: unexpected behavior (raw tty mode?): \$line="$line"}; } } } } while (($timeleft = $done - time) > 0); } sub sleep_read { my $seconds = shift; maybe_read_line($seconds, sub { add_url(grep {m{^https?://.}} split ' ', shift); }); } sub wait_url($) { my $url = shift or return; my $when = $URLS{$url} or return; my $now = time; return unless $when > $now; my $seconds = min(SLEEP_SECONDS, $when - $now); warn "waiting $seconds seconds to fetch $url\n" if VERBOSE; sleep_read($seconds); } my $ua = LWP::UserAgent->new; $ua->agent('Mozilla/5.0'); # WTF, 4chan?? sub getstore($$) { my ($url, $outfile) = @_; my $response = $ua->mirror(@_); unless ($response->is_success or $response->code == 404) { warn sprintf "error fetching '%s': %s\n", $url, $response->status_line; } elsif (VERBOSE) { printf "get(%s) => %d\n", $url, $response->code; } return $response->code; } my $olddir = getcwd or die $!; my $dest; while (keys %URLS) { for my $url (sort {$URLS{$a} <=> $URLS{$b}} keys %URLS) { $url =~ s/#.*//; $dest = ($url =~ m{/([^/]+)/res/(\d+)/*$}) ? "$1-$2" : int rand(2147483647); # 4chan thread urls end in numbers -d $dest or mkdir $dest or warn $!; chdir $dest or warn($!), next; wait_url($url); if (USE_WGET) { print "wget $url" if VERBOSE; if (system(qw(wget -c -q -N -nd --page-requisites), "$url") != 0) { remove_url($url) if $? >> 8 == 8; warn(($? >> 8 == 8) ? "404\n" : ($? == -1) ? "failed to exec wget: $!" : ($? & 127) ? sprintf("wget died with signal %d", $? & 127) : sprintf("wget returned %d ($?)", $? >> 8)); next; } } else { my $fn = basename($url); my $tmpname = "$fn~"; my $res = getstore($url, $tmpname); unless ($res == 200) { remove_url($url), warn "404\n" if $res == 404; unlink $tmpname; next; } rename $tmpname, $fn; } my $html = Encode::decode_utf8(read_file basename $url) or next; # print "parse $url" if VERBOSE; my $parser = HTML::TreeBuilder->new; $parser->ignore_unknown(0); my $tree = $parser->parse($html); my %links; for ($tree->look_down(_tag => 'a')) { my $link = $_->attr('href') or next; next unless $link =~ m{/src/([^/]+)$}; # 4chan image links match this my $fn = $1; next if -e $fn; $links{$link}++; next; } my @links = sort {;no warnings 'numeric'; $a<=>$b} keys %links; if (USE_WGET) { system(qw(wget -c -N -nv), @links) if keys %links; } else { for my $link (@links) { next unless $link =~ m{/src/([^/]+)$}; # 4chan image links match this my $fn = $1; next if -e $fn; my $res = getstore($link, $fn); warn "404\n", last if $res == 404; } } schedule_url($url, time + SLEEP_SECONDS); } continue { chdir $olddir or die $!; rmdir $dest; } sleep_read(SLEEP_SECONDS); }