#!/usr/bin/perl # torrentreactor.pl v0.1 # Josh Jackson use LWP::UserAgent; use integer; use HTML::Tidy; $categoriesfile = 'categories.txt'; if ($ARGV[0] == "" ) { open(INFO, $categoriesfile); while ($_ = ) { print "$_"; } close(INFO); print "\n\n"; print "usage: torrentreactor.pl \n"; exit; } $section = $ARGV[0]; $url = "http://www.torrentreactor.net/torrents/section_${section}"; #start with some atrocious variable names so reading the script is more interesting $string1 = "\"; $string2 = "\<\/TABLE\>"; $splitrows = "\<\/TR\>\n\agent("Mozilla/4.0 (compatible; IAmTheWalrus 5.5; OpenBDSM)"); $res = $ua->request(HTTP::Request->new(GET => $url)); $_ = $res->content; s/\(([0-9]*)\ total\)/\1/; #my trusty regex will find it! $total = $1; $pages = ($total / 50) + 1; #the plot thickens #go through each page of results in this section and get the goods for ( $page = 0; $page <= $pages; ++$page ) { $offset = ($page * 50); $url = "http://www.torrentreactor.net/torrents/section_${section}&off=${offset}"; $ua = new LWP::UserAgent; $res = $ua->request(HTTP::Request->new(GET => $url)); $text = $res->content; my $tidy = new HTML::Tidy; #Tidy is a delightful tool $tidy->clean($text); #It gives me consistent, easy to work with output #so I don't have to parse messy real-world html @text = split(/\>\\n\<", @text); #follows another tag with nothing between @string3 = split(/$string1/,$text); #find the table we want $string4 = $string3[1]; #grab everything after that table opens @string5 = split(/$string2/, $string4); #find table endings $string6 = $string5[0]; #grab inside the table @theserows = split(/$splitrows/,$string6); #split the table rows ($header, @theserows) = @theserows; #first row is the header push(@rows, @theserows); } foreach $row (@rows) { @cols = split(/$splitcols/,$row); #split into TDs #One Regular Expression to rule them all, One Regular Expression to find them, #One Regular Expression to bring them all and in the array bind them #In the Land of Perl Code where the $_'s lie. $_ = $cols[0]; if ( s/\>(.*\:.*)/\1/ ) { $time = $1; } #these would be great places for 'else' statements and error #flagging code, in case one of the regexes didn't always $_ = $cols[1]; #work with the real data. if I cared enough. if ( s/view_.*title\=\"(.*)\"/\1/ ) { $title = $1; } $_ = $cols[1]; if ( s/HREF\=\"(http\:\/\/www\.torrentreactor\.net\/torrents\/download_[0-9]*)\"/\1/ ) { $dl = $1; } $_ = $cols[1]; if ( s/HREF\=\"(http\:\/\/www\.torrentreactor\.net\/torrents\/view_[0-9]*)\"/\1/ ) { $view = $1; } $_ = $cols[2]; if ( s/\>([0-9][0-9].*)/\1/ ) { $size = $1; } $_ = $cols[3]; if ( s/\>([0-9].*)/\1/ ) { $seed = $1; } $_ = $cols[4]; if ( s/\>([0-9].*)/\1/ ) { $leech = $1; } $_ = $cols[5]; if ( s/section_.*\"\>(.*)\(.*)\