=head1 NAME WWW::Crawler::Mojo - A web crawling framework for Perl =head1 SYNOPSIS use strict; use warnings; use utf8; use WWW::Crawler::Mojo; use 5.10.0; my $bot = WWW::Crawler::Mojo->new; my %count; $bot->on(res => sub { my ($bot, $discover, $queue, $res) = @_; $count{$res->code}++; if ($res->code == 404) { say sprintf('404 occured! : %s referred by %s', $queue->resolved_uri, $queue->referrer->resolved_uri); } my @disp_seed; push(@disp_seed, sprintf('%s:%s', $_, $count{$_})) for (keys %count); $| = 1; print(join(' / ', @disp_seed), ' ' x 30); print("\r"); $discover->(); }); $bot->on(refer => sub { my ($bot, $enqueue, $queue, $context) = @_; $enqueue->(); }); $bot->enqueue('http://example.com/'); $bot->peeping_port(3001); $bot->crawl; =head1 DESCRIPTION L is a web crawling framework for those who familier with L::* APIs. Note that the module is aimed at trivial use cases of crawling within a moderate range of web pages so DO NOT use it for persistent crawler jobs. =head1 ATTRIBUTE L inherits all attributes from L and implements the following new ones. =head2 ua A L instance. my $ua = $bot->ua; $bot->ua(Mojo::UserAgent->new); =head2 ua_name Name of crawler for User-Agent header. $bot->ua_name('my-bot/0.01 (+https://example.com/)'); say $bot->ua_name; # 'my-bot/0.01 (+https://example.com/)' =head2 active_conn A number of current connections. $bot->active_conn($bot->active_conn + 1); say $bot->active_conn; =head2 active_conns_per_host A number of current connections per host. $bot->active_conns_per_host($bot->active_conns_per_host + 1); say $bot->active_conns_per_host; =head2 depth A number of max depth to crawl. Note that the depth is the number of HTTP requests to get to URI starting with the first queue. This doesn't mean the deepness of URI path detected with slash. $bot->depth(5); say $bot->depth; # 5 =head2 max_conn A number of max connections. $bot->max_conn(5); say $bot->max_conn; # 5 =head2 max_conn_per_host A number of max connections per host. $bot->max_conn_per_host(5); say $bot->max_conn_per_host; # 5 =head2 peeping_max_length Max length of peeping API content. $bot->peeping_max_length(100000); say $bot->peeping_max_length; # 100000 =head2 queues FIFO array contains L objects. push(@{$bot->queues}, WWW::Crawler::Mojo::Queue->new(...)); my $queue = shift @{$bot->queues}; =head1 EVENTS L inherits all events from L and implements the following new ones. =head2 res Emitted when crawler got response from server. $bot->on(res => sub { my ($bot, $discover, $queue, $res) = @_; if (...) { $discover->(); } else { # DO NOTHING } }); =head2 refer Emitted when new URI is found. You can enqueue the URI conditionally with the callback. $bot->on(refer => sub { my ($bot, $enqueue, $queue, $context) = @_; if (...) { $enqueue->(); } elseif (...) { $enqueue->(...); # maybe different url } else { # DO NOTHING } }); =head2 empty Emitted when queue length got zero. The length is checked every 5 seconds. $bot->on(empty => sub { my ($bot) = @_; say "Queue is drained out."; }); =head2 error Emitted when user agent returns no status code for request. Possibly caused by network errors or un-responsible servers. $bot->on(error => sub { my ($bot, $error, $queue) = @_; say "error: $_[1]"; if (...) { # until failur occures 3 times $bot->requeue($queue); } }); Note that server errors such as 404 or 500 cannot be catched with the event. Consider res event for the use case instead of this. =head2 start Emitted right before crawl is started. $bot->on(start => sub { my $self = shift; ... }); =head1 METHODS L inherits all methods from L and implements the following new ones. =head2 crawl Start crawling loop. $bot->crawl; =head2 init Initialize crawler settings. $bot->init; =head2 process_queue Process a queue. $bot->process_queue; =head2 say_start Displays starting messages to STDOUT $bot->say_start; =head2 peeping_handler peeping API dispatcher. $bot->peeping_handler($loop, $stream); =head2 discover Parses and discovers links in a web page. Each links are appended to FIFO array. $bot->discover($res, $queue); =head2 enqueue Append a queue with a URI or L object. $bot->enqueue($queue); =head2 requeue Append a queue for re-try. $self->on(error => sub { my ($self, $msg, $queue) = @_; if (...) { # until failur occures 3 times $bot->requeue($queue); } }); =head2 collect_urls_html Collects URLs out of HTML. WWW::Crawler::Mojo::collect_urls_html($dom, sub { my ($uri, $dom) = @_; }); =head2 collect_urls_css Collects URLs out of CSS. WWW::Crawler::Mojo::collect_urls_css($dom, sub { my $uri = shift; }); =head2 guess_encoding Guesses encoding of HTML or CSS with given L instance. $encode = WWW::Crawler::Mojo::guess_encoding($res) || 'utf-8' =head2 resolve_href Resolves URLs with a base URL. WWW::Crawler::Mojo::resolve_href($base, $uri); =head1 EXAMPLE L =head1 AUTHOR Sugama Keita, Esugama@jamadam.comE =head1 COPYRIGHT AND LICENSE Copyright (C) jamadam This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut