SlideShare a Scribd company logo
All YOUR PAGE ARE BELONG TO US
    すべてのウェブページをこの手に

            2012/11/16

           株式会社はてな
          大西康裕 id:onishi
  id:onishi 大西康裕
  ONISHI
  @yasuhiro_onishi
  株式会社はてな
  はてなブログ
ウェブページを
保存したい
ウェブページを保存したい
 •ウェブページは日々変化する
 •手元に置いておきたい
 •競合調査
 • 魚拓

 •画像などまとめて保存したい
Google
Chrome
wget.pl
HTML::Parser
my $result;
my $parser    = HTML::Parser->new(
    start_h      => [ sub {}, 'self,tagname,attr,text' ],
    default_h    => [ sub {}, 'self,text' ],
);
$parser->parse($content);
print $result;


   •   text
   •   start
   •   end
   •   process
   •   declaration
   •   comment
   •   default
HTML::Parser
start_h => [
    sub {
        my($self, $tagname, $attr, $text) = @_;
        $result .= "<$tagname";
        for my $key (sort keys %$attr) {
             my $value = $attr->{$key};
             if ($key =~ /^(?:src)$/i) {
                 # HTTP GET して保存してローカルパスにする
                 $value = get_src($value);
             }
             $result .= qq{ $key="$value"};
         }
         $result .= ">";
     },
     'self,tagname,attr,text',
],
HTML::Parser
default_h => [
    sub {
        my($self, $text) = @_;
        $result .= $text;
    },
    'self,text',
],
完
wget.pl
CSSから参照
$content =~ s{url(([^)]+))}{
    my $link = $1;

       # relative link (from HTML::ResolveLink)
       my $u = URI->new($link);
       unless (defined $u->scheme) {
           my $old = $u;
           $u = $u->abs($url);
       }
       $link = get_src($u); # HTTP GET して保存してローカルパスに
       "url($link)";
}eg;
script 殺す
my $context = { disallow => 0 };
my $disallow_tag = qr{script};
start_h => [sub {
    if ($tagname =~ /^(?:$disallow_tag)$/i) {
        $context->{disallow}++; return;
    }
}],
end_h => [sub {
    if ($tagname =~ /^(?:$disallow_tag)$/i) {
        $context->{disallow}--; return;
    }
}],
default_h => [sub {
    if ($context->{disallow} > 0) {
        return;
    }
}],
noscript 内を生かす
my $nodisplay_tag = qr{noscript};

start_h => [sub {
    if ($tagname =~ /^(?:$nodisplay_tag)$/i) {
        return;
    }
}],
end_h => [sub {
    if ($tagname =~ /^(?:$nodisplay_tag)$/i) {
        return;
    }
}],
base


start_h => [sub {
    if ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i)   {
        $value = "./";
    }
}],
できました!

gist.github.com/

   4071196
#!/usr/bin/env perl
use strict;
use warnings;
use utf8;

use    DateTime;
use    Digest::SHA1 qw(sha1_hex);
use    Encode;
use    File::Path qw/make_path/;
use    HTML::Parser;
use    HTML::ResolveLink;
use    HTTP::Request::Common qw/GET/;
use    IO::All;
use    LWP::UserAgent;
use    URI;

my $path = './';

my    $uri        =   URI->new(shift) or die;
my    $now        =   DateTime->now;
my    $ymd        =   $now->ymd;
my    $ua         =   LWP::UserAgent->new(agent => 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)');
my    $resolver   =   HTML::ResolveLink->new(base => $uri);
my    $res        =   $ua->request(GET $uri);
my    $content    =   $resolver->resolve($res->decoded_content);

my $dir           = $uri;
   $dir           =~ s{[^A-Za-z0-9.]+}{-}g;
   $dir           =~ s{-+$}{};
   $dir           = "$path/$dir/$ymd/";
   $dir           =~ s{/+}{/}g;

make_path($dir);

my $disallow_tag = qr{script};
my $nodisplay_tag = qr{noscript};

my $result;
my $context = { disallow => 0 };
my $parser    = HTML::Parser->new(
    api_version => 3,
    start_h      => [
        sub {
            my($self, $tagname, $attr, $text) = @_;
            if ($tagname =~ /^(?:$nodisplay_tag)$/i) {
                 return;
            } elsif ($tagname =~ /^(?:$disallow_tag)$/i) {
                 $context->{disallow}++;
                 return;
            }
            $result .= "<$tagname";
            for my $key (sort keys %$attr) {
                 $key eq '/' and next;
                 my $value = $attr->{$key};
                 if ($key =~ /^(?:src)$/i) {
                      $value = get_src($value);
                 } elsif ($tagname =~ /^(?:link)$/i and $key =~ /^(?:href)$/i)        {
                      $value = get_link($value);
                 } elsif ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i)        {
                      $value = $path;
                 }
                 $result .= qq{ $key="$value"};
            }
            $result .= ">";
        },
        'self,tagname,attr,text',
    ],
    end_h      => [
        sub {
            my($self, $tagname, $text) = @_;
            if ($tagname =~ /^(?:$nodisplay_tag)$/i) {
                 return;
            } elsif ($tagname =~ /^(?:$disallow_tag)$/i) {
                 $context->{disallow}--;
                 return;
            }
            $result .= $text;
        },
        'self,tagname,text',
    ],
    default_h    => [
        sub {
            my($self, $text) = @_;
            if ($context->{disallow} > 0) {
                 return;
            }
            $result .= $text;
        },
        'self,text',
    ],
);

$parser->parse($content);

$result =~ s{(<head[^>]*>)}{$1<meta http-equiv="Content-Type" content="text/html; charset=utf-8">}i; # XXX

$result = Encode::encode('utf-8', $result);

$result > io("${dir}index.html");

print "${dir}index.htmln";

sub get_src {
    my $src = shift or return;
    unless (-e "${dir}file") {
        make_path("${dir}file");
    }
    my $file = $src;
    $file =~ s{[^A-Za-z0-9.]+}{-}g;
    if (length($file) > 255) {
        $file = sha1_hex($file);
    }
    $file = "file/$file";
    $file =~ s{/+}{/}g;
    unless (-e "$dir$file") {
        $ua->request(GET $src)->content >> io("$dir$file");
        sleep(1); # DOS対策対策
       }
       $file;
}

sub get_link {
    my $url = shift or return;
    my $file = get_src($url);
    my $io = io("$dir$file");
    my $content = $io->slurp;
    $content =~ s{url(([^)]+))}{
        my $link = $1;
        $link =~ s{^[s"']+}{};
        $link =~ s{[s"']+$}{};

           # relative link (from HTML::ResolveLink)
           my $u = URI->new($link);
           unless (defined $u->scheme) {
               my $old = $u;
               $u = $u->abs($url);
           }
           $link = get_src($u);
           $link =~ s{^file/}{};
           "url($link)";
       }eg;
       $content > $io;
       return $file;
}
Google
Chrome
wget.pl
wget.pl
wget.pl
どうぞご利用ください!

gist.github.com/

   4071196
ご清聴ありがとうございました

More Related Content

PDF
C A S Sample Php
PPTX
London XQuery Meetup: Querying the World (Web Scraping)
PPTX
Topological indices (t is) of the graphs to seek qsar models of proteins com...
PDF
20 modules i haven't yet talked about
PDF
Pemrograman Web 9 - Input Form DB dan Session
PDF
Pemrograman Web 8 - MySQL
PDF
DEV Čtvrtkon #76 - Fluent Interface
PDF
R57shell
C A S Sample Php
London XQuery Meetup: Querying the World (Web Scraping)
Topological indices (t is) of the graphs to seek qsar models of proteins com...
20 modules i haven't yet talked about
Pemrograman Web 9 - Input Form DB dan Session
Pemrograman Web 8 - MySQL
DEV Čtvrtkon #76 - Fluent Interface
R57shell

What's hot (20)

PPT
Database api
TXT
PDF
introduction to Django in five slides
PPT
An Elephant of a Different Colour: Hack
PDF
The History of PHPersistence
PPTX
Database Management - Lecture 4 - PHP and Mysql
ODP
PHP pod mikroskopom
PDF
Database Design Patterns
DOC
PDF
Php (1)
PDF
Uncovering Iterators
PDF
Agile database access with CakePHP 3
ODT
linieaire regressie
PPTX
PHP Lecture 4 - Working with form, GET and Post Methods
PDF
PHP for Adults: Clean Code and Object Calisthenics
PDF
Doctrine fixtures
PDF
Perl Fitxers i Directoris
PDF
CGI.pm - 3ло?!
DOCX
Table through php
KEY
Object Calisthenics Applied to PHP
Database api
introduction to Django in five slides
An Elephant of a Different Colour: Hack
The History of PHPersistence
Database Management - Lecture 4 - PHP and Mysql
PHP pod mikroskopom
Database Design Patterns
Php (1)
Uncovering Iterators
Agile database access with CakePHP 3
linieaire regressie
PHP Lecture 4 - Working with form, GET and Post Methods
PHP for Adults: Clean Code and Object Calisthenics
Doctrine fixtures
Perl Fitxers i Directoris
CGI.pm - 3ло?!
Table through php
Object Calisthenics Applied to PHP
Ad

Viewers also liked (6)

PDF
開発合宿!!!!
KEY
Redmine::ChanでIRCからプロジェクト管理
PDF
oEmbed と Text::Hatena
PDF
Webサーバ勉強会#5
PDF
ウェブアプリケーションのパフォーマンスチューニング
PDF
Hatena blogdevelopmentflow
開発合宿!!!!
Redmine::ChanでIRCからプロジェクト管理
oEmbed と Text::Hatena
Webサーバ勉強会#5
ウェブアプリケーションのパフォーマンスチューニング
Hatena blogdevelopmentflow
Ad

Recently uploaded (20)

PDF
Advanced methodologies resolving dimensionality complications for autism neur...
PDF
Encapsulation theory and applications.pdf
PPTX
Cloud computing and distributed systems.
PDF
Per capita expenditure prediction using model stacking based on satellite ima...
PDF
NewMind AI Monthly Chronicles - July 2025
PDF
Mobile App Security Testing_ A Comprehensive Guide.pdf
PDF
CIFDAQ's Market Insight: SEC Turns Pro Crypto
PDF
TokAI - TikTok AI Agent : The First AI Application That Analyzes 10,000+ Vira...
PDF
Chapter 3 Spatial Domain Image Processing.pdf
PDF
Empathic Computing: Creating Shared Understanding
PDF
Review of recent advances in non-invasive hemoglobin estimation
PDF
Modernizing your data center with Dell and AMD
PDF
How UI/UX Design Impacts User Retention in Mobile Apps.pdf
DOCX
The AUB Centre for AI in Media Proposal.docx
PPTX
Digital-Transformation-Roadmap-for-Companies.pptx
PDF
Unlocking AI with Model Context Protocol (MCP)
PPTX
VMware vSphere Foundation How to Sell Presentation-Ver1.4-2-14-2024.pptx
PDF
7 ChatGPT Prompts to Help You Define Your Ideal Customer Profile.pdf
PDF
NewMind AI Weekly Chronicles - August'25 Week I
PDF
Spectral efficient network and resource selection model in 5G networks
Advanced methodologies resolving dimensionality complications for autism neur...
Encapsulation theory and applications.pdf
Cloud computing and distributed systems.
Per capita expenditure prediction using model stacking based on satellite ima...
NewMind AI Monthly Chronicles - July 2025
Mobile App Security Testing_ A Comprehensive Guide.pdf
CIFDAQ's Market Insight: SEC Turns Pro Crypto
TokAI - TikTok AI Agent : The First AI Application That Analyzes 10,000+ Vira...
Chapter 3 Spatial Domain Image Processing.pdf
Empathic Computing: Creating Shared Understanding
Review of recent advances in non-invasive hemoglobin estimation
Modernizing your data center with Dell and AMD
How UI/UX Design Impacts User Retention in Mobile Apps.pdf
The AUB Centre for AI in Media Proposal.docx
Digital-Transformation-Roadmap-for-Companies.pptx
Unlocking AI with Model Context Protocol (MCP)
VMware vSphere Foundation How to Sell Presentation-Ver1.4-2-14-2024.pptx
7 ChatGPT Prompts to Help You Define Your Ideal Customer Profile.pdf
NewMind AI Weekly Chronicles - August'25 Week I
Spectral efficient network and resource selection model in 5G networks

wget.pl

  • 1. All YOUR PAGE ARE BELONG TO US すべてのウェブページをこの手に 2012/11/16 株式会社はてな 大西康裕 id:onishi
  • 2.   id:onishi 大西康裕   ONISHI   @yasuhiro_onishi   株式会社はてな   はてなブログ
  • 7. HTML::Parser my $result; my $parser = HTML::Parser->new( start_h => [ sub {}, 'self,tagname,attr,text' ], default_h => [ sub {}, 'self,text' ], ); $parser->parse($content); print $result; • text • start • end • process • declaration • comment • default
  • 8. HTML::Parser start_h => [ sub { my($self, $tagname, $attr, $text) = @_; $result .= "<$tagname"; for my $key (sort keys %$attr) { my $value = $attr->{$key}; if ($key =~ /^(?:src)$/i) { # HTTP GET して保存してローカルパスにする $value = get_src($value); } $result .= qq{ $key="$value"}; } $result .= ">"; }, 'self,tagname,attr,text', ],
  • 9. HTML::Parser default_h => [ sub { my($self, $text) = @_; $result .= $text; }, 'self,text', ],
  • 10.
  • 12. CSSから参照 $content =~ s{url(([^)]+))}{ my $link = $1; # relative link (from HTML::ResolveLink) my $u = URI->new($link); unless (defined $u->scheme) { my $old = $u; $u = $u->abs($url); } $link = get_src($u); # HTTP GET して保存してローカルパスに "url($link)"; }eg;
  • 13. script 殺す my $context = { disallow => 0 }; my $disallow_tag = qr{script}; start_h => [sub { if ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}++; return; } }], end_h => [sub { if ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}--; return; } }], default_h => [sub { if ($context->{disallow} > 0) { return; } }],
  • 14. noscript 内を生かす my $nodisplay_tag = qr{noscript}; start_h => [sub { if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } }], end_h => [sub { if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } }],
  • 15. base start_h => [sub { if ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i) { $value = "./"; } }],
  • 17. #!/usr/bin/env perl use strict; use warnings; use utf8; use DateTime; use Digest::SHA1 qw(sha1_hex); use Encode; use File::Path qw/make_path/; use HTML::Parser; use HTML::ResolveLink; use HTTP::Request::Common qw/GET/; use IO::All; use LWP::UserAgent; use URI; my $path = './'; my $uri = URI->new(shift) or die; my $now = DateTime->now; my $ymd = $now->ymd; my $ua = LWP::UserAgent->new(agent => 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)'); my $resolver = HTML::ResolveLink->new(base => $uri); my $res = $ua->request(GET $uri); my $content = $resolver->resolve($res->decoded_content); my $dir = $uri; $dir =~ s{[^A-Za-z0-9.]+}{-}g; $dir =~ s{-+$}{}; $dir = "$path/$dir/$ymd/"; $dir =~ s{/+}{/}g; make_path($dir); my $disallow_tag = qr{script}; my $nodisplay_tag = qr{noscript}; my $result; my $context = { disallow => 0 }; my $parser = HTML::Parser->new( api_version => 3, start_h => [ sub { my($self, $tagname, $attr, $text) = @_; if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } elsif ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}++; return; } $result .= "<$tagname"; for my $key (sort keys %$attr) { $key eq '/' and next; my $value = $attr->{$key}; if ($key =~ /^(?:src)$/i) { $value = get_src($value); } elsif ($tagname =~ /^(?:link)$/i and $key =~ /^(?:href)$/i) { $value = get_link($value); } elsif ($tagname =~ /^(?:base)$/i and $key =~ /^(?:href)$/i) { $value = $path; } $result .= qq{ $key="$value"}; } $result .= ">"; }, 'self,tagname,attr,text', ], end_h => [ sub { my($self, $tagname, $text) = @_; if ($tagname =~ /^(?:$nodisplay_tag)$/i) { return; } elsif ($tagname =~ /^(?:$disallow_tag)$/i) { $context->{disallow}--; return; } $result .= $text; }, 'self,tagname,text', ], default_h => [ sub { my($self, $text) = @_; if ($context->{disallow} > 0) { return; } $result .= $text; }, 'self,text', ], ); $parser->parse($content); $result =~ s{(<head[^>]*>)}{$1<meta http-equiv="Content-Type" content="text/html; charset=utf-8">}i; # XXX $result = Encode::encode('utf-8', $result); $result > io("${dir}index.html"); print "${dir}index.htmln"; sub get_src { my $src = shift or return; unless (-e "${dir}file") { make_path("${dir}file"); } my $file = $src; $file =~ s{[^A-Za-z0-9.]+}{-}g; if (length($file) > 255) { $file = sha1_hex($file); } $file = "file/$file"; $file =~ s{/+}{/}g; unless (-e "$dir$file") { $ua->request(GET $src)->content >> io("$dir$file"); sleep(1); # DOS対策対策 } $file; } sub get_link { my $url = shift or return; my $file = get_src($url); my $io = io("$dir$file"); my $content = $io->slurp; $content =~ s{url(([^)]+))}{ my $link = $1; $link =~ s{^[s"']+}{}; $link =~ s{[s"']+$}{}; # relative link (from HTML::ResolveLink) my $u = URI->new($link); unless (defined $u->scheme) { my $old = $u; $u = $u->abs($url); } $link = get_src($u); $link =~ s{^file/}{}; "url($link)"; }eg; $content > $io; return $file; }