#!/usr/bin/perl # # usage: dmozlists.pl content.rdf.u8[.gz] # # This script filters a RDF dump file of Open Directory Project # and extracts some domains/urls useful for filtering from it. # # The copyright of RDF files is held by AOL/Netscape # You must agree with the Open Directory License # (at http://dmoz.org/license.html) in order to use this script. # # The lastest RDF files are available at http://dmoz.org/rdf.html # # By Masanori Harada 2002 (harada@ingrid.org) # # # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License (version 2) as # published by the Free Software Foundation. It is distributed in the # hope that it will be useful, but WITHOUT ANY WARRANTY; without even the # implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR # PURPOSE. See the GNU General Public License (GPL) for more details. # # You should have received a copy of the GNU General Public License # (GPL) along with this program. # use 5.000; # perl4 is not supported. use strict; use File::Path; use FileHandle; my $VERSION = '1.0.0'; my $GUNZIP = 'gzip -qdc'; my $content_path = shift; unless (defined($content_path)) { print STDOUT "usage: dmozlists.pl content.rdf.u8[.gz]\r\n"; exit 1; } if ($content_path =~ /\.gz$/i) { open(CONTENT, "${GUNZIP} ${content_path}|") || die "cannot open and decompress ${content_path} : $!"; } else { open(CONTENT, "<${content_path}") || die "cannot open ${content_path} : $!"; } my %CATS = ( # category regex => [rule-dir, domains filehandle, urls filehandle] '^Top/Adult' => ['dmozlists/adult', undef, undef], '^Top/Arts' => ['dmozlists/arts', undef, undef], '^Top/Business' => ['dmozlists/business', undef, undef], '^Top/Computers' => ['dmozlists/computers', undef, undef], '^Top/Games' => ['dmozlists/games', undef, undef], '^Top/Health' => ['dmozlists/health', undef, undef], '^Top/Home' => ['dmozlists/home', undef, undef], '^Top/News' => ['dmozlists/news', undef, undef], '^Top/Recreation' => ['dmozlists/recreation', undef, undef], '^Top/Reference' => ['dmozlists/reference', undef, undef], '^Top/Regional' => ['dmozlists/regional', undef, undef], '^Top/Science' => ['dmozlists/science', undef, undef], '^Top/Shopping' => ['dmozlists/shopping', undef, undef], '^Top/Society' => ['dmozlists/society', undef, undef], '^Top/Sports' => ['dmozlists/sports', undef, undef], '^Top/World' => ['dmozlists/world', undef, undef], '^Top/Kids_and_Teens' => ['dmozlists/kids_and_teens', undef, undef] ); my %AGES = ( # type => [rule-dir, domains filehandle, urls filehandle] 'kids' => ['dmozlists/ages/kids', undef, undef], 'teen' => ['dmozlists/ages/teen', undef, undef], 'mteen' => ['dmozlists/ages/mteen', undef, undef] ); foreach (values %CATS, values %AGES) { my $dir = ${$_}[0]; $dir =~ s@/$@@; my $doms_tmp = "${dir}/domains.tmp"; my $urls_tmp = "${dir}/urls.tmp"; mkpath($dir, 0, 0777) unless (-e $dir); my $domfile = new FileHandle; my $urlfile = new FileHandle; open($domfile, ">${doms_tmp}") || die $!; open($urlfile, ">${urls_tmp}") || die $!; ${$_}[1] = $domfile; ${$_}[2] = $urlfile; } my @matches = (); my $catname; my $url; my $lineno = 0; my $line; my $pageurl; while ($line = ) { if ($line =~ m@^= 0 && $line =~ m@@) { $pageurl = $1; } elsif ($line =~ m@(.*)@) { my @ages = split('/', $1); $pageurl = unescape($pageurl); next unless ($pageurl =~ m@^(http|https|ftp):@); $pageurl =~ s@#.*@@; # remove ref $pageurl =~ s@^(.*?:/*)@@; # remove protocol unless ($pageurl =~ /\?/) { $pageurl =~ s@/[^/]*$@@; # remove filename } next if ($pageurl =~ /\s+/); # skip strange urls $pageurl =~ tr/A-Z/a-z/; $pageurl =~ s@^www\.@@; foreach (@ages) { my @entry = @{$AGES{$_}}; my $DOMFILE = $entry[1]; my $URLFILE = $entry[2]; if ($pageurl =~ m@^([^/:]+)$@) { $pageurl =~ s/\.+$//; # remove extra periods print $DOMFILE "${pageurl}\n"; } else { print $URLFILE "${pageurl}\n"; } } } } foreach (values %CATS, values %AGES) { my $dir = ${$_}[0]; close(${$_}[1]); close(${$_}[2]); my $doms_tmp = "${dir}/domains.tmp"; my $urls_tmp = "${dir}/urls.tmp"; my $domspath = "${dir}/domains"; my $urlspath = "${dir}/urls"; open(DOMSTMP, "<${doms_tmp}") || warn $!; open(DOMSFILE, ">${domspath}") || warn $!; $_ = printdoms(\*DOMSFILE, grep(chomp, )); print "${domspath}\t$_\r\n"; close(DOMSFILE); close(DOMSTMP); unlink $doms_tmp; open(URLSTMP, "<${urls_tmp}") || warn $!; open(URLSFILE, ">${urlspath}") || warn $!; $_ = printurls(\*URLSFILE, grep(chomp, )); print "${urlspath}\t$_\r\n"; close(URLSFILE); close(URLSTMP); unlink $urls_tmp; } exit 0; # ---------------------------------------- sub unescape($) { my ($url) = shift; $url =~ s/"/\"/g; $url =~ s/</\/g; $url =~ s/&/\&/g; $url =~ s/%([0-9A-Fa-f][0-9A-Fa-f])/pack("C",hex($1))/eg; return $url; } sub printurls($,@) { my ($URLFILE, @urls) = @_; my $count = 0; my $lasturl = ''; foreach (sort @urls) { if ($_ ne $lasturl) { print $URLFILE "$_\r\n"; ++$count; } $lasturl = $_; } return $count; } sub printdoms($,@) { my ($DOMFILE, @doms) = @_; my $count = 0; my $lastdom = ''; foreach (sort {revdomain($a) cmp revdomain($b)} @doms) { if ($_ ne $lastdom) { print $DOMFILE "$_\r\n"; ++$count; } $lastdom = $_; } return $count; } sub revdomain($) { my $str = shift; return join('.', reverse split('\.', $str)); } sub revstr($) { my $str = shift; return join('', reverse split(//, $str)); }