Index: Makefile =================================================================== RCS file: D:/home/kazuhiro/CVSROOT/wikipedia-fpw/Makefile,v retrieving revision 1.11 diff -u -r1.11 Makefile --- Makefile 30 Jul 2008 11:55:26 -0000 1.11 +++ Makefile 22 Oct 2008 11:22:37 -0000 @@ -25,7 +25,7 @@ FPWPARSERFLAGS = ${SOURCES} ${COPYRIGHT_FILE} PREPARSE_DEP = ${SRCDIR}/preparse.dep ALLDEPS = ${PREPARSE_DEP} -CLEANEXTRA = ${PREPARSE_DEP} ${SRCDIR}/entries +CLEANEXTRA = ${SRCDIR}/${PREPARSE_DEP} ${SRCDIR}/entries ${SRCDIR}/redirects .SUFFIXES: @@ -34,5 +34,5 @@ ${PREPARSE_DEP}: ${COMMON_SOURCES} ${PREPARSER} @rm -f ${PARSE_DEP} ${PERL} ${PERLINCFLAGS} ${PERLFLAGS} ${PREPARSER} \ - ${SRCDIR}/wikipedia.xml ${SRCDIR}/entries + ${SRCDIR}/wikipedia.xml ${SRCDIR}/entries ${SRCDIR}/redirects @echo timestamp > ${SRCDIR}/preparse.dep Index: fpwwikipedia =================================================================== RCS file: D:/home/kazuhiro/CVSROOT/wikipedia-fpw/fpwwikipedia,v retrieving revision 1.11 diff -u -r1.11 fpwwikipedia --- fpwwikipedia 30 Jul 2008 11:55:26 -0000 1.11 +++ fpwwikipedia 23 Oct 2008 00:34:58 -0000 @@ -20,6 +20,10 @@ use Encode qw/ from_to /; use FileHandle; +use DB_File; +use Fcntl; + + use vars qw(%fpwwikipedia_conf); require "wikipedia-fpw.conf"; @@ -28,7 +32,7 @@ use vars qw ($utf2euc_regexp); require "tables"; -use vars qw(%entry_headings); +use vars qw(%entry_headings %redirects); MAIN: { my $time = time; @@ -41,7 +45,7 @@ 'word2' => \$fpwword2, 'copyright' => \$fpwcopyright); - get_entry_headings('entries'); + get_entry_headings('entries', 'redirects'); my $copyright_filename; if ( $#ARGV < 1) { @@ -149,6 +153,9 @@ } } + untie(%entry_headings); + untie(%redirects); + finalize_fpwparser('text' => \$fpwtext, 'heading' => \$fpwheading, 'word2' => \$fpwword2, @@ -201,17 +208,18 @@ my @keys = ($key); - if (defined($entry_headings{$key}) - && $entry_headings{$key} != 1) { - push (@keys, @{$entry_headings{$key}}); - } else { - @keys = ($key); + if (defined($redirects{$key})) { + push (@keys, split(/\t/, $redirects{$key})); } - + my @extended_keys; @extended_keys = register_search_entry_internal(@keys); foreach $key (@extended_keys) { + if ($key =~ /^(\xA1\xA1| |\xA1\xC7|\'|\xA1\xDD|-|\xA1\xA6|\xA1\xBE)+$/) { + next; + } + if(verbose_mode ()) { my $tmp = $key; from_to($tmp, 'euc-jp', 'utf-8'); @@ -660,28 +668,16 @@ } sub get_entry_headings { - my $filename = $_[0]; - - if (not -e $filename) { - die("$PROGRAM_NAME: '$filename' does not exist."); - } + my ($entries_filename, $redirects_filename) = @_; - my $entry_file = FileHandle->new(); - if (!$entry_file->open("$filename", 'r')) { - die "$PROGRAM_NAME: Failed to open the file, $ERRNO: $filename\n"; - } - - for (;;) { - $_ = $entry_file->getline(); - if (!defined($_)) { - last; - } - if ($_ =~ /\t/) { - $_ =~ s/^([^\t]+)\t//s; - $entry_headings{$1} = [split(/\t|\n/, $_)]; - } else { - $_ =~ /(.+)\n/; - $entry_headings{$1} = 1; - } + if (not -e $entries_filename) { + die("$PROGRAM_NAME: '$entries_filename' does not exist."); + } elsif (not -e $redirects_filename) { + die("$PROGRAM_NAME: '$redirects_filename' does not exist."); } + + tie(%entry_headings, 'DB_File', $entries_filename, O_RDONLY, + 0644, $DB_HASH); + tie(%redirects, 'DB_File', $redirects_filename, O_RDONLY, + 0644, $DB_HASH); } Index: preparser =================================================================== RCS file: D:/home/kazuhiro/CVSROOT/wikipedia-fpw/preparser,v retrieving revision 1.4 diff -u -r1.4 preparser --- preparser 28 May 2008 00:02:02 -0000 1.4 +++ preparser 18 Oct 2008 09:38:10 -0000 @@ -18,6 +18,9 @@ use English; use FileHandle; +use DB_File; +use Fcntl; + use vars qw(%fpwwikipedia_conf); require "wikipedia-fpw.conf"; @@ -27,13 +30,19 @@ my $entry_count = 0; my $wikipedia_filename = $ARGV[0]; - my $output_filename = $ARGV[1]; + my $entries_filename = $ARGV[1]; + my $redirects_filename = $ARGV[2]; my %headings; my %redirects; my $text; my $heading; + tie(%headings, 'DB_File', $entries_filename, O_CREAT | O_RDWR | O_TRUNC, + 0644, $DB_HASH); + tie(%redirects, 'DB_File', $redirects_filename, O_CREAT | O_RDWR | O_TRUNC, + 0644, $DB_HASH); + if (not -e $wikipedia_filename) { die("$PROGRAM_NAME: '$wikipedia_filename' does not exist."); } @@ -43,11 +52,6 @@ die "$PROGRAM_NAME: Failed to open the file, $ERRNO: $wikipedia_filename\n"; } - my $output = FileHandle->new(); - if (!$output->open("$output_filename", 'w')) { - die "$PROGRAM_NAME: Failed to open the file, $ERRNO: $output_filename\n"; - } - if(verbose_mode ()) { print "Skipping headers: $fpwwikipedia_conf{'skip_heading'}\n"; print "Skipping contents: $fpwwikipedia_conf{'skip_content'}\n"; @@ -96,9 +100,9 @@ print "Redirect: $page_count; $heading.\n"; if (defined($redirects{$_})) { - push(@{$redirects{$_}}, $heading); + $redirects{$_} .= "\t$heading"; } else { - $redirects{$_} = [$heading]; + $redirects{$_} = $heading; } next; } elsif (is_skipped_content($text)){ @@ -121,17 +125,15 @@ } } - foreach $heading (keys(%headings)) { - $text = $heading; - if (defined($redirects{$heading})) { - foreach $_ (@{$redirects{$heading}}) { - $text .= "\t$_"; - } + while ($heading = each(%redirects)) { + if (!defined($headings{$heading})) { + delete($redirects{$heading}); } - $text .= "\n"; - $output->print("$text"); } + untie %headings; + untie %redirects; + printf("$PROGRAM_NAME: Elapsed time : %8dsec.\n", time - $time); printf("$PROGRAM_NAME: Number of entries: %8d\n", $entry_count); } Index: wikipedia-fpw.conf =================================================================== RCS file: D:/home/kazuhiro/CVSROOT/wikipedia-fpw/wikipedia-fpw.conf,v retrieving revision 1.6 diff -u -r1.6 wikipedia-fpw.conf --- wikipedia-fpw.conf 30 Jul 2008 11:55:26 -0000 1.6 +++ wikipedia-fpw.conf 23 Oct 2008 06:43:12 -0000 @@ -15,9 +15,7 @@ %fpwwikipedia_conf = ( # -# 長い検索語の登録に対応したFreePWINGを使用する場合は 0 にします。 -# http://green.ribbon.to/~ikazuhiro/dic/dic.html#FREEPWING -# を参照して下さい。 +# 長い検索語の登録に対応した FreePWING 1.5 以降を使用する場合は 0 にします。 # 'trim_long_index' => 0, @@ -64,8 +62,8 @@ # 指定しない場合は全てマッチしなかったとみなします。 # my @skip_headings = ( - '^(Wikipedia|MediaWiki|Template|WP|Portal|Category|画像|Help):', - '^( | |\xEF\xBC\x8D|\xE2\x88\x92|‐|-|\'|’|・)+$', + '^(Wikipedia|MediaWiki|Template|WP|Portal|Category|画像|Help|Image):', + # '^( | |\xEF\xBC\x8D|\xE2\x88\x92|‐|-|\'|’|・)+$', '/履歴$', # '^Category:.*操り人形だと疑われるユーザー', );