#!/usr/pkg/bin/perl -w # 2003.11.09 msittig@freeshell.org # 2003.11.12 msittig@freeshell.org (tuning for speed) # 2003.11.13 msittig@freeshell.org (documentation) # 2003.11.14 msittig@freeshell.org (wubi tweaks) # 2003.11.16 msittig@freeshell.org (cosmetic tweaks) # 2003.11.19 msittig@freeshell.org ('markup' bug w/ Lingua module) # 2003.11.20 msittig@freeshell.org (commenting & clarifying) use strict; use utf8; use lib '/arpa/hm/m/msittig/lib/site_perl/5.6.1'; use Lingua::ZH::CEDICT; use Benchmark; use CGI::Pretty qw(:standard :html3); my $MAX_WORD_SIZE = param('max_word_size'); $MAX_WORD_SIZE ||= 4; my $DEBUG = param('debug'); $DEBUG ||= 0; my $ROMAN_CHARACTERS = '0-9a-zA-Z\'"!?~\n\r\t()., /\-=_;{}+*&\[\]·%'; my $SAFARI = param('safari'); $SAFARI ||= 0; my $utf8 = q{ # UTF-8 encoding reg-exp, for parsing text [\x00-\x7F] | [\xC2-\xDF][\x80-\xBF] | \xE0[\xA0-\xBF][\x80-\xBF] | [\xE1-\xEF][\x80-\xBF][\x80-\xBF] | \xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF] | [\xF1-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF] | \xF8[\x88-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF] | [\xF9-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF] | \xFC[\x84-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF] | \xFD[\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF] }; my @TOOLTIP_SCRIPTS = ""; # Safari doesn't pop-up tooltips, so we include some Javascript if ($SAFARI) { my %script_layer = ( '-type' => 'text/javascript', '-src' => 'layer.js' ); my %script_mouse = ( -type=>'text/javascript', -src=>'mouse.js' ); my %script_tooltip = ( -type=>'text/javascript', -src=>'tooltip.js' ); @TOOLTIP_SCRIPTS = [\%script_layer, \%script_mouse, \%script_tooltip]; }; # Predeclare these -- they should be global my %dict; my %should_ignore; # This stuff is for the fancy pinyin accents. my $dict = Lingua::ZH::CEDICT->new(); $dict->init(); # Start printing html code print header(-charset=>'utf-8'); use bytes; # Work around harmless but annoying "wide character" error. print start_html(-title=>'Chinese Tool', -encoding=>'utf-8', -script=>@TOOLTIP_SCRIPTS, -head=>meta({-http_equiv=>'Content-Type', -content=>'text/html; charset=utf-8'}), -style=>{-src=>'zhtool.css', -media=>'all'}), h1('中文 Tool'), start_form("POST", "index.cgi", "utf-8"), ol({-class=>'note'}, li(a({-href=>'../src/zhtool/'}, 'Source code')), li(a({-href=>'http://www.mandarintools.com/cedict.html'}, 'CEDICT')), li(a({-href=>'http://www.perl.org'}, 'Perl')), li(a({-href=>'http://sdf.lonestar.org'}, 'SDF')), li(a({-href=>'http://www.cnblog.org/blog/'}, 'CNBlog')), li(a({-href=>'http://popjisyo.com/'}, 'PopJisyo')), li(a({-href=>'http://wubi.org/'}, 'Wubi.org')), ), p("Enter Chinese text:"), textarea(-name=>'zhtext', -rows=>'10', -columns=>'50'), "
", submit, checkbox_group(-name=>'debug', -values=>'Debug'), checkbox_group(-name=>'safari', -values=>'Safari'), radio_group(-name=>'max_word_size', -values=>['4', '5'], -default=>'4'), end_form; no bytes; # Main parsing routine, executed when there is input if (param('zhtext')) { # For benchmarking purposes my $t0 = new Benchmark; print hr; my $time_string = localtime; error("

$time_string

"); # Read the dictionary files into a hash load_dictionary("cedict.gb.utf8", \%dict); load_dictionary("msdict.utf8", \%dict); load_dictionary("override.utf8", \%dict); # Read the list of ignored characters (mostly punctuation) load_dictionary("ignore.utf8", \%should_ignore); # Clean up the input my $text_query = param('zhtext'); chomp($text_query); # Start recursive lookup of last ("next") word my @text_arrayified = split('', $text_query); my ($word, $text_left) = next_word($MAX_WORD_SIZE, \@text_arrayified); my $parsed_text = markup($word); while(scalar @$text_left) { error("Adding $word to parsed_text.

"); ($word, $text_left) = next_word($MAX_WORD_SIZE, $text_left); $parsed_text = markup($word).$parsed_text; } error("

"); # Print output $parsed_text =~ s{(\n|\r\n)}{
}g; print h3("Result:"), p("$parsed_text"); # Benchmark output. my $t1 = new Benchmark; my $td = timediff($t1, $t0); print p({-class=>'benchmark'},"This took ",timestr($td),"."); } else { use bytes; print p("Try this: 毛主席万岁 and hover the mouse over the output."); no bytes; } print end_html; exit 1; ## Subroutines # load_dictionary: loads a file into memory for global use # TAKES: string of dictionary file name, # reference(hash) in which to store contents of dic # RETURNS: sub load_dictionary { my $dictionary_name = shift; my $hash_ref = shift; open(DIC, "$dictionary_name") or print p("Couldn't open $dictionary_name"); binmode(DIC); my $first_line = ; error("Opened dictionary $dictionary_name.
"); while() { my $line = $_; $line = m/([^ ]+) *(.*)/i; my ($word, $info) = ($1, $2); chomp($word, $info); $info = 1 if $dictionary_name =~ /ignore/; $$hash_ref{$word} = $info; } close DIC; } # its_a_word: checks if the chunk is a word in the dictionary # TAKES: string with the word to check # RETURNS: string, 1 or 0 sub its_a_word { if ($dict{"$_[0]"}) { return 1; } else { return 0; } } # next_word: # TAKES: scalar number of characters to try as next word # reference to hash of the remaining text # RETURNS: string with the next word # reference to hash of the remaining text, minus word # recursion in the hiz-ouse sub next_word { error("Passed arguments were: ".join(' and ', @_)."
"); my $max_chars = shift; my $text_toparse = shift; error("Text to parse this time: $text_toparse
"); my $is_this_a_word = ""; error("Building word"); foreach my $char (1 .. $max_chars) { if (! $$text_toparse[length(@$text_toparse) - 1]) { $max_chars -= 1; my $this_many = $max_chars - 1; error("... max_chars $this_many next time."); last; } $is_this_a_word = pop(@$text_toparse)."$is_this_a_word"; error("... $is_this_a_word"); } error("
"); error("max_chars = $max_chars, is_this_a_word = $is_this_a_word
"); error("Checking to see if \"$is_this_a_word\" is a word..."); if(its_a_word($is_this_a_word) || $max_chars == 1) { error("yes! (or just a single character)

"); return $is_this_a_word, $text_toparse; } else { error("nope.

"); push @$text_toparse, split('', $is_this_a_word); return next_word($max_chars - 1, $text_toparse); } } # word_so_far sub word_so_far { if ($_[0] ne "") { return $_[0]; } else { return ""; } } # split_utf8: splits a utf-8 string and returns array # TAKES: string to split, in mixed ascii/utf-8 # RETURNS: array of single characters # perl 5.8's split function doesn't seem to handle utf-8? sub split_utf8 { my $make_linux_happy = shift; error("Ready to split this: $_[0]
"); my @split = $_[0] =~ /$utf8/gox; return @split; } # markup: adds HTML markup to words with pinyin/definition # TAKES: string containing a single word # RETURNS: string containing the word, marked up in html sub markup { my $new_word = shift; my $tag_prepend = qq{$new_word}; if ($SAFARI) { $tag_prepend = qq{$new_word}; } if ($should_ignore{$new_word}) { #in dict of ignored punctuation error("Didn't markup because I'm supposed to ignore: $new_word. "); return $new_word; } elsif ($new_word =~ /[\x00-\x7F]/) { #single-bit ascii (western) error("Didn't markup because it's not multi-byte: $new_word. "); return $new_word; } else { my $tooltip_message = $dict->utf8Pinyin($dict{$new_word}) if $dict{$new_word}; $tooltip_message ||= "Not found."; my $marked_up = $tag_prepend.$tooltip_message.$tag_append; return $marked_up; } } # sub error prints error messages if DEBUG flag is not zero sub error { print "$_[0]" if $DEBUG; }