#!/usr/pkg/bin/perl -w
# 2003.11.09 msittig@freeshell.org
# 2003.11.12 msittig@freeshell.org (tuning for speed)
# 2003.11.13 msittig@freeshell.org (documentation)
# 2003.11.14 msittig@freeshell.org (wubi tweaks)
# 2003.11.16 msittig@freeshell.org (cosmetic tweaks)
# 2003.11.19 msittig@freeshell.org ('markup' bug w/ Lingua module)
# 2003.11.20 msittig@freeshell.org (commenting & clarifying)
use strict;
use utf8;
use lib '/arpa/hm/m/msittig/lib/site_perl/5.6.1';
use Lingua::ZH::CEDICT;
use Benchmark;
use CGI::Pretty qw(:standard :html3);
my $MAX_WORD_SIZE = param('max_word_size'); $MAX_WORD_SIZE ||= 4;
my $DEBUG = param('debug'); $DEBUG ||= 0;
my $ROMAN_CHARACTERS = '0-9a-zA-Z\'"!?~\n\r\t()., /\-=_;{}+*&\[\]·%';
my $SAFARI = param('safari'); $SAFARI ||= 0;
my $utf8 = q{ # UTF-8 encoding reg-exp, for parsing text
[\x00-\x7F]
| [\xC2-\xDF][\x80-\xBF]
| \xE0[\xA0-\xBF][\x80-\xBF]
| [\xE1-\xEF][\x80-\xBF][\x80-\xBF]
| \xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]
| [\xF1-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]
| \xF8[\x88-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]
| [\xF9-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]
| \xFC[\x84-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]
| \xFD[\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]
};
my @TOOLTIP_SCRIPTS = "";
# Safari doesn't pop-up tooltips, so we include some Javascript
if ($SAFARI) {
my %script_layer = (
'-type' => 'text/javascript',
'-src' => 'layer.js'
);
my %script_mouse = (
-type=>'text/javascript',
-src=>'mouse.js'
);
my %script_tooltip = (
-type=>'text/javascript',
-src=>'tooltip.js'
);
@TOOLTIP_SCRIPTS = [\%script_layer, \%script_mouse, \%script_tooltip];
};
# Predeclare these -- they should be global
my %dict;
my %should_ignore;
# This stuff is for the fancy pinyin accents.
my $dict = Lingua::ZH::CEDICT->new();
$dict->init();
# Start printing html code
print header(-charset=>'utf-8');
use bytes; # Work around harmless but annoying "wide character" error.
print start_html(-title=>'Chinese Tool',
-encoding=>'utf-8',
-script=>@TOOLTIP_SCRIPTS,
-head=>meta({-http_equiv=>'Content-Type',
-content=>'text/html; charset=utf-8'}),
-style=>{-src=>'zhtool.css',
-media=>'all'}),
h1('中文 Tool'),
start_form("POST", "index.cgi", "utf-8"),
ol({-class=>'note'},
li(a({-href=>'../src/zhtool/'}, 'Source code')),
li(a({-href=>'http://www.mandarintools.com/cedict.html'}, 'CEDICT')),
li(a({-href=>'http://www.perl.org'}, 'Perl')),
li(a({-href=>'http://sdf.lonestar.org'}, 'SDF')),
li(a({-href=>'http://www.cnblog.org/blog/'}, 'CNBlog')),
li(a({-href=>'http://popjisyo.com/'}, 'PopJisyo')),
li(a({-href=>'http://wubi.org/'}, 'Wubi.org')),
),
p("Enter Chinese text:"),
textarea(-name=>'zhtext',
-rows=>'10',
-columns=>'50'),
"
",
submit,
checkbox_group(-name=>'debug',
-values=>'Debug'),
checkbox_group(-name=>'safari',
-values=>'Safari'),
radio_group(-name=>'max_word_size',
-values=>['4', '5'],
-default=>'4'),
end_form;
no bytes;
# Main parsing routine, executed when there is input
if (param('zhtext')) {
# For benchmarking purposes
my $t0 = new Benchmark;
print hr;
my $time_string = localtime;
error("
$time_string
");
# Read the dictionary files into a hash
load_dictionary("cedict.gb.utf8", \%dict);
load_dictionary("msdict.utf8", \%dict);
load_dictionary("override.utf8", \%dict);
# Read the list of ignored characters (mostly punctuation)
load_dictionary("ignore.utf8", \%should_ignore);
# Clean up the input
my $text_query = param('zhtext');
chomp($text_query);
# Start recursive lookup of last ("next") word
my @text_arrayified = split('', $text_query);
my ($word, $text_left) = next_word($MAX_WORD_SIZE, \@text_arrayified);
my $parsed_text = markup($word);
while(scalar @$text_left) {
error("Adding $word to parsed_text.
");
($word, $text_left) = next_word($MAX_WORD_SIZE, $text_left);
$parsed_text = markup($word).$parsed_text;
}
error("