#!/usr/pkg/bin/perl -w
# 2003.11.09 msittig@freeshell.org
# 2003.11.12 msittig@freeshell.org (tuning for speed)
# 2003.11.13 msittig@freeshell.org (documentation)
# 2003.11.14 msittig@freeshell.org (wubi tweaks)
# 2003.11.16 msittig@freeshell.org (cosmetic tweaks)
# 2003.11.19 msittig@freeshell.org ('markup' bug w/ Lingua module)
# 2003.11.20 msittig@freeshell.org (commenting & clarifying)

use strict;
use utf8;
use lib '/arpa/hm/m/msittig/lib/site_perl/5.6.1';
use Lingua::ZH::CEDICT;
use Benchmark;
use CGI::Pretty qw(:standard :html3);

my $MAX_WORD_SIZE = param('max_word_size'); $MAX_WORD_SIZE ||= 4;
my $DEBUG = param('debug'); $DEBUG ||= 0;
my $ROMAN_CHARACTERS = '0-9a-zA-Z\'"!?~\n\r\t()., /\-=_;{}+*&\[\]·%';
my $SAFARI = param('safari'); $SAFARI ||= 0;

my $utf8 = q{ # UTF-8 encoding reg-exp, for parsing text
[\x00-\x7F]
| [\xC2-\xDF][\x80-\xBF]
| \xE0[\xA0-\xBF][\x80-\xBF]
| [\xE1-\xEF][\x80-\xBF][\x80-\xBF]
| \xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]
| [\xF1-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF]
| \xF8[\x88-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]
| [\xF9-\xFB][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]
| \xFC[\x84-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]
| \xFD[\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF][\x80-\xBF]
};

my @TOOLTIP_SCRIPTS = "";
# Safari doesn't pop-up tooltips, so we include some Javascript
if ($SAFARI) {
    my %script_layer = ( 
	'-type' => 'text/javascript',
	'-src'  => 'layer.js'
	);
    my %script_mouse = ( 
	-type=>'text/javascript',
	-src=>'mouse.js'
	);
    my %script_tooltip = ( 
	-type=>'text/javascript',
	-src=>'tooltip.js'
	);
    @TOOLTIP_SCRIPTS = [\%script_layer, \%script_mouse, \%script_tooltip];
};

# Predeclare these -- they should be global
my %dict;
my %should_ignore;

# This stuff is for the fancy pinyin accents.
my $dict = Lingua::ZH::CEDICT->new();
$dict->init();

# Start printing html code
print header(-charset=>'utf-8');
use bytes; # Work around harmless but annoying "wide character" error.
print start_html(-title=>'Chinese Tool',
		 -encoding=>'utf-8',
		 -script=>@TOOLTIP_SCRIPTS,
		 -head=>meta({-http_equiv=>'Content-Type',
			      -content=>'text/html; charset=utf-8'}),
		 -style=>{-src=>'zhtool.css',
			-media=>'all'}),

    h1('<strong lang="zh">中文</strong> Tool'),

    start_form("POST", "index.cgi", "utf-8"),
    ol({-class=>'note'},
    	li(a({-href=>'../src/zhtool/'}, 'Source code')),
    	li(a({-href=>'http://www.mandarintools.com/cedict.html'}, 'CEDICT')),
    	li(a({-href=>'http://www.perl.org'}, 'Perl')),
    	li(a({-href=>'http://sdf.lonestar.org'}, 'SDF')),
    	li(a({-href=>'http://www.cnblog.org/blog/'}, 'CNBlog')),
    	li(a({-href=>'http://popjisyo.com/'}, 'PopJisyo')),
    	li(a({-href=>'http://wubi.org/'}, 'Wubi.org')),
    ),
    p("Enter Chinese text:"),
    textarea(-name=>'zhtext',
	     -rows=>'10',
	     -columns=>'50'),
    "<br />",
    submit,
    checkbox_group(-name=>'debug',
		   -values=>'Debug'),
    checkbox_group(-name=>'safari',
		   -values=>'Safari'),
    radio_group(-name=>'max_word_size',
		-values=>['4', '5'],
		-default=>'4'),
    end_form;
no bytes;


# Main parsing routine, executed when there is input
if (param('zhtext')) {

    # For benchmarking purposes
    my $t0 = new Benchmark;

    print hr;
    
    my $time_string = localtime;
    error("<p>$time_string<br /><br />");

    # Read the dictionary files into a hash
    load_dictionary("cedict.gb.utf8", \%dict);
    load_dictionary("msdict.utf8", \%dict);
    load_dictionary("override.utf8", \%dict);

    # Read the list of ignored characters (mostly punctuation)
    load_dictionary("ignore.utf8", \%should_ignore);

    # Clean up the input
    my $text_query = param('zhtext');
    chomp($text_query);

    # Start recursive lookup of last ("next") word
    my @text_arrayified = split('', $text_query);
    my ($word, $text_left) = next_word($MAX_WORD_SIZE, \@text_arrayified);
    my $parsed_text = markup($word);
    while(scalar @$text_left) {
	error("Adding $word to parsed_text.<br /><br />");
	($word, $text_left) = next_word($MAX_WORD_SIZE, $text_left);
	$parsed_text = markup($word).$parsed_text;
    }

    error("</p>");

    # Print output
    $parsed_text =~ s{(\n|\r\n)}{<br />}g;
    print h3("Result:"),
    p("$parsed_text");

    # Benchmark output.
    my $t1 = new Benchmark;
    my $td = timediff($t1, $t0);
    print p({-class=>'benchmark'},"This took ",timestr($td),".");

} else {
    use bytes;
    print p("Try this: 毛主席万岁 and hover the mouse over the output.");
    no bytes;
}

print end_html;

exit 1;


## Subroutines

# load_dictionary: loads a file into memory for global use
#    TAKES:   string of dictionary file name, 
#             reference(hash) in which to store contents of dic
#    RETURNS: 

sub load_dictionary {
    my $dictionary_name = shift;
    my $hash_ref = shift;
    open(DIC, "$dictionary_name") or print p("Couldn't open $dictionary_name");
    binmode(DIC);
    my $first_line = <DIC>;
    error("Opened dictionary $dictionary_name.<br />");
    while(<DIC>) {
	my $line = $_;
	$line = m/([^ ]+) *(.*)/i;
	my ($word, $info) = ($1, $2);
	chomp($word, $info);
	$info = 1 if $dictionary_name =~ /ignore/;
	$$hash_ref{$word} = $info;
    }
    close DIC;
}

# its_a_word: checks if the chunk is a word in the dictionary
#    TAKES:   string with the word to check
#    RETURNS: string, 1 or 0

sub its_a_word {
    if ($dict{"$_[0]"}) {
	return 1;
    } else {
	return 0;
    }
}


# next_word:
#    TAKES:   scalar number of characters to try as next word
#             reference to hash of the remaining text
#    RETURNS: string with the next word
#             reference to hash of the remaining text, minus word
# recursion in the hiz-ouse

sub next_word {
    error("Passed arguments were: ".join(' and ', @_)."<br />");
    my $max_chars = shift;
    my $text_toparse = shift;
    error("Text to parse this time: $text_toparse<br />");
    my $is_this_a_word = "";
    error("Building word");
    foreach my $char (1 .. $max_chars) {
        if (! $$text_toparse[length(@$text_toparse) - 1]) {
	    $max_chars -= 1;
	    my $this_many = $max_chars - 1;
	    error("... max_chars $this_many next time.");
	    last;
	}
	$is_this_a_word = pop(@$text_toparse)."$is_this_a_word";
	error("... $is_this_a_word");
    }
    error("<br />");
    error("max_chars = $max_chars, is_this_a_word = $is_this_a_word<br />");
    error("Checking to see if \"$is_this_a_word\" is a word...");

    if(its_a_word($is_this_a_word) || $max_chars == 1) {
	error("yes! (or just a single character)<br /><br />");
	return $is_this_a_word, $text_toparse;
    } else {
	error("nope.<br /><br />");
	push @$text_toparse, split('', $is_this_a_word);
	return next_word($max_chars - 1, $text_toparse);
    }

}


# word_so_far
sub word_so_far {
    if ($_[0] ne "") {
	return $_[0];
    } else {
	return "";
    }
}

# split_utf8: splits a utf-8 string and returns array
#    TAKES:   string to split, in mixed ascii/utf-8
#    RETURNS: array of single characters
# perl 5.8's split function doesn't seem to handle utf-8?

sub split_utf8 {
    my $make_linux_happy = shift;
    error("Ready to split this: $_[0]<br />");
    my @split = $_[0] =~ /$utf8/gox;

    return @split;
}


# markup: adds HTML markup to words with pinyin/definition
#    TAKES:   string containing a single word
#    RETURNS: string containing the word, marked up in html

sub markup {
    my $new_word = shift;
    my $tag_prepend = qq{<span title=\"};
    my $tag_append = qq{\">$new_word</span>};
    if ($SAFARI) {
	$tag_prepend = qq{<span onmouseover="showTooltip(true, '};
    	$tag_append = qq{', this)" onmouseout="showTooltip(false)">$new_word</span>};
    }
    if ($should_ignore{$new_word}) { #in dict of ignored punctuation
	error("Didn't markup because I'm supposed to ignore: $new_word.  ");
	return $new_word;
    } elsif ($new_word =~ /[\x00-\x7F]/) { #single-bit ascii (western)
	error("Didn't markup because it's not multi-byte: $new_word.  ");
	return $new_word;
    } else {
	my $tooltip_message = $dict->utf8Pinyin($dict{$new_word}) if $dict{$new_word};
	$tooltip_message ||= "Not found.";
	my $marked_up = $tag_prepend.$tooltip_message.$tag_append;
	return $marked_up;
    }
}

# sub error prints error messages if DEBUG flag is not zero
sub error {
    print "$_[0]" if $DEBUG;
}