# Custom sorting of a list of Khmer strings (one per line) in Unicode # Written by Maurice Bauhahn (bauhahnm@clara.net). All rights reserved. 24 Jan 2004 $" = ''; # character inserted between parts when array or slice interpolated into a double-quoted string # import a tab separated list of characters primary priority, secondary priority, and teriary priority # Name: KhmerCollation.data # Syntax: Perl KhmerCollation.pl wordlist.txt sorted_wordlist.txt # Should preprocess longer text to detect CHOENG DA open (DATA, "<:utf8", "KhmerCollation4.data" ) or die "Couldn't open KhmerCollation4.data file"; $textfilename = shift @ARGV; $sortedoutfile = shift @ARGV; open (TEXTIN , "<:utf8", $textfilename) or die "Couldn't open input file: $!"; open (SORTOUT , ">:utf8", $sortedoutfile) or die "Couldn't open output file: $!"; $charPriority1 = {} ; $charPriority2 = {} ; $charPriority3 = {} ; while () { chomp($_); $CharSort=''; $Prim1=''; $Sec1=''; $Ter1=''; $Prim2=''; $Sec2=''; $Ter2=''; ($CharSort, $Prim1, $Sec1, $Ter1, $Prim2, $Sec2, $Ter2 ) = split (/\t/, $_) ; # print SORTOUT $CharSort . $Prim1 . $Sec1 . $Ter1 . $Prim2 . $Sec2 . $Ter2 . "\n"; This works # Convert from Unicode character to sorting numbers push @{$charPriority1{$CharSort} }, $Prim1, $Prim2 ; push @{$charPriority2{$CharSort} }, $Sec1, $Sec2 ; push @{$charPriority3{$CharSort}}, $Ter1, $Ter2 ; } close(DATA); sub prioritize1 { my $key = shift; my $x, $v, @t ; $x=''; @t = @{$charPriority1{$key}} ; for (my $i=0; $i < ($#t + 1) ; $i++ ) { $x = $x . $charPriority1{$key}[$i]; # print SORTOUT $v . "\t" . $i . $charPriority1{$key}[$i] . "\tkey: " . $key . "\n" ; } return $x ; } sub prioritize2 { my $key = shift; my $x, $v, @t ; $x=''; @t = @{$charPriority2{$key}} ; $v = @t ; for (my $i=0; $i < $v ; $i++ ) { $x = $x . $charPriority2{$key}[$i]; } return $x ; } sub prioritize3 { my $key = shift; my $x, $v, @t ; $x=''; @t = @{$charPriority3{$key}} ; $v = @t ; for (my $i=0; $i < $v ; $i++ ) { $x .= $charPriority3{$key}[$i]; } return $x ; } sub compute { # index is looking for tab; is absolute maximum...but only portion before tab is actually used my $al = index($a, "\t"); my $bl = index($b, "\t"); # pick the shortest if ($al > $bl) { my $c = $bl } else {$c = $al } # working on assumption that 0 means $a comes before $b or is equal; 1 means $a comes after $b for (my $i ; $i < $c; $i++) { if ((substr($a,$i,1) == "|") && (substr($b,$i,1) != "|" )) { return 0; } if ((substr($a,$i,1) != "|") && (substr($b,$i,1) == "|" )) { return 1; } if (substr($a,$i,1) < substr($b,$i,1)) { return 0 } ; if (substr($a,$i,1) > substr($b,$i,1)) { return 1 } ; } return 0; } $/= chr(0x0a); # LF; chr(8229) = Unicode LINE SEPARATOR for <> processing not permitted while (defined($_ = )) { $persistent = $_ ; # print SORTOUT $persistent . "\n"; This is properly taking one line at a time. my $singlechar, @linein, $transfer ; my @arrayout1, @arrayout2, @arrayout3 ; $prime = ''; $second = ''; $third = ''; $singlechar=''; $transfer=''; $#arrayout1=0; $#arrayout2=0; $#arrayout3=0; $#linein=0; chomp($persistent) ; # Dealing with vowel-NIKAHIT vowel-REAHMUK combinations at text input level if ((index($persistent, "\x{17c6}") ==-1) && (index($persistent, "\x{17c7}") ==-1)) { @linein = split('', $persistent) ; } else { my $outindex = 0; for (my $inindex=0; $inindex < length($persistent);$inindex++) { $mypair = substr($persistent,$inindex,2); if (($mypair =~ /([\x{17b6}..\x{17c5}][\x{17c7}])/) || ($mypair =~ /([\x{17bb}\x{17b6}][\x{17c6}])/)) { $linein[$outindex] = $mypair; $inindex++; } else { $linein[$outindex] = substr($persistent,$inindex,1); } $outindex++; } } # Handle Robat; should only have to do this if it is known Robat is in $persistent if (index($persistent, "\x{17cc}") !=-1) { for (my $i = 0; $i < ($#linein + 1); $i++ ) { $singlechar = $linein[$i]; if ($singlechar =~ /\x{17cc}/g ) { $transfer = $linein[$i-1]; $linein[$i] = $transfer; $linein[$i-1] = $singlechar; } else { $linein[$i] = $singlechar; } # print SORTOUT $linein[$i] . "\n"; } } # Map which handles one character at a time returning primary, secondary, and primary equivalents # In the case of some vowels combined with NIKAHIT or REAHMUK, however, there may be two @arrayout1 = map { &prioritize1($_) } @linein; @arrayout2 = map { &prioritize2($_) } @linein; @arrayout3 = map { &prioritize3($_) } @linein; # This mapping is working well according to # for (my $i=0; $i < ($#arrayout1 + 1); $i++) { # print SORTOUT $arrayout1[$i] . "\t" . $arrayout2[$i] . "\t" . $arrayout3[$i] . "\n" ; # } # Rearrange array to string: primaries first, secondaries next and teriaries last # no space between areas $" = ''; # local $_ ; for (my $j = 0; $j < ($#arrayout1 + 1); $j++ ) { $prime = $prime . @arrayout1[$j] ; $second = $second . @arrayout2[$j] ; # print SORTOUT $second . "\n" ; # This is adding properly $third = $third . @arrayout3[$j] ; } # print SORTOUT "What is going wrong here? $prime \n"; $line = $prime . $second . $third . "|" . "$persistent" ; push @unsorted , $line ; # Reunite with $persistent and export } @sorted = sort @unsorted; for ( my $j = 0; $j < ($#sorted + 1); $j++ ) { my $point = index($sorted[$j], "|") +1; print SORTOUT substr($sorted[$j],$point,) . "\n"; }