[ Index ] |
PHP Cross Reference of Unnamed Project |
[Summary view] [Print] [Text view]
1 package Unicode::Collate; 2 3 BEGIN { 4 unless ("A" eq pack('U', 0x41)) { 5 die "Unicode::Collate cannot stringify a Unicode code point\n"; 6 } 7 } 8 9 use 5.006; 10 use strict; 11 use warnings; 12 use Carp; 13 use File::Spec; 14 15 no warnings 'utf8'; 16 17 our $VERSION = '0.52'; 18 our $PACKAGE = __PACKAGE__; 19 20 my @Path = qw(Unicode Collate); 21 my $KeyFile = "allkeys.txt"; 22 23 # Perl's boolean 24 use constant TRUE => 1; 25 use constant FALSE => ""; 26 use constant NOMATCHPOS => -1; 27 28 # A coderef to get combining class imported from Unicode::Normalize 29 # (i.e. \&Unicode::Normalize::getCombinClass). 30 # This is also used as a HAS_UNICODE_NORMALIZE flag. 31 my $CVgetCombinClass; 32 33 # Supported Levels 34 use constant MinLevel => 1; 35 use constant MaxLevel => 4; 36 37 # Minimum weights at level 2 and 3, respectively 38 use constant Min2Wt => 0x20; 39 use constant Min3Wt => 0x02; 40 41 # Shifted weight at 4th level 42 use constant Shift4Wt => 0xFFFF; 43 44 # A boolean for Variable and 16-bit weights at 4 levels of Collation Element 45 # PROBLEM: The Default Unicode Collation Element Table 46 # has weights over 0xFFFF at the 4th level. 47 # The tie-breaking in the variable weights 48 # other than "shift" (as well as "shift-trimmed") is unreliable. 49 use constant VCE_TEMPLATE => 'Cn4'; 50 51 # A sort key: 16-bit weights 52 # See also the PROBLEM on VCE_TEMPLATE above. 53 use constant KEY_TEMPLATE => 'n*'; 54 55 # Level separator in a sort key: 56 # i.e. pack(KEY_TEMPLATE, 0) 57 use constant LEVEL_SEP => "\0\0"; 58 59 # As Unicode code point separator for hash keys. 60 # A joined code point string (denoted by JCPS below) 61 # like "65;768" is used for internal processing 62 # instead of Perl's Unicode string like "\x41\x{300}", 63 # as the native code point is different from the Unicode code point 64 # on EBCDIC platform. 65 # This character must not be included in any stringified 66 # representation of an integer. 67 use constant CODE_SEP => ';'; 68 69 # boolean values of variable weights 70 use constant NON_VAR => 0; # Non-Variable character 71 use constant VAR => 1; # Variable character 72 73 # specific code points 74 use constant Hangul_LBase => 0x1100; 75 use constant Hangul_LIni => 0x1100; 76 use constant Hangul_LFin => 0x1159; 77 use constant Hangul_LFill => 0x115F; 78 use constant Hangul_VBase => 0x1161; 79 use constant Hangul_VIni => 0x1160; # from Vowel Filler 80 use constant Hangul_VFin => 0x11A2; 81 use constant Hangul_TBase => 0x11A7; # from "no-final" codepoint 82 use constant Hangul_TIni => 0x11A8; 83 use constant Hangul_TFin => 0x11F9; 84 use constant Hangul_TCount => 28; 85 use constant Hangul_NCount => 588; 86 use constant Hangul_SBase => 0xAC00; 87 use constant Hangul_SIni => 0xAC00; 88 use constant Hangul_SFin => 0xD7A3; 89 use constant CJK_UidIni => 0x4E00; 90 use constant CJK_UidFin => 0x9FA5; 91 use constant CJK_UidF41 => 0x9FBB; 92 use constant CJK_ExtAIni => 0x3400; 93 use constant CJK_ExtAFin => 0x4DB5; 94 use constant CJK_ExtBIni => 0x20000; 95 use constant CJK_ExtBFin => 0x2A6D6; 96 use constant BMP_Max => 0xFFFF; 97 98 # Logical_Order_Exception in PropList.txt 99 my $DefaultRearrange = [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ]; 100 101 sub UCA_Version { "14" } 102 103 sub Base_Unicode_Version { "4.1.0" } 104 105 ###### 106 107 sub pack_U { 108 return pack('U*', @_); 109 } 110 111 sub unpack_U { 112 return unpack('U*', shift(@_).pack('U*')); 113 } 114 115 ###### 116 117 my (%VariableOK); 118 @VariableOK{ qw/ 119 blanked non-ignorable shifted shift-trimmed 120 / } = (); # keys lowercased 121 122 our @ChangeOK = qw/ 123 alternate backwards level normalization rearrange 124 katakana_before_hiragana upper_before_lower 125 overrideHangul overrideCJK preprocess UCA_Version 126 hangul_terminator variable 127 /; 128 129 our @ChangeNG = qw/ 130 entry mapping table maxlength 131 ignoreChar ignoreName undefChar undefName variableTable 132 versionTable alternateTable backwardsTable forwardsTable rearrangeTable 133 derivCode normCode rearrangeHash 134 backwardsFlag 135 /; 136 # The hash key 'ignored' is deleted at v 0.21. 137 # The hash key 'isShift' is deleted at v 0.23. 138 # The hash key 'combining' is deleted at v 0.24. 139 # The hash key 'entries' is deleted at v 0.30. 140 # The hash key 'L3_ignorable' is deleted at v 0.40. 141 142 sub version { 143 my $self = shift; 144 return $self->{versionTable} || 'unknown'; 145 } 146 147 my (%ChangeOK, %ChangeNG); 148 @ChangeOK{ @ChangeOK } = (); 149 @ChangeNG{ @ChangeNG } = (); 150 151 sub change { 152 my $self = shift; 153 my %hash = @_; 154 my %old; 155 if (exists $hash{variable} && exists $hash{alternate}) { 156 delete $hash{alternate}; 157 } 158 elsif (!exists $hash{variable} && exists $hash{alternate}) { 159 $hash{variable} = $hash{alternate}; 160 } 161 foreach my $k (keys %hash) { 162 if (exists $ChangeOK{$k}) { 163 $old{$k} = $self->{$k}; 164 $self->{$k} = $hash{$k}; 165 } 166 elsif (exists $ChangeNG{$k}) { 167 croak "change of $k via change() is not allowed!"; 168 } 169 # else => ignored 170 } 171 $self->checkCollator(); 172 return wantarray ? %old : $self; 173 } 174 175 sub _checkLevel { 176 my $level = shift; 177 my $key = shift; # 'level' or 'backwards' 178 MinLevel <= $level or croak sprintf 179 "Illegal level %d (in value for key '%s') lower than %d.", 180 $level, $key, MinLevel; 181 $level <= MaxLevel or croak sprintf 182 "Unsupported level %d (in value for key '%s') higher than %d.", 183 $level, $key, MaxLevel; 184 } 185 186 my %DerivCode = ( 187 8 => \&_derivCE_8, 188 9 => \&_derivCE_9, 189 11 => \&_derivCE_9, # 11 == 9 190 14 => \&_derivCE_14, 191 ); 192 193 sub checkCollator { 194 my $self = shift; 195 _checkLevel($self->{level}, "level"); 196 197 $self->{derivCode} = $DerivCode{ $self->{UCA_Version} } 198 or croak "Illegal UCA version (passed $self->{UCA_Version})."; 199 200 $self->{variable} ||= $self->{alternate} || $self->{variableTable} || 201 $self->{alternateTable} || 'shifted'; 202 $self->{variable} = $self->{alternate} = lc($self->{variable}); 203 exists $VariableOK{ $self->{variable} } 204 or croak "$PACKAGE unknown variable parameter name: $self->{variable}"; 205 206 if (! defined $self->{backwards}) { 207 $self->{backwardsFlag} = 0; 208 } 209 elsif (! ref $self->{backwards}) { 210 _checkLevel($self->{backwards}, "backwards"); 211 $self->{backwardsFlag} = 1 << $self->{backwards}; 212 } 213 else { 214 my %level; 215 $self->{backwardsFlag} = 0; 216 for my $b (@{ $self->{backwards} }) { 217 _checkLevel($b, "backwards"); 218 $level{$b} = 1; 219 } 220 for my $v (sort keys %level) { 221 $self->{backwardsFlag} += 1 << $v; 222 } 223 } 224 225 defined $self->{rearrange} or $self->{rearrange} = []; 226 ref $self->{rearrange} 227 or croak "$PACKAGE: list for rearrangement must be store in ARRAYREF"; 228 229 # keys of $self->{rearrangeHash} are $self->{rearrange}. 230 $self->{rearrangeHash} = undef; 231 232 if (@{ $self->{rearrange} }) { 233 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = (); 234 } 235 236 $self->{normCode} = undef; 237 238 if (defined $self->{normalization}) { 239 eval { require Unicode::Normalize }; 240 $@ and croak "Unicode::Normalize is required to normalize strings"; 241 242 $CVgetCombinClass ||= \&Unicode::Normalize::getCombinClass; 243 244 if ($self->{normalization} =~ /^(?:NF)D\z/) { # tweak for default 245 $self->{normCode} = \&Unicode::Normalize::NFD; 246 } 247 elsif ($self->{normalization} ne 'prenormalized') { 248 my $norm = $self->{normalization}; 249 $self->{normCode} = sub { 250 Unicode::Normalize::normalize($norm, shift); 251 }; 252 eval { $self->{normCode}->("") }; # try 253 $@ and croak "$PACKAGE unknown normalization form name: $norm"; 254 } 255 } 256 return; 257 } 258 259 sub new 260 { 261 my $class = shift; 262 my $self = bless { @_ }, $class; 263 264 # If undef is passed explicitly, no file is read. 265 $self->{table} = $KeyFile if ! exists $self->{table}; 266 $self->read_table() if defined $self->{table}; 267 268 if ($self->{entry}) { 269 while ($self->{entry} =~ /([^\n]+)/g) { 270 $self->parseEntry($1); 271 } 272 } 273 274 $self->{level} ||= MaxLevel; 275 $self->{UCA_Version} ||= UCA_Version(); 276 277 $self->{overrideHangul} = FALSE 278 if ! exists $self->{overrideHangul}; 279 $self->{overrideCJK} = FALSE 280 if ! exists $self->{overrideCJK}; 281 $self->{normalization} = 'NFD' 282 if ! exists $self->{normalization}; 283 $self->{rearrange} = $self->{rearrangeTable} || 284 ($self->{UCA_Version} <= 11 ? $DefaultRearrange : []) 285 if ! exists $self->{rearrange}; 286 $self->{backwards} = $self->{backwardsTable} 287 if ! exists $self->{backwards}; 288 289 $self->checkCollator(); 290 291 return $self; 292 } 293 294 sub read_table { 295 my $self = shift; 296 297 my($f, $fh); 298 foreach my $d (@INC) { 299 $f = File::Spec->catfile($d, @Path, $self->{table}); 300 last if open($fh, $f); 301 $f = undef; 302 } 303 if (!defined $f) { 304 $f = File::Spec->catfile(@Path, $self->{table}); 305 croak("$PACKAGE: Can't locate $f in \@INC (\@INC contains: @INC)"); 306 } 307 308 while (my $line = <$fh>) { 309 next if $line =~ /^\s*#/; 310 unless ($line =~ s/^\s*\@//) { 311 $self->parseEntry($line); 312 next; 313 } 314 315 # matched ^\s*\@ 316 if ($line =~ /^version\s*(\S*)/) { 317 $self->{versionTable} ||= $1; 318 } 319 elsif ($line =~ /^variable\s+(\S*)/) { # since UTS #10-9 320 $self->{variableTable} ||= $1; 321 } 322 elsif ($line =~ /^alternate\s+(\S*)/) { # till UTS #10-8 323 $self->{alternateTable} ||= $1; 324 } 325 elsif ($line =~ /^backwards\s+(\S*)/) { 326 push @{ $self->{backwardsTable} }, $1; 327 } 328 elsif ($line =~ /^forwards\s+(\S*)/) { # parhaps no use 329 push @{ $self->{forwardsTable} }, $1; 330 } 331 elsif ($line =~ /^rearrange\s+(.*)/) { # (\S*) is NG 332 push @{ $self->{rearrangeTable} }, _getHexArray($1); 333 } 334 } 335 close $fh; 336 } 337 338 339 ## 340 ## get $line, parse it, and write an entry in $self 341 ## 342 sub parseEntry 343 { 344 my $self = shift; 345 my $line = shift; 346 my($name, $entry, @uv, @key); 347 348 return if $line !~ /^\s*[0-9A-Fa-f]/; 349 350 # removes comment and gets name 351 $name = $1 352 if $line =~ s/[#%]\s*(.*)//; 353 return if defined $self->{undefName} && $name =~ /$self->{undefName}/; 354 355 # gets element 356 my($e, $k) = split /;/, $line; 357 croak "Wrong Entry: <charList> must be separated by ';' from <collElement>" 358 if ! $k; 359 360 @uv = _getHexArray($e); 361 return if !@uv; 362 363 $entry = join(CODE_SEP, @uv); # in JCPS 364 365 if (defined $self->{undefChar} || defined $self->{ignoreChar}) { 366 my $ele = pack_U(@uv); 367 368 # regarded as if it were not entried in the table 369 return 370 if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/; 371 372 # replaced as completely ignorable 373 $k = '[.0000.0000.0000.0000]' 374 if defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/; 375 } 376 377 # replaced as completely ignorable 378 $k = '[.0000.0000.0000.0000]' 379 if defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/; 380 381 my $is_L3_ignorable = TRUE; 382 383 foreach my $arr ($k =~ /\[([^\[\]]+)\]/g) { # SPACEs allowed 384 my $var = $arr =~ /\*/; # exactly /^\*/ but be lenient. 385 my @wt = _getHexArray($arr); 386 push @key, pack(VCE_TEMPLATE, $var, @wt); 387 $is_L3_ignorable = FALSE 388 if $wt[0] || $wt[1] || $wt[2]; 389 # Conformance Test for 3.1.1 and 4.0.0 shows Level 3 ignorable 390 # is completely ignorable. 391 # For expansion, an entry $is_L3_ignorable 392 # if and only if "all" CEs are [.0000.0000.0000]. 393 } 394 395 $self->{mapping}{$entry} = $is_L3_ignorable ? [] : \@key; 396 397 if (@uv > 1) { 398 (!$self->{maxlength}{$uv[0]} || $self->{maxlength}{$uv[0]} < @uv) 399 and $self->{maxlength}{$uv[0]} = @uv; 400 } 401 } 402 403 404 ## 405 ## VCE = _varCE(variable term, VCE) 406 ## 407 sub _varCE 408 { 409 my $vbl = shift; 410 my $vce = shift; 411 if ($vbl eq 'non-ignorable') { 412 return $vce; 413 } 414 my ($var, @wt) = unpack VCE_TEMPLATE, $vce; 415 416 if ($var) { 417 return pack(VCE_TEMPLATE, $var, 0, 0, 0, 418 $vbl eq 'blanked' ? $wt[3] : $wt[0]); 419 } 420 elsif ($vbl eq 'blanked') { 421 return $vce; 422 } 423 else { 424 return pack(VCE_TEMPLATE, $var, @wt[0..2], 425 $vbl eq 'shifted' && $wt[0]+$wt[1]+$wt[2] ? Shift4Wt : 0); 426 } 427 } 428 429 sub viewSortKey 430 { 431 my $self = shift; 432 $self->visualizeSortKey($self->getSortKey(@_)); 433 } 434 435 sub visualizeSortKey 436 { 437 my $self = shift; 438 my $view = join " ", map sprintf("%04X", $_), unpack(KEY_TEMPLATE, shift); 439 440 if ($self->{UCA_Version} <= 8) { 441 $view =~ s/ ?0000 ?/|/g; 442 } else { 443 $view =~ s/\b0000\b/|/g; 444 } 445 return "[$view]"; 446 } 447 448 449 ## 450 ## arrayref of JCPS = splitEnt(string to be collated) 451 ## arrayref of arrayref[JCPS, ini_pos, fin_pos] = splitEnt(string, true) 452 ## 453 sub splitEnt 454 { 455 my $self = shift; 456 my $wLen = $_[1]; 457 458 my $code = $self->{preprocess}; 459 my $norm = $self->{normCode}; 460 my $map = $self->{mapping}; 461 my $max = $self->{maxlength}; 462 my $reH = $self->{rearrangeHash}; 463 my $ver9 = $self->{UCA_Version} >= 9 && $self->{UCA_Version} <= 11; 464 465 my ($str, @buf); 466 467 if ($wLen) { 468 $code and croak "Preprocess breaks character positions. " 469 . "Don't use with index(), match(), etc."; 470 $norm and croak "Normalization breaks character positions. " 471 . "Don't use with index(), match(), etc."; 472 $str = $_[0]; 473 } 474 else { 475 $str = $_[0]; 476 $str = &$code($str) if ref $code; 477 $str = &$norm($str) if ref $norm; 478 } 479 480 # get array of Unicode code point of string. 481 my @src = unpack_U($str); 482 483 # rearrangement: 484 # Character positions are not kept if rearranged, 485 # then neglected if $wLen is true. 486 if ($reH && ! $wLen) { 487 for (my $i = 0; $i < @src; $i++) { 488 if (exists $reH->{ $src[$i] } && $i + 1 < @src) { 489 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i]); 490 $i++; 491 } 492 } 493 } 494 495 # remove a code point marked as a completely ignorable. 496 for (my $i = 0; $i < @src; $i++) { 497 $src[$i] = undef 498 if _isIllegal($src[$i]) || ($ver9 && 499 $map->{ $src[$i] } && @{ $map->{ $src[$i] } } == 0); 500 } 501 502 for (my $i = 0; $i < @src; $i++) { 503 my $jcps = $src[$i]; 504 505 # skip removed code point 506 if (! defined $jcps) { 507 if ($wLen && @buf) { 508 $buf[-1][2] = $i + 1; 509 } 510 next; 511 } 512 513 my $i_orig = $i; 514 515 # find contraction 516 if ($max->{$jcps}) { 517 my $temp_jcps = $jcps; 518 my $jcpsLen = 1; 519 my $maxLen = $max->{$jcps}; 520 521 for (my $p = $i + 1; $jcpsLen < $maxLen && $p < @src; $p++) { 522 next if ! defined $src[$p]; 523 $temp_jcps .= CODE_SEP . $src[$p]; 524 $jcpsLen++; 525 if ($map->{$temp_jcps}) { 526 $jcps = $temp_jcps; 527 $i = $p; 528 } 529 } 530 531 # not-contiguous contraction with Combining Char (cf. UTS#10, S2.1). 532 # This process requires Unicode::Normalize. 533 # If "normalization" is undef, here should be skipped *always* 534 # (in spite of bool value of $CVgetCombinClass), 535 # since canonical ordering cannot be expected. 536 # Blocked combining character should not be contracted. 537 538 if ($self->{normalization}) 539 # $self->{normCode} is false in the case of "prenormalized". 540 { 541 my $preCC = 0; 542 my $curCC = 0; 543 544 for (my $p = $i + 1; $p < @src; $p++) { 545 next if ! defined $src[$p]; 546 $curCC = $CVgetCombinClass->($src[$p]); 547 last unless $curCC; 548 my $tail = CODE_SEP . $src[$p]; 549 if ($preCC != $curCC && $map->{$jcps.$tail}) { 550 $jcps .= $tail; 551 $src[$p] = undef; 552 } else { 553 $preCC = $curCC; 554 } 555 } 556 } 557 } 558 559 # skip completely ignorable 560 if ($map->{$jcps} && @{ $map->{$jcps} } == 0) { 561 if ($wLen && @buf) { 562 $buf[-1][2] = $i + 1; 563 } 564 next; 565 } 566 567 push @buf, $wLen ? [$jcps, $i_orig, $i + 1] : $jcps; 568 } 569 return \@buf; 570 } 571 572 573 ## 574 ## list of VCE = getWt(JCPS) 575 ## 576 sub getWt 577 { 578 my $self = shift; 579 my $u = shift; 580 my $vbl = $self->{variable}; 581 my $map = $self->{mapping}; 582 my $der = $self->{derivCode}; 583 584 return if !defined $u; 585 return map(_varCE($vbl, $_), @{ $map->{$u} }) 586 if $map->{$u}; 587 588 # JCPS must not be a contraction, then it's a code point. 589 if (Hangul_SIni <= $u && $u <= Hangul_SFin) { 590 my $hang = $self->{overrideHangul}; 591 my @hangulCE; 592 if ($hang) { 593 @hangulCE = map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$hang($u)); 594 } 595 elsif (!defined $hang) { 596 @hangulCE = $der->($u); 597 } 598 else { 599 my $max = $self->{maxlength}; 600 my @decH = _decompHangul($u); 601 602 if (@decH == 2) { 603 my $contract = join(CODE_SEP, @decH); 604 @decH = ($contract) if $map->{$contract}; 605 } else { # must be <@decH == 3> 606 if ($max->{$decH[0]}) { 607 my $contract = join(CODE_SEP, @decH); 608 if ($map->{$contract}) { 609 @decH = ($contract); 610 } else { 611 $contract = join(CODE_SEP, @decH[0,1]); 612 $map->{$contract} and @decH = ($contract, $decH[2]); 613 } 614 # even if V's ignorable, LT contraction is not supported. 615 # If such a situatution were required, NFD should be used. 616 } 617 if (@decH == 3 && $max->{$decH[1]}) { 618 my $contract = join(CODE_SEP, @decH[1,2]); 619 $map->{$contract} and @decH = ($decH[0], $contract); 620 } 621 } 622 623 @hangulCE = map({ 624 $map->{$_} ? @{ $map->{$_} } : $der->($_); 625 } @decH); 626 } 627 return map _varCE($vbl, $_), @hangulCE; 628 } 629 elsif (_isUIdeo($u, $self->{UCA_Version})) { 630 my $cjk = $self->{overrideCJK}; 631 return map _varCE($vbl, $_), 632 $cjk 633 ? map(pack(VCE_TEMPLATE, NON_VAR, @$_), &$cjk($u)) 634 : defined $cjk && $self->{UCA_Version} <= 8 && $u < 0x10000 635 ? _uideoCE_8($u) 636 : $der->($u); 637 } 638 else { 639 return map _varCE($vbl, $_), $der->($u); 640 } 641 } 642 643 644 ## 645 ## string sortkey = getSortKey(string arg) 646 ## 647 sub getSortKey 648 { 649 my $self = shift; 650 my $lev = $self->{level}; 651 my $rEnt = $self->splitEnt(shift); # get an arrayref of JCPS 652 my $v2i = $self->{UCA_Version} >= 9 && 653 $self->{variable} ne 'non-ignorable'; 654 655 my @buf; # weight arrays 656 if ($self->{hangul_terminator}) { 657 my $preHST = ''; 658 foreach my $jcps (@$rEnt) { 659 # weird things like VL, TL-contraction are not considered! 660 my $curHST = ''; 661 foreach my $u (split /;/, $jcps) { 662 $curHST .= getHST($u); 663 } 664 if ($preHST && !$curHST || # hangul before non-hangul 665 $preHST =~ /L\z/ && $curHST =~ /^T/ || 666 $preHST =~ /V\z/ && $curHST =~ /^L/ || 667 $preHST =~ /T\z/ && $curHST =~ /^[LV]/) { 668 669 push @buf, $self->getWtHangulTerm(); 670 } 671 $preHST = $curHST; 672 673 push @buf, $self->getWt($jcps); 674 } 675 $preHST # end at hangul 676 and push @buf, $self->getWtHangulTerm(); 677 } 678 else { 679 foreach my $jcps (@$rEnt) { 680 push @buf, $self->getWt($jcps); 681 } 682 } 683 684 # make sort key 685 my @ret = ([],[],[],[]); 686 my $last_is_variable; 687 688 foreach my $vwt (@buf) { 689 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 690 691 # "Ignorable (L1, L2) after Variable" since track. v. 9 692 if ($v2i) { 693 if ($var) { 694 $last_is_variable = TRUE; 695 } 696 elsif (!$wt[0]) { # ignorable 697 next if $last_is_variable; 698 } 699 else { 700 $last_is_variable = FALSE; 701 } 702 } 703 foreach my $v (0..$lev-1) { 704 0 < $wt[$v] and push @{ $ret[$v] }, $wt[$v]; 705 } 706 } 707 708 # modification of tertiary weights 709 if ($self->{upper_before_lower}) { 710 foreach my $w (@{ $ret[2] }) { 711 if (0x8 <= $w && $w <= 0xC) { $w -= 6 } # lower 712 elsif (0x2 <= $w && $w <= 0x6) { $w += 6 } # upper 713 elsif ($w == 0x1C) { $w += 1 } # square upper 714 elsif ($w == 0x1D) { $w -= 1 } # square lower 715 } 716 } 717 if ($self->{katakana_before_hiragana}) { 718 foreach my $w (@{ $ret[2] }) { 719 if (0x0F <= $w && $w <= 0x13) { $w -= 2 } # katakana 720 elsif (0x0D <= $w && $w <= 0x0E) { $w += 5 } # hiragana 721 } 722 } 723 724 if ($self->{backwardsFlag}) { 725 for (my $v = MinLevel; $v <= MaxLevel; $v++) { 726 if ($self->{backwardsFlag} & (1 << $v)) { 727 @{ $ret[$v-1] } = reverse @{ $ret[$v-1] }; 728 } 729 } 730 } 731 732 join LEVEL_SEP, map pack(KEY_TEMPLATE, @$_), @ret; 733 } 734 735 736 ## 737 ## int compare = cmp(string a, string b) 738 ## 739 sub cmp { $_[0]->getSortKey($_[1]) cmp $_[0]->getSortKey($_[2]) } 740 sub eq { $_[0]->getSortKey($_[1]) eq $_[0]->getSortKey($_[2]) } 741 sub ne { $_[0]->getSortKey($_[1]) ne $_[0]->getSortKey($_[2]) } 742 sub lt { $_[0]->getSortKey($_[1]) lt $_[0]->getSortKey($_[2]) } 743 sub le { $_[0]->getSortKey($_[1]) le $_[0]->getSortKey($_[2]) } 744 sub gt { $_[0]->getSortKey($_[1]) gt $_[0]->getSortKey($_[2]) } 745 sub ge { $_[0]->getSortKey($_[1]) ge $_[0]->getSortKey($_[2]) } 746 747 ## 748 ## list[strings] sorted = sort(list[strings] arg) 749 ## 750 sub sort { 751 my $obj = shift; 752 return 753 map { $_->[1] } 754 sort{ $a->[0] cmp $b->[0] } 755 map [ $obj->getSortKey($_), $_ ], @_; 756 } 757 758 759 sub _derivCE_14 { 760 my $u = shift; 761 my $base = 762 (CJK_UidIni <= $u && $u <= CJK_UidF41) 763 ? 0xFB40 : # CJK 764 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || 765 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) 766 ? 0xFB80 # CJK ext. 767 : 0xFBC0; # others 768 769 my $aaaa = $base + ($u >> 15); 770 my $bbbb = ($u & 0x7FFF) | 0x8000; 771 return 772 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), 773 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); 774 } 775 776 sub _derivCE_9 { 777 my $u = shift; 778 my $base = 779 (CJK_UidIni <= $u && $u <= CJK_UidFin) 780 ? 0xFB40 : # CJK 781 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin || 782 CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) 783 ? 0xFB80 # CJK ext. 784 : 0xFBC0; # others 785 786 my $aaaa = $base + ($u >> 15); 787 my $bbbb = ($u & 0x7FFF) | 0x8000; 788 return 789 pack(VCE_TEMPLATE, NON_VAR, $aaaa, Min2Wt, Min3Wt, $u), 790 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $u); 791 } 792 793 sub _derivCE_8 { 794 my $code = shift; 795 my $aaaa = 0xFF80 + ($code >> 15); 796 my $bbbb = ($code & 0x7FFF) | 0x8000; 797 return 798 pack(VCE_TEMPLATE, NON_VAR, $aaaa, 2, 1, $code), 799 pack(VCE_TEMPLATE, NON_VAR, $bbbb, 0, 0, $code); 800 } 801 802 sub _uideoCE_8 { 803 my $u = shift; 804 return pack(VCE_TEMPLATE, NON_VAR, $u, Min2Wt, Min3Wt, $u); 805 } 806 807 sub _isUIdeo { 808 my ($u, $uca_vers) = @_; 809 return( 810 (CJK_UidIni <= $u && 811 ($uca_vers >= 14 ? ( $u <= CJK_UidF41) : ($u <= CJK_UidFin))) 812 || 813 (CJK_ExtAIni <= $u && $u <= CJK_ExtAFin) 814 || 815 (CJK_ExtBIni <= $u && $u <= CJK_ExtBFin) 816 ); 817 } 818 819 820 sub getWtHangulTerm { 821 my $self = shift; 822 return _varCE($self->{variable}, 823 pack(VCE_TEMPLATE, NON_VAR, $self->{hangul_terminator}, 0,0,0)); 824 } 825 826 827 ## 828 ## "hhhh hhhh hhhh" to (dddd, dddd, dddd) 829 ## 830 sub _getHexArray { map hex, $_[0] =~ /([0-9a-fA-F]+)/g } 831 832 # 833 # $code *must* be in Hangul syllable. 834 # Check it before you enter here. 835 # 836 sub _decompHangul { 837 my $code = shift; 838 my $si = $code - Hangul_SBase; 839 my $li = int( $si / Hangul_NCount); 840 my $vi = int(($si % Hangul_NCount) / Hangul_TCount); 841 my $ti = $si % Hangul_TCount; 842 return ( 843 Hangul_LBase + $li, 844 Hangul_VBase + $vi, 845 $ti ? (Hangul_TBase + $ti) : (), 846 ); 847 } 848 849 sub _isIllegal { 850 my $code = shift; 851 return ! defined $code # removed 852 || ($code < 0 || 0x10FFFF < $code) # out of range 853 || (($code & 0xFFFE) == 0xFFFE) # ??FFF[EF] (cf. utf8.c) 854 || (0xD800 <= $code && $code <= 0xDFFF) # unpaired surrogates 855 || (0xFDD0 <= $code && $code <= 0xFDEF) # other non-characters 856 ; 857 } 858 859 # Hangul Syllable Type 860 sub getHST { 861 my $u = shift; 862 return 863 Hangul_LIni <= $u && $u <= Hangul_LFin || $u == Hangul_LFill ? "L" : 864 Hangul_VIni <= $u && $u <= Hangul_VFin ? "V" : 865 Hangul_TIni <= $u && $u <= Hangul_TFin ? "T" : 866 Hangul_SIni <= $u && $u <= Hangul_SFin ? 867 ($u - Hangul_SBase) % Hangul_TCount ? "LVT" : "LV" : ""; 868 } 869 870 871 ## 872 ## bool _nonIgnorAtLevel(arrayref weights, int level) 873 ## 874 sub _nonIgnorAtLevel($$) 875 { 876 my $wt = shift; 877 return if ! defined $wt; 878 my $lv = shift; 879 return grep($wt->[$_-1] != 0, MinLevel..$lv) ? TRUE : FALSE; 880 } 881 882 ## 883 ## bool _eqArray( 884 ## arrayref of arrayref[weights] source, 885 ## arrayref of arrayref[weights] substr, 886 ## int level) 887 ## * comparison of graphemes vs graphemes. 888 ## @$source >= @$substr must be true (check it before call this); 889 ## 890 sub _eqArray($$$) 891 { 892 my $source = shift; 893 my $substr = shift; 894 my $lev = shift; 895 896 for my $g (0..@$substr-1){ 897 # Do the $g'th graphemes have the same number of AV weigths? 898 return if @{ $source->[$g] } != @{ $substr->[$g] }; 899 900 for my $w (0..@{ $substr->[$g] }-1) { 901 for my $v (0..$lev-1) { 902 return if $source->[$g][$w][$v] != $substr->[$g][$w][$v]; 903 } 904 } 905 } 906 return 1; 907 } 908 909 ## 910 ## (int position, int length) 911 ## int position = index(string, substring, position, [undoc'ed grobal]) 912 ## 913 ## With "grobal" (only for the list context), 914 ## returns list of arrayref[position, length]. 915 ## 916 sub index 917 { 918 my $self = shift; 919 my $str = shift; 920 my $len = length($str); 921 my $subE = $self->splitEnt(shift); 922 my $pos = @_ ? shift : 0; 923 $pos = 0 if $pos < 0; 924 my $grob = shift; 925 926 my $lev = $self->{level}; 927 my $v2i = $self->{UCA_Version} >= 9 && 928 $self->{variable} ne 'non-ignorable'; 929 930 if (! @$subE) { 931 my $temp = $pos <= 0 ? 0 : $len <= $pos ? $len : $pos; 932 return $grob 933 ? map([$_, 0], $temp..$len) 934 : wantarray ? ($temp,0) : $temp; 935 } 936 $len < $pos 937 and return wantarray ? () : NOMATCHPOS; 938 my $strE = $self->splitEnt($pos ? substr($str, $pos) : $str, TRUE); 939 @$strE 940 or return wantarray ? () : NOMATCHPOS; 941 942 my(@strWt, @iniPos, @finPos, @subWt, @g_ret); 943 944 my $last_is_variable; 945 for my $vwt (map $self->getWt($_), @$subE) { 946 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 947 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); 948 949 # "Ignorable (L1, L2) after Variable" since track. v. 9 950 if ($v2i) { 951 if ($var) { 952 $last_is_variable = TRUE; 953 } 954 elsif (!$wt[0]) { # ignorable 955 $to_be_pushed = FALSE if $last_is_variable; 956 } 957 else { 958 $last_is_variable = FALSE; 959 } 960 } 961 962 if (@subWt && !$var && !$wt[0]) { 963 push @{ $subWt[-1] }, \@wt if $to_be_pushed; 964 } else { 965 push @subWt, [ \@wt ]; 966 } 967 } 968 969 my $count = 0; 970 my $end = @$strE - 1; 971 972 $last_is_variable = FALSE; # reuse 973 for (my $i = 0; $i <= $end; ) { # no $i++ 974 my $found_base = 0; 975 976 # fetch a grapheme 977 while ($i <= $end && $found_base == 0) { 978 for my $vwt ($self->getWt($strE->[$i][0])) { 979 my($var, @wt) = unpack(VCE_TEMPLATE, $vwt); 980 my $to_be_pushed = _nonIgnorAtLevel(\@wt,$lev); 981 982 # "Ignorable (L1, L2) after Variable" since track. v. 9 983 if ($v2i) { 984 if ($var) { 985 $last_is_variable = TRUE; 986 } 987 elsif (!$wt[0]) { # ignorable 988 $to_be_pushed = FALSE if $last_is_variable; 989 } 990 else { 991 $last_is_variable = FALSE; 992 } 993 } 994 995 if (@strWt && !$var && !$wt[0]) { 996 push @{ $strWt[-1] }, \@wt if $to_be_pushed; 997 $finPos[-1] = $strE->[$i][2]; 998 } elsif ($to_be_pushed) { 999 push @strWt, [ \@wt ]; 1000 push @iniPos, $found_base ? NOMATCHPOS : $strE->[$i][1]; 1001 $finPos[-1] = NOMATCHPOS if $found_base; 1002 push @finPos, $strE->[$i][2]; 1003 $found_base++; 1004 } 1005 # else ===> no-op 1006 } 1007 $i++; 1008 } 1009 1010 # try to match 1011 while ( @strWt > @subWt || (@strWt == @subWt && $i > $end) ) { 1012 if ($iniPos[0] != NOMATCHPOS && 1013 $finPos[$#subWt] != NOMATCHPOS && 1014 _eqArray(\@strWt, \@subWt, $lev)) { 1015 my $temp = $iniPos[0] + $pos; 1016 1017 if ($grob) { 1018 push @g_ret, [$temp, $finPos[$#subWt] - $iniPos[0]]; 1019 splice @strWt, 0, $#subWt; 1020 splice @iniPos, 0, $#subWt; 1021 splice @finPos, 0, $#subWt; 1022 } 1023 else { 1024 return wantarray 1025 ? ($temp, $finPos[$#subWt] - $iniPos[0]) 1026 : $temp; 1027 } 1028 } 1029 shift @strWt; 1030 shift @iniPos; 1031 shift @finPos; 1032 } 1033 } 1034 1035 return $grob 1036 ? @g_ret 1037 : wantarray ? () : NOMATCHPOS; 1038 } 1039 1040 ## 1041 ## scalarref to matching part = match(string, substring) 1042 ## 1043 sub match 1044 { 1045 my $self = shift; 1046 if (my($pos,$len) = $self->index($_[0], $_[1])) { 1047 my $temp = substr($_[0], $pos, $len); 1048 return wantarray ? $temp : \$temp; 1049 # An lvalue ref \substr should be avoided, 1050 # since its value is affected by modification of its referent. 1051 } 1052 else { 1053 return; 1054 } 1055 } 1056 1057 ## 1058 ## arrayref matching parts = gmatch(string, substring) 1059 ## 1060 sub gmatch 1061 { 1062 my $self = shift; 1063 my $str = shift; 1064 my $sub = shift; 1065 return map substr($str, $_->[0], $_->[1]), 1066 $self->index($str, $sub, 0, 'g'); 1067 } 1068 1069 ## 1070 ## bool subst'ed = subst(string, substring, replace) 1071 ## 1072 sub subst 1073 { 1074 my $self = shift; 1075 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; 1076 1077 if (my($pos,$len) = $self->index($_[0], $_[1])) { 1078 if ($code) { 1079 my $mat = substr($_[0], $pos, $len); 1080 substr($_[0], $pos, $len, $code->($mat)); 1081 } else { 1082 substr($_[0], $pos, $len, $_[2]); 1083 } 1084 return TRUE; 1085 } 1086 else { 1087 return FALSE; 1088 } 1089 } 1090 1091 ## 1092 ## int count = gsubst(string, substring, replace) 1093 ## 1094 sub gsubst 1095 { 1096 my $self = shift; 1097 my $code = ref $_[2] eq 'CODE' ? $_[2] : FALSE; 1098 my $cnt = 0; 1099 1100 # Replacement is carried out from the end, then use reverse. 1101 for my $pos_len (reverse $self->index($_[0], $_[1], 0, 'g')) { 1102 if ($code) { 1103 my $mat = substr($_[0], $pos_len->[0], $pos_len->[1]); 1104 substr($_[0], $pos_len->[0], $pos_len->[1], $code->($mat)); 1105 } else { 1106 substr($_[0], $pos_len->[0], $pos_len->[1], $_[2]); 1107 } 1108 $cnt++; 1109 } 1110 return $cnt; 1111 } 1112 1113 1; 1114 __END__ 1115 1116 =head1 NAME 1117 1118 Unicode::Collate - Unicode Collation Algorithm 1119 1120 =head1 SYNOPSIS 1121 1122 use Unicode::Collate; 1123 1124 #construct 1125 $Collator = Unicode::Collate->new(%tailoring); 1126 1127 #sort 1128 @sorted = $Collator->sort(@not_sorted); 1129 1130 #compare 1131 $result = $Collator->cmp($a, $b); # returns 1, 0, or -1. 1132 1133 # If %tailoring is false (i.e. empty), 1134 # $Collator should do the default collation. 1135 1136 =head1 DESCRIPTION 1137 1138 This module is an implementation of Unicode Technical Standard #10 1139 (a.k.a. UTS #10) - Unicode Collation Algorithm (a.k.a. UCA). 1140 1141 =head2 Constructor and Tailoring 1142 1143 The C<new> method returns a collator object. 1144 1145 $Collator = Unicode::Collate->new( 1146 UCA_Version => $UCA_Version, 1147 alternate => $alternate, # deprecated: use of 'variable' is recommended. 1148 backwards => $levelNumber, # or \@levelNumbers 1149 entry => $element, 1150 hangul_terminator => $term_primary_weight, 1151 ignoreName => qr/$ignoreName/, 1152 ignoreChar => qr/$ignoreChar/, 1153 katakana_before_hiragana => $bool, 1154 level => $collationLevel, 1155 normalization => $normalization_form, 1156 overrideCJK => \&overrideCJK, 1157 overrideHangul => \&overrideHangul, 1158 preprocess => \&preprocess, 1159 rearrange => \@charList, 1160 table => $filename, 1161 undefName => qr/$undefName/, 1162 undefChar => qr/$undefChar/, 1163 upper_before_lower => $bool, 1164 variable => $variable, 1165 ); 1166 1167 =over 4 1168 1169 =item UCA_Version 1170 1171 If the tracking version number of UCA is given, 1172 behavior of that tracking version is emulated on collating. 1173 If omitted, the return value of C<UCA_Version()> is used. 1174 C<UCA_Version()> should return the latest tracking version supported. 1175 1176 The supported tracking version: 8, 9, 11, or 14. 1177 1178 UCA Unicode Standard DUCET (@version) 1179 --------------------------------------------------- 1180 8 3.1 3.0.1 (3.0.1d9) 1181 9 3.1 with Corrigendum 3 3.1.1 (3.1.1) 1182 11 4.0 4.0.0 (4.0.0) 1183 14 4.1.0 4.1.0 (4.1.0) 1184 1185 Note: Recent UTS #10 renames "Tracking Version" to "Revision." 1186 1187 =item alternate 1188 1189 -- see 3.2.2 Alternate Weighting, version 8 of UTS #10 1190 1191 For backward compatibility, C<alternate> (old name) can be used 1192 as an alias for C<variable>. 1193 1194 =item backwards 1195 1196 -- see 3.1.2 French Accents, UTS #10. 1197 1198 backwards => $levelNumber or \@levelNumbers 1199 1200 Weights in reverse order; ex. level 2 (diacritic ordering) in French. 1201 If omitted, forwards at all the levels. 1202 1203 =item entry 1204 1205 -- see 3.1 Linguistic Features; 3.2.1 File Format, UTS #10. 1206 1207 If the same character (or a sequence of characters) exists 1208 in the collation element table through C<table>, 1209 mapping to collation elements is overrided. 1210 If it does not exist, the mapping is defined additionally. 1211 1212 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) 1213 0063 0068 ; [.0E6A.0020.0002.0063] # ch 1214 0043 0068 ; [.0E6A.0020.0007.0043] # Ch 1215 0043 0048 ; [.0E6A.0020.0008.0043] # CH 1216 006C 006C ; [.0F4C.0020.0002.006C] # ll 1217 004C 006C ; [.0F4C.0020.0007.004C] # Ll 1218 004C 004C ; [.0F4C.0020.0008.004C] # LL 1219 00F1 ; [.0F7B.0020.0002.00F1] # n-tilde 1220 006E 0303 ; [.0F7B.0020.0002.00F1] # n-tilde 1221 00D1 ; [.0F7B.0020.0008.00D1] # N-tilde 1222 004E 0303 ; [.0F7B.0020.0008.00D1] # N-tilde 1223 ENTRY 1224 1225 entry => <<'ENTRY', # for DUCET v4.0.0 (allkeys-4.0.0.txt) 1226 00E6 ; [.0E33.0020.0002.00E6][.0E8B.0020.0002.00E6] # ae ligature as <a><e> 1227 00C6 ; [.0E33.0020.0008.00C6][.0E8B.0020.0008.00C6] # AE ligature as <A><E> 1228 ENTRY 1229 1230 B<NOTE:> The code point in the UCA file format (before C<';'>) 1231 B<must> be a Unicode code point (defined as hexadecimal), 1232 but not a native code point. 1233 So C<0063> must always denote C<U+0063>, 1234 but not a character of C<"\x63">. 1235 1236 Weighting may vary depending on collation element table. 1237 So ensure the weights defined in C<entry> will be consistent with 1238 those in the collation element table loaded via C<table>. 1239 1240 In DUCET v4.0.0, primary weight of C<C> is C<0E60> 1241 and that of C<D> is C<0E6D>. So setting primary weight of C<CH> to C<0E6A> 1242 (as a value between C<0E60> and C<0E6D>) 1243 makes ordering as C<C E<lt> CH E<lt> D>. 1244 Exactly speaking DUCET already has some characters between C<C> and C<D>: 1245 C<small capital C> (C<U+1D04>) with primary weight C<0E64>, 1246 C<c-hook/C-hook> (C<U+0188/U+0187>) with C<0E65>, 1247 and C<c-curl> (C<U+0255>) with C<0E69>. 1248 Then primary weight C<0E6A> for C<CH> makes C<CH> 1249 ordered between C<c-curl> and C<D>. 1250 1251 =item hangul_terminator 1252 1253 -- see 7.1.4 Trailing Weights, UTS #10. 1254 1255 If a true value is given (non-zero but should be positive), 1256 it will be added as a terminator primary weight to the end of 1257 every standard Hangul syllable. Secondary and any higher weights 1258 for terminator are set to zero. 1259 If the value is false or C<hangul_terminator> key does not exist, 1260 insertion of terminator weights will not be performed. 1261 1262 Boundaries of Hangul syllables are determined 1263 according to conjoining Jamo behavior in F<the Unicode Standard> 1264 and F<HangulSyllableType.txt>. 1265 1266 B<Implementation Note:> 1267 (1) For expansion mapping (Unicode character mapped 1268 to a sequence of collation elements), a terminator will not be added 1269 between collation elements, even if Hangul syllable boundary exists there. 1270 Addition of terminator is restricted to the next position 1271 to the last collation element. 1272 1273 (2) Non-conjoining Hangul letters 1274 (Compatibility Jamo, halfwidth Jamo, and enclosed letters) are not 1275 automatically terminated with a terminator primary weight. 1276 These characters may need terminator included in a collation element 1277 table beforehand. 1278 1279 =item ignoreChar 1280 1281 =item ignoreName 1282 1283 -- see 3.2.2 Variable Weighting, UTS #10. 1284 1285 Makes the entry in the table completely ignorable; 1286 i.e. as if the weights were zero at all level. 1287 1288 Through C<ignoreChar>, any character matching C<qr/$ignoreChar/> 1289 will be ignored. Through C<ignoreName>, any character whose name 1290 (given in the C<table> file as a comment) matches C<qr/$ignoreName/> 1291 will be ignored. 1292 1293 E.g. when 'a' and 'e' are ignorable, 1294 'element' is equal to 'lament' (or 'lmnt'). 1295 1296 =item katakana_before_hiragana 1297 1298 -- see 7.3.1 Tertiary Weight Table, UTS #10. 1299 1300 By default, hiragana is before katakana. 1301 If the parameter is made true, this is reversed. 1302 1303 B<NOTE>: This parameter simplemindedly assumes that any hiragana/katakana 1304 distinctions must occur in level 3, and their weights at level 3 must be 1305 same as those mentioned in 7.3.1, UTS #10. 1306 If you define your collation elements which violate this requirement, 1307 this parameter does not work validly. 1308 1309 =item level 1310 1311 -- see 4.3 Form Sort Key, UTS #10. 1312 1313 Set the maximum level. 1314 Any higher levels than the specified one are ignored. 1315 1316 Level 1: alphabetic ordering 1317 Level 2: diacritic ordering 1318 Level 3: case ordering 1319 Level 4: tie-breaking (e.g. in the case when variable is 'shifted') 1320 1321 ex.level => 2, 1322 1323 If omitted, the maximum is the 4th. 1324 1325 =item normalization 1326 1327 -- see 4.1 Normalize, UTS #10. 1328 1329 If specified, strings are normalized before preparation of sort keys 1330 (the normalization is executed after preprocess). 1331 1332 A form name C<Unicode::Normalize::normalize()> accepts will be applied 1333 as C<$normalization_form>. 1334 Acceptable names include C<'NFD'>, C<'NFC'>, C<'NFKD'>, and C<'NFKC'>. 1335 See C<Unicode::Normalize::normalize()> for detail. 1336 If omitted, C<'NFD'> is used. 1337 1338 C<normalization> is performed after C<preprocess> (if defined). 1339 1340 Furthermore, special values, C<undef> and C<"prenormalized">, can be used, 1341 though they are not concerned with C<Unicode::Normalize::normalize()>. 1342 1343 If C<undef> (not a string C<"undef">) is passed explicitly 1344 as the value for this key, 1345 any normalization is not carried out (this may make tailoring easier 1346 if any normalization is not desired). Under C<(normalization =E<gt> undef)>, 1347 only contiguous contractions are resolved; 1348 e.g. even if C<A-ring> (and C<A-ring-cedilla>) is ordered after C<Z>, 1349 C<A-cedilla-ring> would be primary equal to C<A>. 1350 In this point, 1351 C<(normalization =E<gt> undef, preprocess =E<gt> sub { NFD(shift) })> 1352 B<is not> equivalent to C<(normalization =E<gt> 'NFD')>. 1353 1354 In the case of C<(normalization =E<gt> "prenormalized")>, 1355 any normalization is not performed, but 1356 non-contiguous contractions with combining characters are performed. 1357 Therefore 1358 C<(normalization =E<gt> 'prenormalized', preprocess =E<gt> sub { NFD(shift) })> 1359 B<is> equivalent to C<(normalization =E<gt> 'NFD')>. 1360 If source strings are finely prenormalized, 1361 C<(normalization =E<gt> 'prenormalized')> may save time for normalization. 1362 1363 Except C<(normalization =E<gt> undef)>, 1364 B<Unicode::Normalize> is required (see also B<CAVEAT>). 1365 1366 =item overrideCJK 1367 1368 -- see 7.1 Derived Collation Elements, UTS #10. 1369 1370 By default, CJK Unified Ideographs are ordered in Unicode codepoint order 1371 but C<CJK Unified Ideographs> (if C<UCA_Version> is 8 to 11, its range is 1372 C<U+4E00..U+9FA5>; if C<UCA_Version> is 14, its range is C<U+4E00..U+9FBB>) 1373 are lesser than C<CJK Unified Ideographs Extension> (its range is 1374 C<U+3400..U+4DB5> and C<U+20000..U+2A6D6>). 1375 1376 Through C<overrideCJK>, ordering of CJK Unified Ideographs can be overrided. 1377 1378 ex. CJK Unified Ideographs in the JIS code point order. 1379 1380 overrideCJK => sub { 1381 my $u = shift; # get a Unicode codepoint 1382 my $b = pack('n', $u); # to UTF-16BE 1383 my $s = your_unicode_to_sjis_converter($b); # convert 1384 my $n = unpack('n', $s); # convert sjis to short 1385 [ $n, 0x20, 0x2, $u ]; # return the collation element 1386 }, 1387 1388 ex. ignores all CJK Unified Ideographs. 1389 1390 overrideCJK => sub {()}, # CODEREF returning empty list 1391 1392 # where ->eq("Pe\x{4E00}rl", "Perl") is true 1393 # as U+4E00 is a CJK Unified Ideograph and to be ignorable. 1394 1395 If C<undef> is passed explicitly as the value for this key, 1396 weights for CJK Unified Ideographs are treated as undefined. 1397 But assignment of weight for CJK Unified Ideographs 1398 in table or C<entry> is still valid. 1399 1400 =item overrideHangul 1401 1402 -- see 7.1 Derived Collation Elements, UTS #10. 1403 1404 By default, Hangul Syllables are decomposed into Hangul Jamo, 1405 even if C<(normalization =E<gt> undef)>. 1406 But the mapping of Hangul Syllables may be overrided. 1407 1408 This parameter works like C<overrideCJK>, so see there for examples. 1409 1410 If you want to override the mapping of Hangul Syllables, 1411 NFD, NFKD, and FCD are not appropriate, 1412 since they will decompose Hangul Syllables before overriding. 1413 1414 If C<undef> is passed explicitly as the value for this key, 1415 weight for Hangul Syllables is treated as undefined 1416 without decomposition into Hangul Jamo. 1417 But definition of weight for Hangul Syllables 1418 in table or C<entry> is still valid. 1419 1420 =item preprocess 1421 1422 -- see 5.1 Preprocessing, UTS #10. 1423 1424 If specified, the coderef is used to preprocess 1425 before the formation of sort keys. 1426 1427 ex. dropping English articles, such as "a" or "the". 1428 Then, "the pen" is before "a pencil". 1429 1430 preprocess => sub { 1431 my $str = shift; 1432 $str =~ s/\b(?:an?|the)\s+//gi; 1433 return $str; 1434 }, 1435 1436 C<preprocess> is performed before C<normalization> (if defined). 1437 1438 =item rearrange 1439 1440 -- see 3.1.3 Rearrangement, UTS #10. 1441 1442 Characters that are not coded in logical order and to be rearranged. 1443 If C<UCA_Version> is equal to or lesser than 11, default is: 1444 1445 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ], 1446 1447 If you want to disallow any rearrangement, pass C<undef> or C<[]> 1448 (a reference to empty list) as the value for this key. 1449 1450 If C<UCA_Version> is equal to 14, default is C<[]> (i.e. no rearrangement). 1451 1452 B<According to the version 9 of UCA, this parameter shall not be used; 1453 but it is not warned at present.> 1454 1455 =item table 1456 1457 -- see 3.2 Default Unicode Collation Element Table, UTS #10. 1458 1459 You can use another collation element table if desired. 1460 1461 The table file should locate in the F<Unicode/Collate> directory 1462 on C<@INC>. Say, if the filename is F<Foo.txt>, 1463 the table file is searched as F<Unicode/Collate/Foo.txt> in C<@INC>. 1464 1465 By default, F<allkeys.txt> (as the filename of DUCET) is used. 1466 If you will prepare your own table file, any name other than F<allkeys.txt> 1467 may be better to avoid namespace conflict. 1468 1469 If C<undef> is passed explicitly as the value for this key, 1470 no file is read (but you can define collation elements via C<entry>). 1471 1472 A typical way to define a collation element table 1473 without any file of table: 1474 1475 $onlyABC = Unicode::Collate->new( 1476 table => undef, 1477 entry => << 'ENTRIES', 1478 0061 ; [.0101.0020.0002.0061] # LATIN SMALL LETTER A 1479 0041 ; [.0101.0020.0008.0041] # LATIN CAPITAL LETTER A 1480 0062 ; [.0102.0020.0002.0062] # LATIN SMALL LETTER B 1481 0042 ; [.0102.0020.0008.0042] # LATIN CAPITAL LETTER B 1482 0063 ; [.0103.0020.0002.0063] # LATIN SMALL LETTER C 1483 0043 ; [.0103.0020.0008.0043] # LATIN CAPITAL LETTER C 1484 ENTRIES 1485 ); 1486 1487 If C<ignoreName> or C<undefName> is used, character names should be 1488 specified as a comment (following C<#>) on each line. 1489 1490 =item undefChar 1491 1492 =item undefName 1493 1494 -- see 6.3.4 Reducing the Repertoire, UTS #10. 1495 1496 Undefines the collation element as if it were unassigned in the table. 1497 This reduces the size of the table. 1498 If an unassigned character appears in the string to be collated, 1499 the sort key is made from its codepoint 1500 as a single-character collation element, 1501 as it is greater than any other assigned collation elements 1502 (in the codepoint order among the unassigned characters). 1503 But, it'd be better to ignore characters 1504 unfamiliar to you and maybe never used. 1505 1506 Through C<undefChar>, any character matching C<qr/$undefChar/> 1507 will be undefined. Through C<undefName>, any character whose name 1508 (given in the C<table> file as a comment) matches C<qr/$undefName/> 1509 will be undefined. 1510 1511 ex. Collation weights for beyond-BMP characters are not stored in object: 1512 1513 undefChar => qr/[^\0-\x{fffd}]/, 1514 1515 =item upper_before_lower 1516 1517 -- see 6.6 Case Comparisons, UTS #10. 1518 1519 By default, lowercase is before uppercase. 1520 If the parameter is made true, this is reversed. 1521 1522 B<NOTE>: This parameter simplemindedly assumes that any lowercase/uppercase 1523 distinctions must occur in level 3, and their weights at level 3 must be 1524 same as those mentioned in 7.3.1, UTS #10. 1525 If you define your collation elements which differs from this requirement, 1526 this parameter doesn't work validly. 1527 1528 =item variable 1529 1530 -- see 3.2.2 Variable Weighting, UTS #10. 1531 1532 This key allows to variable weighting for variable collation elements, 1533 which are marked with an ASTERISK in the table 1534 (NOTE: Many punction marks and symbols are variable in F<allkeys.txt>). 1535 1536 variable => 'blanked', 'non-ignorable', 'shifted', or 'shift-trimmed'. 1537 1538 These names are case-insensitive. 1539 By default (if specification is omitted), 'shifted' is adopted. 1540 1541 'Blanked' Variable elements are made ignorable at levels 1 through 3; 1542 considered at the 4th level. 1543 1544 'Non-Ignorable' Variable elements are not reset to ignorable. 1545 1546 'Shifted' Variable elements are made ignorable at levels 1 through 3 1547 their level 4 weight is replaced by the old level 1 weight. 1548 Level 4 weight for Non-Variable elements is 0xFFFF. 1549 1550 'Shift-Trimmed' Same as 'shifted', but all FFFF's at the 4th level 1551 are trimmed. 1552 1553 =back 1554 1555 =head2 Methods for Collation 1556 1557 =over 4 1558 1559 =item C<@sorted = $Collator-E<gt>sort(@not_sorted)> 1560 1561 Sorts a list of strings. 1562 1563 =item C<$result = $Collator-E<gt>cmp($a, $b)> 1564 1565 Returns 1 (when C<$a> is greater than C<$b>) 1566 or 0 (when C<$a> is equal to C<$b>) 1567 or -1 (when C<$a> is lesser than C<$b>). 1568 1569 =item C<$result = $Collator-E<gt>eq($a, $b)> 1570 1571 =item C<$result = $Collator-E<gt>ne($a, $b)> 1572 1573 =item C<$result = $Collator-E<gt>lt($a, $b)> 1574 1575 =item C<$result = $Collator-E<gt>le($a, $b)> 1576 1577 =item C<$result = $Collator-E<gt>gt($a, $b)> 1578 1579 =item C<$result = $Collator-E<gt>ge($a, $b)> 1580 1581 They works like the same name operators as theirs. 1582 1583 eq : whether $a is equal to $b. 1584 ne : whether $a is not equal to $b. 1585 lt : whether $a is lesser than $b. 1586 le : whether $a is lesser than $b or equal to $b. 1587 gt : whether $a is greater than $b. 1588 ge : whether $a is greater than $b or equal to $b. 1589 1590 =item C<$sortKey = $Collator-E<gt>getSortKey($string)> 1591 1592 -- see 4.3 Form Sort Key, UTS #10. 1593 1594 Returns a sort key. 1595 1596 You compare the sort keys using a binary comparison 1597 and get the result of the comparison of the strings using UCA. 1598 1599 $Collator->getSortKey($a) cmp $Collator->getSortKey($b) 1600 1601 is equivalent to 1602 1603 $Collator->cmp($a, $b) 1604 1605 =item C<$sortKeyForm = $Collator-E<gt>viewSortKey($string)> 1606 1607 Converts a sorting key into its representation form. 1608 If C<UCA_Version> is 8, the output is slightly different. 1609 1610 use Unicode::Collate; 1611 my $c = Unicode::Collate->new(); 1612 print $c->viewSortKey("Perl"),"\n"; 1613 1614 # output: 1615 # [0B67 0A65 0B7F 0B03 | 0020 0020 0020 0020 | 0008 0002 0002 0002 | FFFF FFFF FFFF FFFF] 1616 # Level 1 Level 2 Level 3 Level 4 1617 1618 =back 1619 1620 =head2 Methods for Searching 1621 1622 B<DISCLAIMER:> If C<preprocess> or C<normalization> parameter is true 1623 for C<$Collator>, calling these methods (C<index>, C<match>, C<gmatch>, 1624 C<subst>, C<gsubst>) is croaked, 1625 as the position and the length might differ 1626 from those on the specified string. 1627 (And C<rearrange> and C<hangul_terminator> parameters are neglected.) 1628 1629 The C<match>, C<gmatch>, C<subst>, C<gsubst> methods work 1630 like C<m//>, C<m//g>, C<s///>, C<s///g>, respectively, 1631 but they are not aware of any pattern, but only a literal substring. 1632 1633 =over 4 1634 1635 =item C<$position = $Collator-E<gt>index($string, $substring[, $position])> 1636 1637 =item C<($position, $length) = $Collator-E<gt>index($string, $substring[, $position])> 1638 1639 If C<$substring> matches a part of C<$string>, returns 1640 the position of the first occurrence of the matching part in scalar context; 1641 in list context, returns a two-element list of 1642 the position and the length of the matching part. 1643 1644 If C<$substring> does not match any part of C<$string>, 1645 returns C<-1> in scalar context and 1646 an empty list in list context. 1647 1648 e.g. you say 1649 1650 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); 1651 # (normalization => undef) is REQUIRED. 1652 my $str = "Ich muß studieren Perl."; 1653 my $sub = "MÜSS"; 1654 my $match; 1655 if (my($pos,$len) = $Collator->index($str, $sub)) { 1656 $match = substr($str, $pos, $len); 1657 } 1658 1659 and get C<"muß"> in C<$match> since C<"muß"> 1660 is primary equal to C<"MÜSS">. 1661 1662 =item C<$match_ref = $Collator-E<gt>match($string, $substring)> 1663 1664 =item C<($match) = $Collator-E<gt>match($string, $substring)> 1665 1666 If C<$substring> matches a part of C<$string>, in scalar context, returns 1667 B<a reference to> the first occurrence of the matching part 1668 (C<$match_ref> is always true if matches, 1669 since every reference is B<true>); 1670 in list context, returns the first occurrence of the matching part. 1671 1672 If C<$substring> does not match any part of C<$string>, 1673 returns C<undef> in scalar context and 1674 an empty list in list context. 1675 1676 e.g. 1677 1678 if ($match_ref = $Collator->match($str, $sub)) { # scalar context 1679 print "matches [$$match_ref].\n"; 1680 } else { 1681 print "doesn't match.\n"; 1682 } 1683 1684 or 1685 1686 if (($match) = $Collator->match($str, $sub)) { # list context 1687 print "matches [$match].\n"; 1688 } else { 1689 print "doesn't match.\n"; 1690 } 1691 1692 =item C<@match = $Collator-E<gt>gmatch($string, $substring)> 1693 1694 If C<$substring> matches a part of C<$string>, returns 1695 all the matching parts (or matching count in scalar context). 1696 1697 If C<$substring> does not match any part of C<$string>, 1698 returns an empty list. 1699 1700 =item C<$count = $Collator-E<gt>subst($string, $substring, $replacement)> 1701 1702 If C<$substring> matches a part of C<$string>, 1703 the first occurrence of the matching part is replaced by C<$replacement> 1704 (C<$string> is modified) and return C<$count> (always equals to C<1>). 1705 1706 C<$replacement> can be a C<CODEREF>, 1707 taking the matching part as an argument, 1708 and returning a string to replace the matching part 1709 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/e>). 1710 1711 =item C<$count = $Collator-E<gt>gsubst($string, $substring, $replacement)> 1712 1713 If C<$substring> matches a part of C<$string>, 1714 all the occurrences of the matching part is replaced by C<$replacement> 1715 (C<$string> is modified) and return C<$count>. 1716 1717 C<$replacement> can be a C<CODEREF>, 1718 taking the matching part as an argument, 1719 and returning a string to replace the matching part 1720 (a bit similar to C<s/(..)/$coderef-E<gt>($1)/eg>). 1721 1722 e.g. 1723 1724 my $Collator = Unicode::Collate->new( normalization => undef, level => 1 ); 1725 # (normalization => undef) is REQUIRED. 1726 my $str = "Camel donkey zebra came\x{301}l CAMEL horse cAm\0E\0L..."; 1727 $Collator->gsubst($str, "camel", sub { "<b>$_[0]</b>" }); 1728 1729 # now $str is "<b>Camel</b> donkey zebra <b>came\x{301}l</b> <b>CAMEL</b> horse <b>cAm\0E\0L</b>..."; 1730 # i.e., all the camels are made bold-faced. 1731 1732 =back 1733 1734 =head2 Other Methods 1735 1736 =over 4 1737 1738 =item C<%old_tailoring = $Collator-E<gt>change(%new_tailoring)> 1739 1740 Change the value of specified keys and returns the changed part. 1741 1742 $Collator = Unicode::Collate->new(level => 4); 1743 1744 $Collator->eq("perl", "PERL"); # false 1745 1746 %old = $Collator->change(level => 2); # returns (level => 4). 1747 1748 $Collator->eq("perl", "PERL"); # true 1749 1750 $Collator->change(%old); # returns (level => 2). 1751 1752 $Collator->eq("perl", "PERL"); # false 1753 1754 Not all C<(key,value)>s are allowed to be changed. 1755 See also C<@Unicode::Collate::ChangeOK> and C<@Unicode::Collate::ChangeNG>. 1756 1757 In the scalar context, returns the modified collator 1758 (but it is B<not> a clone from the original). 1759 1760 $Collator->change(level => 2)->eq("perl", "PERL"); # true 1761 1762 $Collator->eq("perl", "PERL"); # true; now max level is 2nd. 1763 1764 $Collator->change(level => 4)->eq("perl", "PERL"); # false 1765 1766 =item C<$version = $Collator-E<gt>version()> 1767 1768 Returns the version number (a string) of the Unicode Standard 1769 which the C<table> file used by the collator object is based on. 1770 If the table does not include a version line (starting with C<@version>), 1771 returns C<"unknown">. 1772 1773 =item C<UCA_Version()> 1774 1775 Returns the tracking version number of UTS #10 this module consults. 1776 1777 =item C<Base_Unicode_Version()> 1778 1779 Returns the version number of UTS #10 this module consults. 1780 1781 =back 1782 1783 =head1 EXPORT 1784 1785 No method will be exported. 1786 1787 =head1 INSTALL 1788 1789 Though this module can be used without any C<table> file, 1790 to use this module easily, it is recommended to install a table file 1791 in the UCA format, by copying it under the directory 1792 <a place in @INC>/Unicode/Collate. 1793 1794 The most preferable one is "The Default Unicode Collation Element Table" 1795 (aka DUCET), available from the Unicode Consortium's website: 1796 1797 http://www.unicode.org/Public/UCA/ 1798 1799 http://www.unicode.org/Public/UCA/latest/allkeys.txt (latest version) 1800 1801 If DUCET is not installed, it is recommended to copy the file 1802 from http://www.unicode.org/Public/UCA/latest/allkeys.txt 1803 to <a place in @INC>/Unicode/Collate/allkeys.txt 1804 manually. 1805 1806 =head1 CAVEATS 1807 1808 =over 4 1809 1810 =item Normalization 1811 1812 Use of the C<normalization> parameter requires the B<Unicode::Normalize> 1813 module (see L<Unicode::Normalize>). 1814 1815 If you need not it (say, in the case when you need not 1816 handle any combining characters), 1817 assign C<normalization =E<gt> undef> explicitly. 1818 1819 -- see 6.5 Avoiding Normalization, UTS #10. 1820 1821 =item Conformance Test 1822 1823 The Conformance Test for the UCA is available 1824 under L<http://www.unicode.org/Public/UCA/>. 1825 1826 For F<CollationTest_SHIFTED.txt>, 1827 a collator via C<Unicode::Collate-E<gt>new( )> should be used; 1828 for F<CollationTest_NON_IGNORABLE.txt>, a collator via 1829 C<Unicode::Collate-E<gt>new(variable =E<gt> "non-ignorable", level =E<gt> 3)>. 1830 1831 B<Unicode::Normalize is required to try The Conformance Test.> 1832 1833 =back 1834 1835 =head1 AUTHOR, COPYRIGHT AND LICENSE 1836 1837 The Unicode::Collate module for perl was written by SADAHIRO Tomoyuki, 1838 <SADAHIRO@cpan.org>. This module is Copyright(C) 2001-2005, 1839 SADAHIRO Tomoyuki. Japan. All rights reserved. 1840 1841 This module is free software; you can redistribute it and/or 1842 modify it under the same terms as Perl itself. 1843 1844 The file Unicode/Collate/allkeys.txt was copied directly 1845 from L<http://www.unicode.org/Public/UCA/4.1.0/allkeys.txt>. 1846 This file is Copyright (c) 1991-2005 Unicode, Inc. All rights reserved. 1847 Distributed under the Terms of Use in L<http://www.unicode.org/copyright.html>. 1848 1849 =head1 SEE ALSO 1850 1851 =over 4 1852 1853 =item Unicode Collation Algorithm - UTS #10 1854 1855 L<http://www.unicode.org/reports/tr10/> 1856 1857 =item The Default Unicode Collation Element Table (DUCET) 1858 1859 L<http://www.unicode.org/Public/UCA/latest/allkeys.txt> 1860 1861 =item The conformance test for the UCA 1862 1863 L<http://www.unicode.org/Public/UCA/latest/CollationTest.html> 1864 1865 L<http://www.unicode.org/Public/UCA/latest/CollationTest.zip> 1866 1867 =item Hangul Syllable Type 1868 1869 L<http://www.unicode.org/Public/UNIDATA/HangulSyllableType.txt> 1870 1871 =item Unicode Normalization Forms - UAX #15 1872 1873 L<http://www.unicode.org/reports/tr15/> 1874 1875 =back 1876 1877 =cut
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Tue Mar 17 22:47:18 2015 | Cross-referenced by PHPXref 0.7.1 |