PHPXRef 0.7.1 : Unnamed Project : /se3-unattended/var/se3/unattended/install/linuxaux/opt/perl/lib/5.10.0/Text/Balanced.pm source

[Summary view] [Print] [Text view]
   1  # EXTRACT VARIOUSLY DELIMITED TEXT SEQUENCES FROM STRINGS.
   2  # FOR FULL DOCUMENTATION SEE Balanced.pod
   3  
   4  use 5.005;
   5  use strict;
   6  
   7  package Text::Balanced;
   8  
   9  use Exporter;
  10  use SelfLoader;
  11  use vars qw { $VERSION @ISA %EXPORT_TAGS };
  12  
  13  use version; $VERSION = qv('2.0.0');
  14  @ISA        = qw ( Exporter );
  15               
  16  %EXPORT_TAGS    = ( ALL => [ qw(
  17                  &extract_delimited
  18                  &extract_bracketed
  19                  &extract_quotelike
  20                  &extract_codeblock
  21                  &extract_variable
  22                  &extract_tagged
  23                  &extract_multiple
  24  
  25                  &gen_delimited_pat
  26                  &gen_extract_tagged
  27  
  28                  &delimited_pat
  29                     ) ] );
  30  
  31  Exporter::export_ok_tags('ALL');
  32  
  33  # PROTOTYPES
  34  
  35  sub _match_bracketed($$$$$$);
  36  sub _match_variable($$);
  37  sub _match_codeblock($$$$$$$);
  38  sub _match_quotelike($$$$);
  39  
  40  # HANDLE RETURN VALUES IN VARIOUS CONTEXTS
  41  
  42  sub _failmsg {
  43      my ($message, $pos) = @_;
  44      $@ = bless { error=>$message, pos=>$pos }, "Text::Balanced::ErrorMsg";
  45  }
  46  
  47  sub _fail
  48  {
  49      my ($wantarray, $textref, $message, $pos) = @_;
  50      _failmsg $message, $pos if $message;
  51      return (undef,$$textref,undef) if $wantarray;
  52      return undef;
  53  }
  54  
  55  sub _succeed
  56  {
  57      $@ = undef;
  58      my ($wantarray,$textref) = splice @_, 0, 2;
  59      my ($extrapos, $extralen) = @_>18 ? splice(@_, -2, 2) : (0,0);
  60      my ($startlen, $oppos) = @_[5,6];
  61      my $remainderpos = $_[2];
  62      if ($wantarray)
  63      {
  64          my @res;
  65          while (my ($from, $len) = splice @_, 0, 2)
  66          {
  67              push @res, substr($$textref,$from,$len);
  68          }
  69          if ($extralen) {    # CORRECT FILLET
  70              my $extra = substr($res[0], $extrapos-$oppos, $extralen, "\n");
  71              $res[1] = "$extra$res[1]";
  72              eval { substr($$textref,$remainderpos,0) = $extra;
  73                     substr($$textref,$extrapos,$extralen,"\n")} ;
  74                  #REARRANGE HERE DOC AND FILLET IF POSSIBLE
  75              pos($$textref) = $remainderpos-$extralen+1; # RESET \G
  76          }
  77          else {
  78              pos($$textref) = $remainderpos;            # RESET \G
  79          }
  80          return @res;
  81      }
  82      else
  83      {
  84          my $match = substr($$textref,$_[0],$_[1]);
  85          substr($match,$extrapos-$_[0]-$startlen,$extralen,"") if $extralen;
  86          my $extra = $extralen
  87              ? substr($$textref, $extrapos, $extralen)."\n" : "";
  88          eval {substr($$textref,$_[4],$_[1]+$_[5])=$extra} ;    #CHOP OUT PREFIX & MATCH, IF POSSIBLE
  89          pos($$textref) = $_[4];                # RESET \G
  90          return $match;
  91      }
  92  }
  93  
  94  # BUILD A PATTERN MATCHING A SIMPLE DELIMITED STRING
  95  
  96  sub gen_delimited_pat($;$)  # ($delimiters;$escapes)
  97  {
  98      my ($dels, $escs) = @_;
  99      return "" unless $dels =~ /\S/;
 100      $escs = '\\' unless $escs;
 101      $escs .= substr($escs,-1) x (length($dels)-length($escs));
 102      my @pat = ();
 103      my $i;
 104      for ($i=0; $i<length $dels; $i++)
 105      {
 106          my $del = quotemeta substr($dels,$i,1);
 107          my $esc = quotemeta substr($escs,$i,1);
 108          if ($del eq $esc)
 109          {
 110              push @pat, "$del(?:[^$del]*(?:(?:$del$del)[^$del]*)*)$del";
 111          }
 112          else
 113          {
 114              push @pat, "$del(?:[^$esc$del]*(?:$esc.[^$esc$del]*)*)$del";
 115          }
 116      }
 117      my $pat = join '|', @pat;
 118      return "(?:$pat)";
 119  }
 120  
 121  *delimited_pat = \&gen_delimited_pat;
 122  
 123  
 124  # THE EXTRACTION FUNCTIONS
 125  
 126  sub extract_delimited (;$$$$)
 127  {
 128      my $textref = defined $_[0] ? \$_[0] : \$_;
 129      my $wantarray = wantarray;
 130      my $del  = defined $_[1] ? $_[1] : qq{\'\"\`};
 131      my $pre  = defined $_[2] ? $_[2] : '\s*';
 132      my $esc  = defined $_[3] ? $_[3] : qq{\\};
 133      my $pat = gen_delimited_pat($del, $esc);
 134      my $startpos = pos $$textref || 0;
 135      return _fail($wantarray, $textref, "Not a delimited pattern", 0)
 136          unless $$textref =~ m/\G($pre)($pat)/gc;
 137      my $prelen = length($1);
 138      my $matchpos = $startpos+$prelen;
 139      my $endpos = pos $$textref;
 140      return _succeed $wantarray, $textref,
 141              $matchpos, $endpos-$matchpos,        # MATCH
 142              $endpos,   length($$textref)-$endpos,    # REMAINDER
 143              $startpos, $prelen;            # PREFIX
 144  }
 145  
 146  sub extract_bracketed (;$$$)
 147  {
 148      my $textref = defined $_[0] ? \$_[0] : \$_;
 149      my $ldel = defined $_[1] ? $_[1] : '{([<';
 150      my $pre  = defined $_[2] ? $_[2] : '\s*';
 151      my $wantarray = wantarray;
 152      my $qdel = "";
 153      my $quotelike;
 154      $ldel =~ s/'//g and $qdel .= q{'};
 155      $ldel =~ s/"//g and $qdel .= q{"};
 156      $ldel =~ s/`//g and $qdel .= q{`};
 157      $ldel =~ s/q//g and $quotelike = 1;
 158      $ldel =~ tr/[](){}<>\0-\377/[[(({{<</ds;
 159      my $rdel = $ldel;
 160      unless ($rdel =~ tr/[({</])}>/)
 161          {
 162          return _fail $wantarray, $textref,
 163                   "Did not find a suitable bracket in delimiter: \"$_[1]\"",
 164                   0;
 165      }
 166      my $posbug = pos;
 167      $ldel = join('|', map { quotemeta $_ } split('', $ldel));
 168      $rdel = join('|', map { quotemeta $_ } split('', $rdel));
 169      pos = $posbug;
 170  
 171      my $startpos = pos $$textref || 0;
 172      my @match = _match_bracketed($textref,$pre, $ldel, $qdel, $quotelike, $rdel);
 173  
 174      return _fail ($wantarray, $textref) unless @match;
 175  
 176      return _succeed ( $wantarray, $textref,
 177                $match[2], $match[5]+2,    # MATCH
 178                @match[8,9],            # REMAINDER
 179                @match[0,1],            # PREFIX
 180              );
 181  }
 182  
 183  sub _match_bracketed($$$$$$)    # $textref, $pre, $ldel, $qdel, $quotelike, $rdel
 184  {
 185      my ($textref, $pre, $ldel, $qdel, $quotelike, $rdel) = @_;
 186      my ($startpos, $ldelpos, $endpos) = (pos $$textref = pos $$textref||0);
 187      unless ($$textref =~ m/\G$pre/gc)
 188      {
 189          _failmsg "Did not find prefix: /$pre/", $startpos;
 190          return;
 191      }
 192  
 193      $ldelpos = pos $$textref;
 194  
 195      unless ($$textref =~ m/\G($ldel)/gc)
 196      {
 197          _failmsg "Did not find opening bracket after prefix: \"$pre\"",
 198                   pos $$textref;
 199          pos $$textref = $startpos;
 200          return;
 201      }
 202  
 203      my @nesting = ( $1 );
 204      my $textlen = length $$textref;
 205      while (pos $$textref < $textlen)
 206      {
 207          next if $$textref =~ m/\G\\./gcs;
 208  
 209          if ($$textref =~ m/\G($ldel)/gc)
 210          {
 211              push @nesting, $1;
 212          }
 213          elsif ($$textref =~ m/\G($rdel)/gc)
 214          {
 215              my ($found, $brackettype) = ($1, $1);
 216              if ($#nesting < 0)
 217              {
 218                  _failmsg "Unmatched closing bracket: \"$found\"",
 219                       pos $$textref;
 220                  pos $$textref = $startpos;
 221                      return;
 222              }
 223              my $expected = pop(@nesting);
 224              $expected =~ tr/({[</)}]>/;
 225              if ($expected ne $brackettype)
 226              {
 227                  _failmsg qq{Mismatched closing bracket: expected "$expected" but found "$found"},
 228                       pos $$textref;
 229                  pos $$textref = $startpos;
 230                      return;
 231              }
 232              last if $#nesting < 0;
 233          }
 234          elsif ($qdel && $$textref =~ m/\G([$qdel])/gc)
 235          {
 236              $$textref =~ m/\G[^\\$1]*(?:\\.[^\\$1]*)*(\Q$1\E)/gsc and next;
 237              _failmsg "Unmatched embedded quote ($1)",
 238                   pos $$textref;
 239              pos $$textref = $startpos;
 240              return;
 241          }
 242          elsif ($quotelike && _match_quotelike($textref,"",1,0))
 243          {
 244              next;
 245          }
 246  
 247          else { $$textref =~ m/\G(?:[a-zA-Z0-9]+|.)/gcs }
 248      }
 249      if ($#nesting>=0)
 250      {
 251          _failmsg "Unmatched opening bracket(s): "
 252                  . join("..",@nesting)."..",
 253                   pos $$textref;
 254          pos $$textref = $startpos;
 255          return;
 256      }
 257  
 258      $endpos = pos $$textref;
 259      
 260      return (
 261          $startpos,  $ldelpos-$startpos,        # PREFIX
 262          $ldelpos,   1,                # OPENING BRACKET
 263          $ldelpos+1, $endpos-$ldelpos-2,        # CONTENTS
 264          $endpos-1,  1,                # CLOSING BRACKET
 265          $endpos,    length($$textref)-$endpos,    # REMAINDER
 266             );
 267  }
 268  
 269  sub _revbracket($)
 270  {
 271      my $brack = reverse $_[0];
 272      $brack =~ tr/[({</])}>/;
 273      return $brack;
 274  }
 275  
 276  my $XMLNAME = q{[a-zA-Z_:][a-zA-Z0-9_:.-]*};
 277  
 278  sub extract_tagged (;$$$$$) # ($text, $opentag, $closetag, $pre, \%options)
 279  {
 280      my $textref = defined $_[0] ? \$_[0] : \$_;
 281      my $ldel    = $_[1];
 282      my $rdel    = $_[2];
 283      my $pre     = defined $_[3] ? $_[3] : '\s*';
 284      my %options = defined $_[4] ? %{$_[4]} : ();
 285      my $omode   = defined $options{fail} ? $options{fail} : '';
 286      my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
 287              : defined($options{reject})           ? $options{reject}
 288              :                     ''
 289              ;
 290      my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
 291              : defined($options{ignore})           ? $options{ignore}
 292              :                     ''
 293              ;
 294  
 295      if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
 296      $@ = undef;
 297  
 298      my @match = _match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
 299  
 300      return _fail(wantarray, $textref) unless @match;
 301      return _succeed wantarray, $textref,
 302              $match[2], $match[3]+$match[5]+$match[7],    # MATCH
 303              @match[8..9,0..1,2..7];                # REM, PRE, BITS
 304  }
 305  
 306  sub _match_tagged    # ($$$$$$$)
 307  {
 308      my ($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore) = @_;
 309      my $rdelspec;
 310  
 311      my ($startpos, $opentagpos, $textpos, $parapos, $closetagpos, $endpos) = ( pos($$textref) = pos($$textref)||0 );
 312  
 313      unless ($$textref =~ m/\G($pre)/gc)
 314      {
 315          _failmsg "Did not find prefix: /$pre/", pos $$textref;
 316          goto failed;
 317      }
 318  
 319      $opentagpos = pos($$textref);
 320  
 321      unless ($$textref =~ m/\G$ldel/gc)
 322      {
 323          _failmsg "Did not find opening tag: /$ldel/", pos $$textref;
 324          goto failed;
 325      }
 326  
 327      $textpos = pos($$textref);
 328  
 329      if (!defined $rdel)
 330      {
 331          $rdelspec = substr($$textref, $-[0], $+[0] - $-[0]);
 332          unless ($rdelspec =~ s/\A([[(<{]+)($XMLNAME).*/ quotemeta "$1\/$2". _revbracket($1) /oes)
 333          {
 334              _failmsg "Unable to construct closing tag to match: $rdel",
 335                   pos $$textref;
 336              goto failed;
 337          }
 338      }
 339      else
 340      {
 341          $rdelspec = eval "qq{$rdel}" || do {
 342              my $del;
 343              for (qw,~ ! ^ & * ) _ + - = } ] : " ; ' > . ? / | ',)
 344                  { next if $rdel =~ /\Q$_/; $del = $_; last }
 345              unless ($del) {
 346                  use Carp;
 347                  croak "Can't interpolate right delimiter $rdel"
 348              }
 349              eval "qq$del$rdel$del";
 350          };
 351      }
 352  
 353      while (pos($$textref) < length($$textref))
 354      {
 355          next if $$textref =~ m/\G\\./gc;
 356  
 357          if ($$textref =~ m/\G(\n[ \t]*\n)/gc )
 358          {
 359              $parapos = pos($$textref) - length($1)
 360                  unless defined $parapos;
 361          }
 362          elsif ($$textref =~ m/\G($rdelspec)/gc )
 363          {
 364              $closetagpos = pos($$textref)-length($1);
 365              goto matched;
 366          }
 367          elsif ($ignore && $$textref =~ m/\G(?:$ignore)/gc)
 368          {
 369              next;
 370          }
 371          elsif ($bad && $$textref =~ m/\G($bad)/gcs)
 372          {
 373              pos($$textref) -= length($1);    # CUT OFF WHATEVER CAUSED THE SHORTNESS
 374              goto short if ($omode eq 'PARA' || $omode eq 'MAX');
 375              _failmsg "Found invalid nested tag: $1", pos $$textref;
 376              goto failed;
 377          }
 378          elsif ($$textref =~ m/\G($ldel)/gc)
 379          {
 380              my $tag = $1;
 381              pos($$textref) -= length($tag);    # REWIND TO NESTED TAG
 382              unless (_match_tagged(@_))    # MATCH NESTED TAG
 383              {
 384                  goto short if $omode eq 'PARA' || $omode eq 'MAX';
 385                  _failmsg "Found unbalanced nested tag: $tag",
 386                       pos $$textref;
 387                  goto failed;
 388              }
 389          }
 390          else { $$textref =~ m/./gcs }
 391      }
 392  
 393  short:
 394      $closetagpos = pos($$textref);
 395      goto matched if $omode eq 'MAX';
 396      goto failed unless $omode eq 'PARA';
 397  
 398      if (defined $parapos) { pos($$textref) = $parapos }
 399      else              { $parapos = pos($$textref) }
 400  
 401      return (
 402          $startpos,    $opentagpos-$startpos,        # PREFIX
 403          $opentagpos,  $textpos-$opentagpos,        # OPENING TAG
 404          $textpos,     $parapos-$textpos,        # TEXT
 405          $parapos,     0,                # NO CLOSING TAG
 406          $parapos,     length($$textref)-$parapos,    # REMAINDER
 407             );
 408      
 409  matched:
 410      $endpos = pos($$textref);
 411      return (
 412          $startpos,    $opentagpos-$startpos,        # PREFIX
 413          $opentagpos,  $textpos-$opentagpos,        # OPENING TAG
 414          $textpos,     $closetagpos-$textpos,        # TEXT
 415          $closetagpos, $endpos-$closetagpos,        # CLOSING TAG
 416          $endpos,      length($$textref)-$endpos,    # REMAINDER
 417             );
 418  
 419  failed:
 420      _failmsg "Did not find closing tag", pos $$textref unless $@;
 421      pos($$textref) = $startpos;
 422      return;
 423  }
 424  
 425  sub extract_variable (;$$)
 426  {
 427      my $textref = defined $_[0] ? \$_[0] : \$_;
 428      return ("","","") unless defined $$textref;
 429      my $pre  = defined $_[1] ? $_[1] : '\s*';
 430  
 431      my @match = _match_variable($textref,$pre);
 432  
 433      return _fail wantarray, $textref unless @match;
 434  
 435      return _succeed wantarray, $textref,
 436              @match[2..3,4..5,0..1];        # MATCH, REMAINDER, PREFIX
 437  }
 438  
 439  sub _match_variable($$)
 440  {
 441  #  $#
 442  #  $^
 443  #  $$
 444      my ($textref, $pre) = @_;
 445      my $startpos = pos($$textref) = pos($$textref)||0;
 446      unless ($$textref =~ m/\G($pre)/gc)
 447      {
 448          _failmsg "Did not find prefix: /$pre/", pos $$textref;
 449          return;
 450      }
 451      my $varpos = pos($$textref);
 452          unless ($$textref =~ m{\G\$\s*(?!::)(\d+|[][&`'+*./|,";%=~:?!\@<>()-]|\^[a-z]?)}gci)
 453      {
 454          unless ($$textref =~ m/\G((\$#?|[*\@\%]|\\&)+)/gc)
 455          {
 456          _failmsg "Did not find leading dereferencer", pos $$textref;
 457          pos $$textref = $startpos;
 458          return;
 459          }
 460          my $deref = $1;
 461  
 462          unless ($$textref =~ m/\G\s*(?:::|')?(?:[_a-z]\w*(?:::|'))*[_a-z]\w*/gci
 463              or _match_codeblock($textref, "", '\{', '\}', '\{', '\}', 0)
 464          or $deref eq '$#' or $deref eq '$$' )
 465          {
 466          _failmsg "Bad identifier after dereferencer", pos $$textref;
 467          pos $$textref = $startpos;
 468          return;
 469          }
 470      }
 471  
 472      while (1)
 473      {
 474          next if $$textref =~ m/\G\s*(?:->)?\s*[{]\w+[}]/gc;
 475          next if _match_codeblock($textref,
 476                       qr/\s*->\s*(?:[_a-zA-Z]\w+\s*)?/,
 477                       qr/[({[]/, qr/[)}\]]/,
 478                       qr/[({[]/, qr/[)}\]]/, 0);
 479          next if _match_codeblock($textref,
 480                       qr/\s*/, qr/[{[]/, qr/[}\]]/,
 481                       qr/[{[]/, qr/[}\]]/, 0);
 482          next if _match_variable($textref,'\s*->\s*');
 483          next if $$textref =~ m/\G\s*->\s*\w+(?![{([])/gc;
 484          last;
 485      }
 486      
 487      my $endpos = pos($$textref);
 488      return ($startpos, $varpos-$startpos,
 489          $varpos,   $endpos-$varpos,
 490          $endpos,   length($$textref)-$endpos
 491          );
 492  }
 493  
 494  sub extract_codeblock (;$$$$$)
 495  {
 496      my $textref = defined $_[0] ? \$_[0] : \$_;
 497      my $wantarray = wantarray;
 498      my $ldel_inner = defined $_[1] ? $_[1] : '{';
 499      my $pre        = defined $_[2] ? $_[2] : '\s*';
 500      my $ldel_outer = defined $_[3] ? $_[3] : $ldel_inner;
 501      my $rd         = $_[4];
 502      my $rdel_inner = $ldel_inner;
 503      my $rdel_outer = $ldel_outer;
 504      my $posbug = pos;
 505      for ($ldel_inner, $ldel_outer) { tr/[]()<>{}\0-\377/[[((<<{{/ds }
 506      for ($rdel_inner, $rdel_outer) { tr/[]()<>{}\0-\377/]]))>>}}/ds }
 507      for ($ldel_inner, $ldel_outer, $rdel_inner, $rdel_outer)
 508      {
 509          $_ = '('.join('|',map { quotemeta $_ } split('',$_)).')'
 510      }
 511      pos = $posbug;
 512  
 513      my @match = _match_codeblock($textref, $pre,
 514                       $ldel_outer, $rdel_outer,
 515                       $ldel_inner, $rdel_inner,
 516                       $rd);
 517      return _fail($wantarray, $textref) unless @match;
 518      return _succeed($wantarray, $textref,
 519              @match[2..3,4..5,0..1]    # MATCH, REMAINDER, PREFIX
 520                 );
 521  
 522  }
 523  
 524  sub _match_codeblock($$$$$$$)
 525  {
 526      my ($textref, $pre, $ldel_outer, $rdel_outer, $ldel_inner, $rdel_inner, $rd) = @_;
 527      my $startpos = pos($$textref) = pos($$textref) || 0;
 528      unless ($$textref =~ m/\G($pre)/gc)
 529      {
 530          _failmsg qq{Did not match prefix /$pre/ at"} .
 531                  substr($$textref,pos($$textref),20) .
 532                  q{..."},
 533                   pos $$textref;
 534          return; 
 535      }
 536      my $codepos = pos($$textref);
 537      unless ($$textref =~ m/\G($ldel_outer)/gc)    # OUTERMOST DELIMITER
 538      {
 539          _failmsg qq{Did not find expected opening bracket at "} .
 540                   substr($$textref,pos($$textref),20) .
 541                   q{..."},
 542                   pos $$textref;
 543          pos $$textref = $startpos;
 544          return;
 545      }
 546      my $closing = $1;
 547         $closing =~ tr/([<{/)]>}/;
 548      my $matched;
 549      my $patvalid = 1;
 550      while (pos($$textref) < length($$textref))
 551      {
 552          $matched = '';
 553          if ($rd && $$textref =~ m#\G(\Q(?)\E|\Q(s?)\E|\Q(s)\E)#gc)
 554          {
 555              $patvalid = 0;
 556              next;
 557          }
 558  
 559          if ($$textref =~ m/\G\s*#.*/gc)
 560          {
 561              next;
 562          }
 563  
 564          if ($$textref =~ m/\G\s*($rdel_outer)/gc)
 565          {
 566              unless ($matched = ($closing && $1 eq $closing) )
 567              {
 568                  next if $1 eq '>';    # MIGHT BE A "LESS THAN"
 569                  _failmsg q{Mismatched closing bracket at "} .
 570                           substr($$textref,pos($$textref),20) .
 571                           qq{...". Expected '$closing'},
 572                       pos $$textref;
 573              }
 574              last;
 575          }
 576  
 577          if (_match_variable($textref,'\s*') ||
 578              _match_quotelike($textref,'\s*',$patvalid,$patvalid) )
 579          {
 580              $patvalid = 0;
 581              next;
 582          }
 583  
 584  
 585          # NEED TO COVER MANY MORE CASES HERE!!!
 586          if ($$textref =~ m#\G\s*(?!$ldel_inner)
 587                      ( [-+*x/%^&|.]=?
 588                      | [!=]~
 589                      | =(?!>)
 590                      | (\*\*|&&|\|\||<<|>>)=?
 591                      | split|grep|map|return
 592                      | [([]
 593                      )#gcx)
 594          {
 595              $patvalid = 1;
 596              next;
 597          }
 598  
 599          if ( _match_codeblock($textref, '\s*', $ldel_inner, $rdel_inner, $ldel_inner, $rdel_inner, $rd) )
 600          {
 601              $patvalid = 1;
 602              next;
 603          }
 604  
 605          if ($$textref =~ m/\G\s*$ldel_outer/gc)
 606          {
 607              _failmsg q{Improperly nested codeblock at "} .
 608                       substr($$textref,pos($$textref),20) .
 609                       q{..."},
 610                   pos $$textref;
 611              last;
 612          }
 613  
 614          $patvalid = 0;
 615          $$textref =~ m/\G\s*(\w+|[-=>]>|.|\Z)/gc;
 616      }
 617      continue { $@ = undef }
 618  
 619      unless ($matched)
 620      {
 621          _failmsg 'No match found for opening bracket', pos $$textref
 622              unless $@;
 623          return;
 624      }
 625  
 626      my $endpos = pos($$textref);
 627      return ( $startpos, $codepos-$startpos,
 628           $codepos, $endpos-$codepos,
 629           $endpos,  length($$textref)-$endpos,
 630             );
 631  }
 632  
 633  
 634  my %mods   = (
 635          'none'    => '[cgimsox]*',
 636          'm'    => '[cgimsox]*',
 637          's'    => '[cegimsox]*',
 638          'tr'    => '[cds]*',
 639          'y'    => '[cds]*',
 640          'qq'    => '',
 641          'qx'    => '',
 642          'qw'    => '',
 643          'qr'    => '[imsx]*',
 644          'q'    => '',
 645           );
 646  
 647  sub extract_quotelike (;$$)
 648  {
 649      my $textref = $_[0] ? \$_[0] : \$_;
 650      my $wantarray = wantarray;
 651      my $pre  = defined $_[1] ? $_[1] : '\s*';
 652  
 653      my @match = _match_quotelike($textref,$pre,1,0);
 654      return _fail($wantarray, $textref) unless @match;
 655      return _succeed($wantarray, $textref,
 656              $match[2], $match[18]-$match[2],    # MATCH
 657              @match[18,19],                # REMAINDER
 658              @match[0,1],                # PREFIX
 659              @match[2..17],                # THE BITS
 660              @match[20,21],                # ANY FILLET?
 661                 );
 662  };
 663  
 664  sub _match_quotelike($$$$)    # ($textref, $prepat, $allow_raw_match)
 665  {
 666      my ($textref, $pre, $rawmatch, $qmark) = @_;
 667  
 668      my ($textlen,$startpos,
 669          $oppos,
 670          $preld1pos,$ld1pos,$str1pos,$rd1pos,
 671          $preld2pos,$ld2pos,$str2pos,$rd2pos,
 672          $modpos) = ( length($$textref), pos($$textref) = pos($$textref) || 0 );
 673  
 674      unless ($$textref =~ m/\G($pre)/gc)
 675      {
 676          _failmsg qq{Did not find prefix /$pre/ at "} .
 677                   substr($$textref, pos($$textref), 20) .
 678                   q{..."},
 679                   pos $$textref;
 680          return; 
 681      }
 682      $oppos = pos($$textref);
 683  
 684      my $initial = substr($$textref,$oppos,1);
 685  
 686      if ($initial && $initial =~ m|^[\"\'\`]|
 687               || $rawmatch && $initial =~ m|^/|
 688               || $qmark && $initial =~ m|^\?|)
 689      {
 690          unless ($$textref =~ m/ \Q$initial\E [^\\$initial]* (\\.[^\\$initial]*)* \Q$initial\E /gcsx)
 691          {
 692              _failmsg qq{Did not find closing delimiter to match '$initial' at "} .
 693                       substr($$textref, $oppos, 20) .
 694                       q{..."},
 695                   pos $$textref;
 696              pos $$textref = $startpos;
 697              return;
 698          }
 699          $modpos= pos($$textref);
 700          $rd1pos = $modpos-1;
 701  
 702          if ($initial eq '/' || $initial eq '?') 
 703          {
 704              $$textref =~ m/\G$mods{none}/gc
 705          }
 706  
 707          my $endpos = pos($$textref);
 708          return (
 709              $startpos,    $oppos-$startpos,    # PREFIX
 710              $oppos,        0,            # NO OPERATOR
 711              $oppos,        1,            # LEFT DEL
 712              $oppos+1,    $rd1pos-$oppos-1,    # STR/PAT
 713              $rd1pos,    1,            # RIGHT DEL
 714              $modpos,    0,            # NO 2ND LDEL
 715              $modpos,    0,            # NO 2ND STR
 716              $modpos,    0,            # NO 2ND RDEL
 717              $modpos,    $endpos-$modpos,    # MODIFIERS
 718              $endpos,     $textlen-$endpos,    # REMAINDER
 719                 );
 720      }
 721  
 722      unless ($$textref =~ m{\G(\b(?:m|s|qq|qx|qw|q|qr|tr|y)\b(?=\s*\S)|<<)}gc)
 723      {
 724          _failmsg q{No quotelike operator found after prefix at "} .
 725                   substr($$textref, pos($$textref), 20) .
 726                   q{..."},
 727                   pos $$textref;
 728          pos $$textref = $startpos;
 729          return;
 730      }
 731  
 732      my $op = $1;
 733      $preld1pos = pos($$textref);
 734      if ($op eq '<<') {
 735          $ld1pos = pos($$textref);
 736          my $label;
 737          if ($$textref =~ m{\G([A-Za-z_]\w*)}gc) {
 738              $label = $1;
 739          }
 740          elsif ($$textref =~ m{ \G ' ([^'\\]* (?:\\.[^'\\]*)*) '
 741                       | \G " ([^"\\]* (?:\\.[^"\\]*)*) "
 742                       | \G ` ([^`\\]* (?:\\.[^`\\]*)*) `
 743                       }gcsx) {
 744              $label = $+;
 745          }
 746          else {
 747              $label = "";
 748          }
 749          my $extrapos = pos($$textref);
 750          $$textref =~ m{.*\n}gc;
 751          $str1pos = pos($$textref)--;
 752          unless ($$textref =~ m{.*?\n(?=\Q$label\E\n)}gc) {
 753              _failmsg qq{Missing here doc terminator ('$label') after "} .
 754                       substr($$textref, $startpos, 20) .
 755                       q{..."},
 756                   pos $$textref;
 757              pos $$textref = $startpos;
 758              return;
 759          }
 760          $rd1pos = pos($$textref);
 761          $$textref =~ m{\Q$label\E\n}gc;
 762          $ld2pos = pos($$textref);
 763          return (
 764              $startpos,    $oppos-$startpos,    # PREFIX
 765              $oppos,        length($op),        # OPERATOR
 766              $ld1pos,    $extrapos-$ld1pos,    # LEFT DEL
 767              $str1pos,    $rd1pos-$str1pos,    # STR/PAT
 768              $rd1pos,    $ld2pos-$rd1pos,    # RIGHT DEL
 769              $ld2pos,    0,            # NO 2ND LDEL
 770              $ld2pos,    0,                    # NO 2ND STR
 771              $ld2pos,    0,                    # NO 2ND RDEL
 772              $ld2pos,    0,                      # NO MODIFIERS
 773              $ld2pos,    $textlen-$ld2pos,    # REMAINDER
 774              $extrapos,      $str1pos-$extrapos,    # FILLETED BIT
 775                 );
 776      }
 777  
 778      $$textref =~ m/\G\s*/gc;
 779      $ld1pos = pos($$textref);
 780      $str1pos = $ld1pos+1;
 781  
 782      unless ($$textref =~ m/\G(\S)/gc)    # SHOULD USE LOOKAHEAD
 783      {
 784          _failmsg "No block delimiter found after quotelike $op",
 785                   pos $$textref;
 786          pos $$textref = $startpos;
 787          return;
 788      }
 789      pos($$textref) = $ld1pos;    # HAVE TO DO THIS BECAUSE LOOKAHEAD BROKEN
 790      my ($ldel1, $rdel1) = ("\Q$1","\Q$1");
 791      if ($ldel1 =~ /[[(<{]/)
 792      {
 793          $rdel1 =~ tr/[({</])}>/;
 794          defined(_match_bracketed($textref,"",$ldel1,"","",$rdel1))
 795          || do { pos $$textref = $startpos; return };
 796          $ld2pos = pos($$textref);
 797          $rd1pos = $ld2pos-1;
 798      }
 799      else
 800      {
 801          $$textref =~ /\G$ldel1[^\\$ldel1]*(\\.[^\\$ldel1]*)*$ldel1/gcs
 802          || do { pos $$textref = $startpos; return };
 803          $ld2pos = $rd1pos = pos($$textref)-1;
 804      }
 805  
 806      my $second_arg = $op =~ /s|tr|y/ ? 1 : 0;
 807      if ($second_arg)
 808      {
 809          my ($ldel2, $rdel2);
 810          if ($ldel1 =~ /[[(<{]/)
 811          {
 812              unless ($$textref =~ /\G\s*(\S)/gc)    # SHOULD USE LOOKAHEAD
 813              {
 814                  _failmsg "Missing second block for quotelike $op",
 815                       pos $$textref;
 816                  pos $$textref = $startpos;
 817                  return;
 818              }
 819              $ldel2 = $rdel2 = "\Q$1";
 820              $rdel2 =~ tr/[({</])}>/;
 821          }
 822          else
 823          {
 824              $ldel2 = $rdel2 = $ldel1;
 825          }
 826          $str2pos = $ld2pos+1;
 827  
 828          if ($ldel2 =~ /[[(<{]/)
 829          {
 830              pos($$textref)--;    # OVERCOME BROKEN LOOKAHEAD 
 831              defined(_match_bracketed($textref,"",$ldel2,"","",$rdel2))
 832              || do { pos $$textref = $startpos; return };
 833          }
 834          else
 835          {
 836              $$textref =~ /[^\\$ldel2]*(\\.[^\\$ldel2]*)*$ldel2/gcs
 837              || do { pos $$textref = $startpos; return };
 838          }
 839          $rd2pos = pos($$textref)-1;
 840      }
 841      else
 842      {
 843          $ld2pos = $str2pos = $rd2pos = $rd1pos;
 844      }
 845  
 846      $modpos = pos $$textref;
 847  
 848      $$textref =~ m/\G($mods{$op})/gc;
 849      my $endpos = pos $$textref;
 850  
 851      return (
 852          $startpos,    $oppos-$startpos,    # PREFIX
 853          $oppos,        length($op),        # OPERATOR
 854          $ld1pos,    1,            # LEFT DEL
 855          $str1pos,    $rd1pos-$str1pos,    # STR/PAT
 856          $rd1pos,    1,            # RIGHT DEL
 857          $ld2pos,    $second_arg,        # 2ND LDEL (MAYBE)
 858          $str2pos,    $rd2pos-$str2pos,    # 2ND STR (MAYBE)
 859          $rd2pos,    $second_arg,        # 2ND RDEL (MAYBE)
 860          $modpos,    $endpos-$modpos,    # MODIFIERS
 861          $endpos,    $textlen-$endpos,    # REMAINDER
 862             );
 863  }
 864  
 865  my $def_func = 
 866  [
 867      sub { extract_variable($_[0], '') },
 868      sub { extract_quotelike($_[0],'') },
 869      sub { extract_codeblock($_[0],'{}','') },
 870  ];
 871  
 872  sub extract_multiple (;$$$$)    # ($text, $functions_ref, $max_fields, $ignoreunknown)
 873  {
 874      my $textref = defined($_[0]) ? \$_[0] : \$_;
 875      my $posbug = pos;
 876      my ($lastpos, $firstpos);
 877      my @fields = ();
 878  
 879      #for ($$textref)
 880      {
 881          my @func = defined $_[1] ? @{$_[1]} : @{$def_func};
 882          my $max  = defined $_[2] && $_[2]>0 ? $_[2] : 1_000_000_000;
 883          my $igunk = $_[3];
 884  
 885          pos $$textref ||= 0;
 886  
 887          unless (wantarray)
 888          {
 889              use Carp;
 890              carp "extract_multiple reset maximal count to 1 in scalar context"
 891                  if $^W && defined($_[2]) && $max > 1;
 892              $max = 1
 893          }
 894  
 895          my $unkpos;
 896          my $func;
 897          my $class;
 898  
 899          my @class;
 900          foreach $func ( @func )
 901          {
 902              if (ref($func) eq 'HASH')
 903              {
 904                  push @class, (keys %$func)[0];
 905                  $func = (values %$func)[0];
 906              }
 907              else
 908              {
 909                  push @class, undef;
 910              }
 911          }
 912  
 913          FIELD: while (pos($$textref) < length($$textref))
 914          {
 915              my ($field, $rem);
 916              my @bits;
 917              foreach my $i ( 0..$#func )
 918              {
 919                  my $pref;
 920                  $func = $func[$i];
 921                  $class = $class[$i];
 922                  $lastpos = pos $$textref;
 923                  if (ref($func) eq 'CODE')
 924                      { ($field,$rem,$pref) = @bits = $func->($$textref) }
 925                  elsif (ref($func) eq 'Text::Balanced::Extractor')
 926                      { @bits = $field = $func->extract($$textref) }
 927                  elsif( $$textref =~ m/\G$func/gc )
 928                      { @bits = $field = defined($1)
 929                                  ? $1
 930                                  : substr($$textref, $-[0], $+[0] - $-[0])
 931                      }
 932                  $pref ||= "";
 933                  if (defined($field) && length($field))
 934                  {
 935                      if (!$igunk) {
 936                          $unkpos = $lastpos
 937                              if length($pref) && !defined($unkpos);
 938                          if (defined $unkpos)
 939                          {
 940                              push @fields, substr($$textref, $unkpos, $lastpos-$unkpos).$pref;
 941                              $firstpos = $unkpos unless defined $firstpos;
 942                              undef $unkpos;
 943                              last FIELD if @fields == $max;
 944                          }
 945                      }
 946                      push @fields, $class
 947                          ? bless (\$field, $class)
 948                          : $field;
 949                      $firstpos = $lastpos unless defined $firstpos;
 950                      $lastpos = pos $$textref;
 951                      last FIELD if @fields == $max;
 952                      next FIELD;
 953                  }
 954              }
 955              if ($$textref =~ /\G(.)/gcs)
 956              {
 957                  $unkpos = pos($$textref)-1
 958                      unless $igunk || defined $unkpos;
 959              }
 960          }
 961          
 962          if (defined $unkpos)
 963          {
 964              push @fields, substr($$textref, $unkpos);
 965              $firstpos = $unkpos unless defined $firstpos;
 966              $lastpos = length $$textref;
 967          }
 968          last;
 969      }
 970  
 971      pos $$textref = $lastpos;
 972      return @fields if wantarray;
 973  
 974      $firstpos ||= 0;
 975      eval { substr($$textref,$firstpos,$lastpos-$firstpos)="";
 976             pos $$textref = $firstpos };
 977      return $fields[0];
 978  }
 979  
 980  
 981  sub gen_extract_tagged # ($opentag, $closetag, $pre, \%options)
 982  {
 983      my $ldel    = $_[0];
 984      my $rdel    = $_[1];
 985      my $pre     = defined $_[2] ? $_[2] : '\s*';
 986      my %options = defined $_[3] ? %{$_[3]} : ();
 987      my $omode   = defined $options{fail} ? $options{fail} : '';
 988      my $bad     = ref($options{reject}) eq 'ARRAY' ? join('|', @{$options{reject}})
 989              : defined($options{reject})           ? $options{reject}
 990              :                     ''
 991              ;
 992      my $ignore  = ref($options{ignore}) eq 'ARRAY' ? join('|', @{$options{ignore}})
 993              : defined($options{ignore})           ? $options{ignore}
 994              :                     ''
 995              ;
 996  
 997      if (!defined $ldel) { $ldel = '<\w+(?:' . gen_delimited_pat(q{'"}) . '|[^>])*>'; }
 998  
 999      my $posbug = pos;
1000      for ($ldel, $pre, $bad, $ignore) { $_ = qr/$_/ if $_ }
1001      pos = $posbug;
1002  
1003      my $closure = sub
1004      {
1005          my $textref = defined $_[0] ? \$_[0] : \$_;
1006          my @match = Text::Balanced::_match_tagged($textref, $pre, $ldel, $rdel, $omode, $bad, $ignore);
1007  
1008          return _fail(wantarray, $textref) unless @match;
1009          return _succeed wantarray, $textref,
1010                  $match[2], $match[3]+$match[5]+$match[7],    # MATCH
1011                  @match[8..9,0..1,2..7];                # REM, PRE, BITS
1012      };
1013  
1014      bless $closure, 'Text::Balanced::Extractor';
1015  }
1016  
1017  package Text::Balanced::Extractor;
1018  
1019  sub extract($$)    # ($self, $text)
1020  {
1021      &{$_[0]}($_[1]);
1022  }
1023  
1024  package Text::Balanced::ErrorMsg;
1025  
1026  use overload '""' => sub { "$_[0]->{error}, detected at offset $_[0]->{pos}" };
1027  
1028  1;
1029  
1030  __END__
1031  
1032  =head1 NAME
1033  
1034  Text::Balanced - Extract delimited text sequences from strings.
1035  
1036  
1037  =head1 SYNOPSIS
1038  
1039   use Text::Balanced qw (
1040              extract_delimited
1041              extract_bracketed
1042              extract_quotelike
1043              extract_codeblock
1044              extract_variable
1045              extract_tagged
1046              extract_multiple
1047  
1048              gen_delimited_pat
1049              gen_extract_tagged
1050                 );
1051  
1052   # Extract the initial substring of $text that is delimited by
1053   # two (unescaped) instances of the first character in $delim.
1054  
1055      ($extracted, $remainder) = extract_delimited($text,$delim);
1056  
1057  
1058   # Extract the initial substring of $text that is bracketed
1059   # with a delimiter(s) specified by $delim (where the string
1060   # in $delim contains one or more of '(){}[]<>').
1061  
1062      ($extracted, $remainder) = extract_bracketed($text,$delim);
1063  
1064  
1065   # Extract the initial substring of $text that is bounded by
1066   # an XML tag.
1067  
1068      ($extracted, $remainder) = extract_tagged($text);
1069  
1070  
1071   # Extract the initial substring of $text that is bounded by
1072   # a C<BEGIN>...C<END> pair. Don't allow nested C<BEGIN> tags
1073  
1074      ($extracted, $remainder) =
1075          extract_tagged($text,"BEGIN","END",undef,{bad=>["BEGIN"]});
1076  
1077  
1078   # Extract the initial substring of $text that represents a
1079   # Perl "quote or quote-like operation"
1080  
1081      ($extracted, $remainder) = extract_quotelike($text);
1082  
1083  
1084   # Extract the initial substring of $text that represents a block
1085   # of Perl code, bracketed by any of character(s) specified by $delim
1086   # (where the string $delim contains one or more of '(){}[]<>').
1087  
1088      ($extracted, $remainder) = extract_codeblock($text,$delim);
1089  
1090  
1091   # Extract the initial substrings of $text that would be extracted by
1092   # one or more sequential applications of the specified functions
1093   # or regular expressions
1094  
1095      @extracted = extract_multiple($text,
1096                        [ \&extract_bracketed,
1097                      \&extract_quotelike,
1098                      \&some_other_extractor_sub,
1099                      qr/[xyz]*/,
1100                      'literal',
1101                        ]);
1102  
1103  # Create a string representing an optimized pattern (a la Friedl)
1104  # that matches a substring delimited by any of the specified characters
1105  # (in this case: any type of quote or a slash)
1106  
1107      $patstring = gen_delimited_pat(q{'"`/});
1108  
1109  
1110  # Generate a reference to an anonymous sub that is just like extract_tagged
1111  # but pre-compiled and optimized for a specific pair of tags, and consequently
1112  # much faster (i.e. 3 times faster). It uses qr// for better performance on
1113  # repeated calls, so it only works under Perl 5.005 or later.
1114  
1115      $extract_head = gen_extract_tagged('<HEAD>','</HEAD>');
1116  
1117      ($extracted, $remainder) = $extract_head->($text);
1118  
1119  
1120  =head1 DESCRIPTION
1121  
1122  The various C<extract_...> subroutines may be used to
1123  extract a delimited substring, possibly after skipping a
1124  specified prefix string. By default, that prefix is
1125  optional whitespace (C</\s*/>), but you can change it to whatever
1126  you wish (see below).
1127  
1128  The substring to be extracted must appear at the
1129  current C<pos> location of the string's variable
1130  (or at index zero, if no C<pos> position is defined).
1131  In other words, the C<extract_...> subroutines I<don't>
1132  extract the first occurrence of a substring anywhere
1133  in a string (like an unanchored regex would). Rather,
1134  they extract an occurrence of the substring appearing
1135  immediately at the current matching position in the
1136  string (like a C<\G>-anchored regex would).
1137  
1138  
1139  
1140  =head2 General behaviour in list contexts
1141  
1142  In a list context, all the subroutines return a list, the first three
1143  elements of which are always:
1144  
1145  =over 4
1146  
1147  =item [0]
1148  
1149  The extracted string, including the specified delimiters.
1150  If the extraction fails C<undef> is returned.
1151  
1152  =item [1]
1153  
1154  The remainder of the input string (i.e. the characters after the
1155  extracted string). On failure, the entire string is returned.
1156  
1157  =item [2]
1158  
1159  The skipped prefix (i.e. the characters before the extracted string).
1160  On failure, C<undef> is returned.
1161  
1162  =back 
1163  
1164  Note that in a list context, the contents of the original input text (the first
1165  argument) are not modified in any way. 
1166  
1167  However, if the input text was passed in a variable, that variable's
1168  C<pos> value is updated to point at the first character after the
1169  extracted text. That means that in a list context the various
1170  subroutines can be used much like regular expressions. For example:
1171  
1172      while ( $next = (extract_quotelike($text))[0] )
1173      {
1174          # process next quote-like (in $next)
1175      }
1176  
1177  
1178  =head2 General behaviour in scalar and void contexts
1179  
1180  In a scalar context, the extracted string is returned, having first been
1181  removed from the input text. Thus, the following code also processes
1182  each quote-like operation, but actually removes them from $text:
1183  
1184      while ( $next = extract_quotelike($text) )
1185      {
1186          # process next quote-like (in $next)
1187      }
1188  
1189  Note that if the input text is a read-only string (i.e. a literal),
1190  no attempt is made to remove the extracted text.
1191  
1192  In a void context the behaviour of the extraction subroutines is
1193  exactly the same as in a scalar context, except (of course) that the
1194  extracted substring is not returned.
1195  
1196  =head2 A note about prefixes
1197  
1198  Prefix patterns are matched without any trailing modifiers (C</gimsox> etc.)
1199  This can bite you if you're expecting a prefix specification like
1200  '.*?(?=<H1>)' to skip everything up to the first <H1> tag. Such a prefix
1201  pattern will only succeed if the <H1> tag is on the current line, since
1202  . normally doesn't match newlines.
1203  
1204  To overcome this limitation, you need to turn on /s matching within
1205  the prefix pattern, using the C<(?s)> directive: '(?s).*?(?=<H1>)'
1206  
1207  
1208  =head2 C<extract_delimited>
1209  
1210  The C<extract_delimited> function formalizes the common idiom
1211  of extracting a single-character-delimited substring from the start of
1212  a string. For example, to extract a single-quote delimited string, the
1213  following code is typically used:
1214  
1215      ($remainder = $text) =~ s/\A('(\\.|[^'])*')//s;
1216      $extracted = $1;
1217  
1218  but with C<extract_delimited> it can be simplified to:
1219  
1220      ($extracted,$remainder) = extract_delimited($text, "'");
1221  
1222  C<extract_delimited> takes up to four scalars (the input text, the
1223  delimiters, a prefix pattern to be skipped, and any escape characters)
1224  and extracts the initial substring of the text that
1225  is appropriately delimited. If the delimiter string has multiple
1226  characters, the first one encountered in the text is taken to delimit
1227  the substring.
1228  The third argument specifies a prefix pattern that is to be skipped
1229  (but must be present!) before the substring is extracted.
1230  The final argument specifies the escape character to be used for each
1231  delimiter.
1232  
1233  All arguments are optional. If the escape characters are not specified,
1234  every delimiter is escaped with a backslash (C<\>).
1235  If the prefix is not specified, the
1236  pattern C<'\s*'> - optional whitespace - is used. If the delimiter set
1237  is also not specified, the set C</["'`]/> is used. If the text to be processed
1238  is not specified either, C<$_> is used.
1239  
1240  In list context, C<extract_delimited> returns a array of three
1241  elements, the extracted substring (I<including the surrounding
1242  delimiters>), the remainder of the text, and the skipped prefix (if
1243  any). If a suitable delimited substring is not found, the first
1244  element of the array is the empty string, the second is the complete
1245  original text, and the prefix returned in the third element is an
1246  empty string.
1247  
1248  In a scalar context, just the extracted substring is returned. In
1249  a void context, the extracted substring (and any prefix) are simply
1250  removed from the beginning of the first argument.
1251  
1252  Examples:
1253  
1254      # Remove a single-quoted substring from the very beginning of $text:
1255  
1256          $substring = extract_delimited($text, "'", '');
1257  
1258      # Remove a single-quoted Pascalish substring (i.e. one in which
1259      # doubling the quote character escapes it) from the very
1260      # beginning of $text:
1261  
1262          $substring = extract_delimited($text, "'", '', "'");
1263  
1264      # Extract a single- or double- quoted substring from the
1265      # beginning of $text, optionally after some whitespace
1266      # (note the list context to protect $text from modification):
1267  
1268          ($substring) = extract_delimited $text, q{"'};
1269  
1270  
1271      # Delete the substring delimited by the first '/' in $text:
1272  
1273          $text = join '', (extract_delimited($text,'/','[^/]*')[2,1];
1274  
1275  Note that this last example is I<not> the same as deleting the first
1276  quote-like pattern. For instance, if C<$text> contained the string:
1277  
1278      "if ('./cmd' =~ m/$UNIXCMD/s) { $cmd = $1; }"
1279      
1280  then after the deletion it would contain:
1281  
1282      "if ('.$UNIXCMD/s) { $cmd = $1; }"
1283  
1284  not:
1285  
1286      "if ('./cmd' =~ ms) { $cmd = $1; }"
1287      
1288  
1289  See L<"extract_quotelike"> for a (partial) solution to this problem.
1290  
1291  
1292  =head2 C<extract_bracketed>
1293  
1294  Like C<"extract_delimited">, the C<extract_bracketed> function takes
1295  up to three optional scalar arguments: a string to extract from, a delimiter
1296  specifier, and a prefix pattern. As before, a missing prefix defaults to
1297  optional whitespace and a missing text defaults to C<$_>. However, a missing
1298  delimiter specifier defaults to C<'{}()[]E<lt>E<gt>'> (see below).
1299  
1300  C<extract_bracketed> extracts a balanced-bracket-delimited
1301  substring (using any one (or more) of the user-specified delimiter
1302  brackets: '(..)', '{..}', '[..]', or '<..>'). Optionally it will also
1303  respect quoted unbalanced brackets (see below).
1304  
1305  A "delimiter bracket" is a bracket in list of delimiters passed as
1306  C<extract_bracketed>'s second argument. Delimiter brackets are
1307  specified by giving either the left or right (or both!) versions
1308  of the required bracket(s). Note that the order in which
1309  two or more delimiter brackets are specified is not significant.
1310  
1311  A "balanced-bracket-delimited substring" is a substring bounded by
1312  matched brackets, such that any other (left or right) delimiter
1313  bracket I<within> the substring is also matched by an opposite
1314  (right or left) delimiter bracket I<at the same level of nesting>. Any
1315  type of bracket not in the delimiter list is treated as an ordinary
1316  character.
1317  
1318  In other words, each type of bracket specified as a delimiter must be
1319  balanced and correctly nested within the substring, and any other kind of
1320  ("non-delimiter") bracket in the substring is ignored.
1321  
1322  For example, given the string:
1323  
1324      $text = "{ an '[irregularly :-(] {} parenthesized >:-)' string }";
1325  
1326  then a call to C<extract_bracketed> in a list context:
1327  
1328      @result = extract_bracketed( $text, '{}' );
1329  
1330  would return:
1331  
1332      ( "{ an '[irregularly :-(] {} parenthesized >:-)' string }" , "" , "" )
1333  
1334  since both sets of C<'{..}'> brackets are properly nested and evenly balanced.
1335  (In a scalar context just the first element of the array would be returned. In
1336  a void context, C<$text> would be replaced by an empty string.)
1337  
1338  Likewise the call in:
1339  
1340      @result = extract_bracketed( $text, '{[' );
1341  
1342  would return the same result, since all sets of both types of specified
1343  delimiter brackets are correctly nested and balanced.
1344  
1345  However, the call in:
1346  
1347      @result = extract_bracketed( $text, '{([<' );
1348  
1349  would fail, returning:
1350  
1351      ( undef , "{ an '[irregularly :-(] {} parenthesized >:-)' string }"  );
1352  
1353  because the embedded pairs of C<'(..)'>s and C<'[..]'>s are "cross-nested" and
1354  the embedded C<'E<gt>'> is unbalanced. (In a scalar context, this call would
1355  return an empty string. In a void context, C<$text> would be unchanged.)
1356  
1357  Note that the embedded single-quotes in the string don't help in this
1358  case, since they have not been specified as acceptable delimiters and are
1359  therefore treated as non-delimiter characters (and ignored).
1360  
1361  However, if a particular species of quote character is included in the
1362  delimiter specification, then that type of quote will be correctly handled.
1363  for example, if C<$text> is:
1364  
1365      $text = '<A HREF=">>>>">link</A>';
1366  
1367  then
1368  
1369      @result = extract_bracketed( $text, '<">' );
1370  
1371  returns:
1372  
1373      ( '<A HREF=">>>>">', 'link</A>', "" )
1374  
1375  as expected. Without the specification of C<"> as an embedded quoter:
1376  
1377      @result = extract_bracketed( $text, '<>' );
1378  
1379  the result would be:
1380  
1381      ( '<A HREF=">', '>>>">link</A>', "" )
1382  
1383  In addition to the quote delimiters C<'>, C<">, and C<`>, full Perl quote-like
1384  quoting (i.e. q{string}, qq{string}, etc) can be specified by including the
1385  letter 'q' as a delimiter. Hence:
1386  
1387      @result = extract_bracketed( $text, '<q>' );
1388  
1389  would correctly match something like this:
1390  
1391      $text = '<leftop: conj /and/ conj>';
1392  
1393  See also: C<"extract_quotelike"> and C<"extract_codeblock">.
1394  
1395  
1396  =head2 C<extract_variable>
1397  
1398  C<extract_variable> extracts any valid Perl variable or
1399  variable-involved expression, including scalars, arrays, hashes, array
1400  accesses, hash look-ups, method calls through objects, subroutine calls
1401  through subroutine references, etc.
1402  
1403  The subroutine takes up to two optional arguments:
1404  
1405  =over 4
1406  
1407  =item 1.
1408  
1409  A string to be processed (C<$_> if the string is omitted or C<undef>)
1410  
1411  =item 2.
1412  
1413  A string specifying a pattern to be matched as a prefix (which is to be
1414  skipped). If omitted, optional whitespace is skipped.
1415  
1416  =back
1417  
1418  On success in a list context, an array of 3 elements is returned. The
1419  elements are:
1420  
1421  =over 4
1422  
1423  =item [0]
1424  
1425  the extracted variable, or variablish expression
1426  
1427  =item [1]
1428  
1429  the remainder of the input text,
1430  
1431  =item [2]
1432  
1433  the prefix substring (if any),
1434  
1435  =back
1436  
1437  On failure, all of these values (except the remaining text) are C<undef>.
1438  
1439  In a scalar context, C<extract_variable> returns just the complete
1440  substring that matched a variablish expression. C<undef> is returned on
1441  failure. In addition, the original input text has the returned substring
1442  (and any prefix) removed from it.
1443  
1444  In a void context, the input text just has the matched substring (and
1445  any specified prefix) removed.
1446  
1447  
1448  =head2 C<extract_tagged>
1449  
1450  C<extract_tagged> extracts and segments text between (balanced)
1451  specified tags. 
1452  
1453  The subroutine takes up to five optional arguments:
1454  
1455  =over 4
1456  
1457  =item 1.
1458  
1459  A string to be processed (C<$_> if the string is omitted or C<undef>)
1460  
1461  =item 2.
1462  
1463  A string specifying a pattern to be matched as the opening tag.
1464  If the pattern string is omitted (or C<undef>) then a pattern
1465  that matches any standard XML tag is used.
1466  
1467  =item 3.
1468  
1469  A string specifying a pattern to be matched at the closing tag. 
1470  If the pattern string is omitted (or C<undef>) then the closing
1471  tag is constructed by inserting a C</> after any leading bracket
1472  characters in the actual opening tag that was matched (I<not> the pattern
1473  that matched the tag). For example, if the opening tag pattern
1474  is specified as C<'{{\w+}}'> and actually matched the opening tag 
1475  C<"{{DATA}}">, then the constructed closing tag would be C<"{{/DATA}}">.
1476  
1477  =item 4.
1478  
1479  A string specifying a pattern to be matched as a prefix (which is to be
1480  skipped). If omitted, optional whitespace is skipped.
1481  
1482  =item 5.
1483  
1484  A hash reference containing various parsing options (see below)
1485  
1486  =back
1487  
1488  The various options that can be specified are:
1489  
1490  =over 4
1491  
1492  =item C<reject =E<gt> $listref>
1493  
1494  The list reference contains one or more strings specifying patterns
1495  that must I<not> appear within the tagged text.
1496  
1497  For example, to extract
1498  an HTML link (which should not contain nested links) use:
1499  
1500          extract_tagged($text, '<A>', '</A>', undef, {reject => ['<A>']} );
1501  
1502  =item C<ignore =E<gt> $listref>
1503  
1504  The list reference contains one or more strings specifying patterns
1505  that are I<not> be be treated as nested tags within the tagged text
1506  (even if they would match the start tag pattern).
1507  
1508  For example, to extract an arbitrary XML tag, but ignore "empty" elements:
1509  
1510          extract_tagged($text, undef, undef, undef, {ignore => ['<[^>]*/>']} );
1511  
1512  (also see L<"gen_delimited_pat"> below).
1513  
1514  
1515  =item C<fail =E<gt> $str>
1516  
1517  The C<fail> option indicates the action to be taken if a matching end
1518  tag is not encountered (i.e. before the end of the string or some
1519  C<reject> pattern matches). By default, a failure to match a closing
1520  tag causes C<extract_tagged> to immediately fail.
1521  
1522  However, if the string value associated with <reject> is "MAX", then
1523  C<extract_tagged> returns the complete text up to the point of failure.
1524  If the string is "PARA", C<extract_tagged> returns only the first paragraph
1525  after the tag (up to the first line that is either empty or contains
1526  only whitespace characters).
1527  If the string is "", the the default behaviour (i.e. failure) is reinstated.
1528  
1529  For example, suppose the start tag "/para" introduces a paragraph, which then
1530  continues until the next "/endpara" tag or until another "/para" tag is
1531  encountered:
1532  
1533          $text = "/para line 1\n\nline 3\n/para line 4";
1534  
1535          extract_tagged($text, '/para', '/endpara', undef,
1536                                  {reject => '/para', fail => MAX );
1537  
1538          # EXTRACTED: "/para line 1\n\nline 3\n"
1539  
1540  Suppose instead, that if no matching "/endpara" tag is found, the "/para"
1541  tag refers only to the immediately following paragraph:
1542  
1543          $text = "/para line 1\n\nline 3\n/para line 4";
1544  
1545          extract_tagged($text, '/para', '/endpara', undef,
1546                          {reject => '/para', fail => MAX );
1547  
1548          # EXTRACTED: "/para line 1\n"
1549  
1550  Note that the specified C<fail> behaviour applies to nested tags as well.
1551  
1552  =back
1553  
1554  On success in a list context, an array of 6 elements is returned. The elements are:
1555  
1556  =over 4
1557  
1558  =item [0]
1559  
1560  the extracted tagged substring (including the outermost tags),
1561  
1562  =item [1]
1563  
1564  the remainder of the input text,
1565  
1566  =item [2]
1567  
1568  the prefix substring (if any),
1569  
1570  =item [3]
1571  
1572  the opening tag
1573  
1574  =item [4]
1575  
1576  the text between the opening and closing tags
1577  
1578  =item [5]
1579  
1580  the closing tag (or "" if no closing tag was found)
1581  
1582  =back
1583  
1584  On failure, all of these values (except the remaining text) are C<undef>.
1585  
1586  In a scalar context, C<extract_tagged> returns just the complete
1587  substring that matched a tagged text (including the start and end
1588  tags). C<undef> is returned on failure. In addition, the original input
1589  text has the returned substring (and any prefix) removed from it.
1590  
1591  In a void context, the input text just has the matched substring (and
1592  any specified prefix) removed.
1593  
1594  
1595  =head2 C<gen_extract_tagged>
1596  
1597  (Note: This subroutine is only available under Perl5.005)
1598  
1599  C<gen_extract_tagged> generates a new anonymous subroutine which
1600  extracts text between (balanced) specified tags. In other words,
1601  it generates a function identical in function to C<extract_tagged>.
1602  
1603  The difference between C<extract_tagged> and the anonymous
1604  subroutines generated by
1605  C<gen_extract_tagged>, is that those generated subroutines:
1606  
1607  =over 4
1608  
1609  =item * 
1610  
1611  do not have to reparse tag specification or parsing options every time
1612  they are called (whereas C<extract_tagged> has to effectively rebuild
1613  its tag parser on every call);
1614  
1615  =item *
1616  
1617  make use of the new qr// construct to pre-compile the regexes they use
1618  (whereas C<extract_tagged> uses standard string variable interpolation 
1619  to create tag-matching patterns).
1620  
1621  =back
1622  
1623  The subroutine takes up to four optional arguments (the same set as
1624  C<extract_tagged> except for the string to be processed). It returns
1625  a reference to a subroutine which in turn takes a single argument (the text to
1626  be extracted from).
1627  
1628  In other words, the implementation of C<extract_tagged> is exactly
1629  equivalent to:
1630  
1631          sub extract_tagged
1632          {
1633                  my $text = shift;
1634                  $extractor = gen_extract_tagged(@_);
1635                  return $extractor->($text);
1636          }
1637  
1638  (although C<extract_tagged> is not currently implemented that way, in order
1639  to preserve pre-5.005 compatibility).
1640  
1641  Using C<gen_extract_tagged> to create extraction functions for specific tags 
1642  is a good idea if those functions are going to be called more than once, since
1643  their performance is typically twice as good as the more general-purpose
1644  C<extract_tagged>.
1645  
1646  
1647  =head2 C<extract_quotelike>
1648  
1649  C<extract_quotelike> attempts to recognize, extract, and segment any
1650  one of the various Perl quotes and quotelike operators (see
1651  L<perlop(3)>) Nested backslashed delimiters, embedded balanced bracket
1652  delimiters (for the quotelike operators), and trailing modifiers are
1653  all caught. For example, in:
1654  
1655          extract_quotelike 'q # an octothorpe: \# (not the end of the q!) #'
1656          
1657          extract_quotelike '  "You said, \"Use sed\"."  '
1658  
1659          extract_quotelike ' s{([A-Z]{1,8}\.[A-Z]{3})} /\L$1\E/; '
1660  
1661          extract_quotelike ' tr/\\\/\\\\/\\\//ds; '
1662  
1663  the full Perl quotelike operations are all extracted correctly.
1664  
1665  Note too that, when using the /x modifier on a regex, any comment
1666  containing the current pattern delimiter will cause the regex to be
1667  immediately terminated. In other words:
1668  
1669          'm /
1670                  (?i)            # CASE INSENSITIVE
1671                  [a-z_]          # LEADING ALPHABETIC/UNDERSCORE
1672                  [a-z0-9]*       # FOLLOWED BY ANY NUMBER OF ALPHANUMERICS
1673             /x'
1674  
1675  will be extracted as if it were:
1676  
1677          'm /
1678                  (?i)            # CASE INSENSITIVE
1679                  [a-z_]          # LEADING ALPHABETIC/'
1680  
1681  This behaviour is identical to that of the actual compiler.
1682  
1683  C<extract_quotelike> takes two arguments: the text to be processed and
1684  a prefix to be matched at the very beginning of the text. If no prefix 
1685  is specified, optional whitespace is the default. If no text is given,
1686  C<$_> is used.
1687  
1688  In a list context, an array of 11 elements is returned. The elements are:
1689  
1690  =over 4
1691  
1692  =item [0]
1693  
1694  the extracted quotelike substring (including trailing modifiers),
1695  
1696  =item [1]
1697  
1698  the remainder of the input text,
1699  
1700  =item [2]
1701  
1702  the prefix substring (if any),
1703  
1704  =item [3]
1705  
1706  the name of the quotelike operator (if any),
1707  
1708  =item [4]
1709  
1710  the left delimiter of the first block of the operation,
1711  
1712  =item [5]
1713  
1714  the text of the first block of the operation
1715  (that is, the contents of
1716  a quote, the regex of a match or substitution or the target list of a
1717  translation),
1718  
1719  =item [6]
1720  
1721  the right delimiter of the first block of the operation,
1722  
1723  =item [7]
1724  
1725  the left delimiter of the second block of the operation
1726  (that is, if it is a C<s>, C<tr>, or C<y>),
1727  
1728  =item [8]
1729  
1730  the text of the second block of the operation 
1731  (that is, the replacement of a substitution or the translation list
1732  of a translation),
1733  
1734  =item [9]
1735  
1736  the right delimiter of the second block of the operation (if any),
1737  
1738  =item [10]
1739  
1740  the trailing modifiers on the operation (if any).
1741  
1742  =back
1743  
1744  For each of the fields marked "(if any)" the default value on success is
1745  an empty string.
1746  On failure, all of these values (except the remaining text) are C<undef>.
1747  
1748  
1749  In a scalar context, C<extract_quotelike> returns just the complete substring
1750  that matched a quotelike operation (or C<undef> on failure). In a scalar or
1751  void context, the input text has the same substring (and any specified
1752  prefix) removed.
1753  
1754  Examples:
1755  
1756          # Remove the first quotelike literal that appears in text
1757  
1758                  $quotelike = extract_quotelike($text,'.*?');
1759  
1760          # Replace one or more leading whitespace-separated quotelike
1761          # literals in $_ with "<QLL>"
1762  
1763                  do { $_ = join '<QLL>', (extract_quotelike)[2,1] } until $@;
1764  
1765  
1766          # Isolate the search pattern in a quotelike operation from $text
1767  
1768                  ($op,$pat) = (extract_quotelike $text)[3,5];
1769                  if ($op =~ /[ms]/)
1770                  {
1771                          print "search pattern: $pat\n";
1772                  }
1773                  else
1774                  {
1775                          print "$op is not a pattern matching operation\n";
1776                  }
1777  
1778  
1779  =head2 C<extract_quotelike> and "here documents"
1780  
1781  C<extract_quotelike> can successfully extract "here documents" from an input
1782  string, but with an important caveat in list contexts.
1783  
1784  Unlike other types of quote-like literals, a here document is rarely
1785  a contiguous substring. For example, a typical piece of code using
1786  here document might look like this:
1787  
1788          <<'EOMSG' || die;
1789          This is the message.
1790          EOMSG
1791          exit;
1792  
1793  Given this as an input string in a scalar context, C<extract_quotelike>
1794  would correctly return the string "<<'EOMSG'\nThis is the message.\nEOMSG",
1795  leaving the string " || die;\nexit;" in the original variable. In other words,
1796  the two separate pieces of the here document are successfully extracted and
1797  concatenated.
1798  
1799  In a list context, C<extract_quotelike> would return the list
1800  
1801  =over 4
1802  
1803  =item [0]
1804  
1805  "<<'EOMSG'\nThis is the message.\nEOMSG\n" (i.e. the full extracted here document,
1806  including fore and aft delimiters),
1807  
1808  =item [1]
1809  
1810  " || die;\nexit;" (i.e. the remainder of the input text, concatenated),
1811  
1812  =item [2]
1813  
1814  "" (i.e. the prefix substring -- trivial in this case),
1815  
1816  =item [3]
1817  
1818  "<<" (i.e. the "name" of the quotelike operator)
1819  
1820  =item [4]
1821  
1822  "'EOMSG'" (i.e. the left delimiter of the here document, including any quotes),
1823  
1824  =item [5]
1825  
1826  "This is the message.\n" (i.e. the text of the here document),
1827  
1828  =item [6]
1829  
1830  "EOMSG" (i.e. the right delimiter of the here document),
1831  
1832  =item [7..10]
1833  
1834  "" (a here document has no second left delimiter, second text, second right
1835  delimiter, or trailing modifiers).
1836  
1837  =back
1838  
1839  However, the matching position of the input variable would be set to
1840  "exit;" (i.e. I<after> the closing delimiter of the here document),
1841  which would cause the earlier " || die;\nexit;" to be skipped in any
1842  sequence of code fragment extractions.
1843  
1844  To avoid this problem, when it encounters a here document whilst
1845  extracting from a modifiable string, C<extract_quotelike> silently
1846  rearranges the string to an equivalent piece of Perl:
1847  
1848          <<'EOMSG'
1849          This is the message.
1850          EOMSG
1851          || die;
1852          exit;
1853  
1854  in which the here document I<is> contiguous. It still leaves the
1855  matching position after the here document, but now the rest of the line
1856  on which the here document starts is not skipped.
1857  
1858  To prevent <extract_quotelike> from mucking about with the input in this way
1859  (this is the only case where a list-context C<extract_quotelike> does so),
1860  you can pass the input variable as an interpolated literal:
1861  
1862          $quotelike = extract_quotelike("$var");
1863  
1864  
1865  =head2 C<extract_codeblock>
1866  
1867  C<extract_codeblock> attempts to recognize and extract a balanced
1868  bracket delimited substring that may contain unbalanced brackets
1869  inside Perl quotes or quotelike operations. That is, C<extract_codeblock>
1870  is like a combination of C<"extract_bracketed"> and
1871  C<"extract_quotelike">.
1872  
1873  C<extract_codeblock> takes the same initial three parameters as C<extract_bracketed>:
1874  a text to process, a set of delimiter brackets to look for, and a prefix to
1875  match first. It also takes an optional fourth parameter, which allows the
1876  outermost delimiter brackets to be specified separately (see below).
1877  
1878  Omitting the first argument (input text) means process C<$_> instead.
1879  Omitting the second argument (delimiter brackets) indicates that only C<'{'> is to be used.
1880  Omitting the third argument (prefix argument) implies optional whitespace at the start.
1881  Omitting the fourth argument (outermost delimiter brackets) indicates that the
1882  value of the second argument is to be used for the outermost delimiters.
1883  
1884  Once the prefix an dthe outermost opening delimiter bracket have been
1885  recognized, code blocks are extracted by stepping through the input text and
1886  trying the following alternatives in sequence:
1887  
1888  =over 4
1889  
1890  =item 1.
1891  
1892  Try and match a closing delimiter bracket. If the bracket was the same
1893  species as the last opening bracket, return the substring to that
1894  point. If the bracket was mismatched, return an error.
1895  
1896  =item 2.
1897  
1898  Try to match a quote or quotelike operator. If found, call
1899  C<extract_quotelike> to eat it. If C<extract_quotelike> fails, return
1900  the error it returned. Otherwise go back to step 1.
1901  
1902  =item 3.
1903  
1904  Try to match an opening delimiter bracket. If found, call
1905  C<extract_codeblock> recursively to eat the embedded block. If the
1906  recursive call fails, return an error. Otherwise, go back to step 1.
1907  
1908  =item 4.
1909  
1910  Unconditionally match a bareword or any other single character, and
1911  then go back to step 1.
1912  
1913  =back
1914  
1915  
1916  Examples:
1917  
1918          # Find a while loop in the text
1919  
1920                  if ($text =~ s/.*?while\s*\{/{/)
1921                  {
1922                          $loop = "while " . extract_codeblock($text);
1923                  }
1924  
1925          # Remove the first round-bracketed list (which may include
1926          # round- or curly-bracketed code blocks or quotelike operators)
1927  
1928                  extract_codeblock $text, "(){}", '[^(]*';
1929  
1930  
1931  The ability to specify a different outermost delimiter bracket is useful
1932  in some circumstances. For example, in the Parse::RecDescent module,
1933  parser actions which are to be performed only on a successful parse
1934  are specified using a C<E<lt>defer:...E<gt>> directive. For example:
1935  
1936          sentence: subject verb object
1937                          <defer: {$::theVerb = $item{verb}} >
1938  
1939  Parse::RecDescent uses C<extract_codeblock($text, '{}E<lt>E<gt>')> to extract the code
1940  within the C<E<lt>defer:...E<gt>> directive, but there's a problem.
1941  
1942  A deferred action like this:
1943  
1944                          <defer: {if ($count>10) {$count--}} >
1945  
1946  will be incorrectly parsed as:
1947  
1948                          <defer: {if ($count>
1949  
1950  because the "less than" operator is interpreted as a closing delimiter.
1951  
1952  But, by extracting the directive using
1953  S<C<extract_codeblock($text, '{}', undef, 'E<lt>E<gt>')>>
1954  the '>' character is only treated as a delimited at the outermost
1955  level of the code block, so the directive is parsed correctly.
1956  
1957  =head2 C<extract_multiple>
1958  
1959  The C<extract_multiple> subroutine takes a string to be processed and a 
1960  list of extractors (subroutines or regular expressions) to apply to that string.
1961  
1962  In an array context C<extract_multiple> returns an array of substrings
1963  of the original string, as extracted by the specified extractors.
1964  In a scalar context, C<extract_multiple> returns the first
1965  substring successfully extracted from the original string. In both
1966  scalar and void contexts the original string has the first successfully
1967  extracted substring removed from it. In all contexts
1968  C<extract_multiple> starts at the current C<pos> of the string, and
1969  sets that C<pos> appropriately after it matches.
1970  
1971  Hence, the aim of of a call to C<extract_multiple> in a list context
1972  is to split the processed string into as many non-overlapping fields as
1973  possible, by repeatedly applying each of the specified extractors
1974  to the remainder of the string. Thus C<extract_multiple> is
1975  a generalized form of Perl's C<split> subroutine.
1976  
1977  The subroutine takes up to four optional arguments:
1978  
1979  =over 4
1980  
1981  =item 1.
1982  
1983  A string to be processed (C<$_> if the string is omitted or C<undef>)
1984  
1985  =item 2.
1986  
1987  A reference to a list of subroutine references and/or qr// objects and/or
1988  literal strings and/or hash references, specifying the extractors
1989  to be used to split the string. If this argument is omitted (or
1990  C<undef>) the list:
1991  
1992          [
1993                  sub { extract_variable($_[0], '') },
1994                  sub { extract_quotelike($_[0],'') },
1995                  sub { extract_codeblock($_[0],'{}','') },
1996          ]
1997  
1998  is used.
1999  
2000  
2001  =item 3.
2002  
2003  An number specifying the maximum number of fields to return. If this
2004  argument is omitted (or C<undef>), split continues as long as possible.
2005  
2006  If the third argument is I<N>, then extraction continues until I<N> fields
2007  have been successfully extracted, or until the string has been completely 
2008  processed.
2009  
2010  Note that in scalar and void contexts the value of this argument is 
2011  automatically reset to 1 (under C<-w>, a warning is issued if the argument 
2012  has to be reset).
2013  
2014  =item 4.
2015  
2016  A value indicating whether unmatched substrings (see below) within the
2017  text should be skipped or returned as fields. If the value is true,
2018  such substrings are skipped. Otherwise, they are returned.
2019  
2020  =back
2021  
2022  The extraction process works by applying each extractor in
2023  sequence to the text string.
2024  
2025  If the extractor is a subroutine it is called in a list context and is
2026  expected to return a list of a single element, namely the extracted
2027  text. It may optionally also return two further arguments: a string
2028  representing the text left after extraction (like $' for a pattern
2029  match), and a string representing any prefix skipped before the
2030  extraction (like $` in a pattern match). Note that this is designed
2031  to facilitate the use of other Text::Balanced subroutines with
2032  C<extract_multiple>. Note too that the value returned by an extractor
2033  subroutine need not bear any relationship to the corresponding substring
2034  of the original text (see examples below).
2035  
2036  If the extractor is a precompiled regular expression or a string,
2037  it is matched against the text in a scalar context with a leading
2038  '\G' and the gc modifiers enabled. The extracted value is either
2039  $1 if that variable is defined after the match, or else the
2040  complete match (i.e. $&).
2041  
2042  If the extractor is a hash reference, it must contain exactly one element.
2043  The value of that element is one of the
2044  above extractor types (subroutine reference, regular expression, or string).
2045  The key of that element is the name of a class into which the successful
2046  return value of the extractor will be blessed.
2047  
2048  If an extractor returns a defined value, that value is immediately
2049  treated as the next extracted field and pushed onto the list of fields.
2050  If the extractor was specified in a hash reference, the field is also
2051  blessed into the appropriate class, 
2052  
2053  If the extractor fails to match (in the case of a regex extractor), or returns an empty list or an undefined value (in the case of a subroutine extractor), it is
2054  assumed to have failed to extract.
2055  If none of the extractor subroutines succeeds, then one
2056  character is extracted from the start of the text and the extraction
2057  subroutines reapplied. Characters which are thus removed are accumulated and
2058  eventually become the next field (unless the fourth argument is true, in which
2059  case they are discarded).
2060  
2061  For example, the following extracts substrings that are valid Perl variables:
2062  
2063          @fields = extract_multiple($text,
2064                                     [ sub { extract_variable($_[0]) } ],
2065                                     undef, 1);
2066  
2067  This example separates a text into fields which are quote delimited,
2068  curly bracketed, and anything else. The delimited and bracketed
2069  parts are also blessed to identify them (the "anything else" is unblessed):
2070  
2071          @fields = extract_multiple($text,
2072                     [
2073                          { Delim => sub { extract_delimited($_[0],q{'"}) } },
2074                          { Brack => sub { extract_bracketed($_[0],'{}') } },
2075                     ]);
2076  
2077  This call extracts the next single substring that is a valid Perl quotelike
2078  operator (and removes it from $text):
2079  
2080          $quotelike = extract_multiple($text,
2081                                        [
2082                                          sub { extract_quotelike($_[0]) },
2083                                        ], undef, 1);
2084  
2085  Finally, here is yet another way to do comma-separated value parsing:
2086  
2087          @fields = extract_multiple($csv_text,
2088                                    [
2089                                          sub { extract_delimited($_[0],q{'"}) },
2090                                          qr/([^,]+)(.*)/,
2091                                    ],
2092                                    undef,1);
2093  
2094  The list in the second argument means:
2095  I<"Try and extract a ' or " delimited string, otherwise extract anything up to a comma...">.
2096  The undef third argument means:
2097  I<"...as many times as possible...">,
2098  and the true value in the fourth argument means
2099  I<"...discarding anything else that appears (i.e. the commas)">.
2100  
2101  If you wanted the commas preserved as separate fields (i.e. like split
2102  does if your split pattern has capturing parentheses), you would
2103  just make the last parameter undefined (or remove it).
2104  
2105  
2106  =head2 C<gen_delimited_pat>
2107  
2108  The C<gen_delimited_pat> subroutine takes a single (string) argument and
2109     > builds a Friedl-style optimized regex that matches a string delimited
2110  by any one of the characters in the single argument. For example:
2111  
2112          gen_delimited_pat(q{'"})
2113  
2114  returns the regex:
2115  
2116          (?:\"(?:\\\"|(?!\").)*\"|\'(?:\\\'|(?!\').)*\')
2117  
2118  Note that the specified delimiters are automatically quotemeta'd.
2119  
2120  A typical use of C<gen_delimited_pat> would be to build special purpose tags
2121  for C<extract_tagged>. For example, to properly ignore "empty" XML elements
2122  (which might contain quoted strings):
2123  
2124          my $empty_tag = '<(' . gen_delimited_pat(q{'"}) . '|.)+/>';
2125  
2126          extract_tagged($text, undef, undef, undef, {ignore => [$empty_tag]} );
2127  
2128  
2129  C<gen_delimited_pat> may also be called with an optional second argument,
2130  which specifies the "escape" character(s) to be used for each delimiter.
2131  For example to match a Pascal-style string (where ' is the delimiter
2132  and '' is a literal ' within the string):
2133  
2134          gen_delimited_pat(q{'},q{'});
2135  
2136  Different escape characters can be specified for different delimiters.
2137  For example, to specify that '/' is the escape for single quotes
2138  and '%' is the escape for double quotes:
2139  
2140          gen_delimited_pat(q{'"},q{/%});
2141  
2142  If more delimiters than escape chars are specified, the last escape char
2143  is used for the remaining delimiters.
2144  If no escape char is specified for a given specified delimiter, '\' is used.
2145  
2146  =head2 C<delimited_pat>
2147  
2148  Note that C<gen_delimited_pat> was previously called C<delimited_pat>.
2149  That name may still be used, but is now deprecated.
2150          
2151  
2152  =head1 DIAGNOSTICS
2153  
2154  In a list context, all the functions return C<(undef,$original_text)>
2155  on failure. In a scalar context, failure is indicated by returning C<undef>
2156  (in this case the input text is not modified in any way).
2157  
2158  In addition, on failure in I<any> context, the C<$@> variable is set.
2159  Accessing C<$@-E<gt>{error}> returns one of the error diagnostics listed
2160  below.
2161  Accessing C<$@-E<gt>{pos}> returns the offset into the original string at
2162  which the error was detected (although not necessarily where it occurred!)
2163  Printing C<$@> directly produces the error message, with the offset appended.
2164  On success, the C<$@> variable is guaranteed to be C<undef>.
2165  
2166  The available diagnostics are:
2167  
2168  =over 4
2169  
2170  =item  C<Did not find a suitable bracket: "%s">
2171  
2172  The delimiter provided to C<extract_bracketed> was not one of
2173  C<'()[]E<lt>E<gt>{}'>.
2174  
2175  =item  C<Did not find prefix: /%s/>
2176  
2177  A non-optional prefix was specified but wasn't found at the start of the text.
2178  
2179  =item  C<Did not find opening bracket after prefix: "%s">
2180  
2181  C<extract_bracketed> or C<extract_codeblock> was expecting a
2182  particular kind of bracket at the start of the text, and didn't find it.
2183  
2184  =item  C<No quotelike operator found after prefix: "%s">
2185  
2186  C<extract_quotelike> didn't find one of the quotelike operators C<q>,
2187  C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y> at the start of the substring
2188  it was extracting.
2189  
2190  =item  C<Unmatched closing bracket: "%c">
2191  
2192  C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> encountered
2193  a closing bracket where none was expected.
2194  
2195  =item  C<Unmatched opening bracket(s): "%s">
2196  
2197  C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> ran 
2198  out of characters in the text before closing one or more levels of nested
2199  brackets.
2200  
2201  =item C<Unmatched embedded quote (%s)>
2202  
2203  C<extract_bracketed> attempted to match an embedded quoted substring, but
2204  failed to find a closing quote to match it.
2205  
2206  =item C<Did not find closing delimiter to match '%s'>
2207  
2208  C<extract_quotelike> was unable to find a closing delimiter to match the
2209  one that opened the quote-like operation.
2210  
2211  =item  C<Mismatched closing bracket: expected "%c" but found "%s">
2212  
2213  C<extract_bracketed>, C<extract_quotelike> or C<extract_codeblock> found
2214  a valid bracket delimiter, but it was the wrong species. This usually
2215  indicates a nesting error, but may indicate incorrect quoting or escaping.
2216  
2217  =item  C<No block delimiter found after quotelike "%s">
2218  
2219  C<extract_quotelike> or C<extract_codeblock> found one of the
2220  quotelike operators C<q>, C<qq>, C<qw>, C<qx>, C<s>, C<tr> or C<y>
2221  without a suitable block after it.
2222  
2223  =item C<Did not find leading dereferencer>
2224  
2225  C<extract_variable> was expecting one of '$', '@', or '%' at the start of
2226  a variable, but didn't find any of them.
2227  
2228  =item C<Bad identifier after dereferencer>
2229  
2230  C<extract_variable> found a '$', '@', or '%' indicating a variable, but that
2231  character was not followed by a legal Perl identifier.
2232  
2233  =item C<Did not find expected opening bracket at %s>
2234  
2235  C<extract_codeblock> failed to find any of the outermost opening brackets
2236  that were specified.
2237  
2238  =item C<Improperly nested codeblock at %s>
2239  
2240  A nested code block was found that started with a delimiter that was specified
2241  as being only to be used as an outermost bracket.
2242  
2243  =item  C<Missing second block for quotelike "%s">
2244  
2245  C<extract_codeblock> or C<extract_quotelike> found one of the
2246  quotelike operators C<s>, C<tr> or C<y> followed by only one block.
2247  
2248  =item C<No match found for opening bracket>
2249  
2250  C<extract_codeblock> failed to find a closing bracket to match the outermost
2251  opening bracket.
2252  
2253  =item C<Did not find opening tag: /%s/>
2254  
2255  C<extract_tagged> did not find a suitable opening tag (after any specified
2256  prefix was removed).
2257  
2258  =item C<Unable to construct closing tag to match: /%s/>
2259  
2260  C<extract_tagged> matched the specified opening tag and tried to
2261  modify the matched text to produce a matching closing tag (because
2262  none was specified). It failed to generate the closing tag, almost
2263  certainly because the opening tag did not start with a
2264  bracket of some kind.
2265  
2266  =item C<Found invalid nested tag: %s>
2267  
2268  C<extract_tagged> found a nested tag that appeared in the "reject" list
2269  (and the failure mode was not "MAX" or "PARA").
2270  
2271  =item C<Found unbalanced nested tag: %s>
2272  
2273  C<extract_tagged> found a nested opening tag that was not matched by a
2274  corresponding nested closing tag (and the failure mode was not "MAX" or "PARA").
2275  
2276  =item C<Did not find closing tag>
2277  
2278  C<extract_tagged> reached the end of the text without finding a closing tag
2279  to match the original opening tag (and the failure mode was not
2280  "MAX" or "PARA").
2281  
2282  
2283  
2284  
2285  =back
2286  
2287  
2288  =head1 AUTHOR
2289  
2290  Damian Conway (damian@conway.org)
2291  
2292  
2293  =head1 BUGS AND IRRITATIONS
2294  
2295  There are undoubtedly serious bugs lurking somewhere in this code, if
2296  only because parts of it give the impression of understanding a great deal
2297  more about Perl than they really do. 
2298  
2299  Bug reports and other feedback are most welcome.
2300  
2301  
2302  =head1 COPYRIGHT
2303  
2304   Copyright (c) 1997-2001, Damian Conway. All Rights Reserved.
2305   This module is free software. It may be used, redistributed
2306       and/or modified under the same terms as Perl itself.
PHP Cross Reference of Unnamed Project

/se3-unattended/var/se3/unattended/install/linuxaux/opt/perl/lib/5.10.0/Text/ -> Balanced.pm (source)