use utf8; # (C) 2004 Stefan Ram # http://www.purl.org/stefan_ram/utf-8/ngtoken_all.pm use strict; use warnings; package ngtoken_all; # This package implements an algorithm to find all possible # tokens for a newsgroup. # # When then name is .xyzzy.foo-bar three possibilities are # taken into account: # # - The most common case is to abbreviate it as # "xf-b". # # - But sometimes the special character "-" is ommited # as in "xfb". # # - Sometimes the rule to just use the characters after a # dot is used, giving "xf". # # This package makes a name for a newsgroup combining all # the choices for every of the special characters "-" or # "+". So a newsgroup with two hyphens this gives nine # possible tokens. # # For a newsgroup with n hypens or plus signs a ternary # storage with n digits is created using the internal # data type ternary. To every state of this storage there # corresponds a newsgroup token. So by setting the state # to 0 and then incrementing it all possible newsgroup # tokens by the rules mentioned above can be created. # # This package does not check for duplicates or assigns # weights to the tokens. This is the responsibility of # another package "ngtoken". use strict; use warnings; # implement ternary numbers; { # create a ternary number of size kvl_size sub ksf_makebin($) { my( $kvl_size )= @_; my $kvl_result = []; while( $kvl_size-- ) { push @$kvl_result, "0"; } $kvl_result; } # increment a ternary number $kvl_bin sub ksf_incbin($) { my( $kvl_bin )= @_; my $kvl_result; my $i = 0; my $kvl_size = $#$kvl_bin + 1; while( $i < $kvl_size && $$kvl_bin[ $i ]==2 ) { $$kvl_bin[ $i++ ]= 0; } if( $i == $kvl_size ) { $kvl_result = 0; } else { $kvl_result = 1; $$kvl_bin[ $i ]++; } return $kvl_result; } # print a binary number $kvl_bin # not used in production code # only for testing purposes sub ksf_printbin($) { my( $kvl_bin )= @_; my $kvl_size = $#$kvl_bin + 1; my $i = $kvl_size - 1; while( $i >= 0 ) { print "p\n"; print $$kvl_bin[ $i-- ]; }} } # how many hyphens (i.e., "-" or "+") are there in a # name $kvl_name? sub ksf_hyphencount($) { my( $kvl_name )= @_; my $kvl_result = 0; while( $kvl_name =~ /[+\-]/g ) { ++$kvl_result; } $kvl_result; } # Depending on the ternary number given perform one of # three actions: # # 0: Leave the character as it is (it plus the character # following it will appear in the token as it is, # e.g., giving "xf-b") # # 1: Replace the character by a dot (it will not appear # in the token, but the character following it will # appear as it is, e.g., giving "xfb") # # 2: Replace the character by the letter "a" (The # character nor the letter following it will appear # in the resulting token, e.g., giving "xf".) # # The actual token can be obtained by invoking ksf_token # on the result of ksf_modname. # sub ksf_modname($$) { my( $kvl_name, $kvl_bin )= @_; my $kvl_text = $kvl_name; my $kvl_size = $#$kvl_bin + 1; my $i = $kvl_size - 1; while( $kvl_name =~ /[+\-]/g ) { if( $$kvl_bin[ $i ] == 1 ) { substr( $kvl_text, ( pos $kvl_name ) - 1, 1 )= "."; } if( $$kvl_bin[ $i ] == 2 ) { substr( $kvl_text, ( pos $kvl_name ) - 1, 1 )= "a"; } $i--; } $kvl_text; } # Make a token from a text. # Replace all letter sequences preceded by a special # character by their initial letter and then remove # all dots. sub ksf_token($) { my( $kvl_name )= @_; $kvl_name = "." . $kvl_name; $kvl_name =~ s/([.+\-])([a-zA-Z0-9])[^.+\-]*/$1$2/g; $kvl_name =~ s/([.+\-])([a-zA-Z0-9])[^.+\-]*/$1$2/g; $kvl_name =~ s/([.+\-])([a-zA-Z0-9])[^.+\-]*/$1$2/g; $kvl_name =~ s/([.+\-])([a-zA-Z0-9])[^.+\-]*/$1$2/g; $kvl_name =~ s/\.//g; return $kvl_name; } # Make all tokens for a newsgroup # Create a ternary number and then create a token # for every possible ternary number. sub ksf_ngtoken($) { my( $kvl_name )= @_; my $ksf_result = []; $kvl_name =~ s/\+\+//; # for cases like "c++" my $kvl_hyphencount = ksf_hyphencount( $kvl_name ); my $kvl_bin = ksf_makebin( $kvl_hyphencount ); push @$ksf_result, ksf_token( $kvl_name ); while( ksf_incbin( $kvl_bin )) { push @$ksf_result, ksf_token( ksf_modname( $kvl_name, $kvl_bin )); } $ksf_result; } # perl -mngtoken_all -e "ngtoken_all::ksf_test()" sub ksf_test() { my $kvl_name = "de.alt.anime+this-example"; my $kvl_array = ksf_ngtoken( $kvl_name ); for( @$kvl_array ){ print $_ . "\n" ; } # daa+t-e # daa+te # daa+t # daat-e # daate # daat # daa-e # daae # daa } 1;