File:  [Local Repository] / db / prgsrc / updateRS2.pl
Revision 1.3: download - view: text, annotated - select for diffs - revision graph
Sat Aug 28 23:47:41 2004 UTC (19 years, 8 months ago) by roma7
Branches: MAIN
CVS tags: HEAD
PassCriteria is added

    1: #!/usr/bin/perl -w
    2: 
    3: =head1 NAME
    4: 
    5: updateRS.pl - Скрипт для занесения в таблицы русского поиска базы 
    6: B<$base> информации о вопросах. Использует Tie::Hash
    7: 
    8: =head1 SYNOPSIS
    9: 
   10: updateRS1.pl Questions_per_cicle cicles_number
   11: 
   12: updateRS.pl Questions_per_cicle
   13: 
   14: updateRS.pl
   15: 
   16: 
   17: =head1 DESCRIPTION
   18: 
   19: Скрипт ищет в таблице Questions вопросы с нулевым ProcessedBySearch,
   20: добавляет информацию в таблицы word2question, nests, nf. Поле 
   21: ProcessedBySearch устанавливается в 1. Обрабатывает 
   22: Questions_per_cicle*cicles_number вопросов, сбрасывая 
   23: информацию в базу каждые Questions_per_cicle вопросов.
   24: Значения по умолчанию:
   25: Questions_per_cicle=500;
   26: cicles_number=1.
   27: 
   28: 
   29: =head1 AUTHOR
   30: 
   31: Роман Семизаров
   32: 
   33: 
   34: =cut
   35: 
   36: 
   37: 
   38: use locale;
   39: use DBI;
   40: use POSIX qw (locale_h);
   41: use lib "../lib";
   42: use chgkfiles;
   43: use Tie::Hash;
   44: use dbchgk;
   45: 
   46: my $DUMPDIR = $ENV{DUMPDIR} || "../dump";
   47: open (STDERR,">$DUMPDIR/errors");
   48: my $initime=time;
   49: open TIME, ">$DUMPDIR/time"; 
   50: do "common.pl";
   51: do "chgk.cnf";
   52: 
   53: 
   54: 
   55:     package NewStdHash;
   56:     require Tie::Hash;
   57: 
   58:     @ISA = (Tie::StdHash);
   59: 
   60: 
   61: package main;
   62: 
   63: 
   64: 
   65: $qlimit=shift||500;
   66: $times=shift||1;
   67: 
   68: die "Undefined \$maxwsize! check your chgk.cnf" unless $maxwsize;
   69: 
   70: require "check.pl";
   71: 
   72: open (STDERR,">$stderr") if $stderr;
   73: 
   74: 
   75: open (UNKNOWN,">$unknown");
   76: 
   77: 
   78:           
   79: my $nf;
   80: 
   81: #open WARN, ">$warnings";
   82: 
   83: #%forbidden=checktable('equalto')? getequalto : ();
   84: 
   85: 
   86: if ((uc 'а') ne 'А') {die "!Koi8-r locale not installed!\n"};
   87: 
   88: getquestions(QuestionId, Question, Answer, PassCriteria, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
   89: 
   90: 
   91: 
   92: print "Loading dictionaries\n";
   93: 
   94: die "No dictionaries! Check your chgk.cnf" unless scalar @dictionaries;
   95: 
   96: foreach $d(@dictionaries)
   97: {
   98:      print "Loading $d\n";
   99:      open (DICT, $d) || print "              Not found\n";
  100:      while ( <DICT> )
  101:      {
  102:      	chomp;
  103:      	s/\s*$//;
  104:        	($aa,$b)=split(/\//);
  105:        	$aa=~tr/ёЁ/еЕ/;
  106:     	$a= uc $aa;
  107:     	$words{$a}.=$b || "!";
  108:      }
  109:      close(DICT);
  110: }
  111: 
  112: die "No dictionaries found! Check your chgk.cnf" unless scalar keys %words;
  113: 
  114: 
  115: print "Getting words...\n";
  116: 
  117: print TIME "\t\t".(time-$initime)."\n";
  118: 
  119: for my $commonI(1..$times)
  120: {
  121: 
  122: 
  123: print "loading nests\n";
  124: 
  125: tie %nf, 'NewStdHash';
  126:                    
  127: %nf=getnests;
  128: 
  129: %nfnumber=getnfnumbers;
  130: 
  131: print "Getting words...\n";
  132: 
  133: getquestions(QuestionId, Question, Answer, PassCriteria, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
  134: $sch=0;
  135: while ((++$sch<=$qlimit) && (($id, @ss) = getrow, $id)) 
  136:                              # берём по одному вопросу
  137:                              # и вешаем в массив, индексы которого -- словоформы,
  138:                              # а значения -- списки вопросов. 
  139: {
  140: 
  141:    print "$sch $id\n" ;#unless (++$sch % 1);
  142:    searchmark($id);
  143:    if ($forbidden{$id}) {next};
  144:    foreach $fieldnumber (0..$#ss) #перебираем поля
  145:    {
  146:       $text=$ss[$fieldnumber];
  147:       next unless $text;
  148:       $text=~tr/ёЁ/еЕ/;
  149:       $text=~s/(${RLrl})p(${RLrl})/$1p$2/gom;
  150:       $text=~s/p(${RLrl})/р$1/gom;
  151:       $text=~s/(${RLrl})p/$1р/gom;
  152:       $text=~s/\s+/ /gmo;
  153:       @list= $text=~m/(?:(?:${RLrl})+)|(?:[A-Za-z0-9]+)/gom;
  154: 
  155: foreach $wordnumber(0..$#list)
  156:       {
  157:            $word=uc $list[$wordnumber];
  158:            next if length $word>$maxwsize;
  159:            if (my $n=$nf{$word}||$newnf{$word}) 
  160:            {
  161:                 @n= split ' ',$n;
  162:                 $nfkvo{$_}++ foreach @n;
  163:                 $neww2k{$_}.=packword($fieldnumber, $id,$wordnumber%256)
  164:                      foreach (@n);
  165:            }
  166:            else {
  167:              if ($word=~/^${RLrl}+$/o){ # Русское слово
  168:             # проанализировать по таблице аффиксов, 
  169:             # проверить наличие начальных форм в 
  170:             # nf, а если таких нет, то 
  171:             # и по словарю. 
  172:               
  173:                $nf=&checkit(uc $word,\%words);
  174:                if (!$nf) {
  175:                            $nf=(uc $word)."/!";
  176:                            print UNKNOWN "$nf\n" if $unknown;
  177:                          }
  178: 
  179:               }  else {# нерусское слово
  180:                          $nf=(uc $word)."/!";
  181:                          print UNKNOWN "$nf\n" if $unknown;
  182:                       }
  183: 
  184:                foreach $n (split ' ', $nf)
  185:                {
  186:                    ($f,$flag)=split '/', $n;
  187:                    if ($nfnumber=$nfnumber{$f})
  188:                    {
  189:                        $newnf{$word}.=" $nfnumber";
  190:                        $nfkvo{$nfnumber}++;
  191:                        $a=\$neww2k{$nfnumber};
  192:                        $$a.=packword($fieldnumber, $id,$wordnumber);
  193:                        if (length $$a>100) {flushw2k($nfnumber)}
  194:                    }
  195:                    else 
  196:                    {
  197:                       $nfnumber=addnf(0, $f, $flag,1);
  198:                       $newnf{uc $word}.=" $nfnumber";
  199:                       $neww2k{$nfnumber}.=packword($fieldnumber, $id,$wordnumber);
  200:                    }
  201:                }
  202:            }
  203: 
  204:       }
  205:    }
  206: 
  207: }
  208: 
  209: 
  210: print "Filling word2question...\n";
  211: 
  212: foreach (keys %neww2k)
  213: { 
  214:    updateword2question($_,$neww2k{$_});
  215:    delete $neww2k{$_};
  216: }
  217: 
  218: %neww2k=();
  219: 
  220: print "Filling nf...\n";
  221: $sch=0;
  222: 
  223: incnf($_,$nfkvo{$_})  foreach (keys %nfkvo);
  224: 
  225: %nfkvo=();
  226: 
  227: print "Filling nests...\n";
  228: $sch=0;
  229: 
  230: 
  231: foreach $w (keys %newnf)
  232: {
  233:   print "$sch\n" unless (++$sch % 1000);
  234:   @nf=split ' ',$newnf{$w};
  235:   addnest($w,$_) foreach @nf;
  236: }
  237: print "$sch nests added\n";
  238: 
  239: print TIME "$commonI: \t$sch ";
  240: print TIME "\t".(time-$initime)."\n";
  241: %newnf=();
  242: 
  243: }
  244: 
  245: sub flushw2k
  246: {
  247:    my ($n)=@_;
  248:    updateword2question($n,$neww2k{$n});
  249:    delete $neww2k{$_};
  250: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>