File:  [Local Repository] / db / prgsrc / updateRS1.pl
Revision 1.10: download - view: text, annotated - select for diffs - revision graph
Thu Feb 13 16:27:32 2003 UTC (21 years, 3 months ago) by boris
Branches: MAIN
CVS tags: HEAD
Added $DUMPDIR

    1: #!/usr/bin/perl -w
    2: 
    3: =head1 NAME
    4: 
    5: updateRS.pl - Скрипт для занесения в таблицы русского поиска базы 
    6: B<$base> информации о вопросах 
    7: 
    8: =head1 SYNOPSIS
    9: 
   10: updateRS1.pl Questions_per_cicle cicles_number
   11: 
   12: updateRS.pl Questions_per_cicle
   13: 
   14: updateRS.pl
   15: 
   16: 
   17: =head1 DESCRIPTION
   18: 
   19: Скрипт ищет в таблице Questions вопросы с нулевым ProcessedBySearch,
   20: добавляет информацию в таблицы word2question, nests, nf. Поле 
   21: ProcessedBySearch устанавливается в 1. Обрабатывает 
   22: Questions_per_cicle*cicles_number вопросов, сбрасывая 
   23: информацию в базу каждые Questions_per_cicle вопросов.
   24: Значения по умолчанию:
   25: Questions_per_cicle=500;
   26: cicles_number=1.
   27: 
   28: 
   29: =head1 AUTHOR
   30: 
   31: Роман Семизаров
   32: 
   33: 
   34: =cut
   35: 
   36: 
   37: use locale;
   38: use DBI;
   39: use POSIX qw (locale_h);
   40: use lib "../lib";
   41: use chgkfiles;
   42: use dbchgk;
   43: my $DUMPDIR = $ENV{DUMPDIR} || "../dump";
   44: 
   45: open (STDERR,">$DUMPDIR/errors");
   46: my $initime=time;
   47: open TIME, ">$DUMPDIR/time"; 
   48: do "common.pl";
   49: do "chgk.cnf";
   50: 
   51: $qlimit=shift||500;
   52: $times=shift||1;
   53: 
   54: die "Undefined \$maxwsize! check your chgk.cnf" unless $maxwsize;
   55: 
   56: require "check.pl";
   57: 
   58: open (STDERR,">$stderr") if $stderr;
   59: 
   60: 
   61: open (UNKNOWN,">$unknown");
   62: 
   63: 
   64:           
   65: my $nf;
   66: 
   67: #open WARN, ">$warnings";
   68: 
   69: #%forbidden=checktable('equalto')? getequalto : ();
   70: 
   71: 
   72: if ((uc 'а') ne 'А') {die "!Koi8-r locale not installed!\n"};
   73: 
   74: getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
   75: 
   76: 
   77: 
   78: print "Loading dictionaries\n";
   79: 
   80: die "No dictionaries! Check your chgk.cnf" unless scalar @dictionaries;
   81: 
   82: foreach $d(@dictionaries)
   83: {
   84:      print "Loading $d\n";
   85:      open (DICT, $d) || print "              Not found\n";
   86:      while ( <DICT> )
   87:      {
   88:      	chomp;
   89:      	s/\s*$//;
   90:        	($aa,$b)=split(/\//);
   91:        	$aa=~tr/ёЁ/еЕ/;
   92:     	$a= uc $aa;
   93:     	$words{$a}.=$b || "!";
   94:      }
   95:      close(DICT);
   96: }
   97: 
   98: die "No dictionaries found! Check your chgk.cnf" unless scalar keys %words;
   99: 
  100: 
  101: print "Getting words...\n";
  102: 
  103: print TIME "\t\t".(time-$initime)."\n";
  104: my $broken=0;
  105: 
  106: for my $commonI(1..$times)
  107: {
  108: last if $broken;
  109: 
  110: print "loading nests\n";
  111: 
  112: %nf=getnests;
  113: 
  114: %nfnumber=getnfnumbers;
  115: 
  116: print "Getting words...\n";
  117: 
  118: getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
  119: $sch=0;
  120: while ((++$sch<=$qlimit) && (($id, @ss) = getrow, $id)) 
  121:                              # берём по одному вопросу
  122:                              # и вешаем в массив, индексы которого -- словоформы,
  123:                              # а значения -- списки вопросов. 
  124: {
  125:    if (-e "RS_stop")  {unlink("RS_stop"); $broken=1;last}
  126:    print "$sch $id\n" unless ($sch % 10);#unless (++$sch % 1);
  127:    searchmark($id);
  128:    if ($forbidden{$id}) {next};
  129:    foreach $fieldnumber (0..$#ss) #перебираем поля
  130:    {
  131:       $text=$ss[$fieldnumber];
  132:       next unless $text;
  133:       $text=~tr/ёЁ/еЕ/;
  134:       $text=~s/(${RLrl})p(${RLrl})/$1p$2/gom;
  135:       $text=~s/p(${RLrl})/р$1/gom;
  136:       $text=~s/(${RLrl})p/$1р/gom;
  137:       $text=~s/\s+/ /gmo;
  138:       @list= $text=~m/(?:(?:${RLrl})+)|(?:[A-Za-z0-9]+)/gom;
  139: 
  140: foreach $wordnumber(0..$#list)
  141:       {
  142:            $word=uc $list[$wordnumber];
  143:            next if length $word>$maxwsize;
  144:            if (my $n=$nf{$word}||$newnf{$word}) 
  145:            {
  146:                 @n= split ' ',$n;
  147:                 $nfkvo{$_}++ foreach @n;
  148:                 my $r=packword($fieldnumber, $id,$wordnumber%256);
  149:                 $neww2k{$_}.=$r foreach (@n);
  150:            }
  151:            else {
  152:              if ($word=~/^${RLrl}+$/o){ # Русское слово
  153:             # проанализировать по таблице аффиксов, 
  154:             # проверить наличие начальных форм в 
  155:             # nf, а если таких нет, то 
  156:             # и по словарю. 
  157:               
  158:                $nf=&checkit(uc $word,\%words);
  159:                if (!$nf) {
  160:                            $nf=(uc $word)."/!";
  161:                            print UNKNOWN "$nf\n" if $unknown;
  162:                          }
  163: 
  164:               }  else {# нерусское слово
  165:                          $nf=(uc $word)."/!";
  166:                          print UNKNOWN "$nf\n" if $unknown;
  167:                       }
  168: 
  169:                foreach $n (split ' ', $nf)
  170:                {
  171:                    ($f,$flag)=split '/', $n;
  172:                    if ($nfnumber=$nfnumber{$f})
  173:                    {
  174:                        $newnf{$word}.=" $nfnumber";
  175:                        $nfkvo{$nfnumber}++;
  176:                        $a=\$neww2k{$nfnumber};
  177:                        my $r=packword($fieldnumber, $id,$wordnumber);
  178:                        $$a.=$r;
  179:                        if (length $$a>100) {flushw2k($nfnumber)}
  180:                    }
  181:                    else 
  182:                    {
  183:                       $nfnumber=addnf(0, $f, $flag,1);
  184:                       $newnf{uc $word}.=" $nfnumber";
  185:                        my $r=packword($fieldnumber, $id,$wordnumber);
  186:                       $neww2k{$nfnumber}.=$r;
  187:                    }
  188:                }
  189:            }
  190: 
  191:       }
  192:    }
  193: 
  194: }
  195: 
  196: 
  197: print "Filling word2question...\n";
  198: 
  199: foreach (keys %neww2k)
  200: { 
  201:    updateword2question($_,$neww2k{$_});
  202:    delete $neww2k{$_};
  203: }
  204: 
  205: %neww2k=();
  206: 
  207: print "Filling nf...\n";
  208: $sch=0;
  209: 
  210: incnf($_,$nfkvo{$_})  foreach (keys %nfkvo);
  211: 
  212: %nfkvo=();
  213: 
  214: print "Filling nests...\n";
  215: $sch=0;
  216: 
  217: 
  218: foreach $w (keys %newnf)
  219: {
  220:   print "$sch\n" unless (++$sch % 1000);
  221:   @nf=split ' ',$newnf{$w};
  222:   addnest($w,$_) foreach @nf;
  223: }
  224: print "$sch nests added\n";
  225: 
  226: print TIME "$commonI: \t$sch ";
  227: print TIME "\t".(time-$initime)."\n";
  228: %newnf=();
  229: 
  230: }
  231: 
  232: sub flushw2k
  233: {
  234:    my ($n)=@_;
  235:    updateword2question($n,$neww2k{$n});
  236:    delete $neww2k{$_};
  237: }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>