db/prgsrc/updateRS1.pl - diff

Return to updateRS1.pl CVS log

Up to [Local Repository] / db / prgsrc

Diff for /db/prgsrc/updateRS1.pl between versions 1.1 and 1.10

-version 1.1, 2001/10/31 03:00:10
+version 1.10, 2003/02/13 16:27:32
  Line 1
- #!/usr/local/bin/perl -w
+ #!/usr/bin/perl -w
  =head1 NAME
  updateRS.pl - Скрипт для занесения в таблицы русского поиска базы
  B<$base> информации о вопросах
  =head1 SYNOPSIS
  updateRS1.pl Questions_per_cicle cicles_number
  updateRS.pl Questions_per_cicle
  updateRS.pl
  =head1 DESCRIPTION
  Скрипт ищет в таблице Questions вопросы с нулевым ProcessedBySearch,
  добавляет информацию в таблицы word2question, nests, nf. Поле
  ProcessedBySearch устанавливается в 1. Обрабатывает
  Questions_per_cicle*cicles_number вопросов, сбрасывая
  информацию в базу каждые Questions_per_cicle вопросов.
  Значения по умолчанию:
  Questions_per_cicle=500;
  cicles_number=1.
  =head1 AUTHOR
  Роман Семизаров
  =cut
  use locale;
  use DBI;
  use POSIX qw (locale_h);
- use chgkfiles;
+ use lib "../lib";
- use dbchgk;
+ use chgkfiles;
- open (STDERR,">errors");
+ use dbchgk;
- my $initime=time;
+ my $DUMPDIR = $ENV{DUMPDIR} || "../dump";
- open TIME, ">time";
- do "common.pl";
+ open (STDERR,">$DUMPDIR/errors");
- do "chgk.cnf";
+ my $initime=time;
+ open TIME, ">$DUMPDIR/time";
- $qlimit=shift||500;
+ do "common.pl";
- $times=shift||1;
+ do "chgk.cnf";
- die "Undefined \$maxwsize! check your chgk.cnf" unless $maxwsize;
+ $qlimit=shift||500;
+ $times=shift||1;
- require "check.pl";
+ die "Undefined \$maxwsize! check your chgk.cnf" unless $maxwsize;
- open (STDERR,">$stderr") if $stderr;
+ require "check.pl";
- open (UNKNOWN,">$unknown");
+ open (STDERR,">$stderr") if $stderr;
+ open (UNKNOWN,">$unknown");
- my $nf;
- #open WARN, ">$warnings";
+ my $nf;
- %forbidden=checktable('equalto')? getequalto : ();
+ #open WARN, ">$warnings";
- if ((uc 'а') ne 'А') {die "!Koi8-r locale not installed!\n"};
+ #%forbidden=checktable('equalto')? getequalto : ();
- getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
+ if ((uc 'а') ne 'А') {die "!Koi8-r locale not installed!\n"};
+ getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
- print "Loading dictionaries\n";
- die "No dictionaries! Check your chgk.cnf" unless scalar @dictionaries;
+ print "Loading dictionaries\n";
- foreach $d(@dictionaries)
- {
+ die "No dictionaries! Check your chgk.cnf" unless scalar @dictionaries;
-      print "Loading $d\n";
-      open (DICT, $d) || print "              Not found\n";
+ foreach $d(@dictionaries)
-      while ( <DICT> )
+ {
-      {
+      print "Loading $d\n";
-         chomp;
+      open (DICT, $d) || print "              Not found\n";
-         s/\s*$//;
+      while ( <DICT> )
-         ($aa,$b)=split(/\//);
+      {
-         $a= uc $aa;
+         chomp;
-         $words{$a}.=$b || "!";
+         s/\s*$//;
-      }
+         ($aa,$b)=split(/\//);
-      close(DICT);
+         $aa=~tr/ёЁ/еЕ/;
- }
+         $a= uc $aa;
+         $words{$a}.=$b || "!";
- die "No dictionaries found! Check your chgk.cnf" unless scalar keys %words;
+      }
+      close(DICT);
+ }
- print "Getting words...\n";
+ die "No dictionaries found! Check your chgk.cnf" unless scalar keys %words;
- print TIME "\t\t".(time-$initime)."\n";
- for my $commonI(1..$times)
+ print "Getting words...\n";
- {
+ print TIME "\t\t".(time-$initime)."\n";
+ my $broken=0;
- print "loading nests\n";
+ for my $commonI(1..$times)
- %nf=getnests;
+ {
+ last if $broken;
- %nfnumber=getnfnumbers;
+ print "loading nests\n";
- print "Getting words...\n";
+ %nf=getnests;
- getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
- $sch=0;
+ %nfnumber=getnfnumbers;
- while ((++$sch<=$qlimit) && (($id, @ss) = getrow, $id))
-                              # берём по одному вопросу
+ print "Getting words...\n";
-                              # и вешаем в массив, индексы которого -- словоформы,
-                              # а значения -- списки вопросов.
+ getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
- {
+ $sch=0;
+ while ((++$sch<=$qlimit) && (($id, @ss) = getrow, $id))
-    print "$sch $id\n" ;#unless (++$sch % 1);
+                              # берём по одному вопросу
-    searchmark($id);
+                              # и вешаем в массив, индексы которого -- словоформы,
-    if ($forbidden{$id}) {next};
+                              # а значения -- списки вопросов.
-    foreach $fieldnumber (0..$#ss) #перебираем поля
+ {
-    {
+    if (-e "RS_stop")  {unlink("RS_stop"); $broken=1;last}
-       $text=$ss[$fieldnumber];
+    print "$sch $id\n" unless ($sch % 10);#unless (++$sch % 1);
-       next unless $text;
+    searchmark($id);
-       $text=~tr/ёЁ/еЕ/;
+    if ($forbidden{$id}) {next};
-       $text=~s/(${RLrl})p(${RLrl})/$1p$2/gom;
+    foreach $fieldnumber (0..$#ss) #перебираем поля
-       $text=~s/p(${RLrl})/р$1/gom;
+    {
-       $text=~s/(${RLrl})p/$1р/gom;
+       $text=$ss[$fieldnumber];
-       $text=~s/\s+/ /gmo;
+       next unless $text;
-       @list= $text=~m/(?:(?:${RLrl})+)|(?:[A-Za-z0-9]+)/gom;
+       $text=~tr/ёЁ/еЕ/;
+       $text=~s/(${RLrl})p(${RLrl})/$1p$2/gom;
- foreach $wordnumber(0..$#list)
+       $text=~s/p(${RLrl})/р$1/gom;
-       {
+       $text=~s/(${RLrl})p/$1р/gom;
-            $word=uc $list[$wordnumber];
+       $text=~s/\s+/ /gmo;
-            next if length $word>$maxwsize;
+       @list= $text=~m/(?:(?:${RLrl})+)|(?:[A-Za-z0-9]+)/gom;
-            if (my $n=$nf{$word}||$newnf{$word})
-            {
+ foreach $wordnumber(0..$#list)
-                 @n= split ' ',$n;
+       {
-                 $nfkvo{$_}++ foreach @n;
+            $word=uc $list[$wordnumber];
-                 $neww2k{$_}.=packword($fieldnumber, $id,$wordnumber%256)
+            next if length $word>$maxwsize;
-                      foreach (@n);
+            if (my $n=$nf{$word}||$newnf{$word})
-            }
+            {
-            else {
+                 @n= split ' ',$n;
-              if ($word=~/^${RLrl}+$/o){ # Русское слово
+                 $nfkvo{$_}++ foreach @n;
-             # проанализировать по таблице аффиксов,
+                 my $r=packword($fieldnumber, $id,$wordnumber%256);
-             # проверить наличие начальных форм в
+                 $neww2k{$_}.=$r foreach (@n);
-             # nf, а если таких нет, то
+            }
-             # и по словарю.
+            else {
+              if ($word=~/^${RLrl}+$/o){ # Русское слово
-                $nf=&checkit(uc $word,\%words);
+             # проанализировать по таблице аффиксов,
-                if (!$nf) {
+             # проверить наличие начальных форм в
-                            $nf=(uc $word)."/!";
+             # nf, а если таких нет, то
-                            print UNKNOWN "$nf\n" if $unknown;
+             # и по словарю.
-                          }
+                $nf=&checkit(uc $word,\%words);
-               }  else {# нерусское слово
+                if (!$nf) {
-                          $nf=(uc $word)."/!";
+                            $nf=(uc $word)."/!";
-                          print UNKNOWN "$nf\n" if $unknown;
+                            print UNKNOWN "$nf\n" if $unknown;
-                       }
+                          }
-                foreach $n (split ' ', $nf)
+               }  else {# нерусское слово
-                {
+                          $nf=(uc $word)."/!";
-                    ($f,$flag)=split '/', $n;
+                          print UNKNOWN "$nf\n" if $unknown;
-                    if ($nfnumber=$nfnumber{$f})
+                       }
-                    {
-                        $newnf{$word}.=" $nfnumber";
+                foreach $n (split ' ', $nf)
-                        $nfkvo{$nfnumber}++;
+                {
-                        $a=\$neww2k{$nfnumber};
+                    ($f,$flag)=split '/', $n;
-                        $$a.=packword($fieldnumber, $id,$wordnumber);
+                    if ($nfnumber=$nfnumber{$f})
-                        if (length $$a>100) {flushw2k($nfnumber)}
+                    {
-                    }
+                        $newnf{$word}.=" $nfnumber";
-                    else
+                        $nfkvo{$nfnumber}++;
-                    {
+                        $a=\$neww2k{$nfnumber};
-                       $nfnumber=addnf(0, $f, $flag,1);
+                        my $r=packword($fieldnumber, $id,$wordnumber);
-                       $newnf{uc $word}.=" $nfnumber";
+                        $$a.=$r;
-                       $neww2k{$nfnumber}.=packword($fieldnumber, $id,$wordnumber);
+                        if (length $$a>100) {flushw2k($nfnumber)}
                     }
-                }
+                    else
-            }
+                    {
+                       $nfnumber=addnf(0, $f, $flag,1);
-       }
+                       $newnf{uc $word}.=" $nfnumber";
-    }
+                        my $r=packword($fieldnumber, $id,$wordnumber);
+                       $neww2k{$nfnumber}.=$r;
- }
+                    }
+                }
+            }
- print "Filling word2question...\n";
+       }
- foreach (keys %neww2k)
+    }
- {
-    updateword2question($_,$neww2k{$_});
+ }
-    delete $neww2k{$_};
- }
+ print "Filling word2question...\n";
- %neww2k=();
+ foreach (keys %neww2k)
- print "Filling nf...\n";
+ {
- $sch=0;
+    updateword2question($_,$neww2k{$_});
+    delete $neww2k{$_};
- incnf($_,$nfkvo{$_})  foreach (keys %nfkvo);
+ }
- %nfkvo=();
+ %neww2k=();
- print "Filling nests...\n";
+ print "Filling nf...\n";
  $sch=0;
+ incnf($_,$nfkvo{$_})  foreach (keys %nfkvo);
- foreach $w (keys %newnf)
- {
+ %nfkvo=();
-   print "$sch\n" unless (++$sch % 1000);
-   @nf=split ' ',$newnf{$w};
+ print "Filling nests...\n";
-   addnest($w,$_) foreach @nf;
+ $sch=0;
- }
- print "$sch nests added\n";
+ foreach $w (keys %newnf)
- print TIME "$commonI: \t$sch ";
+ {
- print TIME "\t".(time-$initime)."\n";
+   print "$sch\n" unless (++$sch % 1000);
- %newnf=();
+   @nf=split ' ',$newnf{$w};
+   addnest($w,$_) foreach @nf;
  }
+ print "$sch nests added\n";
- sub flushw2k
- {
+ print TIME "$commonI: \t$sch ";
-    my ($n)=@_;
+ print TIME "\t".(time-$initime)."\n";
-    updateword2question($n,$neww2k{$n});
+ %newnf=();
-    delete $neww2k{$_};
  }
+ sub flushw2k
+ {
+    my ($n)=@_;
+    updateword2question($n,$neww2k{$n});
+    delete $neww2k{$_};
+ }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>

Removed from v.1.1
changed lines
	Added in v.1.10