--- db/prgsrc/updateRS1.pl	2001/10/31 03:00:10	1.1
+++ db/prgsrc/updateRS1.pl	2001/10/31 03:07:29	1.2
@@ -1,230 +1,230 @@
-#!/usr/local/bin/perl -w
-
-=head1 NAME
-
-updateRS.pl - Скрипт для занесения в таблицы русского поиска базы 
-B<$base> информации о вопросах 
-
-=head1 SYNOPSIS
-
-updateRS1.pl Questions_per_cicle cicles_number
-
-updateRS.pl Questions_per_cicle
-
-updateRS.pl
-
-
-=head1 DESCRIPTION
-
-Скрипт ищет в таблице Questions вопросы с нулевым ProcessedBySearch,
-добавляет информацию в таблицы word2question, nests, nf. Поле 
-ProcessedBySearch устанавливается в 1. Обрабатывает 
-Questions_per_cicle*cicles_number вопросов, сбрасывая 
-информацию в базу каждые Questions_per_cicle вопросов.
-Значения по умолчанию:
-Questions_per_cicle=500;
-cicles_number=1.
-
-
-=head1 AUTHOR
-
-Роман Семизаров
-
-
-=cut
-
-
-use locale;
-use DBI;
-use POSIX qw (locale_h);
-use chgkfiles;
-use dbchgk;
-open (STDERR,">errors");
-my $initime=time;
-open TIME, ">time"; 
-do "common.pl";
-do "chgk.cnf";
-
-$qlimit=shift||500;
-$times=shift||1;
-
-die "Undefined \$maxwsize! check your chgk.cnf" unless $maxwsize;
-
-require "check.pl";
-
-open (STDERR,">$stderr") if $stderr;
-
-
-open (UNKNOWN,">$unknown");
-
-
-          
-my $nf;
-
-#open WARN, ">$warnings";
-
-%forbidden=checktable('equalto')? getequalto : ();
-
-
-if ((uc 'а') ne 'А') {die "!Koi8-r locale not installed!\n"};
-
-getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
-
-
-
-print "Loading dictionaries\n";
-
-die "No dictionaries! Check your chgk.cnf" unless scalar @dictionaries;
-
-foreach $d(@dictionaries)
-{
-     print "Loading $d\n";
-     open (DICT, $d) || print "              Not found\n";
-     while ( <DICT> )
-     {
-     	chomp;
-     	s/\s*$//;
-       	($aa,$b)=split(/\//);
-    	$a= uc $aa;
-    	$words{$a}.=$b || "!";
-     }
-     close(DICT);
-}
-
-die "No dictionaries found! Check your chgk.cnf" unless scalar keys %words;
-
-
-print "Getting words...\n";
-
-print TIME "\t\t".(time-$initime)."\n";
-
-for my $commonI(1..$times)
-{
-
-
-print "loading nests\n";
-
-%nf=getnests;
-
-%nfnumber=getnfnumbers;
-
-print "Getting words...\n";
-
-getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
-$sch=0;
-while ((++$sch<=$qlimit) && (($id, @ss) = getrow, $id)) 
-                             # берём по одному вопросу
-                             # и вешаем в массив, индексы которого -- словоформы,
-                             # а значения -- списки вопросов. 
-{
-
-   print "$sch $id\n" ;#unless (++$sch % 1);
-   searchmark($id);
-   if ($forbidden{$id}) {next};
-   foreach $fieldnumber (0..$#ss) #перебираем поля
-   {
-      $text=$ss[$fieldnumber];
-      next unless $text;
-      $text=~tr/ёЁ/еЕ/;
-      $text=~s/(${RLrl})p(${RLrl})/$1p$2/gom;
-      $text=~s/p(${RLrl})/р$1/gom;
-      $text=~s/(${RLrl})p/$1р/gom;
-      $text=~s/\s+/ /gmo;
-      @list= $text=~m/(?:(?:${RLrl})+)|(?:[A-Za-z0-9]+)/gom;
-
-foreach $wordnumber(0..$#list)
-      {
-           $word=uc $list[$wordnumber];
-           next if length $word>$maxwsize;
-           if (my $n=$nf{$word}||$newnf{$word}) 
-           {
-                @n= split ' ',$n;
-                $nfkvo{$_}++ foreach @n;
-                $neww2k{$_}.=packword($fieldnumber, $id,$wordnumber%256)
-                     foreach (@n);
-           }
-           else {
-             if ($word=~/^${RLrl}+$/o){ # Русское слово
-            # проанализировать по таблице аффиксов, 
-            # проверить наличие начальных форм в 
-            # nf, а если таких нет, то 
-            # и по словарю. 
-              
-               $nf=&checkit(uc $word,\%words);
-               if (!$nf) {
-                           $nf=(uc $word)."/!";
-                           print UNKNOWN "$nf\n" if $unknown;
-                         }
-
-              }  else {# нерусское слово
-                         $nf=(uc $word)."/!";
-                         print UNKNOWN "$nf\n" if $unknown;
-                      }
-
-               foreach $n (split ' ', $nf)
-               {
-                   ($f,$flag)=split '/', $n;
-                   if ($nfnumber=$nfnumber{$f})
-                   {
-                       $newnf{$word}.=" $nfnumber";
-                       $nfkvo{$nfnumber}++;
-                       $a=\$neww2k{$nfnumber};
-                       $$a.=packword($fieldnumber, $id,$wordnumber);
-                       if (length $$a>100) {flushw2k($nfnumber)}
-                   }
-                   else 
-                   {
-                      $nfnumber=addnf(0, $f, $flag,1);
-                      $newnf{uc $word}.=" $nfnumber";
-                      $neww2k{$nfnumber}.=packword($fieldnumber, $id,$wordnumber);
-                   }
-               }
-           }
-
-      }
-   }
-
-}
-
-
-print "Filling word2question...\n";
-
-foreach (keys %neww2k)
-{ 
-   updateword2question($_,$neww2k{$_});
-   delete $neww2k{$_};
-}
-
-%neww2k=();
-
-print "Filling nf...\n";
-$sch=0;
-
-incnf($_,$nfkvo{$_})  foreach (keys %nfkvo);
-
-%nfkvo=();
-
-print "Filling nests...\n";
-$sch=0;
-
-
-foreach $w (keys %newnf)
-{
-  print "$sch\n" unless (++$sch % 1000);
-  @nf=split ' ',$newnf{$w};
-  addnest($w,$_) foreach @nf;
-}
-print "$sch nests added\n";
-
-print TIME "$commonI: \t$sch ";
-print TIME "\t".(time-$initime)."\n";
-%newnf=();
-
-}
-
-sub flushw2k
-{
-   my ($n)=@_;
-   updateword2question($n,$neww2k{$n});
-   delete $neww2k{$_};
-}
+#!/usr/bin/perl -w
+
+=head1 NAME
+
+updateRS.pl - Скрипт для занесения в таблицы русского поиска базы 
+B<$base> информации о вопросах 
+
+=head1 SYNOPSIS
+
+updateRS1.pl Questions_per_cicle cicles_number
+
+updateRS.pl Questions_per_cicle
+
+updateRS.pl
+
+
+=head1 DESCRIPTION
+
+Скрипт ищет в таблице Questions вопросы с нулевым ProcessedBySearch,
+добавляет информацию в таблицы word2question, nests, nf. Поле 
+ProcessedBySearch устанавливается в 1. Обрабатывает 
+Questions_per_cicle*cicles_number вопросов, сбрасывая 
+информацию в базу каждые Questions_per_cicle вопросов.
+Значения по умолчанию:
+Questions_per_cicle=500;
+cicles_number=1.
+
+
+=head1 AUTHOR
+
+Роман Семизаров
+
+
+=cut
+
+
+use locale;
+use DBI;
+use POSIX qw (locale_h);
+use chgkfiles;
+use dbchgk;
+open (STDERR,">errors");
+my $initime=time;
+open TIME, ">time"; 
+do "common.pl";
+do "chgk.cnf";
+
+$qlimit=shift||500;
+$times=shift||1;
+
+die "Undefined \$maxwsize! check your chgk.cnf" unless $maxwsize;
+
+require "check.pl";
+
+open (STDERR,">$stderr") if $stderr;
+
+
+open (UNKNOWN,">$unknown");
+
+
+          
+my $nf;
+
+#open WARN, ">$warnings";
+
+%forbidden=checktable('equalto')? getequalto : ();
+
+
+if ((uc 'а') ne 'А') {die "!Koi8-r locale not installed!\n"};
+
+getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
+
+
+
+print "Loading dictionaries\n";
+
+die "No dictionaries! Check your chgk.cnf" unless scalar @dictionaries;
+
+foreach $d(@dictionaries)
+{
+     print "Loading $d\n";
+     open (DICT, $d) || print "              Not found\n";
+     while ( <DICT> )
+     {
+     	chomp;
+     	s/\s*$//;
+       	($aa,$b)=split(/\//);
+    	$a= uc $aa;
+    	$words{$a}.=$b || "!";
+     }
+     close(DICT);
+}
+
+die "No dictionaries found! Check your chgk.cnf" unless scalar keys %words;
+
+
+print "Getting words...\n";
+
+print TIME "\t\t".(time-$initime)."\n";
+
+for my $commonI(1..$times)
+{
+
+
+print "loading nests\n";
+
+%nf=getnests;
+
+%nfnumber=getnfnumbers;
+
+print "Getting words...\n";
+
+getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
+$sch=0;
+while ((++$sch<=$qlimit) && (($id, @ss) = getrow, $id)) 
+                             # берём по одному вопросу
+                             # и вешаем в массив, индексы которого -- словоформы,
+                             # а значения -- списки вопросов. 
+{
+
+   print "$sch $id\n" ;#unless (++$sch % 1);
+   searchmark($id);
+   if ($forbidden{$id}) {next};
+   foreach $fieldnumber (0..$#ss) #перебираем поля
+   {
+      $text=$ss[$fieldnumber];
+      next unless $text;
+      $text=~tr/ёЁ/еЕ/;
+      $text=~s/(${RLrl})p(${RLrl})/$1p$2/gom;
+      $text=~s/p(${RLrl})/р$1/gom;
+      $text=~s/(${RLrl})p/$1р/gom;
+      $text=~s/\s+/ /gmo;
+      @list= $text=~m/(?:(?:${RLrl})+)|(?:[A-Za-z0-9]+)/gom;
+
+foreach $wordnumber(0..$#list)
+      {
+           $word=uc $list[$wordnumber];
+           next if length $word>$maxwsize;
+           if (my $n=$nf{$word}||$newnf{$word}) 
+           {
+                @n= split ' ',$n;
+                $nfkvo{$_}++ foreach @n;
+                $neww2k{$_}.=packword($fieldnumber, $id,$wordnumber%256)
+                     foreach (@n);
+           }
+           else {
+             if ($word=~/^${RLrl}+$/o){ # Русское слово
+            # проанализировать по таблице аффиксов, 
+            # проверить наличие начальных форм в 
+            # nf, а если таких нет, то 
+            # и по словарю. 
+              
+               $nf=&checkit(uc $word,\%words);
+               if (!$nf) {
+                           $nf=(uc $word)."/!";
+                           print UNKNOWN "$nf\n" if $unknown;
+                         }
+
+              }  else {# нерусское слово
+                         $nf=(uc $word)."/!";
+                         print UNKNOWN "$nf\n" if $unknown;
+                      }
+
+               foreach $n (split ' ', $nf)
+               {
+                   ($f,$flag)=split '/', $n;
+                   if ($nfnumber=$nfnumber{$f})
+                   {
+                       $newnf{$word}.=" $nfnumber";
+                       $nfkvo{$nfnumber}++;
+                       $a=\$neww2k{$nfnumber};
+                       $$a.=packword($fieldnumber, $id,$wordnumber);
+                       if (length $$a>100) {flushw2k($nfnumber)}
+                   }
+                   else 
+                   {
+                      $nfnumber=addnf(0, $f, $flag,1);
+                      $newnf{uc $word}.=" $nfnumber";
+                      $neww2k{$nfnumber}.=packword($fieldnumber, $id,$wordnumber);
+                   }
+               }
+           }
+
+      }
+   }
+
+}
+
+
+print "Filling word2question...\n";
+
+foreach (keys %neww2k)
+{ 
+   updateword2question($_,$neww2k{$_});
+   delete $neww2k{$_};
+}
+
+%neww2k=();
+
+print "Filling nf...\n";
+$sch=0;
+
+incnf($_,$nfkvo{$_})  foreach (keys %nfkvo);
+
+%nfkvo=();
+
+print "Filling nests...\n";
+$sch=0;
+
+
+foreach $w (keys %newnf)
+{
+  print "$sch\n" unless (++$sch % 1000);
+  @nf=split ' ',$newnf{$w};
+  addnest($w,$_) foreach @nf;
+}
+print "$sch nests added\n";
+
+print TIME "$commonI: \t$sch ";
+print TIME "\t".(time-$initime)."\n";
+%newnf=();
+
+}
+
+sub flushw2k
+{
+   my ($n)=@_;
+   updateword2question($n,$neww2k{$n});
+   delete $neww2k{$_};
+}