Annotation of db/prgsrc/updateRS1.pl, revision 1.8
1.2 boris 1: #!/usr/bin/perl -w
2:
3: =head1 NAME
4:
5: updateRS.pl - Скрипт для занесения в таблицы русского поиска базы
6: B<$base> информации о вопросах
7:
8: =head1 SYNOPSIS
9:
10: updateRS1.pl Questions_per_cicle cicles_number
11:
12: updateRS.pl Questions_per_cicle
13:
14: updateRS.pl
15:
16:
17: =head1 DESCRIPTION
18:
19: Скрипт ищет в таблице Questions вопросы с нулевым ProcessedBySearch,
20: добавляет информацию в таблицы word2question, nests, nf. Поле
21: ProcessedBySearch устанавливается в 1. Обрабатывает
22: Questions_per_cicle*cicles_number вопросов, сбрасывая
23: информацию в базу каждые Questions_per_cicle вопросов.
24: Значения по умолчанию:
25: Questions_per_cicle=500;
26: cicles_number=1.
27:
28:
29: =head1 AUTHOR
30:
31: Роман Семизаров
32:
33:
34: =cut
35:
36:
37: use locale;
38: use DBI;
39: use POSIX qw (locale_h);
1.3 boris 40: use lib "../lib";
1.2 boris 41: use chgkfiles;
42: use dbchgk;
43: open (STDERR,">errors");
44: my $initime=time;
45: open TIME, ">time";
46: do "common.pl";
47: do "chgk.cnf";
48:
49: $qlimit=shift||500;
50: $times=shift||1;
51:
52: die "Undefined \$maxwsize! check your chgk.cnf" unless $maxwsize;
53:
54: require "check.pl";
55:
56: open (STDERR,">$stderr") if $stderr;
57:
58:
59: open (UNKNOWN,">$unknown");
60:
61:
62:
63: my $nf;
64:
65: #open WARN, ">$warnings";
66:
1.5 roma7 67: #%forbidden=checktable('equalto')? getequalto : ();
1.2 boris 68:
69:
70: if ((uc 'а') ne 'А') {die "!Koi8-r locale not installed!\n"};
71:
72: getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
73:
74:
75:
76: print "Loading dictionaries\n";
77:
78: die "No dictionaries! Check your chgk.cnf" unless scalar @dictionaries;
79:
80: foreach $d(@dictionaries)
81: {
82: print "Loading $d\n";
83: open (DICT, $d) || print " Not found\n";
84: while ( <DICT> )
85: {
86: chomp;
87: s/\s*$//;
88: ($aa,$b)=split(/\//);
1.4 roma7 89: $aa=~tr/ёЁ/еЕ/;
1.2 boris 90: $a= uc $aa;
91: $words{$a}.=$b || "!";
92: }
93: close(DICT);
94: }
95:
96: die "No dictionaries found! Check your chgk.cnf" unless scalar keys %words;
97:
98:
99: print "Getting words...\n";
100:
101: print TIME "\t\t".(time-$initime)."\n";
102:
103: for my $commonI(1..$times)
104: {
105:
106:
107: print "loading nests\n";
108:
109: %nf=getnests;
110:
111: %nfnumber=getnfnumbers;
112:
113: print "Getting words...\n";
114:
115: getquestions(QuestionId, Question, Answer, Comments, Authors, Sources,"ProcessedBySearch IS NULL");
116: $sch=0;
117: while ((++$sch<=$qlimit) && (($id, @ss) = getrow, $id))
118: # берём по одному вопросу
119: # и вешаем в массив, индексы которого -- словоформы,
120: # а значения -- списки вопросов.
121: {
122:
123: print "$sch $id\n" ;#unless (++$sch % 1);
124: searchmark($id);
125: if ($forbidden{$id}) {next};
126: foreach $fieldnumber (0..$#ss) #перебираем поля
127: {
128: $text=$ss[$fieldnumber];
129: next unless $text;
130: $text=~tr/ёЁ/еЕ/;
131: $text=~s/(${RLrl})p(${RLrl})/$1p$2/gom;
132: $text=~s/p(${RLrl})/р$1/gom;
133: $text=~s/(${RLrl})p/$1р/gom;
134: $text=~s/\s+/ /gmo;
135: @list= $text=~m/(?:(?:${RLrl})+)|(?:[A-Za-z0-9]+)/gom;
136:
137: foreach $wordnumber(0..$#list)
138: {
139: $word=uc $list[$wordnumber];
140: next if length $word>$maxwsize;
141: if (my $n=$nf{$word}||$newnf{$word})
142: {
143: @n= split ' ',$n;
144: $nfkvo{$_}++ foreach @n;
1.6 roma7 145: my $r=packword($fieldnumber, $id,$wordnumber%256);
146: $neww2k{$_}.=$r foreach (@n);
1.2 boris 147: }
148: else {
149: if ($word=~/^${RLrl}+$/o){ # Русское слово
150: # проанализировать по таблице аффиксов,
151: # проверить наличие начальных форм в
152: # nf, а если таких нет, то
153: # и по словарю.
154:
155: $nf=&checkit(uc $word,\%words);
156: if (!$nf) {
157: $nf=(uc $word)."/!";
158: print UNKNOWN "$nf\n" if $unknown;
159: }
160:
161: } else {# нерусское слово
162: $nf=(uc $word)."/!";
163: print UNKNOWN "$nf\n" if $unknown;
164: }
165:
166: foreach $n (split ' ', $nf)
167: {
168: ($f,$flag)=split '/', $n;
169: if ($nfnumber=$nfnumber{$f})
170: {
171: $newnf{$word}.=" $nfnumber";
172: $nfkvo{$nfnumber}++;
173: $a=\$neww2k{$nfnumber};
1.6 roma7 174: my $r=packword($fieldnumber, $id,$wordnumber);
175: $$a.=$r;
1.2 boris 176: if (length $$a>100) {flushw2k($nfnumber)}
177: }
178: else
179: {
180: $nfnumber=addnf(0, $f, $flag,1);
181: $newnf{uc $word}.=" $nfnumber";
1.6 roma7 182: my $r=packword($fieldnumber, $id,$wordnumber);
183: $neww2k{$nfnumber}.=$r;
1.2 boris 184: }
185: }
186: }
187:
188: }
189: }
190:
191: }
192:
193:
194: print "Filling word2question...\n";
195:
196: foreach (keys %neww2k)
197: {
198: updateword2question($_,$neww2k{$_});
199: delete $neww2k{$_};
200: }
201:
202: %neww2k=();
203:
204: print "Filling nf...\n";
205: $sch=0;
206:
207: incnf($_,$nfkvo{$_}) foreach (keys %nfkvo);
208:
209: %nfkvo=();
210:
211: print "Filling nests...\n";
212: $sch=0;
213:
214:
215: foreach $w (keys %newnf)
216: {
217: print "$sch\n" unless (++$sch % 1000);
218: @nf=split ' ',$newnf{$w};
219: addnest($w,$_) foreach @nf;
220: }
221: print "$sch nests added\n";
222:
223: print TIME "$commonI: \t$sch ";
224: print TIME "\t".(time-$initime)."\n";
225: %newnf=();
226:
227: }
228:
229: sub flushw2k
230: {
231: my ($n)=@_;
232: updateword2question($n,$neww2k{$n});
233: delete $neww2k{$_};
234: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>