1
#!/usr/bin/perl
2
# vim: set ai et sw=3 ts=3 nu:
3
#
4
# Updates Solr with repository metadata
5
#
6
# by Pascal Bleser <pascal.bleser@opensuse.org>
7
#
8
#     This library is free software; you can redistribute it and/or modify it
9
#     under the terms of the GNU Lesser General Public License as published by
10
#     the Free Software Foundation; either version 2.1 of the License, or (at
11
#     your option) any later version.
12
#                 
13
#     This library is distributed in the hope that it will be useful, but
14
#     WITHOUT ANY WARRANTY; without even the implied warranty of
15
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
#     Lesser General Public License for more details.
17
#      
18
#     You should have received a copy of the GNU Lesser General Public
19
#     License along with this library; if not, write to the Free Software
20
#     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307,
21
#     USA.
22
23
use strict;
24
use warnings;
25
use LWP::UserAgent;
26
use HTTP::Date;
27
use File::Spec;
28
use File::Basename;
29
use HTML::Entities ();
30
use XML::LibXML;
31
use POSIX;
32
use IO::Uncompress::Gunzip;
33
use Term::ProgressBar;
34
use WebService::Solr;
35
use Getopt::Long;
36
37
use lib './lib';
38
use RPM_MD;
39
use YaST2_MD;
40
41
my $repos = "./repos.d";
42
my $cache_dir = "./cache.d";
43
my $verbose = undef;
44
my $force = undef;
45
my $validate_introspect = undef;
46
47
GetOptions(
48
   'v|verbose' => \$verbose,
49
   'f|force'   => \$force,
50
);
51
52
my $solr_escape_chars = quotemeta( '+-&|!(){}[]^"~*?:\\' );
53
my @repos = ();
54
55
my @rfiles = ();
56
if (scalar(@ARGV) > 0) {
57
   push(@rfiles, @ARGV);
58
} else {
59
   @rfiles = grep { -f } glob($repos.'/*.conf');
60
}
61
62
foreach my $rfile (@rfiles) {
63
   open(my $fh, '<', $rfile) or die "failed to open $rfile: $!";
64
   while (<$fh>) {
65
      chomp;
66
      s/#.*$//;
67
      s/^\s*//;
68
      s/\s*//;
69
      next if /^$/;
70
      if (/^(\S+)\s+(\S+)\s+(\S+)(?:\s+(\S+))?(?:\s+(\S+))?$/) {
71
         my $r = {
72
            repoid         => $1,
73
            distribution   => $2,
74
            baseurl        => $3,
75
            configfile     => $rfile,
76
         };
77
         $r->{mdtype} = defined $4 ? $4 : 'rpmmd';
78
         if (defined $5) {
79
            my %flags = map { $_ => 1 } split(/\s*,\s*/, $5);
80
            $r->{flags} = \%flags;
81
         } else {
82
            $r->{flags} = {};
83
         }
84
85
         push(@repos, $r);
86
      } else {
87
         die "invalid repo spec in $rfile at line $.";
88
      }
89
   }
90
   close($fh);
91
}
92
93
my $ua = LWP::UserAgent->new(
94
   timeout      => 10,
95
   agent        => "webpin-repomanager/1.0",
96
   max_redirect => 4,
97
);
98
$ua->env_proxy();
99
100
my $solr = WebService::Solr->new("http://localhost:8983/solr", {
101
    autocommit => 0,
102
});
103
$solr->ping() or die "failed to ping Solr";
104
105
sub f($$) {
106
    my $name = shift;
107
    my $value = shift;
108
    my $field = WebService::Solr::Field->new($name => $value);
109
    return $field;
110
}
111
112
sub solr_escape($) {
113
   my $v = shift;
114
   die "null value passed to solr_escape" unless defined $v;
115
   $v =~ s{([$solr_escape_chars])}{\\$1}g;
116
   return $v;
117
}
118
119
my @gone = ();
120
my $total = 0;
121
foreach my $r (@repos) {
122
   print $r->{repoid}, "\n" if $verbose;
123
124
   my $cache = File::Spec->catfile($cache_dir, $r->{repoid}.".cache");
125
   {
126
      my $dir = dirname($cache);
127
      mkdir($dir, 0750) unless -d $dir;
128
   }
129
130
   my $timestamp = undef;
131
   my $last_modified = undef;
132
   my $etag = undef;
133
   {
134
      if (-e $cache) {
135
         open(my $fh, '<', $cache) or die "failed to open cache file $cache: $!";
136
         chomp($timestamp = <$fh>);
137
         chomp($last_modified = <$fh>);
138
         chomp($etag = <$fh>);
139
         close($fh);
140
      }
141
   }
142
143
   my $h = {
144
      last_modified => $last_modified,
145
      etag          => $etag,
146
      timestamp     => $timestamp,
147
   };
148
149
   my @docs = ();
150
   my $packages = undef;
151
   my $repoheaders = undef;
152
   {
153
      my $pr = undef;
154
      if ($r->{mdtype} eq 'rpmmd' or $r->{mdtype} eq 'rpm-md') {
155
         eval {
156
            $pr = parse_rpmmd($r, $h, $ua, $verbose);
157
            #($packages, $repoheaders) = parse_rpmmd($r, $h, $ua, $verbose);
158
         };
159
         if ($@) {
160
            warn "failed to parse repository ".$r->{repoid}.": ".$@;
161
            next;
162
         }
163
      } elsif ($r->{mdtype} eq 'yast2') {
164
         eval {
165
            $pr = parse_y2md($r, $h, $ua, $verbose);
166
         };
167
         if ($@) {
168
            warn "failed to parse repository ".$r->{repoid}.": ".$@;
169
            next;
170
         }
171
      } else {
172
         warn "unsupported repository type \"".$r->{mdtype}."\"";
173
         next;
174
      }
175
176
      if (ref($pr) eq 'ARRAY') {
177
         $packages = $pr->[0];
178
         $repoheaders = $pr->[1];
179
      } elsif (ref($pr) eq 'HASH') {
180
         push(@gone, $r);
181
         next;
182
      } elsif (not defined($pr)) {
183
         next;
184
      } else {
185
         warn "unsupported scalar returned by parser: $pr";
186
         next;
187
      }
188
189
      if ($validate_introspect) {
190
         foreach my $p (@$packages) {
191
            die "missing summary in ".join('-', map { $p->{$_} } qw(name version release arch)) unless exists $p->{summary};
192
            while (my ($k, $v) = each(%$p)) {
193
               die "found undef for $k in package ".join('-', map { $p->{$_} } qw(name version release arch)) unless defined $v;
194
               if (ref($v) eq 'ARRAY') {
195
                  foreach (@$v) {
196
                     die "found undef in list $k in package ".join('-', map { $p->{$_} } qw(name version release arch)) unless defined $_;
197
                  }
198
               }
199
            }
200
         }
201
         foreach my $p (@$packages) {
202
            if (not exists $p->{id} or not defined $p->{id}) {
203
               use Data::Dumper;
204
               die "no id: " . Dumper($p);
205
            }
206
         }
207
      }
208
209
      my %source_rpm_index = ();
210
      foreach my $p (@$packages) {
211
         if ($p->{arch} eq "src") {
212
            my $k = basename($p->{location});
213
            die "no id for $k" unless exists $p->{id} and defined $p->{id};
214
            $source_rpm_index{$k} = $p;
215
         }
216
      }
217
218
      foreach my $p (@$packages) {
219
         foreach (qw(repoid distribution)) {
220
            $p->{$_} = $r->{$_};
221
         }
222
         $p->{repourl} = $r->{baseurl};
223
224
         if (exists $p->{sourcerpm} and $p->{arch} ne "src" and $p->{arch} ne "nosrc") {
225
            my $srpm = $p->{sourcerpm};
226
            my $s = $source_rpm_index{$srpm};
227
            if (defined $s) {
228
               my $sid = $s->{id};
229
               die "undefined id for $srpm" unless defined $sid;
230
               $p->{sourcerpmid} = $sid;
231
            } else {
232
               #warn "failed to find source rpm $srpm";
233
            }
234
         }
235
236
         {
237
            my $g = $p->{rpmgroup};
238
            delete $p->{rpmgroup};
239
            $p->{group_exact} = $g;
240
            $p->{group_last} = $g;
241
         }
242
243
         # post-process
244
         if (exists $p->{description} and defined $p->{description}) {
245
            $p->{description} =~ s/\s*\bAuthors?:?.*$//ms;
246
         }
247
         
248
         if (exists $p->{packager} and defined $p->{packager}) {
249
            $p->{packager} =~ s/\s*<.+@.+>//;
250
            $p->{packager} =~ s/\w.+@.+\w//;
251
            $p->{packager} = HTML::Entities::encode_numeric($p->{packager});
252
         }
253
254
         {
255
            foreach my $tag (qw(requires provides)) {
256
               my @pp = grep { not /^(rpmlib|libc\.so|debuginfo\()/ } @{$p->{$tag}};
257
               $p->{$tag} = \@pp;
258
            }
259
         }
260
261
         $p->{mime} = [];
262
         $p->{perl} = [];
263
         foreach ($p->{provides}) {
264
            push(@{$p->{mime}}, $1) if /^(?:mimetype|mimehandler)\(.+?\)/;
265
            push(@{$p->{perl}}, $1) if /^perl\(.+?\)/;
266
         }
267
268
         $p->{tag} = [];
269
         push(@{$p->{tag}}, 'doc') if $p->{name} =~ /-doc$/;
270
         push(@{$p->{tag}}, 'lang') if $p->{name} =~ /-lang$/;
271
         push(@{$p->{tag}}, 'devel') if $p->{name} =~ /-devel$/;
272
         push(@{$p->{tag}}, 'perl') if $p->{name} =~ /^perl-\D$/;
273
         push(@{$p->{tag}}, 'python') if $p->{name} =~ /^python-\D$/;
274
         push(@{$p->{tag}}, 'ruby') if $p->{name} =~ /^ruby(gem)?-\D$/;
275
         push(@{$p->{tag}}, 'lib') if $p->{name} =~ /^lib/;
276
         push(@{$p->{tag}}, 'debug') if $p->{name} =~ /\-debug(info|source)$/;
277
         push(@{$p->{tag}}, 'src') if $p->{arch} eq "src" or $p->{arch} eq "nosrc";
278
279
         # make a Solr document from that
280
         my @fields = ();
281
         while (my ($k, $v) = each(%$p)) {
282
            next if $k eq 'configfile';
283
284
            if (ref($v) eq 'ARRAY') {
285
               foreach (@$v) {
286
                  die "undef found for $k in ".join("-", ($p->{name}, $p->{version}, $p->{release})) if not defined $_;
287
                  push(@fields, f($k, $_));
288
               }
289
            } elsif (ref($v) eq '') {
290
               push(@fields, f($k, $v));
291
            } else {
292
               die "wtf, a ref ? ($k)";
293
            }
294
         }
295
         my $doc = WebService::Solr::Document->new;
296
         $doc->add_fields(@fields);
297
         push(@docs, $doc);
298
      }
299
300
      my $solr_repoid = solr_escape($r->{repoid});
301
302
      my @missing;
303
      my @newones;
304
      {
305
         # load all the documents in Solr for the current repoid
306
         my $exres = $solr->search('', {
307
            'q.alt'  => 'repoid:'.$solr_repoid,
308
            'fl'     => 'id,name,version,release,arch,sha',
309
            'rows'   => '9999999',
310
            'start'  => 0,
311
         });
312
         my @exdocs = $exres->docs;
313
314
         {
315
            my %index = map { $_->value_for('sha') => 1 } @docs;
316
            @missing = grep { not exists $index{$_->value_for('sha')} } @exdocs;
317
         }
318
         {
319
            my %index = map { $_->value_for('sha') => 1 } @exdocs;
320
            @newones = grep { not exists $index{$_->value_for('sha')} } @docs;
321
         }
322
      }
323
      if ($verbose) {
324
         print "\n";
325
         print "\t", scalar(@newones), " new packages", "\n";
326
         print "\t", scalar(@missing), " packages have gone missing", "\n";
327
      }
328
329
      if (scalar(@missing) > 0) {
330
         print "\t", "deleting ", scalar(@missing), " packages", "\n" if $verbose;
331
         foreach my $p (@missing) {
332
            my $solr_id = solr_escape($p->value_for('id'));
333
            $solr->delete_by_query('id:'.$solr_id) or die "failed to delete id:".$solr_id;
334
         }
335
      }
336
      if (scalar(@newones) > 0) {
337
         my $progress = undef;
338
         if ($verbose) {
339
            print "\t", "adding ", scalar(@newones), " documents to Solr", "\n";
340
            $progress = Term::ProgressBar->new({
341
               count => scalar(@newones),
342
               name  => "adding to Solr",
343
               ETA   => 'linear',
344
            });
345
            $progress->minor(0);
346
         }
347
         my $chunk = [];
348
         my $i = 0;
349
         foreach my $d (@newones) {
350
            push(@$chunk, $d);
351
            if (($i % 10) == 0) {
352
               $solr->add($chunk, { overwrite => 1 });
353
               $chunk = [];
354
               $progress->update($i) if $progress;
355
            }
356
            $i++;
357
         }
358
         if (scalar(@$chunk) > 0) {
359
            $solr->add($chunk, { overwrite => 1 });
360
         }
361
         $progress->update(scalar(@newones)) if $progress;
362
363
         #$solr->add(\@newones, { overwrite => 1 });
364
         $total += scalar(@newones);
365
      }
366
      print "\t", "committing Solr", "\n" if $verbose;
367
      $solr->commit();
368
      print "\n" if $verbose;
369
   }
370
371
   # save to cache
372
   if (exists $repoheaders->{timestamp} and defined $repoheaders->{timestamp} and exists $repoheaders->{last_modified} and defined $repoheaders->{last_modified}) {
373
      open(my $fh, '>', $cache) or die "failed to open cache for write: $cache: $!";
374
      print $fh $repoheaders->{timestamp}, "\n";
375
      print $fh $repoheaders->{last_modified}, "\n";
376
      if (exists $repoheaders->{etag} and defined $repoheaders->{etag}) {
377
         print $fh $repoheaders->{etag}, "\n"; #->header("etag"), "\n";
378
      } else {
379
         print $fh "\n";
380
      }
381
      close($fh);
382
      print "\t", "saved cache to ", $cache, "\n" if $verbose;
383
   }
384
385
}
386
387
if ($total > 0) {
388
   print "\n", "optimizing Solr index", "\n" if $verbose;
389
   $solr->optimize();
390
}
391
392
if (scalar(@gone) > 0) {
393
   print "The following repositories have disappeared:", "\n";
394
   foreach my $r (@gone) {
395
      print join("    ", map { $r->{$_} } qw(configfile repoid distribution baseurl)), "\n";
396
   }
397
}