summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobin H. Johnson <robbat2@gentoo.org>2008-03-20 17:40:37 -0700
committerRobin H. Johnson <robbat2@gentoo.org>2008-03-21 20:13:06 -0700
commit9a14f7fe3eb10585ead82f99a43802582ee4d5ba (patch)
tree76f49810ccd16b407ceaec1b8449e3c193c3466d /read-index.pl
parentAlter the fields slightly. (diff)
downloaddistindex-9a14f7fe3eb10585ead82f99a43802582ee4d5ba.tar.gz
distindex-9a14f7fe3eb10585ead82f99a43802582ee4d5ba.tar.bz2
distindex-9a14f7fe3eb10585ead82f99a43802582ee4d5ba.zip
Add index creation script.
Diffstat (limited to 'read-index.pl')
-rw-r--r--read-index.pl64
1 files changed, 59 insertions, 5 deletions
diff --git a/read-index.pl b/read-index.pl
index c279750..3041a1c 100644
--- a/read-index.pl
+++ b/read-index.pl
@@ -2,11 +2,17 @@
use strict;
use warnings;
+use Lucene;
+use File::Basename;
+
+# Lucene stuff by Robin H. Johnson <robbat2@gentoo.org>
+
+
my $filename = "sample.out";
open(my $fh, $filename) or die "could not open $filename";
-my %documents;
+my %rawdocs;
while (my $line=<$fh>) {
$line =~ /File-([^-]+)-([^:]+): ([^\n]*)\n/s;
my $fileid = $1; # numeric or "dist"
@@ -15,11 +21,11 @@ while (my $line=<$fh>) {
#print "Fileid: ". $fileid . "\n";
#print "field: ". $field . "\n";
#print "Value: ". $value . "\n";
-
- if ( ! $documents{$fileid} ) {
- $documents{$fileid} = { $field => $value };
+
+ if ( ! $rawdocs{$fileid} ) {
+ $rawdocs{$fileid} = { $field => $value };
} else {
- $documents{$fileid}{$field} = $value;
+ $rawdocs{$fileid}{$field} = $value;
}
}
close($fh);
@@ -41,3 +47,51 @@ close($fh);
# i would split up by [/.-_] at least. technically, using
# (\W|_|\d) as the class of split characters might be reasonable
+my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer();
+mkdir "data";
+my $store = Lucene::Store::FSDirectory->getDirectory("data", 0);
+my $writer = new Lucene::Index::IndexWriter($store, $analyzer, 1);
+$writer->setMergeFactor(100);
+$writer->setUseCompoundFile(0);
+$writer->setMaxFieldLength(2048);
+$writer->setMinMergeDocs(10);
+$writer->setMaxMergeDocs(100);
+
+# Add Documents here
+sub createdoc {
+ my ($distfile, $rawdoc) = @_;
+ my $isdist = defined($rawdoc->{isdistfile}) && $rawdoc->{isdistfile} ? 1 : 0;
+ my $doc = new Lucene::Document;
+ $doc->add(Lucene::Document::Field->Text("distfile", $distfile));
+ $doc->add(Lucene::Document::Field->Keyword("isdistfile", $isdist));
+ if($isdist) {
+ for my $f (qw(origin cat pn cpv)) {
+ $doc->add(Lucene::Document::Field->Text($f, $rawdoc->{$f})) if defined($rawdoc->{$f});
+ }
+ for my $f (qw(pv pr pf)) {
+ $doc->add(Lucene::Document::Field->Keyword($f, $rawdoc->{$f})) if defined($rawdoc->{$f});
+ }
+ } else {
+ my $name = $rawdoc->{name};
+ $doc->add(Lucene::Document::Field->Text("path", $name));
+ $doc->add(Lucene::Document::Field->Text("filename", basename($name)));
+ $doc->add(Lucene::Document::Field->Text("directory", dirname($name)));
+ }
+ for my $f (qw(md5 sha1 mtime size)) {
+ $doc->add(Lucene::Document::Field->Keyword($f, $rawdoc->{$f})) if defined($rawdoc->{$f});
+ }
+ return $doc;
+}
+
+my $distfile = $rawdocs{dist}{name};
+foreach my $f (keys(%rawdocs)) {
+ printf "%s\n", $f;
+ my $doc = createdoc($distfile, $rawdocs{$f});
+ $writer->addDocument($doc);
+}
+
+# End of Document adding
+$writer->optimize();
+$writer->close;
+undef $writer;
+