mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-07-19 08:44:42 -04:00
first try to implement a rci-computation from cr-files
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1103 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
bin
source/de/anomic
@ -1,2 +1,3 @@
|
||||
cd `dirname $0`/..
|
||||
java -classpath source:classes de.anomic.plasma.plasmaRankingCRFile -accumulate .
|
||||
java -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -accumulate .
|
||||
java -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -recycle . 168
|
||||
|
2
bin/cr_genrci
Executable file
2
bin/cr_genrci
Executable file
@ -0,0 +1,2 @@
|
||||
cd `dirname $0`/..
|
||||
java -Xms300m -Xmx900m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -genrci .
|
@ -1,2 +0,0 @@
|
||||
cd `dirname $0`/..
|
||||
java -classpath source:classes de.anomic.plasma.plasmaRankingCRFile -recycle . 168
|
@ -80,7 +80,7 @@ public class kelondroAttrSeq {
|
||||
this.structure = null;
|
||||
this.created = 0;
|
||||
this.name = "";
|
||||
this.entries = readPropFile(file);
|
||||
this.entries = readAttrFile(file);
|
||||
}
|
||||
|
||||
public kelondroAttrSeq(String name, String struct) {
|
||||
@ -95,14 +95,21 @@ public class kelondroAttrSeq {
|
||||
this.theLogger = newLogger;
|
||||
}
|
||||
|
||||
public void logWarning(String message) {
|
||||
public void logInfo(String message) {
|
||||
if (this.theLogger == null)
|
||||
System.err.println("KELONDRO WARNING for file " + this.file + ": " + message);
|
||||
System.err.println("ATTRSEQ INFO for file " + this.file + ": " + message);
|
||||
else
|
||||
this.theLogger.warning("KELONDRO WARNING for file " + this.file + ": " + message);
|
||||
this.theLogger.info("ATTRSEQ INFO for file " + this.file + ": " + message);
|
||||
}
|
||||
|
||||
private TreeMap readPropFile(File file) throws IOException {
|
||||
public void logWarning(String message) {
|
||||
if (this.theLogger == null)
|
||||
System.err.println("ATTRSEQ WARNING for file " + this.file + ": " + message);
|
||||
else
|
||||
this.theLogger.warning("ATTRSEQ WARNING for file " + this.file + ": " + message);
|
||||
}
|
||||
|
||||
private TreeMap readAttrFile(File file) throws IOException {
|
||||
TreeMap entries = new TreeMap();
|
||||
BufferedReader br = null;
|
||||
int p;
|
||||
@ -173,6 +180,10 @@ public class kelondroAttrSeq {
|
||||
return entries.keySet().iterator();
|
||||
}
|
||||
|
||||
public Entry newEntry(String pivot) {
|
||||
return new Entry(pivot, new HashMap(), new TreeSet());
|
||||
}
|
||||
|
||||
public Entry newEntry(String pivot, HashMap props, TreeSet seq) {
|
||||
return new Entry(pivot, props, seq);
|
||||
}
|
||||
@ -334,6 +345,10 @@ public class kelondroAttrSeq {
|
||||
this.seq = seq;
|
||||
}
|
||||
|
||||
public void addSeq(String s) {
|
||||
this.seq.add(s);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
// creates only the attribute field and the sequence, not the pivot
|
||||
StringBuffer sb = new StringBuffer(70);
|
||||
@ -377,28 +392,4 @@ public class kelondroAttrSeq {
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
Class-A File format:
|
||||
|
||||
UDate : latest update timestamp of the URL (as virtual date, hours since epoch)
|
||||
VDate : last visit timestamp of the URL (as virtual date, hours since epoch)
|
||||
LCount : count of links to local resources
|
||||
GCount : count of links to global resources
|
||||
ICount : count of links to images (in document)
|
||||
DCount : count of links to other documents
|
||||
TLength: length of the plain text content (bytes)
|
||||
WACount: total number of all words in content
|
||||
WUCount: number of unique words in content (removed doubles)
|
||||
Flags : Flags (0=update, 1=popularity, 2=attention, 3=vote)
|
||||
|
||||
Class-a File format is an extension of Class-A plus the following attributes
|
||||
FUDate : first update timestamp of the URL
|
||||
FDDate : first update timestamp of the domain
|
||||
LUDate : latest update timestamp of the URL
|
||||
UCount : Update Counter (of 'latest update timestamp')
|
||||
PCount : Popularity Counter (proxy clicks)
|
||||
ACount : Attention Counter (search result clicks)
|
||||
VCount : Votes
|
||||
Vita : Vitality (normed number of updates per time)
|
||||
*/
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
// plasmaCRFile.java
|
||||
// plasmaCRProcess.java
|
||||
// -----------------------
|
||||
// part of YaCy
|
||||
// (C) by Michael Peter Christen; mc@anomic.de
|
||||
@ -54,7 +54,7 @@ import de.anomic.server.serverCodings;
|
||||
import de.anomic.server.serverFileUtils;
|
||||
import de.anomic.tools.bitfield;
|
||||
|
||||
public class plasmaRankingCRFile {
|
||||
public class plasmaRankingCRProcess {
|
||||
|
||||
/*
|
||||
header.append("# Name=YaCy " + ((type.equals("crl")) ? "Local" : "Global") + " Citation Reference Ticket"); header.append((char) 13); header.append((char) 10);
|
||||
@ -202,8 +202,52 @@ public class plasmaRankingCRFile {
|
||||
|
||||
}
|
||||
|
||||
public static long crFileCreated(File f) throws IOException {
|
||||
return (new kelondroAttrSeq(f)).created();
|
||||
public static void genrci(File cr_in, File rci_out) throws IOException {
|
||||
if (!(cr_in.exists())) return;
|
||||
kelondroAttrSeq cr = new kelondroAttrSeq(cr_in);
|
||||
kelondroAttrSeq rci;
|
||||
if (!(rci_out.exists())) {
|
||||
rci = new kelondroAttrSeq("Global Ranking Reverse Citation Index",
|
||||
"<AnchorDom-6>,'='," +
|
||||
"<UDate-3>," +
|
||||
"'|',*<Referee-12>");
|
||||
rci.toFile(rci_out);
|
||||
}
|
||||
rci = new kelondroAttrSeq(rci_out);
|
||||
|
||||
// loop over all referees
|
||||
Iterator i = cr.keys();
|
||||
String referee, anchor, anchorDom;
|
||||
kelondroAttrSeq.Entry cr_entry, rci_entry;
|
||||
long cr_UDate, rci_UDate;
|
||||
while (i.hasNext()) {
|
||||
referee = (String) i.next();
|
||||
cr_entry = cr.getEntry(referee);
|
||||
cr_UDate = cr_entry.getAttr("UDate", 0);
|
||||
|
||||
// loop over all anchors
|
||||
Iterator j = cr_entry.getSeq().iterator();
|
||||
while (j.hasNext()) {
|
||||
// get domain of anchors
|
||||
anchor = (String) j.next();
|
||||
if (anchor.length() == 6) anchorDom = anchor; else anchorDom = anchor.substring(6);
|
||||
|
||||
// update domain-specific entry
|
||||
rci_entry = rci.removeEntry(anchorDom);
|
||||
if (rci_entry == null) rci_entry = rci.newEntry(anchorDom);
|
||||
rci_entry.addSeq(referee);
|
||||
|
||||
// update Update-Date
|
||||
rci_UDate = rci_entry.getAttr("UDate", 0);
|
||||
if (cr_UDate > rci_UDate) rci_entry.setAttr("UDate", cr_UDate);
|
||||
|
||||
// insert entry
|
||||
rci.addEntry(rci_entry);
|
||||
}
|
||||
}
|
||||
|
||||
// finished. write to file
|
||||
rci.toFile(rci_out);
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
@ -243,7 +287,7 @@ public class plasmaRankingCRFile {
|
||||
for (int i = 0; i < list.length; i++) {
|
||||
f = new File(acc_dir, list[i]);
|
||||
try {
|
||||
d = (System.currentTimeMillis() - crFileCreated(f)) / 3600000;
|
||||
d = (System.currentTimeMillis() - (new kelondroAttrSeq(f)).created()) / 3600000;
|
||||
if (d > max_age_hours) {
|
||||
// file is considered to be too old, it is not recycled
|
||||
System.out.println("file " + f.getName() + " is old (" + d + " hours) and not recycled, only moved to backup");
|
||||
@ -261,6 +305,13 @@ public class plasmaRankingCRFile {
|
||||
}
|
||||
}
|
||||
}
|
||||
if ((args.length == 2) && (args[0].equals("-genrci"))) {
|
||||
File root_path = new File(args[1]);
|
||||
File cr_file = new File(root_path, "DATA/RANKING/GLOBAL/020_con0/CRG-a-acc.cr.gz");
|
||||
File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz");
|
||||
rci_file.getParentFile().mkdirs();
|
||||
genrci(cr_file, rci_file);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
Reference in New Issue
Block a user