mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-07-23 09:24:39 -04:00
refactoring of search / preparation for better search methods
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@921 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
@ -54,6 +54,7 @@ import java.util.HashSet;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.plasma.plasmaCrawlLURL;
|
||||
@ -150,9 +151,7 @@ public class IndexControl_p {
|
||||
// generate an urlx array
|
||||
plasmaWordIndexEntity index = null;
|
||||
try {
|
||||
HashSet keyhashes = new HashSet();
|
||||
keyhashes.add(keyhash);
|
||||
index = switchboard.searchManager.searchHashes(keyhashes, 10000);
|
||||
index = switchboard.wordIndex.getEntity(keyhash, true);
|
||||
Enumeration en = index.elements(true);
|
||||
int i = 0;
|
||||
urlx = new String[index.size()];
|
||||
@ -437,9 +436,7 @@ public class IndexControl_p {
|
||||
// search for a word hash and generate a list of url links
|
||||
plasmaWordIndexEntity index = null;
|
||||
try {
|
||||
final HashSet keyhashes = new HashSet();
|
||||
keyhashes.add(keyhash);
|
||||
index = switchboard.searchManager.searchHashes(keyhashes, 10000);
|
||||
index = switchboard.wordIndex.getEntity(keyhash, true);
|
||||
|
||||
final StringBuffer result = new StringBuffer(1024);
|
||||
if (index.size() == 0) {
|
||||
|
@ -139,8 +139,8 @@ public class index {
|
||||
(yacyCore.seedDB.mySeed != null) &&
|
||||
(yacyCore.seedDB.mySeed.getAddress() != null));
|
||||
|
||||
final String order1 = (order.equals("Quality-Date")) ? "quality" : "date";
|
||||
final String order2 = (order.equals("Quality-Date")) ? "date" : "quality";
|
||||
final String order1 = (order.equals("Quality-Date")) ? plasmaSearchQuery.ORDER_QUALITY : plasmaSearchQuery.ORDER_DATE;
|
||||
final String order2 = (order.equals("Quality-Date")) ? plasmaSearchQuery.ORDER_DATE : plasmaSearchQuery.ORDER_QUALITY;
|
||||
String urlmask = "";
|
||||
if (post.containsKey("urlmask") && post.get("urlmask").equals("no")) {
|
||||
urlmask = ".*";
|
||||
@ -149,7 +149,7 @@ public class index {
|
||||
}
|
||||
|
||||
// do the search
|
||||
plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, referer, new String[]{order1, order2}, count, searchtime, urlmask,
|
||||
plasmaSearchQuery thisSearch = new plasmaSearchQuery(query, new String[]{order1, order2}, count, searchtime, urlmask, referer,
|
||||
((global) && (yacyonline) && (!(env.getConfig("last-search","").equals(querystring)))) ? plasmaSearchQuery.SEARCHDOM_GLOBALDHT : plasmaSearchQuery.SEARCHDOM_LOCAL,
|
||||
"", 20);
|
||||
final serverObjects prop = sb.searchFromLocal(thisSearch);
|
||||
|
@ -51,6 +51,7 @@ import java.util.HashSet;
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
import de.anomic.plasma.plasmaWordIndexEntry;
|
||||
import de.anomic.plasma.plasmaSearchQuery;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
import de.anomic.yacy.yacyCore;
|
||||
@ -89,7 +90,11 @@ public final class search {
|
||||
keyhashes.add(query.substring(i * plasmaWordIndexEntry.wordHashLength, (i + 1) * plasmaWordIndexEntry.wordHashLength));
|
||||
}
|
||||
final long timestamp = System.currentTimeMillis();
|
||||
prop = sb.searchFromRemote(keyhashes, count, global, duetime);
|
||||
|
||||
plasmaSearchQuery squery = new plasmaSearchQuery(keyhashes, new String[]{plasmaSearchQuery.ORDER_QUALITY, plasmaSearchQuery.ORDER_DATE},
|
||||
count, duetime, ".*");
|
||||
|
||||
prop = sb.searchFromRemote(squery);
|
||||
prop.put("searchtime", Long.toString(System.currentTimeMillis() - timestamp));
|
||||
|
||||
final int links = Integer.parseInt(prop.get("linkcount","0"));
|
||||
|
@ -113,89 +113,19 @@ public final class plasmaSearch {
|
||||
//System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries");
|
||||
return condenser.getWords().size();
|
||||
}
|
||||
|
||||
/*
|
||||
public plasmaWordIndexEntity searchWords(Set words, long time) throws IOException {
|
||||
// search for the set of words and return an array of urlEntry elements
|
||||
return searchHashes(plasmaSearchQuery.words2hashes(words), time);
|
||||
}
|
||||
|
||||
}
|
||||
*/
|
||||
/*
|
||||
public plasmaWordIndexEntity searchHashes(Set hashes, long time) throws IOException {
|
||||
// search for the set of hashes and return an array of urlEntry elements
|
||||
|
||||
long stamp = System.currentTimeMillis();
|
||||
TreeMap map = new TreeMap();
|
||||
String singleHash;
|
||||
plasmaWordIndexEntity singleResult;
|
||||
Iterator i = hashes.iterator();
|
||||
while (i.hasNext()) {
|
||||
// get next hash:
|
||||
singleHash = (String) i.next();
|
||||
|
||||
// retrieve index
|
||||
singleResult = wordIndex.getEntity(singleHash, true);
|
||||
|
||||
// check result
|
||||
if ((singleResult == null) || (singleResult.size() == 0)) return new plasmaWordIndexEntity(null); // as this is a cunjunction of searches, we have no result if any word is not known
|
||||
|
||||
// store result in order of result size
|
||||
map.put(serverCodings.enhancedCoder.encodeHex(singleResult.size(), 8) + singleHash, singleResult);
|
||||
}
|
||||
|
||||
// check if there is any result
|
||||
if (map.size() == 0) return new plasmaWordIndexEntity(null); // no result, nothing found
|
||||
|
||||
// the map now holds the search results in order of number of hits per word
|
||||
// we now must pairwise build up a conjunction of these sets
|
||||
String k = (String) map.firstKey(); // the smallest, which means, the one with the least entries
|
||||
plasmaWordIndexEntity searchA, searchB, searchResult = (plasmaWordIndexEntity) map.remove(k);
|
||||
while ((map.size() > 0) && (searchResult.size() > 0) && (time > 0)) {
|
||||
// take the first element of map which is a result and combine it with result
|
||||
k = (String) map.firstKey(); // the next smallest...
|
||||
time -= (System.currentTimeMillis() - stamp); stamp = System.currentTimeMillis();
|
||||
searchA = searchResult;
|
||||
searchB = (plasmaWordIndexEntity) map.remove(k);
|
||||
searchResult = plasmaWordIndexEntity.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1));
|
||||
// close the input files/structures
|
||||
if (searchA != searchResult) searchA.close();
|
||||
if (searchB != searchResult) searchB.close();
|
||||
}
|
||||
searchA = null; // free resources
|
||||
searchB = null; // free resources
|
||||
|
||||
// in 'searchResult' is now the combined search result
|
||||
if (searchResult.size() == 0) return new plasmaWordIndexEntity(null);
|
||||
return searchResult;
|
||||
}
|
||||
|
||||
*/
|
||||
/*
|
||||
public plasmaSearchResult order(plasmaWordIndexEntity searchResult, Set searchhashes, Set stopwords, char[] priority, long maxTime, int minEntries) throws IOException {
|
||||
// we collect the urlhashes from it and construct a List with urlEntry objects
|
||||
// attention: if minEntries is too high, this method will not terminate within the maxTime
|
||||
|
||||
plasmaSearchResult acc = new plasmaSearchResult(searchhashes, stopwords, priority);
|
||||
if (searchResult == null) return acc; // strange case where searchResult is not proper: acc is then empty
|
||||
if (searchResult.size() == 0) return acc; // case that we have nothing to do
|
||||
|
||||
Enumeration e = searchResult.elements(true);
|
||||
plasmaWordIndexEntry entry;
|
||||
long startCreateTime = System.currentTimeMillis();
|
||||
plasmaCrawlLURL.Entry page;
|
||||
try {
|
||||
while (e.hasMoreElements()) {
|
||||
if ((acc.sizeFetched() >= minEntries) &&
|
||||
(System.currentTimeMillis() - startCreateTime >= maxTime)) break;
|
||||
entry = (plasmaWordIndexEntry) e.nextElement();
|
||||
// find the url entry
|
||||
page = urlStore.getEntry(entry.getUrlHash());
|
||||
// add a result
|
||||
acc.addResult(entry, page);
|
||||
}
|
||||
} catch (kelondroException ee) {
|
||||
serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee);
|
||||
}
|
||||
long startSortTime = System.currentTimeMillis();
|
||||
acc.sortResults();
|
||||
serverLog.logFine("PLASMA", "plasmaSearch.order: minEntries = " + minEntries + ", effectiveEntries = " + acc.sizeOrdered() + ", demanded Time = " + maxTime + ", effectiveTime = " + (System.currentTimeMillis() - startCreateTime) + ", createTime = " + (startSortTime - startCreateTime) + ", sortTime = " + (System.currentTimeMillis() - startSortTime));
|
||||
return acc;
|
||||
}
|
||||
|
||||
*/
|
||||
}
|
||||
|
@ -43,13 +43,98 @@
|
||||
package de.anomic.plasma;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
import java.util.TreeMap;
|
||||
import java.util.Enumeration;
|
||||
import java.io.IOException;
|
||||
|
||||
import de.anomic.kelondro.kelondroException;
|
||||
import de.anomic.server.logging.serverLog;
|
||||
import de.anomic.server.serverCodings;
|
||||
|
||||
public final class plasmaSearchEvent {
|
||||
|
||||
private serverLog log;
|
||||
private plasmaSearchQuery query;
|
||||
|
||||
public plasmaSearchEvent(plasmaSearchQuery query) {
|
||||
private plasmaWordIndex wordIndex;
|
||||
private plasmaCrawlLURL urlStore;
|
||||
private plasmaSnippetCache snippetCache;
|
||||
|
||||
public plasmaSearchEvent(plasmaSearchQuery query, serverLog log, plasmaWordIndex wordIndex, plasmaCrawlLURL urlStore, plasmaSnippetCache snippetCache) {
|
||||
this.log = log;
|
||||
this.wordIndex = wordIndex;
|
||||
this.query = query;
|
||||
this.urlStore = urlStore;
|
||||
this.snippetCache = snippetCache;
|
||||
}
|
||||
|
||||
public plasmaWordIndexEntity search(long time) throws IOException {
|
||||
// search for the set of hashes and return an array of urlEntry elements
|
||||
|
||||
long stamp = System.currentTimeMillis();
|
||||
|
||||
// retrieve entities that belong to the hashes
|
||||
Set entities = wordIndex.getEntities(query.queryHashes, true, true);
|
||||
|
||||
// since this is a conjunction we return an empty entity if any word is not known
|
||||
if (entities == null) return new plasmaWordIndexEntity(null);
|
||||
|
||||
// join the result
|
||||
return plasmaWordIndexEntity.joinEntities(entities, time - (System.currentTimeMillis() - stamp));
|
||||
}
|
||||
|
||||
public plasmaSearchResult order(plasmaWordIndexEntity searchResult, long maxTime, int minEntries) throws IOException {
|
||||
// we collect the urlhashes from it and construct a List with urlEntry objects
|
||||
// attention: if minEntries is too high, this method will not terminate within the maxTime
|
||||
|
||||
plasmaSearchResult acc = new plasmaSearchResult(query);
|
||||
if (searchResult == null) return acc; // strange case where searchResult is not proper: acc is then empty
|
||||
if (searchResult.size() == 0) return acc; // case that we have nothing to do
|
||||
|
||||
Enumeration e = searchResult.elements(true);
|
||||
plasmaWordIndexEntry entry;
|
||||
long startCreateTime = System.currentTimeMillis();
|
||||
plasmaCrawlLURL.Entry page;
|
||||
try {
|
||||
while (e.hasMoreElements()) {
|
||||
if ((acc.sizeFetched() >= minEntries) &&
|
||||
(System.currentTimeMillis() - startCreateTime >= maxTime)) break;
|
||||
entry = (plasmaWordIndexEntry) e.nextElement();
|
||||
// find the url entry
|
||||
page = urlStore.getEntry(entry.getUrlHash());
|
||||
// add a result
|
||||
acc.addResult(entry, page);
|
||||
}
|
||||
} catch (kelondroException ee) {
|
||||
serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee);
|
||||
}
|
||||
long startSortTime = System.currentTimeMillis();
|
||||
acc.sortResults();
|
||||
serverLog.logFine("PLASMA", "plasmaSearch.order: minEntries = " + minEntries + ", effectiveEntries = " + acc.sizeOrdered() + ", demanded Time = " + maxTime + ", effectiveTime = " + (System.currentTimeMillis() - startCreateTime) + ", createTime = " + (startSortTime - startCreateTime) + ", sortTime = " + (System.currentTimeMillis() - startSortTime));
|
||||
return acc;
|
||||
}
|
||||
|
||||
/*
|
||||
public void preSearch() {
|
||||
plasmaWordIndexEntity idx = null;
|
||||
try {
|
||||
// search the database locally
|
||||
log.logFine("presearch: started job");
|
||||
idx = searchHashes(query.queryHashes, time);
|
||||
log.logFine("presearch: found " + idx.size() + " results");
|
||||
plasmaSearchResult acc = order(idx, queryhashes, order, time, searchcount);
|
||||
if (acc == null) return;
|
||||
log.logFine("presearch: ordered results, now " + acc.sizeOrdered() + " URLs ready for fetch");
|
||||
|
||||
// take some elements and fetch the snippets
|
||||
snippetCache.fetch(acc, queryhashes, urlmask, fetchcount);
|
||||
} catch (IOException e) {
|
||||
log.logSevere("presearch: failed", e);
|
||||
} finally {
|
||||
if (idx != null) try { idx.close(); } catch (Exception e){}
|
||||
}
|
||||
log.logFine("presearch: job terminated");
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
@ -52,6 +52,9 @@ import de.anomic.server.serverByteBuffer;
|
||||
|
||||
public final class plasmaSearchQuery {
|
||||
|
||||
public static final String ORDER_QUALITY = "quality";
|
||||
public static final String ORDER_DATE = "date";
|
||||
|
||||
public static final int SEARCHDOM_LOCAL = 0;
|
||||
public static final int SEARCHDOM_GROUPDHT = 1;
|
||||
public static final int SEARCHDOM_GROUPALL = 2;
|
||||
@ -69,21 +72,35 @@ public final class plasmaSearchQuery {
|
||||
public String domGroupName;
|
||||
public int domMaxTargets;
|
||||
|
||||
public plasmaSearchQuery(Set queryWords, String referrer,
|
||||
public plasmaSearchQuery(Set queryWords,
|
||||
String[] order, int wantedResults, long maximumTime, String urlMask,
|
||||
String referrer,
|
||||
int domType, String domGroupName, int domMaxTargets) {
|
||||
this.queryWords = queryWords;
|
||||
this.queryHashes = words2hashes(queryWords);
|
||||
this.referrer = referrer;
|
||||
this.order = order;
|
||||
this.wantedResults = wantedResults;
|
||||
this.maximumTime = maximumTime;
|
||||
this.urlMask = urlMask;
|
||||
this.referrer = referrer;
|
||||
this.domType = domType;
|
||||
this.domGroupName = domGroupName;
|
||||
this.domMaxTargets = domMaxTargets;
|
||||
}
|
||||
|
||||
public plasmaSearchQuery(Set queryHashes,
|
||||
String[] order, int wantedResults, long maximumTime, String urlMask) {
|
||||
this.queryWords = null;
|
||||
this.queryHashes = queryHashes;
|
||||
this.order = order;
|
||||
this.wantedResults = wantedResults;
|
||||
this.maximumTime = maximumTime;
|
||||
this.urlMask = urlMask;
|
||||
this.referrer = referrer;
|
||||
this.domType = -1;
|
||||
this.domGroupName = null;
|
||||
this.domMaxTargets = -1;
|
||||
}
|
||||
|
||||
public static Set words2hashes(String[] words) {
|
||||
TreeSet hashes = new TreeSet();
|
||||
@ -117,4 +134,13 @@ public final class plasmaSearchQuery {
|
||||
return query;
|
||||
}
|
||||
|
||||
public void filterOut(Set blueList) {
|
||||
// filter out words that appear in this set
|
||||
Iterator it = queryWords.iterator();
|
||||
String word;
|
||||
while (it.hasNext()) {
|
||||
word = (String) it.next();
|
||||
if (blueList.contains(word)) it.remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -54,29 +54,23 @@ import de.anomic.server.serverCodings;
|
||||
|
||||
public final class plasmaSearchResult {
|
||||
|
||||
public static final char O_QUALITY = 'q';
|
||||
public static final char O_AGE = 'a';
|
||||
public static final String splitrex = " |/|\\(|\\)|-|\\:|_|\\.|,|\\?|!|'|" + '"';
|
||||
|
||||
private TreeMap pageAcc; // key = order hash; value = plasmaLURL.entry
|
||||
private kelondroMScoreCluster ref; // reference score computation for the commonSense heuristic
|
||||
private Set searchhashes; // hashes that are searched here
|
||||
private Set stopwords; // words that are excluded from the commonSense heuristic
|
||||
private char[] order; // order of heuristics
|
||||
private ArrayList results; // this is a buffer for plasmaWordIndexEntry + plasmaCrawlLURL.entry - objects
|
||||
private plasmaSearchQuery query;
|
||||
|
||||
public plasmaSearchResult(Set searchhashes, Set stopwords, char[] order) {
|
||||
public plasmaSearchResult(plasmaSearchQuery query) {
|
||||
this.pageAcc = new TreeMap();
|
||||
ref = new kelondroMScoreCluster();
|
||||
this.searchhashes = searchhashes;
|
||||
this.stopwords = stopwords;
|
||||
this.order = order;
|
||||
this.ref = new kelondroMScoreCluster();
|
||||
this.results = new ArrayList();
|
||||
this.query = query;
|
||||
}
|
||||
|
||||
public plasmaSearchResult cloneSmart() {
|
||||
// clones only the top structure
|
||||
plasmaSearchResult theClone = new plasmaSearchResult(this.searchhashes, this.stopwords, this.order);
|
||||
plasmaSearchResult theClone = new plasmaSearchResult(query);
|
||||
theClone.pageAcc = (TreeMap) this.pageAcc.clone();
|
||||
theClone.ref = this.ref;
|
||||
theClone.results = this.results;
|
||||
@ -149,10 +143,10 @@ public final class plasmaSearchResult {
|
||||
|
||||
// apply pre-calculated order attributes
|
||||
ranking = 0;
|
||||
if (order[0] == O_QUALITY) ranking = 4096 * indexEntry.getQuality();
|
||||
else if (order[0] == O_AGE) ranking = 4096 * indexEntry.getVirtualAge();
|
||||
if (order[1] == O_QUALITY) ranking += indexEntry.getQuality();
|
||||
else if (order[1] == O_AGE) ranking += indexEntry.getVirtualAge();
|
||||
if (query.order[0].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking = 4096 * indexEntry.getQuality();
|
||||
else if (query.order[0].equals(plasmaSearchQuery.ORDER_DATE)) ranking = 4096 * indexEntry.getVirtualAge();
|
||||
if (query.order[1].equals(plasmaSearchQuery.ORDER_QUALITY)) ranking += indexEntry.getQuality();
|
||||
else if (query.order[1].equals(plasmaSearchQuery.ORDER_DATE)) ranking += indexEntry.getVirtualAge();
|
||||
|
||||
// apply 'common-sense' heuristic using references
|
||||
for (int j = 0; j < urlcomps.length; j++) if (commonSense.contains(urlcomps[j])) ranking += inc;
|
||||
@ -161,7 +155,7 @@ public final class plasmaSearchResult {
|
||||
// apply query-in-result matching
|
||||
Set urlcomph = plasmaSearchQuery.words2hashes(urlcomps);
|
||||
Set descrcomph = plasmaSearchQuery.words2hashes(descrcomps);
|
||||
Iterator shi = searchhashes.iterator();
|
||||
Iterator shi = query.queryHashes.iterator();
|
||||
while (shi.hasNext()) {
|
||||
queryhash = (String) shi.next();
|
||||
if (urlcomph.contains(queryhash)) ranking += 10 * inc;
|
||||
@ -187,9 +181,8 @@ public final class plasmaSearchResult {
|
||||
for (int i = 0; i < words.length; i++) {
|
||||
word = words[i].toLowerCase();
|
||||
if ((word.length() > 2) &&
|
||||
(!(stopwords.contains(word))) &&
|
||||
("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) &&
|
||||
(!(searchhashes.contains(plasmaWordIndexEntry.word2hash(word)))))
|
||||
("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) &&
|
||||
(!(query.queryHashes.contains(plasmaWordIndexEntry.word2hash(word)))))
|
||||
ref.incScore(word);
|
||||
}
|
||||
}
|
||||
|
@ -1394,13 +1394,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
||||
if (date == null) return ""; else return DateFormatter.format(date);
|
||||
}
|
||||
|
||||
/*
|
||||
public class presearch extends Thread {
|
||||
Set queryhashes;
|
||||
char[] order;
|
||||
String urlmask;
|
||||
long time;
|
||||
int searchcount, fetchcount;
|
||||
public presearch(Set queryhashes, char[] order, long time /*milliseconds*/, String urlmask, int searchcount, int fetchcount) {
|
||||
public presearch(Set queryhashes, char[] order, long time, String urlmask, int searchcount, int fetchcount) {
|
||||
this.queryhashes = queryhashes;
|
||||
this.order = order;
|
||||
this.urlmask = urlmask;
|
||||
@ -1430,38 +1431,34 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
||||
}
|
||||
}
|
||||
|
||||
*/
|
||||
|
||||
//public serverObjects searchFromLocal(Set querywords, String order1, String order2, int count, boolean global, long time /*milliseconds*/, String urlmask) {
|
||||
public serverObjects searchFromLocal(plasmaSearchQuery query) {
|
||||
|
||||
// tell all threads to do nothing for a specific time
|
||||
wordIndex.intermission(query.maximumTime);
|
||||
intermissionAllThreads(query.maximumTime);
|
||||
wordIndex.intermission(2 * query.maximumTime);
|
||||
intermissionAllThreads(2 * query.maximumTime);
|
||||
|
||||
serverObjects prop = new serverObjects();
|
||||
try {
|
||||
char[] order = new char[2];
|
||||
if (query.order[0].equals("quality")) order[0] = plasmaSearchResult.O_QUALITY; else order[0] = plasmaSearchResult.O_AGE;
|
||||
if (query.order[1].equals("quality")) order[1] = plasmaSearchResult.O_QUALITY; else order[1] = plasmaSearchResult.O_AGE;
|
||||
//char[] order = new char[2];
|
||||
//if (query.order[0].equals("quality")) order[0] = plasmaSearchResult.O_QUALITY; else order[0] = plasmaSearchResult.O_AGE;
|
||||
//if (query.order[1].equals("quality")) order[1] = plasmaSearchResult.O_QUALITY; else order[1] = plasmaSearchResult.O_AGE;
|
||||
|
||||
// filter out words that appear in bluelist
|
||||
Iterator it = query.queryWords.iterator();
|
||||
String word, gs = "";
|
||||
while (it.hasNext()) {
|
||||
word = (String) it.next();
|
||||
if (blueList.contains(word)) it.remove(); else gs += "+" + word;
|
||||
}
|
||||
if (gs.length() > 0) gs = gs.substring(1);
|
||||
query.filterOut(blueList);
|
||||
|
||||
// log
|
||||
log.logInfo("INIT WORD SEARCH: " + gs + ":" + query.queryHashes + " - " + query.wantedResults + " links, " + (query.maximumTime / 1000) + " seconds");
|
||||
log.logInfo("INIT WORD SEARCH: " + query.queryWords + ":" + query.queryHashes + " - " + query.wantedResults + " links, " + (query.maximumTime / 1000) + " seconds");
|
||||
long timestamp = System.currentTimeMillis();
|
||||
|
||||
// start a presearch, which makes only sense if we idle afterwards.
|
||||
// this is especially the case if we start a global search and idle until search
|
||||
if (query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) {
|
||||
Thread preselect = new presearch(query.queryHashes, order, query.maximumTime / 10, query.urlMask, 10, 3);
|
||||
preselect.start();
|
||||
}
|
||||
//if (query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) {
|
||||
// Thread preselect = new presearch(query.queryHashes, order, query.maximumTime / 10, query.urlMask, 10, 3);
|
||||
// preselect.start();
|
||||
//}
|
||||
|
||||
// do global fetching
|
||||
int globalresults = 0;
|
||||
@ -1479,13 +1476,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
||||
|
||||
// now search locally (the global results should be now in the local db)
|
||||
long remainingTime = query.maximumTime - (System.currentTimeMillis() - timestamp);
|
||||
plasmaWordIndexEntity idx = searchManager.searchHashes(query.queryHashes, remainingTime * 8 / 10); // the search
|
||||
plasmaSearchEvent theSearch = new plasmaSearchEvent(query, log, wordIndex, urlPool.loadedURL, snippetCache);
|
||||
plasmaWordIndexEntity idx = theSearch.search(remainingTime * 8 / 10);
|
||||
log.logFine("SEARCH TIME AFTER FINDING " + idx.size() + " ELEMENTS: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
|
||||
|
||||
remainingTime = query.maximumTime - (System.currentTimeMillis() - timestamp);
|
||||
if (remainingTime < 500) remainingTime = 500;
|
||||
if (remainingTime > 3000) remainingTime = 3000;
|
||||
plasmaSearchResult acc = searchManager.order(idx, query.queryHashes, stopwords, order, remainingTime, 10);
|
||||
plasmaSearchResult acc = theSearch.order(idx, remainingTime, 10);
|
||||
if (query.domType != plasmaSearchQuery.SEARCHDOM_GLOBALDHT)
|
||||
snippetCache.fetch(acc.cloneSmart(), query.queryHashes, query.urlMask, 10);
|
||||
log.logFine("SEARCH TIME AFTER ORDERING OF SEARCH RESULT: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
|
||||
@ -1595,7 +1593,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
||||
}
|
||||
|
||||
// log
|
||||
log.logInfo("EXIT WORD SEARCH: " + gs + " - " +
|
||||
log.logInfo("EXIT WORD SEARCH: " + query.queryWords + " - " +
|
||||
prop.get("totalcount", "0") + " links found, " +
|
||||
prop.get("orderedcount", "0") + " links ordered, " +
|
||||
prop.get("linkcount", "?") + " links selected, " +
|
||||
@ -1607,21 +1605,21 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
||||
}
|
||||
}
|
||||
|
||||
public serverObjects searchFromRemote(Set hashes, int count, boolean global, long duetime) {
|
||||
public serverObjects searchFromRemote(plasmaSearchQuery query) {
|
||||
|
||||
// tell all threads to do nothing for a specific time
|
||||
wordIndex.intermission(duetime);
|
||||
intermissionAllThreads(duetime);
|
||||
wordIndex.intermission(2 * query.maximumTime);
|
||||
intermissionAllThreads(2 * query.maximumTime);
|
||||
|
||||
if (hashes == null) hashes = new HashSet();
|
||||
serverObjects prop = new serverObjects();
|
||||
try {
|
||||
log.logInfo("INIT HASH SEARCH: " + hashes + " - " + count + " links");
|
||||
log.logInfo("INIT HASH SEARCH: " + query.queryHashes + " - " + query.wantedResults + " links");
|
||||
long timestamp = System.currentTimeMillis();
|
||||
plasmaWordIndexEntity idx = searchManager.searchHashes(hashes, duetime * 8 / 10); // a nameless temporary index, not sorted by special order but by hash
|
||||
long remainingTime = duetime - (System.currentTimeMillis() - timestamp);
|
||||
plasmaSearchEvent theSearch = new plasmaSearchEvent(query, log, wordIndex, urlPool.loadedURL, snippetCache);
|
||||
plasmaWordIndexEntity idx = theSearch.search(query.maximumTime * 8 / 10);
|
||||
long remainingTime = query.maximumTime - (System.currentTimeMillis() - timestamp);
|
||||
if (remainingTime < 500) remainingTime = 500;
|
||||
plasmaSearchResult acc = searchManager.order(idx, hashes, stopwords, new char[]{plasmaSearchResult.O_QUALITY, plasmaSearchResult.O_AGE}, remainingTime, 10);
|
||||
plasmaSearchResult acc = theSearch.order(idx, remainingTime, 10);
|
||||
|
||||
// result is a List of urlEntry elements
|
||||
if (acc == null) {
|
||||
@ -1636,9 +1634,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
||||
//plasmaIndexEntry pie;
|
||||
plasmaCrawlLURL.Entry urlentry;
|
||||
plasmaSnippetCache.result snippet;
|
||||
while ((acc.hasMoreElements()) && (i < count)) {
|
||||
while ((acc.hasMoreElements()) && (i < query.wantedResults)) {
|
||||
urlentry = acc.nextElement();
|
||||
snippet = snippetCache.retrieve(urlentry.url(), hashes, false, 260);
|
||||
snippet = snippetCache.retrieve(urlentry.url(), query.queryHashes, false, 260);
|
||||
if (snippet.source == plasmaSnippetCache.ERROR_NO_MATCH) {
|
||||
// suppress line: there is no match in that resource
|
||||
} else {
|
||||
@ -1669,7 +1667,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
||||
prop.put("fwrec", ""); // peers that would have helped to construct this result (recommendations)
|
||||
|
||||
// log
|
||||
log.logInfo("EXIT HASH SEARCH: " + hashes + " - " +
|
||||
log.logInfo("EXIT HASH SEARCH: " + query.queryHashes + " - " +
|
||||
((idx == null) ? "0" : (""+idx.size())) + " links found, " +
|
||||
prop.get("linkcount", "?") + " links selected, " +
|
||||
((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
|
||||
|
@ -53,6 +53,8 @@ import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.TreeSet;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import de.anomic.kelondro.kelondroMSetTools;
|
||||
import de.anomic.server.logging.serverLog;
|
||||
@ -105,6 +107,28 @@ public final class plasmaWordIndex {
|
||||
return ramCache.getIndex(wordHash, deleteIfEmpty);
|
||||
}
|
||||
|
||||
public Set getEntities(Set wordHashes, boolean deleteIfEmpty, boolean interruptIfEmpty) {
|
||||
|
||||
// retrieve entities that belong to the hashes
|
||||
HashSet entities = new HashSet();
|
||||
String singleHash;
|
||||
plasmaWordIndexEntity singleEntity;
|
||||
Iterator i = wordHashes.iterator();
|
||||
while (i.hasNext()) {
|
||||
// get next hash:
|
||||
singleHash = (String) i.next();
|
||||
|
||||
// retrieve index
|
||||
singleEntity = getEntity(singleHash, true);
|
||||
|
||||
// check result
|
||||
if (((singleEntity == null) || (singleEntity.size() == 0)) && (interruptIfEmpty)) return null;
|
||||
|
||||
entities.add(singleEntity);
|
||||
}
|
||||
return entities;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return ramCache.size();
|
||||
}
|
||||
|
@ -46,6 +46,7 @@ import java.io.IOException;
|
||||
import java.util.Enumeration;
|
||||
import java.util.Iterator;
|
||||
import java.util.TreeMap;
|
||||
import java.util.Set;
|
||||
|
||||
import de.anomic.kelondro.kelondroRecords;
|
||||
import de.anomic.kelondro.kelondroTree;
|
||||
@ -293,6 +294,54 @@ public final class plasmaWordIndexEntity {
|
||||
return l;
|
||||
}
|
||||
|
||||
public static plasmaWordIndexEntity joinEntities(Set entities, long time) throws IOException {
|
||||
|
||||
long stamp = System.currentTimeMillis();
|
||||
|
||||
// order entities by their size
|
||||
TreeMap map = new TreeMap();
|
||||
plasmaWordIndexEntity singleEntity;
|
||||
Iterator i = entities.iterator();
|
||||
int count = 0;
|
||||
while (i.hasNext()) {
|
||||
// get next entity:
|
||||
singleEntity = (plasmaWordIndexEntity) i.next();
|
||||
|
||||
// check result
|
||||
if ((singleEntity == null) || (singleEntity.size() == 0)) return new plasmaWordIndexEntity(null); // as this is a cunjunction of searches, we have no result if any word is not known
|
||||
|
||||
// store result in order of result size
|
||||
map.put(new Long(singleEntity.size() * 1000 + count), singleEntity);
|
||||
count++;
|
||||
}
|
||||
|
||||
// check if there is any result
|
||||
if (map.size() == 0) return new plasmaWordIndexEntity(null); // no result, nothing found
|
||||
|
||||
// the map now holds the search results in order of number of hits per word
|
||||
// we now must pairwise build up a conjunction of these sets
|
||||
Long k = (Long) map.firstKey(); // the smallest, which means, the one with the least entries
|
||||
plasmaWordIndexEntity searchA, searchB, searchResult = (plasmaWordIndexEntity) map.remove(k);
|
||||
while ((map.size() > 0) && (searchResult.size() > 0) && (time > 0)) {
|
||||
// take the first element of map which is a result and combine it with result
|
||||
k = (Long) map.firstKey(); // the next smallest...
|
||||
time -= (System.currentTimeMillis() - stamp); stamp = System.currentTimeMillis();
|
||||
searchA = searchResult;
|
||||
searchB = (plasmaWordIndexEntity) map.remove(k);
|
||||
searchResult = plasmaWordIndexEntity.joinConstructive(searchA, searchB, 2 * time / (map.size() + 1));
|
||||
// close the input files/structures
|
||||
if (searchA != searchResult) searchA.close();
|
||||
if (searchB != searchResult) searchB.close();
|
||||
}
|
||||
searchA = null; // free resources
|
||||
searchB = null; // free resources
|
||||
|
||||
// in 'searchResult' is now the combined search result
|
||||
if (searchResult.size() == 0) return new plasmaWordIndexEntity(null);
|
||||
return searchResult;
|
||||
}
|
||||
|
||||
|
||||
public static plasmaWordIndexEntity joinConstructive(plasmaWordIndexEntity i1, plasmaWordIndexEntity i2, long time) throws IOException {
|
||||
if ((i1 == null) || (i2 == null)) return null;
|
||||
if ((i1.size() == 0) || (i2.size() == 0)) return new plasmaWordIndexEntity(null);
|
||||
|
@ -238,8 +238,10 @@ public abstract class serverAbstractThread extends Thread implements serverThrea
|
||||
|
||||
while (running) {
|
||||
if (this.intermission > 0) {
|
||||
if (this.intermission > System.currentTimeMillis()) {
|
||||
ratz(this.intermission - System.currentTimeMillis());
|
||||
long itime = this.intermission - System.currentTimeMillis();
|
||||
if (itime > 0) {
|
||||
logSystem("thread '" + this.getName() + "' breaks for intermission: " + (itime / 1000) + " seconds");
|
||||
ratz(itime);
|
||||
}
|
||||
this.intermission = 0;
|
||||
}
|
||||
|
Reference in New Issue
Block a user