mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-07-22 09:14:38 -04:00
fixed all possible problems with nullpointer exception for LURLs
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2513 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
htroot
source/de/anomic
@ -46,7 +46,6 @@
|
||||
// if the shell's current path is HTROOT
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
@ -135,8 +134,8 @@ public class Bookmarks {
|
||||
bookmarksDB.Bookmark bookmark = switchboard.bookmarksDB.getBookmark(urlHash);
|
||||
if (bookmark == null) {
|
||||
// try to get the bookmark from the LURL database
|
||||
try {
|
||||
plasmaCrawlLURL.Entry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null);
|
||||
plasmaCrawlLURL.Entry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null);
|
||||
if (urlentry != null) {
|
||||
prop.put("mode_edit", 0); // create mode
|
||||
if (urlentry != null) {
|
||||
prop.put("mode_title", urlentry.descr());
|
||||
@ -145,8 +144,6 @@ public class Bookmarks {
|
||||
}
|
||||
prop.put("mode_tags", "");
|
||||
prop.put("mode_public", 0);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
} else {
|
||||
// get from the bookmark database
|
||||
|
@ -55,7 +55,6 @@ import java.util.TreeSet;
|
||||
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
||||
import de.anomic.htmlFilter.htmlFilterOutputStream;
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.index.indexURL;
|
||||
import de.anomic.plasma.plasmaHTCache;
|
||||
import de.anomic.plasma.plasmaParserDocument;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
|
@ -211,8 +211,10 @@ public class IndexControl_p {
|
||||
}
|
||||
|
||||
if (post.containsKey("urlhashdelete")) {
|
||||
try {
|
||||
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
|
||||
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
|
||||
if (entry == null) {
|
||||
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
|
||||
} else {
|
||||
if (entry != null) {
|
||||
URL url = entry.url();
|
||||
urlstring = url.toNormalform();
|
||||
@ -222,8 +224,6 @@ public class IndexControl_p {
|
||||
} else {
|
||||
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
|
||||
}
|
||||
} catch (IOException e) {
|
||||
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
|
||||
}
|
||||
}
|
||||
|
||||
@ -265,16 +265,12 @@ public class IndexControl_p {
|
||||
plasmaCrawlLURL.Entry lurl;
|
||||
while (urlIter.hasNext()) {
|
||||
iEntry = (indexEntry) urlIter.next();
|
||||
try {
|
||||
lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null);
|
||||
if ((lurl == null)||(lurl.toString() == null)) {
|
||||
unknownURLEntries.add(iEntry.urlHash());
|
||||
urlIter.remove();
|
||||
} else {
|
||||
knownURLs.put(iEntry.urlHash(), lurl);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null);
|
||||
if (lurl.toString() == null) {
|
||||
unknownURLEntries.add(iEntry.urlHash());
|
||||
urlIter.remove();
|
||||
} else {
|
||||
knownURLs.put(iEntry.urlHash(), lurl);
|
||||
}
|
||||
}
|
||||
// use whats remaining
|
||||
@ -313,22 +309,26 @@ public class IndexControl_p {
|
||||
if (post.containsKey("urlstringsearch")) {
|
||||
try {
|
||||
URL url = new URL(urlstring);
|
||||
urlhash = indexURL.urlHash(url);
|
||||
prop.put("urlhash", urlhash);
|
||||
urlhash = indexURL.urlHash(url);
|
||||
prop.put("urlhash", urlhash);
|
||||
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
|
||||
prop.put("result", genUrlProfile(switchboard, entry, urlhash));
|
||||
if (entry == null) {
|
||||
prop.put("urlstring", "unknown url: " + urlstring);
|
||||
prop.put("urlhash", "");
|
||||
} else {
|
||||
prop.put("result", genUrlProfile(switchboard, entry, urlhash));
|
||||
}
|
||||
} catch (MalformedURLException e) {
|
||||
prop.put("urlstring", "bad url: " + urlstring);
|
||||
prop.put("urlhash", "");
|
||||
} catch (IOException e) {
|
||||
prop.put("urlstring", "unknown url: " + urlstring);
|
||||
prop.put("urlhash", "");
|
||||
}
|
||||
}
|
||||
|
||||
if (post.containsKey("urlhashsearch")) {
|
||||
try {
|
||||
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
|
||||
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
|
||||
if (entry == null) {
|
||||
prop.put("result", "No Entry for URL hash " + urlhash);
|
||||
} else {
|
||||
if (entry != null) {
|
||||
URL url = entry.url();
|
||||
urlstring = url.toString();
|
||||
@ -337,8 +337,6 @@ public class IndexControl_p {
|
||||
} else {
|
||||
prop.put("result", "No Entry for URL hash " + urlhash);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
prop.put("result", "No Entry for URL hash " + urlhash);
|
||||
}
|
||||
}
|
||||
|
||||
@ -394,15 +392,11 @@ public class IndexControl_p {
|
||||
if (entry == null) { return "No entry found for URL-hash " + urlhash; }
|
||||
URL url = entry.url();
|
||||
String referrer = null;
|
||||
try {
|
||||
plasmaCrawlLURL.Entry referrerEntry = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null);
|
||||
if (referrerEntry != null) {
|
||||
referrer = referrerEntry.url().toString();
|
||||
} else {
|
||||
referrer = "<unknown>";
|
||||
}
|
||||
} catch (IOException e) {
|
||||
plasmaCrawlLURL.Entry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null);
|
||||
if (le == null) {
|
||||
referrer = "<unknown>";
|
||||
} else {
|
||||
referrer = le.url().toString();
|
||||
}
|
||||
if (url == null) { return "No entry found for URL-hash " + urlhash; }
|
||||
String result = "<table>" +
|
||||
@ -456,16 +450,13 @@ public class IndexControl_p {
|
||||
while (en.hasNext()) {
|
||||
xi = (indexEntry) en.next();
|
||||
uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())};
|
||||
try {
|
||||
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(uh[0], null);
|
||||
if (entry != null) {
|
||||
us = entry.url().toString();
|
||||
tm.put(us, uh);
|
||||
} else {
|
||||
tm.put(uh[0], uh);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
plasmaCrawlLURL.Entry le = switchboard.urlPool.loadedURL.load(uh[0], null);
|
||||
if (le == null) {
|
||||
tm.put(uh[0], uh);
|
||||
} else {
|
||||
us = le.url().toString();
|
||||
tm.put(us, uh);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -106,9 +106,8 @@ public class ViewFile {
|
||||
|
||||
// getting the urlEntry that belongs to the url hash
|
||||
Entry urlEntry = null;
|
||||
try {
|
||||
urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
|
||||
} catch (IOException e) {
|
||||
urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
|
||||
if (urlEntry == null) {
|
||||
prop.put("error",2);
|
||||
prop.put("viewMode",VIEW_MODE_NO_TEXT);
|
||||
return prop;
|
||||
|
@ -45,7 +45,6 @@
|
||||
// You must compile this file with
|
||||
// javac -classpath .:../classes crawlOrder.java
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import de.anomic.http.httpHeader;
|
||||
@ -249,8 +248,11 @@ public final class crawlOrder {
|
||||
// case where we have already the url loaded;
|
||||
reason = reasonString;
|
||||
// send lurl-Entry as response
|
||||
try {
|
||||
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null);
|
||||
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null);
|
||||
if (entry == null) {
|
||||
response = "rejected";
|
||||
lurl = "";
|
||||
} else {
|
||||
if (entry != null) {
|
||||
response = "double";
|
||||
switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare);
|
||||
@ -259,9 +261,6 @@ public final class crawlOrder {
|
||||
response = "rejected";
|
||||
lurl = "";
|
||||
}
|
||||
} catch (IOException e) {
|
||||
response = "rejected";
|
||||
lurl = "";
|
||||
}
|
||||
} else {
|
||||
response = "rejected";
|
||||
|
@ -92,7 +92,6 @@ import java.util.zip.GZIPOutputStream;
|
||||
import de.anomic.htmlFilter.htmlFilterContentTransformer;
|
||||
import de.anomic.htmlFilter.htmlFilterOutputStream;
|
||||
import de.anomic.htmlFilter.htmlFilterTransformer;
|
||||
import de.anomic.index.indexURL;
|
||||
import de.anomic.plasma.plasmaHTCache;
|
||||
import de.anomic.plasma.plasmaParser;
|
||||
import de.anomic.plasma.plasmaSwitchboard;
|
||||
|
@ -351,6 +351,11 @@ public class kelondroRow {
|
||||
case kelondroColumn.encoder_none:
|
||||
throw new kelondroException("ROW", "getColLong has celltype none, no encoder given");
|
||||
case kelondroColumn.encoder_b64e:
|
||||
// start - fix for badly stored parameters
|
||||
boolean maxvalue = true;
|
||||
for (int i = 0; i < length; i++) if (rowinstance[offset + i] != '_') {maxvalue = false; break;}
|
||||
if (maxvalue) return 0;
|
||||
// stop - fix for badly stored parameters
|
||||
return kelondroBase64Order.enhancedCoder.decodeLong(rowinstance, offset, length);
|
||||
case kelondroColumn.encoder_b256:
|
||||
return kelondroNaturalOrder.decodeLong(rowinstance, offset, length);
|
||||
|
@ -160,7 +160,7 @@ public final class plasmaCrawlLURL extends indexURL {
|
||||
gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
|
||||
}
|
||||
|
||||
public Entry load(String urlHash, indexEntry searchedWord) throws IOException {
|
||||
public Entry load(String urlHash, indexEntry searchedWord) {
|
||||
// generates an plasmaLURLEntry using the url hash
|
||||
// to speed up the access, the url-hashes are buffered
|
||||
// in the hash cache.
|
||||
@ -169,9 +169,13 @@ public final class plasmaCrawlLURL extends indexURL {
|
||||
// - look into the filed properties
|
||||
// if the url cannot be found, this returns null
|
||||
kelondroRow.Entry entry = urlIndexCache.get(urlHash.getBytes());
|
||||
if (entry == null) entry = urlIndexFile.get(urlHash.getBytes());
|
||||
if (entry == null) return null;
|
||||
return new Entry(entry, searchedWord);
|
||||
try {
|
||||
if (entry == null) entry = urlIndexFile.get(urlHash.getBytes());
|
||||
if (entry == null) return null;
|
||||
return new Entry(entry, searchedWord);
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public void store(Entry entry, boolean cached) throws IOException {
|
||||
|
@ -379,14 +379,10 @@ public final class plasmaCrawlStacker {
|
||||
String nexturlhash = indexURL.urlHash(nexturl);
|
||||
String dbocc = this.sb.urlPool.exists(nexturlhash);
|
||||
plasmaCrawlLURL.Entry oldEntry = null;
|
||||
if (dbocc != null) try {
|
||||
oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null);
|
||||
} catch (IOException e) {}
|
||||
boolean recrawl = (oldEntry != null) &&
|
||||
(((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder());
|
||||
oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null);
|
||||
boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder());
|
||||
if ((dbocc != null) && (!(recrawl))) {
|
||||
reason = plasmaCrawlEURL.DOUBLE_REGISTERED + dbocc + ")";
|
||||
|
||||
this.log.logFine("URL '" + nexturlString + "' is double registered in '" + dbocc + "'. " +
|
||||
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
|
||||
return reason;
|
||||
|
@ -227,22 +227,16 @@ public class plasmaDHTChunk {
|
||||
// iterate over indexes to fetch url entries and store them in the urlCache
|
||||
while ((urlIter.hasNext()) && (maxcount > refcount)) {
|
||||
iEntry = (indexEntry) urlIter.next();
|
||||
try {
|
||||
lurl = lurls.load(iEntry.urlHash(), iEntry);
|
||||
if ((lurl == null) || (lurl.url() == null)) {
|
||||
//yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash());
|
||||
notBoundCounter++;
|
||||
urlIter.remove();
|
||||
wordIndex.removeEntry(container.getWordHash(), iEntry.urlHash(), true);
|
||||
} else {
|
||||
urlCache.put(iEntry.urlHash(), lurl);
|
||||
//yacyCore.log.logFine("DEBUG selectTransferContainersResource: added url hash '" + iEntry.urlHash() + "' to urlCache for word hash " + container.getWordHash());
|
||||
refcount++;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
lurl = lurls.load(iEntry.urlHash(), iEntry);
|
||||
if ((lurl == null) || (lurl.url() == null)) {
|
||||
//yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash());
|
||||
notBoundCounter++;
|
||||
urlIter.remove();
|
||||
wordIndex.removeEntry(container.getWordHash(), iEntry.urlHash(), true);
|
||||
} else {
|
||||
urlCache.put(iEntry.urlHash(), lurl);
|
||||
//yacyCore.log.logFine("DEBUG selectTransferContainersResource: added url hash '" + iEntry.urlHash() + "' to urlCache for word hash " + container.getWordHash());
|
||||
refcount++;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -45,7 +45,6 @@ package de.anomic.plasma;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
import java.util.HashSet;
|
||||
import java.io.IOException;
|
||||
|
||||
import de.anomic.kelondro.kelondroException;
|
||||
import de.anomic.server.logging.serverLog;
|
||||
@ -242,13 +241,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
|
||||
if (System.currentTimeMillis() >= postorderLimitTime) break;
|
||||
entry = preorder.next();
|
||||
// find the url entry
|
||||
try {
|
||||
page = urlStore.load(entry.urlHash(), entry);
|
||||
// add a result
|
||||
if (page != null) acc.addResult(entry, page);
|
||||
} catch (IOException e) {
|
||||
// result was not found
|
||||
}
|
||||
page = urlStore.load(entry.urlHash(), entry);
|
||||
// add a result
|
||||
if (page != null) acc.addResult(entry, page);
|
||||
}
|
||||
} catch (kelondroException ee) {
|
||||
serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee);
|
||||
@ -298,13 +293,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
|
||||
if (System.currentTimeMillis() >= postorderLimitTime) break;
|
||||
entry = preorder.next();
|
||||
// find the url entry
|
||||
try {
|
||||
page = urlStore.load(entry.urlHash(), entry);
|
||||
// add a result
|
||||
if (page != null) acc.addResult(entry, page);
|
||||
} catch (IOException e) {
|
||||
// result was not found
|
||||
}
|
||||
page = urlStore.load(entry.urlHash(), entry);
|
||||
// add a result
|
||||
if (page != null) acc.addResult(entry, page);
|
||||
}
|
||||
} catch (kelondroException ee) {
|
||||
serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee);
|
||||
|
@ -45,7 +45,6 @@ package de.anomic.plasma;
|
||||
import java.io.IOException;
|
||||
import de.anomic.net.URL;
|
||||
import de.anomic.plasma.cache.IResourceInfo;
|
||||
import de.anomic.plasma.crawler.http.CrawlWorker;
|
||||
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashMap;
|
||||
@ -53,7 +52,6 @@ import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.kelondro.kelondroMScoreCluster;
|
||||
import de.anomic.server.logging.serverLog;
|
||||
import de.anomic.yacy.yacySearch;
|
||||
|
@ -2157,25 +2157,18 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
|
||||
// finally, delete the url entry
|
||||
|
||||
// determine the url string
|
||||
try {
|
||||
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.load(urlhash, null);
|
||||
if (entry == null)
|
||||
return 0;
|
||||
URL url = entry.url();
|
||||
if (url == null)
|
||||
return 0;
|
||||
// get set of words
|
||||
// Set words = plasmaCondenser.getWords(getText(getResource(url,
|
||||
// fetchOnline)));
|
||||
Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getText());
|
||||
// delete all word references
|
||||
int count = removeReferences(urlhash, witer);
|
||||
// finally delete the url entry itself
|
||||
urlPool.loadedURL.remove(urlhash);
|
||||
return count;
|
||||
} catch (IOException e) {
|
||||
return 0;
|
||||
}
|
||||
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.load(urlhash, null);
|
||||
if (entry == null) return 0;
|
||||
URL url = entry.url();
|
||||
if (url == null) return 0;
|
||||
// get set of words
|
||||
// Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
|
||||
Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getText());
|
||||
// delete all word references
|
||||
int count = removeReferences(urlhash, witer);
|
||||
// finally delete the url entry itself
|
||||
urlPool.loadedURL.remove(urlhash);
|
||||
return count;
|
||||
}
|
||||
|
||||
public int removeReferences(URL url, Set words) {
|
||||
|
@ -328,12 +328,8 @@ public class plasmaSwitchboardQueue {
|
||||
public URL referrerURL() {
|
||||
if (referrerURL == null) {
|
||||
if ((referrerHash == null) || (referrerHash.equals(indexURL.dummyHash))) return null;
|
||||
try {
|
||||
plasmaCrawlLURL.Entry entry = lurls.load(referrerHash, null);
|
||||
if (entry == null) referrerURL = null; else referrerURL = entry.url();
|
||||
} catch (IOException e) {
|
||||
referrerURL = null;
|
||||
}
|
||||
plasmaCrawlLURL.Entry entry = lurls.load(referrerHash, null);
|
||||
if (entry == null) referrerURL = null; else referrerURL = entry.url();
|
||||
}
|
||||
return referrerURL;
|
||||
}
|
||||
|
@ -81,10 +81,8 @@ public class plasmaURLPool {
|
||||
if (urlhash.equals(indexURL.dummyHash)) return null;
|
||||
plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash);
|
||||
if (ne != null) return ne.url();
|
||||
try {
|
||||
plasmaCrawlLURL.Entry le = loadedURL.load(urlhash, null);
|
||||
if (le != null) return le.url();
|
||||
} catch (IOException e) {}
|
||||
plasmaCrawlLURL.Entry le = loadedURL.load(urlhash, null);
|
||||
if (le != null) return le.url();
|
||||
plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);
|
||||
if (ee != null) return ee.url();
|
||||
return null;
|
||||
|
@ -689,20 +689,15 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
|
||||
while (containerIterator.hasNext() && run) {
|
||||
waiter();
|
||||
entry = (indexEntry) containerIterator.next();
|
||||
// System.out.println("Wordhash: "+wordHash+" UrlHash:
|
||||
// "+entry.getUrlHash());
|
||||
try {
|
||||
plasmaCrawlLURL.Entry lurlEntry = lurl.load(entry.urlHash(), null);
|
||||
if (lurlEntry != null) {
|
||||
url = lurlEntry.url();
|
||||
if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url) == true)) {
|
||||
urlHashs.add(entry.urlHash());
|
||||
}
|
||||
} else {
|
||||
// System.out.println("Wordhash: "+wordHash+" UrlHash: "+entry.getUrlHash());
|
||||
plasmaCrawlLURL.Entry ue = lurl.load(entry.urlHash(), null);
|
||||
if (ue == null) {
|
||||
urlHashs.add(entry.urlHash());
|
||||
} else {
|
||||
url = ue.url();
|
||||
if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url) == true)) {
|
||||
urlHashs.add(entry.urlHash());
|
||||
}
|
||||
} catch (IOException e) {
|
||||
urlHashs.add(entry.urlHash());
|
||||
}
|
||||
}
|
||||
if (urlHashs.size() > 0) {
|
||||
|
Reference in New Issue
Block a user