mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-07-22 09:14:38 -04:00
removed lowercase of snippets (and other things):
- added new sentence parser to condenser - sentence parsing can now handle charsets to do: charsets must be handed over to new sentence parser git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2712 6c8d7289-2bf4-0310-a012-ef5d649a1542
This commit is contained in:
htroot
source/de/anomic/plasma
@ -48,6 +48,7 @@
|
||||
|
||||
import java.io.File;
|
||||
import java.io.Writer;
|
||||
import java.util.Enumeration;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.TreeSet;
|
||||
@ -128,9 +129,9 @@ public class CacheAdmin_p {
|
||||
.append("<b>EMAIL:</b><br>").append(formatAnchor(document.getEmaillinks())).append("<br>")
|
||||
.append("<b>TEXT:</b><br><span class=\"small\">").append(new String(scraper.getText())).append("</span><br>")
|
||||
.append("<b>LINES:</b><br><span class=\"small\">");
|
||||
final String[] sentences = document.getSentences();
|
||||
for (int i = 0; i < sentences.length; i++) {
|
||||
info.append(sentences[i]).append("<br>");
|
||||
final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
|
||||
if (sentences != null) while (sentences.hasMoreElements()) {
|
||||
info.append((String) sentences.nextElement()).append("<br>");
|
||||
}
|
||||
info.append("</span><br>");
|
||||
if (document != null) document.close();
|
||||
|
@ -49,6 +49,7 @@ import java.io.InputStream;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.URLDecoder;
|
||||
import java.net.URLEncoder;
|
||||
import java.util.Enumeration;
|
||||
|
||||
import de.anomic.data.wikiCode;
|
||||
import de.anomic.http.httpHeader;
|
||||
@ -262,11 +263,12 @@ public class ViewFile {
|
||||
prop.put("viewMode_parsedText",content);
|
||||
} else {
|
||||
prop.put("viewMode",VIEW_MODE_AS_PARSED_SENTENCES);
|
||||
String[] sentences = document.getSentences();
|
||||
final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
|
||||
|
||||
boolean dark = true;
|
||||
for (int i=0; i < sentences.length; i++) {
|
||||
String currentSentence = wikiCode.replaceHTML(sentences[i]);
|
||||
int i = 0;
|
||||
if (sentences != null) while (sentences.hasMoreElements()) {
|
||||
String currentSentence = wikiCode.replaceHTML((String) sentences.nextElement());
|
||||
|
||||
// Search word highlighting
|
||||
String words = post.get("words",null);
|
||||
@ -286,8 +288,9 @@ public class ViewFile {
|
||||
prop.put("viewMode_sentences_" + i + "_nr",Integer.toString(i+1));
|
||||
prop.put("viewMode_sentences_" + i + "_text",currentSentence);
|
||||
prop.put("viewMode_sentences_" + i + "_dark",((dark) ? 1 : 0) ); dark=!dark;
|
||||
i++;
|
||||
}
|
||||
prop.put("viewMode_sentences",sentences.length);
|
||||
prop.put("viewMode_sentences", i);
|
||||
|
||||
}
|
||||
if (document != null) document.close();
|
||||
|
@ -51,6 +51,8 @@ import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
@ -469,7 +471,7 @@ public final class plasmaCondenser {
|
||||
}
|
||||
|
||||
protected final static boolean punctuation(char c) {
|
||||
return ("!?.".indexOf(c) >= 0);
|
||||
return (c == '.') || (c == '!') || (c == '?');
|
||||
}
|
||||
|
||||
public final static boolean invisible(char c) {
|
||||
@ -648,7 +650,89 @@ public final class plasmaCondenser {
|
||||
return counter;
|
||||
}
|
||||
}
|
||||
|
||||
public static Enumeration sentencesFromInputStream(InputStream is, String charset) {
|
||||
try {
|
||||
return new sentencesFromInputStreamEnum(is, charset);
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static class sentencesFromInputStreamEnum implements Enumeration {
|
||||
// read sentences from a given input stream
|
||||
// this enumerates String objects
|
||||
|
||||
Object buffer = null;
|
||||
BufferedReader raf;
|
||||
int counter = 0;
|
||||
|
||||
public sentencesFromInputStreamEnum(InputStream is, String charset) throws UnsupportedEncodingException {
|
||||
raf = new BufferedReader((charset == null) ? new InputStreamReader(is) : new InputStreamReader(is, charset));
|
||||
buffer = nextElement0();
|
||||
counter = 0;
|
||||
}
|
||||
|
||||
private Object nextElement0() {
|
||||
try {
|
||||
String s = readSentence(raf);
|
||||
if (s == null) {
|
||||
raf.close();
|
||||
return null;
|
||||
}
|
||||
return s;
|
||||
} catch (IOException e) {
|
||||
try {
|
||||
raf.close();
|
||||
} catch (Exception ee) {
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasMoreElements() {
|
||||
return buffer != null;
|
||||
}
|
||||
|
||||
public Object nextElement() {
|
||||
if (buffer == null) {
|
||||
return null;
|
||||
} else {
|
||||
counter = counter + ((String) buffer).length() + 1;
|
||||
Object r = buffer;
|
||||
buffer = nextElement0();
|
||||
return r;
|
||||
}
|
||||
}
|
||||
|
||||
public int count() {
|
||||
return counter;
|
||||
}
|
||||
}
|
||||
|
||||
static String readSentence(Reader reader) throws IOException {
|
||||
StringBuffer s = new StringBuffer();
|
||||
int nextChar;
|
||||
char c;
|
||||
|
||||
// find sentence end
|
||||
for (;;) {
|
||||
nextChar = reader.read();
|
||||
if (nextChar < 0) return null;
|
||||
c = (char) nextChar;
|
||||
s.append(c);
|
||||
if (punctuation(c)) break;
|
||||
}
|
||||
|
||||
// replace line endings and tabs by blanks
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
if ((s.charAt(i) == (char) 10) || (s.charAt(i) == (char) 13) || (s.charAt(i) == (char) 8)) s.setCharAt(i, ' ');
|
||||
}
|
||||
// remove all double-spaces
|
||||
int p; while ((p = s.indexOf(" ")) >= 0) s.deleteCharAt(p);
|
||||
return new String(s);
|
||||
|
||||
}
|
||||
/*
|
||||
private static void addLineSearchProp(Properties prop, String s, String[] searchwords, HashSet foundsearch) {
|
||||
// we store lines containing a key in search vector
|
||||
|
@ -55,6 +55,7 @@ import java.io.InputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URI;
|
||||
import java.util.Arrays;
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Hashtable;
|
||||
@ -815,12 +816,12 @@ public final class plasmaParser {
|
||||
System.out.println(document.getMainLongTitle());
|
||||
|
||||
// found text
|
||||
String[] sentences = document.getSentences();
|
||||
if (sentences != null) {
|
||||
for (int i = 0; i < sentences.length; i++) {
|
||||
final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
|
||||
int i = 0;
|
||||
if (sentences != null) while (sentences.hasMoreElements()) {
|
||||
System.out.print("line " + i + ": ");
|
||||
System.out.println(sentences[i]);
|
||||
}
|
||||
System.out.println((String) sentences.nextElement());
|
||||
i++;
|
||||
}
|
||||
|
||||
// found links
|
||||
|
@ -50,6 +50,7 @@ import java.io.InputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import de.anomic.server.serverFileUtils;
|
||||
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
@ -189,13 +190,9 @@ public class plasmaParserDocument {
|
||||
return -1;
|
||||
}
|
||||
|
||||
public plasmaCondenser getCondenser() {
|
||||
if (condenser == null) condenser = new plasmaCondenser(getText(), 0, 0);
|
||||
return condenser;
|
||||
}
|
||||
|
||||
public String[] getSentences() {
|
||||
return getCondenser().sentences();
|
||||
public Enumeration getSentences(String charset) {
|
||||
if (this.text == null) return null;
|
||||
return plasmaCondenser.sentencesFromInputStream(getText(), charset);
|
||||
}
|
||||
|
||||
public String getKeywords(char separator) {
|
||||
|
@ -47,6 +47,7 @@ package de.anomic.plasma;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
@ -249,10 +250,10 @@ public class plasmaSnippetCache {
|
||||
if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
|
||||
|
||||
//System.out.println("loaded document for URL " + url);
|
||||
String[] sentences = document.getSentences();
|
||||
final Enumeration sentences = document.getSentences(null); // FIXME: apply correct charset
|
||||
document.close();
|
||||
//System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
|
||||
if ((sentences == null) || (sentences.length == 0)) {
|
||||
if (sentences == null) {
|
||||
//System.out.println("found no sentences in url " + url);
|
||||
return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
|
||||
}
|
||||
@ -357,26 +358,30 @@ public class plasmaSnippetCache {
|
||||
return (String) snippetsCache.get(key);
|
||||
}
|
||||
|
||||
private String computeSnippet(String[] sentences, Set queryhashes, int minLength, int maxLength) {
|
||||
private String computeSnippet(Enumeration sentences, Set queryhashes, int minLength, int maxLength) {
|
||||
try {
|
||||
if ((sentences == null) || (sentences.length == 0)) return null;
|
||||
if (sentences == null) return null;
|
||||
if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
|
||||
kelondroMScoreCluster hitTable = new kelondroMScoreCluster();
|
||||
Iterator j;
|
||||
HashMap hs;
|
||||
String hash;
|
||||
for (int i = 0; i < sentences.length; i++) {
|
||||
ArrayList sb = new ArrayList();
|
||||
String sentence;
|
||||
while (sentences.hasMoreElements()) {
|
||||
sentence = (String) sentences.nextElement();
|
||||
//System.out.println("Sentence " + i + ": " + sentences[i]);
|
||||
if (sentences[i].length() > minLength) {
|
||||
hs = hashSentence(sentences[i]);
|
||||
if (sentence.length() > minLength) {
|
||||
hs = hashSentence(sentence);
|
||||
j = queryhashes.iterator();
|
||||
while (j.hasNext()) {
|
||||
hash = (String) j.next();
|
||||
if (hs.containsKey(hash)) {
|
||||
//System.out.println("hash " + hash + " appears in line " + i);
|
||||
hitTable.incScore(new Integer(i));
|
||||
hitTable.incScore(new Integer(sb.size()));
|
||||
}
|
||||
}
|
||||
sb.add(sentence);
|
||||
}
|
||||
}
|
||||
int score = hitTable.getMaxScore(); // best number of hits
|
||||
@ -385,15 +390,14 @@ public class plasmaSnippetCache {
|
||||
// now find the shortest line of these hits
|
||||
int shortLineIndex = -1;
|
||||
int shortLineLength = Integer.MAX_VALUE;
|
||||
for (int i = 0; i < sentences.length; i++) {
|
||||
if ((hitTable.getScore(new Integer(i)) == score) &&
|
||||
(sentences[i].length() < shortLineLength)) {
|
||||
for (int i = 0; i < sb.size(); i++) {
|
||||
if ((hitTable.getScore(new Integer(i)) == score) && (((String) sb.get(i)).length() < shortLineLength)) {
|
||||
shortLineIndex = i;
|
||||
shortLineLength = sentences[i].length();
|
||||
shortLineLength = ((String) sb.get(i)).length();
|
||||
}
|
||||
}
|
||||
// find a first result
|
||||
String result = sentences[shortLineIndex];
|
||||
String result = (String) sb.get(shortLineIndex);
|
||||
// remove all hashes that appear in the result
|
||||
hs = hashSentence(result);
|
||||
j = queryhashes.iterator();
|
||||
|
Reference in New Issue
Block a user