mirror of
https://github.com/yacy/yacy_search_server.git
synced 2025-05-15 22:29:34 -04:00
Check if the character is a minus sign and is followed by a letter or a
digit. Treat it as part of the word/number.
This commit is contained in:
parent
5db97a8928
commit
0689f4f0ae
source/net/yacy/document
@ -27,11 +27,6 @@ package net.yacy.document;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.SortedMap;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import net.yacy.cora.order.Base64Order;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
|
||||
/**
|
||||
* Read sentences from a given text.
|
||||
@ -127,15 +122,28 @@ public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringB
|
||||
|| type == Character.MODIFIER_LETTER
|
||||
|| type == Character.OTHER_LETTER
|
||||
|| type == Character.TITLECASE_LETTER
|
||||
|| punctuation(c));
|
||||
|| punctuation(c) || digitsep(c));
|
||||
}
|
||||
|
||||
public final static boolean punctuation(final char c) {
|
||||
return c == '.' || c == '!' || c == '?';
|
||||
switch (c) {
|
||||
case '.':
|
||||
case '!':
|
||||
case '?':
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public final static boolean digitsep(final char c) {
|
||||
return c == '.' || c == ',';
|
||||
switch (c) {
|
||||
case '.':
|
||||
case ',':
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -178,7 +186,7 @@ public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringB
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
String s = "a b 1.5 ccc 4,7 d. so o et, qu. 4.7Ohm 2.54inch.";
|
||||
String s = "a b 1.5 ccc -4,7 d. so -o et, qu. 4.7Ohm 2.54inch.";
|
||||
SentenceReader sr = new SentenceReader(s);
|
||||
for (StringBuilder a: sr) System.out.println(a);
|
||||
sr = new SentenceReader(s);
|
||||
|
@ -155,6 +155,12 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
|
||||
for (int i = 0; i < r.length(); i++) { // tokenize one sentence
|
||||
c = r.charAt(i);
|
||||
|
||||
// Check if the character is a minus sign and is followed by a letter or a digit. Treat it as part of the word/number.
|
||||
if (c == '-' && i < r.length() - 1 && (Character.isLetter(r.charAt(i + 1)) || Character.isDigit(r.charAt(i + 1)))) {
|
||||
sb.append(c);
|
||||
continue; // Skip further checks and continue to the next character.
|
||||
}
|
||||
|
||||
// Check if the current character is a digit separator within a number.
|
||||
if (SentenceReader.digitsep(c) && i > 0 && Character.isDigit(r.charAt(i - 1)) && (i < r.length() - 1) && Character.isDigit(r.charAt(i + 1))) {
|
||||
sb.append(c); // Add the digit separator to the current token.
|
||||
|
Loading…
x
Reference in New Issue
Block a user