Added cleaning of meaning data of kanji and checked uniqness of word set
This commit is contained in:
parent
dd044a93ab
commit
b67da1803d
|
@ -111,7 +111,27 @@ public class KanjiDict {
|
||||||
}
|
}
|
||||||
} else if ("meaning".equals(qName)) {
|
} else if ("meaning".equals(qName)) {
|
||||||
if (mLang == null) {
|
if (mLang == null) {
|
||||||
literal.meaning.add(bufChar.toString());
|
String mean = bufChar.toString();
|
||||||
|
if (mean.startsWith("(")) {
|
||||||
|
mean = mean.replaceAll("\\(", "");
|
||||||
|
mean = mean.replaceAll("\\)", "");
|
||||||
|
}
|
||||||
|
mean = mean.replaceAll("\\(.*\\)", "");
|
||||||
|
mean = mean.replaceAll("\\?", "");
|
||||||
|
mean = mean.replaceAll("\\!", "");
|
||||||
|
mean = mean.replaceAll("\\-", "");
|
||||||
|
mean = mean.split(",")[0];
|
||||||
|
mean = mean.split("\'")[0];
|
||||||
|
mean = mean.split("&")[0];
|
||||||
|
mean = mean.trim();
|
||||||
|
for (String rm : new String[] {"*",".", "/"}) {
|
||||||
|
if (mean.contains(rm)) {
|
||||||
|
mean = "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!mean.isEmpty()) {
|
||||||
|
literal.meaning.add(mean);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
mLang = null;
|
mLang = null;
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,8 +22,8 @@
|
||||||
*/
|
*/
|
||||||
package org.x4o.fc18.cake2.fcdoc.kanji;
|
package org.x4o.fc18.cake2.fcdoc.kanji;
|
||||||
|
|
||||||
import java.util.HashSet;
|
import java.util.ArrayList;
|
||||||
import java.util.Set;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Kanji KuTen and FourCorner and codePoint and meaning of a literal.
|
* Kanji KuTen and FourCorner and codePoint and meaning of a literal.
|
||||||
|
@ -37,5 +37,5 @@ public class KanjiDictLiteral {
|
||||||
String kuTen213;
|
String kuTen213;
|
||||||
Integer fourCorner;
|
Integer fourCorner;
|
||||||
Integer codePoint;
|
Integer codePoint;
|
||||||
final Set<String> meaning = new HashSet<>();
|
final List<String> meaning = new ArrayList<>();
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,6 +25,8 @@ package org.x4o.fc18.cake2.fcdoc.kanji;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -46,18 +48,39 @@ public class KanjiDictTest {
|
||||||
dict.parseXML(in);
|
dict.parseXML(in);
|
||||||
}
|
}
|
||||||
System.out.println("=======================================");
|
System.out.println("=======================================");
|
||||||
for (String key : dict.mapKuTen208.keySet()) {
|
|
||||||
KanjiDictLiteral lit = dict.mapKuTen208.get(key);
|
|
||||||
int meanCnt = lit.meaning.size();
|
|
||||||
String meaning = lit.meaning.stream().collect(Collectors.joining(" "));
|
|
||||||
System.out.println(String.format("%06d - %s - %d = %d %s", lit.codePoint, lit.kuTen208, lit.fourCorner, meanCnt, meaning));
|
|
||||||
}
|
|
||||||
System.out.println("=======================================");
|
|
||||||
for (Integer key : dict.mapFourCorner.keySet()) {
|
for (Integer key : dict.mapFourCorner.keySet()) {
|
||||||
KanjiDictLiteral lit = dict.mapFourCorner.get(key);
|
KanjiDictLiteral lit = dict.mapFourCorner.get(key);
|
||||||
int meanCnt = lit.meaning.size();
|
int meanCnt = lit.meaning.size();
|
||||||
String meaning = lit.meaning.stream().collect(Collectors.joining(" "));
|
String meaning = lit.meaning.stream().map(v -> v.replaceAll(" ", "_").toUpperCase()).collect(Collectors.joining(" "));
|
||||||
System.out.println(String.format("%06d - %s - %d = %d %s", lit.codePoint, lit.kuTen208, lit.fourCorner, meanCnt, meaning));
|
System.out.print(new StringBuilder().appendCodePoint(lit.codePoint).toString());
|
||||||
|
System.out.println(String.format(" %s FC %5d = %2d %s", lit.kuTen208, lit.fourCorner, meanCnt, meaning));
|
||||||
|
}
|
||||||
|
|
||||||
|
int meanEmpty = 0;
|
||||||
|
int meanFail = 0;
|
||||||
|
int meanUse2 = 0;
|
||||||
|
Set<String> uniqMean = new HashSet<>();
|
||||||
|
for (Integer key : dict.mapFourCorner.keySet()) {
|
||||||
|
KanjiDictLiteral lit = dict.mapFourCorner.get(key);
|
||||||
|
if (lit.meaning.isEmpty()) {
|
||||||
|
meanEmpty++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
String meanFirst = lit.meaning.get(0);
|
||||||
|
if (uniqMean.contains(meanFirst)) {
|
||||||
|
if (lit.meaning.size() > 1) {
|
||||||
|
String meanSecond = lit.meaning.get(1);
|
||||||
|
if (uniqMean.contains(meanSecond)) {
|
||||||
|
meanFail++;
|
||||||
|
} else {
|
||||||
|
meanUse2++;
|
||||||
|
uniqMean.add(meanSecond);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
meanFail++;
|
||||||
|
} else {
|
||||||
|
uniqMean.add(meanFirst);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// JIS X 0208 prescribes a set of 6879 graphical characters
|
// JIS X 0208 prescribes a set of 6879 graphical characters
|
||||||
|
@ -66,9 +89,20 @@ public class KanjiDictTest {
|
||||||
System.out.println("mapFourCorner: " + dict.mapFourCorner.size());
|
System.out.println("mapFourCorner: " + dict.mapFourCorner.size());
|
||||||
System.out.println("mapKuTen208: " + dict.mapKuTen208.size());
|
System.out.println("mapKuTen208: " + dict.mapKuTen208.size());
|
||||||
System.out.println("mapKuTen213: " + dict.mapKuTen213.size());
|
System.out.println("mapKuTen213: " + dict.mapKuTen213.size());
|
||||||
|
System.out.println("=======================================");
|
||||||
|
System.out.println("meanUniq: " + uniqMean.size());
|
||||||
|
System.out.println("meanEmpty: " + meanEmpty);
|
||||||
|
System.out.println("meanFail: " + meanFail);
|
||||||
|
System.out.println("meanUse2: " + meanUse2);
|
||||||
// 15642334 bytes or 538402 lines gives;
|
// 15642334 bytes or 538402 lines gives;
|
||||||
|
// =======================================
|
||||||
// mapFourCorner: 3936
|
// mapFourCorner: 3936
|
||||||
// mapKuTen208: 6355
|
// mapKuTen208: 6355
|
||||||
// mapKuTen213: 3695
|
// mapKuTen213: 3695
|
||||||
|
// =======================================
|
||||||
|
// meanUniq: 3332
|
||||||
|
// meanEmpty: 1
|
||||||
|
// meanFail: 1228
|
||||||
|
// meanUse2: 405
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue