Added cleaning of meaning data of kanji and checked uniqness of word set

This commit is contained in:
Willem Cazander 2025-01-24 12:46:31 +01:00
parent dd044a93ab
commit b67da1803d
3 changed files with 67 additions and 13 deletions

View file

@ -111,7 +111,27 @@ public class KanjiDict {
} }
} else if ("meaning".equals(qName)) { } else if ("meaning".equals(qName)) {
if (mLang == null) { if (mLang == null) {
literal.meaning.add(bufChar.toString()); String mean = bufChar.toString();
if (mean.startsWith("(")) {
mean = mean.replaceAll("\\(", "");
mean = mean.replaceAll("\\)", "");
}
mean = mean.replaceAll("\\(.*\\)", "");
mean = mean.replaceAll("\\?", "");
mean = mean.replaceAll("\\!", "");
mean = mean.replaceAll("\\-", "");
mean = mean.split(",")[0];
mean = mean.split("\'")[0];
mean = mean.split("&")[0];
mean = mean.trim();
for (String rm : new String[] {"*",".", "/"}) {
if (mean.contains(rm)) {
mean = "";
}
}
if (!mean.isEmpty()) {
literal.meaning.add(mean);
}
} }
mLang = null; mLang = null;
} }

View file

@ -22,8 +22,8 @@
*/ */
package org.x4o.fc18.cake2.fcdoc.kanji; package org.x4o.fc18.cake2.fcdoc.kanji;
import java.util.HashSet; import java.util.ArrayList;
import java.util.Set; import java.util.List;
/** /**
* Kanji KuTen and FourCorner and codePoint and meaning of a literal. * Kanji KuTen and FourCorner and codePoint and meaning of a literal.
@ -37,5 +37,5 @@ public class KanjiDictLiteral {
String kuTen213; String kuTen213;
Integer fourCorner; Integer fourCorner;
Integer codePoint; Integer codePoint;
final Set<String> meaning = new HashSet<>(); final List<String> meaning = new ArrayList<>();
} }

View file

@ -25,6 +25,8 @@ package org.x4o.fc18.cake2.fcdoc.kanji;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.InputStream; import java.io.InputStream;
import java.util.HashSet;
import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
/** /**
@ -46,18 +48,39 @@ public class KanjiDictTest {
dict.parseXML(in); dict.parseXML(in);
} }
System.out.println("======================================="); System.out.println("=======================================");
for (String key : dict.mapKuTen208.keySet()) {
KanjiDictLiteral lit = dict.mapKuTen208.get(key);
int meanCnt = lit.meaning.size();
String meaning = lit.meaning.stream().collect(Collectors.joining(" "));
System.out.println(String.format("%06d - %s - %d = %d %s", lit.codePoint, lit.kuTen208, lit.fourCorner, meanCnt, meaning));
}
System.out.println("=======================================");
for (Integer key : dict.mapFourCorner.keySet()) { for (Integer key : dict.mapFourCorner.keySet()) {
KanjiDictLiteral lit = dict.mapFourCorner.get(key); KanjiDictLiteral lit = dict.mapFourCorner.get(key);
int meanCnt = lit.meaning.size(); int meanCnt = lit.meaning.size();
String meaning = lit.meaning.stream().collect(Collectors.joining(" ")); String meaning = lit.meaning.stream().map(v -> v.replaceAll(" ", "_").toUpperCase()).collect(Collectors.joining(" "));
System.out.println(String.format("%06d - %s - %d = %d %s", lit.codePoint, lit.kuTen208, lit.fourCorner, meanCnt, meaning)); System.out.print(new StringBuilder().appendCodePoint(lit.codePoint).toString());
System.out.println(String.format(" %s FC %5d = %2d %s", lit.kuTen208, lit.fourCorner, meanCnt, meaning));
}
int meanEmpty = 0;
int meanFail = 0;
int meanUse2 = 0;
Set<String> uniqMean = new HashSet<>();
for (Integer key : dict.mapFourCorner.keySet()) {
KanjiDictLiteral lit = dict.mapFourCorner.get(key);
if (lit.meaning.isEmpty()) {
meanEmpty++;
continue;
}
String meanFirst = lit.meaning.get(0);
if (uniqMean.contains(meanFirst)) {
if (lit.meaning.size() > 1) {
String meanSecond = lit.meaning.get(1);
if (uniqMean.contains(meanSecond)) {
meanFail++;
} else {
meanUse2++;
uniqMean.add(meanSecond);
}
}
meanFail++;
} else {
uniqMean.add(meanFirst);
}
} }
// JIS X 0208 prescribes a set of 6879 graphical characters // JIS X 0208 prescribes a set of 6879 graphical characters
@ -66,9 +89,20 @@ public class KanjiDictTest {
System.out.println("mapFourCorner: " + dict.mapFourCorner.size()); System.out.println("mapFourCorner: " + dict.mapFourCorner.size());
System.out.println("mapKuTen208: " + dict.mapKuTen208.size()); System.out.println("mapKuTen208: " + dict.mapKuTen208.size());
System.out.println("mapKuTen213: " + dict.mapKuTen213.size()); System.out.println("mapKuTen213: " + dict.mapKuTen213.size());
System.out.println("=======================================");
System.out.println("meanUniq: " + uniqMean.size());
System.out.println("meanEmpty: " + meanEmpty);
System.out.println("meanFail: " + meanFail);
System.out.println("meanUse2: " + meanUse2);
// 15642334 bytes or 538402 lines gives; // 15642334 bytes or 538402 lines gives;
// =======================================
// mapFourCorner: 3936 // mapFourCorner: 3936
// mapKuTen208: 6355 // mapKuTen208: 6355
// mapKuTen213: 3695 // mapKuTen213: 3695
// =======================================
// meanUniq: 3332
// meanEmpty: 1
// meanFail: 1228
// meanUse2: 405
} }
} }