Fixed empty meaning literal and match last meaning first for uniqness

This commit is contained in:
Willem Cazander 2025-01-24 13:23:04 +01:00
parent f108053f73
commit 365505afc6
2 changed files with 37 additions and 21 deletions

View file

@ -123,8 +123,9 @@ public class KanjiDict {
mean = mean.split(",")[0]; mean = mean.split(",")[0];
mean = mean.split("\'")[0]; mean = mean.split("\'")[0];
mean = mean.split("&")[0]; mean = mean.split("&")[0];
mean = mean.split("/")[0];
mean = mean.trim(); mean = mean.trim();
for (String rm : new String[] {"*",".", "/"}) { for (String rm : new String[] {"*","."}) {
if (mean.contains(rm)) { if (mean.contains(rm)) {
mean = ""; mean = "";
} }

View file

@ -25,7 +25,9 @@ package org.x4o.fc18.cake2.fcdoc.kanji;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.InputStream; import java.io.InputStream;
import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -59,33 +61,44 @@ public class KanjiDictTest {
int meanEmpty = 0; int meanEmpty = 0;
int meanFail = 0; int meanFail = 0;
int meanUse2 = 0; int meanUse2 = 0;
Set<String> uniqMean = new HashSet<>(); int meanUse1 = 0;
Map<String, KanjiDictLiteral> uniqMean = new HashMap<>();
for (Integer key : dict.mapFourCorner.keySet()) { for (Integer key : dict.mapFourCorner.keySet()) {
KanjiDictLiteral lit = dict.mapFourCorner.get(key); KanjiDictLiteral lit = dict.mapFourCorner.get(key);
if (lit.meaning.isEmpty()) { if (lit.meaning.isEmpty()) {
meanEmpty++; meanEmpty++;
continue; continue;
} }
String meanFirst = lit.meaning.get(0); String meanLast = lit.meaning.get(lit.meaning.size() - 1);
if (uniqMean.contains(meanFirst)) { if (!uniqMean.containsKey(meanLast)) {
if (lit.meaning.size() > 1) { uniqMean.put(meanLast, lit);
String meanSecond = lit.meaning.get(1); continue;
if (uniqMean.contains(meanSecond)) {
meanFail++;
} else {
meanUse2++;
uniqMean.add(meanSecond);
}
} else {
meanFail++;
}
} else {
uniqMean.add(meanFirst);
} }
if (lit.meaning.size() > 1) {
String meanSecond = lit.meaning.get(1);
if (!uniqMean.containsKey(meanSecond)) {
uniqMean.put(meanSecond, lit);
meanUse2++;
continue;
}
}
String meanFirst = lit.meaning.get(0);
if (!uniqMean.containsKey(meanFirst)) {
uniqMean.put(meanFirst, lit);
meanUse1++;
continue;
}
meanFail++;
} }
// for (String litMean : uniqMean.keySet()) {
// KanjiDictLiteral lit = uniqMean.get(litMean);
// System.out.print(new StringBuilder().appendCodePoint(lit.codePoint).toString());
// System.out.println(" = " + litMean);
// }
// JIS X 0208 prescribes a set of 6879 graphical characters // JIS X 0208 prescribes a set of 6879 graphical characters
// JIS X 0213 + 0208 have total characters 11233 // JIS X 0213 + 0208 have total characters 11233
// Japanese engineers have defined
System.out.println("======================================="); System.out.println("=======================================");
System.out.println("mapFourCorner: " + dict.mapFourCorner.size()); System.out.println("mapFourCorner: " + dict.mapFourCorner.size());
System.out.println("mapKuTen208: " + dict.mapKuTen208.size()); System.out.println("mapKuTen208: " + dict.mapKuTen208.size());
@ -95,15 +108,17 @@ public class KanjiDictTest {
System.out.println("meanEmpty: " + meanEmpty); System.out.println("meanEmpty: " + meanEmpty);
System.out.println("meanFail: " + meanFail); System.out.println("meanFail: " + meanFail);
System.out.println("meanUse2: " + meanUse2); System.out.println("meanUse2: " + meanUse2);
System.out.println("meanUse1: " + meanUse1);
// 15642334 bytes or 538402 lines gives; // 15642334 bytes or 538402 lines gives;
// ======================================= // =======================================
// mapFourCorner: 3936 // mapFourCorner: 3936
// mapKuTen208: 6355 // mapKuTen208: 6355
// mapKuTen213: 3695 // mapKuTen213: 3695
// ======================================= // =======================================
// meanUniq: 3332 // meanUniq: 3440
// meanEmpty: 1 // meanEmpty: 0
// meanFail: 603 // meanFail: 496
// meanUse2: 405 // meanUse2: 233
// meanUse1: 300
} }
} }