From b67da1803db6cedeaf24529ad1bac9ec6fc70bff Mon Sep 17 00:00:00 2001 From: Willem Date: Fri, 24 Jan 2025 12:46:31 +0100 Subject: [PATCH] Added cleaning of meaning data of kanji and checked uniqness of word set --- .../x4o/fc18/cake2/fcdoc/kanji/KanjiDict.java | 22 +++++++- .../cake2/fcdoc/kanji/KanjiDictLiteral.java | 6 +-- .../fc18/cake2/fcdoc/kanji/KanjiDictTest.java | 52 +++++++++++++++---- 3 files changed, 67 insertions(+), 13 deletions(-) diff --git a/nx01-x4o-fc18/src/test/java/org/x4o/fc18/cake2/fcdoc/kanji/KanjiDict.java b/nx01-x4o-fc18/src/test/java/org/x4o/fc18/cake2/fcdoc/kanji/KanjiDict.java index 5baf8eb..2429074 100644 --- a/nx01-x4o-fc18/src/test/java/org/x4o/fc18/cake2/fcdoc/kanji/KanjiDict.java +++ b/nx01-x4o-fc18/src/test/java/org/x4o/fc18/cake2/fcdoc/kanji/KanjiDict.java @@ -111,7 +111,27 @@ public class KanjiDict { } } else if ("meaning".equals(qName)) { if (mLang == null) { - literal.meaning.add(bufChar.toString()); + String mean = bufChar.toString(); + if (mean.startsWith("(")) { + mean = mean.replaceAll("\\(", ""); + mean = mean.replaceAll("\\)", ""); + } + mean = mean.replaceAll("\\(.*\\)", ""); + mean = mean.replaceAll("\\?", ""); + mean = mean.replaceAll("\\!", ""); + mean = mean.replaceAll("\\-", ""); + mean = mean.split(",")[0]; + mean = mean.split("\'")[0]; + mean = mean.split("&")[0]; + mean = mean.trim(); + for (String rm : new String[] {"*",".", "/"}) { + if (mean.contains(rm)) { + mean = ""; + } + } + if (!mean.isEmpty()) { + literal.meaning.add(mean); + } } mLang = null; } diff --git a/nx01-x4o-fc18/src/test/java/org/x4o/fc18/cake2/fcdoc/kanji/KanjiDictLiteral.java b/nx01-x4o-fc18/src/test/java/org/x4o/fc18/cake2/fcdoc/kanji/KanjiDictLiteral.java index 60e9b75..fc5df5e 100644 --- a/nx01-x4o-fc18/src/test/java/org/x4o/fc18/cake2/fcdoc/kanji/KanjiDictLiteral.java +++ b/nx01-x4o-fc18/src/test/java/org/x4o/fc18/cake2/fcdoc/kanji/KanjiDictLiteral.java @@ -22,8 +22,8 @@ */ package org.x4o.fc18.cake2.fcdoc.kanji; -import java.util.HashSet; -import java.util.Set; +import java.util.ArrayList; +import java.util.List; /** * Kanji KuTen and FourCorner and codePoint and meaning of a literal. @@ -37,5 +37,5 @@ public class KanjiDictLiteral { String kuTen213; Integer fourCorner; Integer codePoint; - final Set meaning = new HashSet<>(); + final List meaning = new ArrayList<>(); } diff --git a/nx01-x4o-fc18/src/test/java/org/x4o/fc18/cake2/fcdoc/kanji/KanjiDictTest.java b/nx01-x4o-fc18/src/test/java/org/x4o/fc18/cake2/fcdoc/kanji/KanjiDictTest.java index cfc9879..7b0432e 100644 --- a/nx01-x4o-fc18/src/test/java/org/x4o/fc18/cake2/fcdoc/kanji/KanjiDictTest.java +++ b/nx01-x4o-fc18/src/test/java/org/x4o/fc18/cake2/fcdoc/kanji/KanjiDictTest.java @@ -25,6 +25,8 @@ package org.x4o.fc18.cake2.fcdoc.kanji; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; +import java.util.HashSet; +import java.util.Set; import java.util.stream.Collectors; /** @@ -46,18 +48,39 @@ public class KanjiDictTest { dict.parseXML(in); } System.out.println("======================================="); - for (String key : dict.mapKuTen208.keySet()) { - KanjiDictLiteral lit = dict.mapKuTen208.get(key); - int meanCnt = lit.meaning.size(); - String meaning = lit.meaning.stream().collect(Collectors.joining(" ")); - System.out.println(String.format("%06d - %s - %d = %d %s", lit.codePoint, lit.kuTen208, lit.fourCorner, meanCnt, meaning)); - } - System.out.println("======================================="); for (Integer key : dict.mapFourCorner.keySet()) { KanjiDictLiteral lit = dict.mapFourCorner.get(key); int meanCnt = lit.meaning.size(); - String meaning = lit.meaning.stream().collect(Collectors.joining(" ")); - System.out.println(String.format("%06d - %s - %d = %d %s", lit.codePoint, lit.kuTen208, lit.fourCorner, meanCnt, meaning)); + String meaning = lit.meaning.stream().map(v -> v.replaceAll(" ", "_").toUpperCase()).collect(Collectors.joining(" ")); + System.out.print(new StringBuilder().appendCodePoint(lit.codePoint).toString()); + System.out.println(String.format(" %s FC %5d = %2d %s", lit.kuTen208, lit.fourCorner, meanCnt, meaning)); + } + + int meanEmpty = 0; + int meanFail = 0; + int meanUse2 = 0; + Set uniqMean = new HashSet<>(); + for (Integer key : dict.mapFourCorner.keySet()) { + KanjiDictLiteral lit = dict.mapFourCorner.get(key); + if (lit.meaning.isEmpty()) { + meanEmpty++; + continue; + } + String meanFirst = lit.meaning.get(0); + if (uniqMean.contains(meanFirst)) { + if (lit.meaning.size() > 1) { + String meanSecond = lit.meaning.get(1); + if (uniqMean.contains(meanSecond)) { + meanFail++; + } else { + meanUse2++; + uniqMean.add(meanSecond); + } + } + meanFail++; + } else { + uniqMean.add(meanFirst); + } } // JIS X 0208 prescribes a set of 6879 graphical characters @@ -66,9 +89,20 @@ public class KanjiDictTest { System.out.println("mapFourCorner: " + dict.mapFourCorner.size()); System.out.println("mapKuTen208: " + dict.mapKuTen208.size()); System.out.println("mapKuTen213: " + dict.mapKuTen213.size()); + System.out.println("======================================="); + System.out.println("meanUniq: " + uniqMean.size()); + System.out.println("meanEmpty: " + meanEmpty); + System.out.println("meanFail: " + meanFail); + System.out.println("meanUse2: " + meanUse2); // 15642334 bytes or 538402 lines gives; + // ======================================= // mapFourCorner: 3936 // mapKuTen208: 6355 // mapKuTen213: 3695 + // ======================================= + // meanUniq: 3332 + // meanEmpty: 1 + // meanFail: 1228 + // meanUse2: 405 } }