diff --git a/nx01-x4o-fc18/src/test/java/org/x4o/fc18/cake2/fcdoc/kanji/KanjiDict.java b/nx01-x4o-fc18/src/test/java/org/x4o/fc18/cake2/fcdoc/kanji/KanjiDict.java new file mode 100644 index 0000000..e14a863 --- /dev/null +++ b/nx01-x4o-fc18/src/test/java/org/x4o/fc18/cake2/fcdoc/kanji/KanjiDict.java @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2004-2014, Willem Cazander + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted provided + * that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this list of conditions and the + * following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR + * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +package org.x4o.fc18.cake2.fcdoc.kanji; + +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.Map; + +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; + +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.InputSource; +import org.xml.sax.Locator; +import org.xml.sax.SAXException; +import org.xml.sax.XMLReader; + +/** + * Simple dict to lookup kanji. + * + * @author Willem Cazander + * @version 1.0 Jan 23, 2025 + */ +public class KanjiDict { + + Map mapKuTen208 = new HashMap<>(); + Map mapKuTen213 = new HashMap<>(); + Map mapFourCorner = new HashMap<>(); + + public KanjiDict() { + } + + public void parseXML(InputStream input) throws ParserConfigurationException, SAXException, IOException { + SAXParserFactory factory = SAXParserFactory.newInstance(); + SAXParser parser = factory.newSAXParser(); + XMLReader reader = parser.getXMLReader(); + KanjiDictContentHandler handler = new KanjiDictContentHandler(); + reader.setContentHandler(handler); + InputSource inputSource = new InputSource(input); + reader.parse(inputSource); + } + + class KanjiDictContentHandler implements ContentHandler { + + StringBuilder bufChar = new StringBuilder(); + KanjiDictLiteral literal; + String cpType; + String qcType; + + private void addLiteral() { + if (literal.kuTen208 != null) { + mapKuTen208.put(literal.kuTen208, literal); + } + if (literal.kuTen213 != null) { + mapKuTen213.put(literal.kuTen213, literal); + } + if (literal.fourCorner != null) { + mapFourCorner.put(literal.fourCorner, literal); + } + literal = null; + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + bufChar.append(new String(ch,start,length)); + } + + @Override + public void endDocument() throws SAXException { + } + + @Override + public void endElement(String namespaceURI, String localName, String qName) throws SAXException { + if ("character".equals(qName)) { + addLiteral(); + } else if ("cp_value".equals(qName)) { + if ("ucs".equals(cpType)) { + literal.codePoint = Integer.parseInt(bufChar.toString(), 16); + } else if ("jis208".equals(cpType)) { + literal.kuTen208 = bufChar.toString(); + } else if ("jis213".equals(cpType)) { + literal.kuTen213 = bufChar.toString(); + } + } else if ("q_code".equals(qName)) { + if ("four_corner".equals(qcType)) { + literal.fourCorner = Integer.parseInt(bufChar.toString().replaceAll("\\.", "")); + } + } else if ("meaning".equals(qName)) { + literal.meaning.add(bufChar.toString()); + } + bufChar = new StringBuilder(); + } + + @Override + public void endPrefixMapping(String arg0) throws SAXException { + } + + @Override + public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException { + } + + @Override + public void processingInstruction(String arg0, String arg1) throws SAXException { + } + + @Override + public void setDocumentLocator(Locator arg0) { + } + + @Override + public void skippedEntity(String arg0) throws SAXException { + } + + @Override + public void startDocument() throws SAXException { + } + + @Override + public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { + if ("character".equals(qName)) { + literal = new KanjiDictLiteral(); + } else if ("cp_value".equals(qName)) { + cpType = atts.getValue("cp_type"); + } else if ("q_code".equals(qName)) { + qcType = atts.getValue("qc_type"); + } + } + + @Override + public void startPrefixMapping(String arg0, String arg1) throws SAXException { + } + } +} diff --git a/nx01-x4o-fc18/src/test/java/org/x4o/fc18/cake2/fcdoc/kanji/KanjiDictLiteral.java b/nx01-x4o-fc18/src/test/java/org/x4o/fc18/cake2/fcdoc/kanji/KanjiDictLiteral.java new file mode 100644 index 0000000..60e9b75 --- /dev/null +++ b/nx01-x4o-fc18/src/test/java/org/x4o/fc18/cake2/fcdoc/kanji/KanjiDictLiteral.java @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2004-2014, Willem Cazander + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted provided + * that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this list of conditions and the + * following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR + * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +package org.x4o.fc18.cake2.fcdoc.kanji; + +import java.util.HashSet; +import java.util.Set; + +/** + * Kanji KuTen and FourCorner and codePoint and meaning of a literal. + * + * @author Willem Cazander + * @version 1.0 Jan 23, 2025 + */ +public class KanjiDictLiteral { + + String kuTen208; + String kuTen213; + Integer fourCorner; + Integer codePoint; + final Set meaning = new HashSet<>(); +} diff --git a/nx01-x4o-fc18/src/test/java/org/x4o/fc18/cake2/fcdoc/kanji/KanjiDictTest.java b/nx01-x4o-fc18/src/test/java/org/x4o/fc18/cake2/fcdoc/kanji/KanjiDictTest.java new file mode 100644 index 0000000..cfc9879 --- /dev/null +++ b/nx01-x4o-fc18/src/test/java/org/x4o/fc18/cake2/fcdoc/kanji/KanjiDictTest.java @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2004-2014, Willem Cazander + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, are permitted provided + * that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this list of conditions and the + * following disclaimer. + * * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and + * the following disclaimer in the documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR + * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +package org.x4o.fc18.cake2.fcdoc.kanji; + +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.util.stream.Collectors; + +/** + * Write data files from "kanjidic2.xml" + * + * @author Willem Cazander + * @version 1.0 Jan 23, 2025 + */ +public class KanjiDictTest { + + static public void main(String[] args) throws Exception { + if (args.length == 0) { + System.err.println("No argument file given"); + System.exit(1); + return; + } + KanjiDict dict = new KanjiDict(); + try (InputStream in = new FileInputStream(new File(args[0]))) { + dict.parseXML(in); + } + System.out.println("======================================="); + for (String key : dict.mapKuTen208.keySet()) { + KanjiDictLiteral lit = dict.mapKuTen208.get(key); + int meanCnt = lit.meaning.size(); + String meaning = lit.meaning.stream().collect(Collectors.joining(" ")); + System.out.println(String.format("%06d - %s - %d = %d %s", lit.codePoint, lit.kuTen208, lit.fourCorner, meanCnt, meaning)); + } + System.out.println("======================================="); + for (Integer key : dict.mapFourCorner.keySet()) { + KanjiDictLiteral lit = dict.mapFourCorner.get(key); + int meanCnt = lit.meaning.size(); + String meaning = lit.meaning.stream().collect(Collectors.joining(" ")); + System.out.println(String.format("%06d - %s - %d = %d %s", lit.codePoint, lit.kuTen208, lit.fourCorner, meanCnt, meaning)); + } + + // JIS X 0208 prescribes a set of 6879 graphical characters + // JIS X 0213 + 0208 have total characters 11233 + System.out.println("======================================="); + System.out.println("mapFourCorner: " + dict.mapFourCorner.size()); + System.out.println("mapKuTen208: " + dict.mapKuTen208.size()); + System.out.println("mapKuTen213: " + dict.mapKuTen213.size()); + // 15642334 bytes or 538402 lines gives; + // mapFourCorner: 3936 + // mapKuTen208: 6355 + // mapKuTen213: 3695 + } +}