Added first part of test code to read the four corner meaning from kanji

This commit is contained in:
Willem Cazander 2025-01-24 10:53:16 +01:00
parent 661a179c23
commit 77e27954f1
3 changed files with 271 additions and 0 deletions

View file

@ -0,0 +1,156 @@
/*
* Copyright (c) 2004-2014, Willem Cazander
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted provided
* that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this list of conditions and the
* following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.x4o.fc18.cake2.fcdoc.kanji;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
/**
* Simple dict to lookup kanji.
*
* @author Willem Cazander
* @version 1.0 Jan 23, 2025
*/
public class KanjiDict {
Map<String, KanjiDictLiteral> mapKuTen208 = new HashMap<>();
Map<String, KanjiDictLiteral> mapKuTen213 = new HashMap<>();
Map<Integer, KanjiDictLiteral> mapFourCorner = new HashMap<>();
public KanjiDict() {
}
public void parseXML(InputStream input) throws ParserConfigurationException, SAXException, IOException {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser parser = factory.newSAXParser();
XMLReader reader = parser.getXMLReader();
KanjiDictContentHandler handler = new KanjiDictContentHandler();
reader.setContentHandler(handler);
InputSource inputSource = new InputSource(input);
reader.parse(inputSource);
}
class KanjiDictContentHandler implements ContentHandler {
StringBuilder bufChar = new StringBuilder();
KanjiDictLiteral literal;
String cpType;
String qcType;
private void addLiteral() {
if (literal.kuTen208 != null) {
mapKuTen208.put(literal.kuTen208, literal);
}
if (literal.kuTen213 != null) {
mapKuTen213.put(literal.kuTen213, literal);
}
if (literal.fourCorner != null) {
mapFourCorner.put(literal.fourCorner, literal);
}
literal = null;
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
bufChar.append(new String(ch,start,length));
}
@Override
public void endDocument() throws SAXException {
}
@Override
public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
if ("character".equals(qName)) {
addLiteral();
} else if ("cp_value".equals(qName)) {
if ("ucs".equals(cpType)) {
literal.codePoint = Integer.parseInt(bufChar.toString(), 16);
} else if ("jis208".equals(cpType)) {
literal.kuTen208 = bufChar.toString();
} else if ("jis213".equals(cpType)) {
literal.kuTen213 = bufChar.toString();
}
} else if ("q_code".equals(qName)) {
if ("four_corner".equals(qcType)) {
literal.fourCorner = Integer.parseInt(bufChar.toString().replaceAll("\\.", ""));
}
} else if ("meaning".equals(qName)) {
literal.meaning.add(bufChar.toString());
}
bufChar = new StringBuilder();
}
@Override
public void endPrefixMapping(String arg0) throws SAXException {
}
@Override
public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException {
}
@Override
public void processingInstruction(String arg0, String arg1) throws SAXException {
}
@Override
public void setDocumentLocator(Locator arg0) {
}
@Override
public void skippedEntity(String arg0) throws SAXException {
}
@Override
public void startDocument() throws SAXException {
}
@Override
public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
if ("character".equals(qName)) {
literal = new KanjiDictLiteral();
} else if ("cp_value".equals(qName)) {
cpType = atts.getValue("cp_type");
} else if ("q_code".equals(qName)) {
qcType = atts.getValue("qc_type");
}
}
@Override
public void startPrefixMapping(String arg0, String arg1) throws SAXException {
}
}
}

View file

@ -0,0 +1,41 @@
/*
* Copyright (c) 2004-2014, Willem Cazander
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted provided
* that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this list of conditions and the
* following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.x4o.fc18.cake2.fcdoc.kanji;
import java.util.HashSet;
import java.util.Set;
/**
* Kanji KuTen and FourCorner and codePoint and meaning of a literal.
*
* @author Willem Cazander
* @version 1.0 Jan 23, 2025
*/
public class KanjiDictLiteral {
String kuTen208;
String kuTen213;
Integer fourCorner;
Integer codePoint;
final Set<String> meaning = new HashSet<>();
}

View file

@ -0,0 +1,74 @@
/*
* Copyright (c) 2004-2014, Willem Cazander
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted provided
* that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this list of conditions and the
* following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.x4o.fc18.cake2.fcdoc.kanji;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.stream.Collectors;
/**
* Write data files from "kanjidic2.xml"
*
* @author Willem Cazander
* @version 1.0 Jan 23, 2025
*/
public class KanjiDictTest {
static public void main(String[] args) throws Exception {
if (args.length == 0) {
System.err.println("No argument file given");
System.exit(1);
return;
}
KanjiDict dict = new KanjiDict();
try (InputStream in = new FileInputStream(new File(args[0]))) {
dict.parseXML(in);
}
System.out.println("=======================================");
for (String key : dict.mapKuTen208.keySet()) {
KanjiDictLiteral lit = dict.mapKuTen208.get(key);
int meanCnt = lit.meaning.size();
String meaning = lit.meaning.stream().collect(Collectors.joining(" "));
System.out.println(String.format("%06d - %s - %d = %d %s", lit.codePoint, lit.kuTen208, lit.fourCorner, meanCnt, meaning));
}
System.out.println("=======================================");
for (Integer key : dict.mapFourCorner.keySet()) {
KanjiDictLiteral lit = dict.mapFourCorner.get(key);
int meanCnt = lit.meaning.size();
String meaning = lit.meaning.stream().collect(Collectors.joining(" "));
System.out.println(String.format("%06d - %s - %d = %d %s", lit.codePoint, lit.kuTen208, lit.fourCorner, meanCnt, meaning));
}
// JIS X 0208 prescribes a set of 6879 graphical characters
// JIS X 0213 + 0208 have total characters 11233
System.out.println("=======================================");
System.out.println("mapFourCorner: " + dict.mapFourCorner.size());
System.out.println("mapKuTen208: " + dict.mapKuTen208.size());
System.out.println("mapKuTen213: " + dict.mapKuTen213.size());
// 15642334 bytes or 538402 lines gives;
// mapFourCorner: 3936
// mapKuTen208: 6355
// mapKuTen213: 3695
}
}