Added first part of test code to read the four corner meaning from kanji
This commit is contained in:
parent
661a179c23
commit
77e27954f1
|
@ -0,0 +1,156 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2004-2014, Willem Cazander
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without modification, are permitted provided
|
||||||
|
* that the following conditions are met:
|
||||||
|
*
|
||||||
|
* * Redistributions of source code must retain the above copyright notice, this list of conditions and the
|
||||||
|
* following disclaimer.
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and
|
||||||
|
* the following disclaimer in the documentation and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
|
||||||
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||||
|
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
||||||
|
* THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
|
||||||
|
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||||
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
|
||||||
|
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
package org.x4o.fc18.cake2.fcdoc.kanji;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import javax.xml.parsers.ParserConfigurationException;
|
||||||
|
import javax.xml.parsers.SAXParser;
|
||||||
|
import javax.xml.parsers.SAXParserFactory;
|
||||||
|
|
||||||
|
import org.xml.sax.Attributes;
|
||||||
|
import org.xml.sax.ContentHandler;
|
||||||
|
import org.xml.sax.InputSource;
|
||||||
|
import org.xml.sax.Locator;
|
||||||
|
import org.xml.sax.SAXException;
|
||||||
|
import org.xml.sax.XMLReader;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Simple dict to lookup kanji.
|
||||||
|
*
|
||||||
|
* @author Willem Cazander
|
||||||
|
* @version 1.0 Jan 23, 2025
|
||||||
|
*/
|
||||||
|
public class KanjiDict {
|
||||||
|
|
||||||
|
Map<String, KanjiDictLiteral> mapKuTen208 = new HashMap<>();
|
||||||
|
Map<String, KanjiDictLiteral> mapKuTen213 = new HashMap<>();
|
||||||
|
Map<Integer, KanjiDictLiteral> mapFourCorner = new HashMap<>();
|
||||||
|
|
||||||
|
public KanjiDict() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public void parseXML(InputStream input) throws ParserConfigurationException, SAXException, IOException {
|
||||||
|
SAXParserFactory factory = SAXParserFactory.newInstance();
|
||||||
|
SAXParser parser = factory.newSAXParser();
|
||||||
|
XMLReader reader = parser.getXMLReader();
|
||||||
|
KanjiDictContentHandler handler = new KanjiDictContentHandler();
|
||||||
|
reader.setContentHandler(handler);
|
||||||
|
InputSource inputSource = new InputSource(input);
|
||||||
|
reader.parse(inputSource);
|
||||||
|
}
|
||||||
|
|
||||||
|
class KanjiDictContentHandler implements ContentHandler {
|
||||||
|
|
||||||
|
StringBuilder bufChar = new StringBuilder();
|
||||||
|
KanjiDictLiteral literal;
|
||||||
|
String cpType;
|
||||||
|
String qcType;
|
||||||
|
|
||||||
|
private void addLiteral() {
|
||||||
|
if (literal.kuTen208 != null) {
|
||||||
|
mapKuTen208.put(literal.kuTen208, literal);
|
||||||
|
}
|
||||||
|
if (literal.kuTen213 != null) {
|
||||||
|
mapKuTen213.put(literal.kuTen213, literal);
|
||||||
|
}
|
||||||
|
if (literal.fourCorner != null) {
|
||||||
|
mapFourCorner.put(literal.fourCorner, literal);
|
||||||
|
}
|
||||||
|
literal = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void characters(char[] ch, int start, int length) throws SAXException {
|
||||||
|
bufChar.append(new String(ch,start,length));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void endDocument() throws SAXException {
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
|
||||||
|
if ("character".equals(qName)) {
|
||||||
|
addLiteral();
|
||||||
|
} else if ("cp_value".equals(qName)) {
|
||||||
|
if ("ucs".equals(cpType)) {
|
||||||
|
literal.codePoint = Integer.parseInt(bufChar.toString(), 16);
|
||||||
|
} else if ("jis208".equals(cpType)) {
|
||||||
|
literal.kuTen208 = bufChar.toString();
|
||||||
|
} else if ("jis213".equals(cpType)) {
|
||||||
|
literal.kuTen213 = bufChar.toString();
|
||||||
|
}
|
||||||
|
} else if ("q_code".equals(qName)) {
|
||||||
|
if ("four_corner".equals(qcType)) {
|
||||||
|
literal.fourCorner = Integer.parseInt(bufChar.toString().replaceAll("\\.", ""));
|
||||||
|
}
|
||||||
|
} else if ("meaning".equals(qName)) {
|
||||||
|
literal.meaning.add(bufChar.toString());
|
||||||
|
}
|
||||||
|
bufChar = new StringBuilder();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void endPrefixMapping(String arg0) throws SAXException {
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException {
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void processingInstruction(String arg0, String arg1) throws SAXException {
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setDocumentLocator(Locator arg0) {
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void skippedEntity(String arg0) throws SAXException {
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void startDocument() throws SAXException {
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
|
||||||
|
if ("character".equals(qName)) {
|
||||||
|
literal = new KanjiDictLiteral();
|
||||||
|
} else if ("cp_value".equals(qName)) {
|
||||||
|
cpType = atts.getValue("cp_type");
|
||||||
|
} else if ("q_code".equals(qName)) {
|
||||||
|
qcType = atts.getValue("qc_type");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void startPrefixMapping(String arg0, String arg1) throws SAXException {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,41 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2004-2014, Willem Cazander
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without modification, are permitted provided
|
||||||
|
* that the following conditions are met:
|
||||||
|
*
|
||||||
|
* * Redistributions of source code must retain the above copyright notice, this list of conditions and the
|
||||||
|
* following disclaimer.
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and
|
||||||
|
* the following disclaimer in the documentation and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
|
||||||
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||||
|
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
||||||
|
* THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
|
||||||
|
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||||
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
|
||||||
|
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
package org.x4o.fc18.cake2.fcdoc.kanji;
|
||||||
|
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Kanji KuTen and FourCorner and codePoint and meaning of a literal.
|
||||||
|
*
|
||||||
|
* @author Willem Cazander
|
||||||
|
* @version 1.0 Jan 23, 2025
|
||||||
|
*/
|
||||||
|
public class KanjiDictLiteral {
|
||||||
|
|
||||||
|
String kuTen208;
|
||||||
|
String kuTen213;
|
||||||
|
Integer fourCorner;
|
||||||
|
Integer codePoint;
|
||||||
|
final Set<String> meaning = new HashSet<>();
|
||||||
|
}
|
|
@ -0,0 +1,74 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2004-2014, Willem Cazander
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without modification, are permitted provided
|
||||||
|
* that the following conditions are met:
|
||||||
|
*
|
||||||
|
* * Redistributions of source code must retain the above copyright notice, this list of conditions and the
|
||||||
|
* following disclaimer.
|
||||||
|
* * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and
|
||||||
|
* the following disclaimer in the documentation and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
|
||||||
|
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||||
|
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
||||||
|
* THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
|
||||||
|
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||||
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
|
||||||
|
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||||
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
package org.x4o.fc18.cake2.fcdoc.kanji;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write data files from "kanjidic2.xml"
|
||||||
|
*
|
||||||
|
* @author Willem Cazander
|
||||||
|
* @version 1.0 Jan 23, 2025
|
||||||
|
*/
|
||||||
|
public class KanjiDictTest {
|
||||||
|
|
||||||
|
static public void main(String[] args) throws Exception {
|
||||||
|
if (args.length == 0) {
|
||||||
|
System.err.println("No argument file given");
|
||||||
|
System.exit(1);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
KanjiDict dict = new KanjiDict();
|
||||||
|
try (InputStream in = new FileInputStream(new File(args[0]))) {
|
||||||
|
dict.parseXML(in);
|
||||||
|
}
|
||||||
|
System.out.println("=======================================");
|
||||||
|
for (String key : dict.mapKuTen208.keySet()) {
|
||||||
|
KanjiDictLiteral lit = dict.mapKuTen208.get(key);
|
||||||
|
int meanCnt = lit.meaning.size();
|
||||||
|
String meaning = lit.meaning.stream().collect(Collectors.joining(" "));
|
||||||
|
System.out.println(String.format("%06d - %s - %d = %d %s", lit.codePoint, lit.kuTen208, lit.fourCorner, meanCnt, meaning));
|
||||||
|
}
|
||||||
|
System.out.println("=======================================");
|
||||||
|
for (Integer key : dict.mapFourCorner.keySet()) {
|
||||||
|
KanjiDictLiteral lit = dict.mapFourCorner.get(key);
|
||||||
|
int meanCnt = lit.meaning.size();
|
||||||
|
String meaning = lit.meaning.stream().collect(Collectors.joining(" "));
|
||||||
|
System.out.println(String.format("%06d - %s - %d = %d %s", lit.codePoint, lit.kuTen208, lit.fourCorner, meanCnt, meaning));
|
||||||
|
}
|
||||||
|
|
||||||
|
// JIS X 0208 prescribes a set of 6879 graphical characters
|
||||||
|
// JIS X 0213 + 0208 have total characters 11233
|
||||||
|
System.out.println("=======================================");
|
||||||
|
System.out.println("mapFourCorner: " + dict.mapFourCorner.size());
|
||||||
|
System.out.println("mapKuTen208: " + dict.mapKuTen208.size());
|
||||||
|
System.out.println("mapKuTen213: " + dict.mapKuTen213.size());
|
||||||
|
// 15642334 bytes or 538402 lines gives;
|
||||||
|
// mapFourCorner: 3936
|
||||||
|
// mapKuTen208: 6355
|
||||||
|
// mapKuTen213: 3695
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue