Added raw code for reading unicode.xml dict information

This commit is contained in:
Willem Cazander 2025-06-01 20:28:12 +02:00
parent 3d6edc8773
commit df85b66fe0
4 changed files with 486 additions and 0 deletions

View file

@ -0,0 +1,192 @@
/*
* Copyright (c) 2004-2014, Willem Cazander
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted provided
* that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this list of conditions and the
* following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package love.distributedrebirth.nx01.mushroom.mais.fc18.unicode;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
/**
* Simple dict to lookup kanji.
*
* @author Willem Cazander
* @version 1.0 Jun 01, 2025
*/
public class UnicodeDict {
private final List<UnicodeDictLiteralBlock> blocks = new ArrayList<>();
private final Map<UnicodeDictLiteralBlock, List<UnicodeDictLiteral>> blockChars = new HashMap<>();
public UnicodeDict() {
UnicodeDictLiteralBlock block;
block = new UnicodeDictLiteralBlock();
block.setName("Unknown-1");
block.setStart(Integer.parseInt("1D800", 16));
block.setEnd(Integer.parseInt("1E7FF", 16));
blocks.add(block);
block = new UnicodeDictLiteralBlock();
block.setName("Unknown-2");
block.setStart(Integer.parseInt("1F900", 16));
block.setEnd(Integer.parseInt("1FFFF", 16));
blocks.add(block);
}
public List<UnicodeDictLiteralBlock> getBlocks() {
return blocks;
}
public List<UnicodeDictLiteral> getLiterals(UnicodeDictLiteralBlock block) {
return blockChars.get(block);
}
private UnicodeDictLiteralBlock findLiteralBlock(Integer dec) {
for (UnicodeDictLiteralBlock block : blocks) {
if (dec >= block.getStart() && dec <= block.getEnd()) {
return block;
}
}
throw new IllegalArgumentException("Unmapped decimal unicode number: " + dec);
}
public void parseXML(InputStream input) throws ParserConfigurationException, SAXException, IOException {
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser parser = factory.newSAXParser();
XMLReader reader = parser.getXMLReader();
UnicodeDictContentHandler handler = new UnicodeDictContentHandler();
reader.setContentHandler(handler);
InputSource inputSource = new InputSource(input);
reader.parse(inputSource);
}
class UnicodeDictContentHandler implements ContentHandler {
StringBuilder bufChar = new StringBuilder();
UnicodeDictLiteral literal;
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
bufChar.append(new String(ch,start,length));
}
@Override
public void endDocument() throws SAXException {
}
@Override
public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
if (literal == null) {
return;
}
if ("character".equals(qName)) {
UnicodeDictLiteralBlock block = findLiteralBlock(literal.getDec());
List<UnicodeDictLiteral> charList = blockChars.get(block);
if (charList == null) {
charList = new ArrayList<>();
blockChars.put(block, charList);
}
charList.add(literal);
} else if ("latex".equals(qName)) {
literal.setLatex(bufChar.toString());
} else if ("description".equals(qName)) {
literal.setDescription(bufChar.toString());
}
}
@Override
public void endPrefixMapping(String arg0) throws SAXException {
}
@Override
public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException {
}
@Override
public void processingInstruction(String arg0, String arg1) throws SAXException {
}
@Override
public void setDocumentLocator(Locator arg0) {
}
@Override
public void skippedEntity(String arg0) throws SAXException {
}
@Override
public void startDocument() throws SAXException {
}
@Override
public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
bufChar = new StringBuilder();
if ("character".equals(qName)) {
if (atts.getValue("dec").contains("-")) {
literal = null;
return;
}
literal = new UnicodeDictLiteral();
literal.setId(atts.getValue("id"));
literal.setDec(Integer.parseInt(atts.getValue("dec")));
literal.setImage(atts.getValue("image"));
literal.setMode(atts.getValue("mode"));
literal.setType(atts.getValue("type"));
} else if ("unicodedata".equals(qName)) {
if (literal == null) {
return;
}
literal.setDataCategory(atts.getValue("category"));
literal.setDataCombClass(atts.getValue("combClass"));
literal.setDataBibi(atts.getValue("bibi"));
literal.setDataDecomp(atts.getValue("decomp"));
literal.setDataNumeric(atts.getValue("numeric"));
literal.setDataMirror(atts.getValue("mirror"));
literal.setDataComment(atts.getValue("comment"));
literal.setDataMathClass(atts.getValue("mathclass"));
} else if ("block".equals(qName)) {
UnicodeDictLiteralBlock block = new UnicodeDictLiteralBlock();
block.setStart(Integer.parseInt(atts.getValue("start"), 16));
block.setEnd(Integer.parseInt(atts.getValue("end"), 16));
block.setName(atts.getValue("name"));
blocks.add(block);
}
}
@Override
public void startPrefixMapping(String arg0, String arg1) throws SAXException {
}
}
}

View file

@ -0,0 +1,171 @@
/*
* Copyright (c) 2004-2014, Willem Cazander
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted provided
* that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this list of conditions and the
* following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package love.distributedrebirth.nx01.mushroom.mais.fc18.unicode;
/**
* Unicode character literal.
*
* @author Willem Cazander
* @version 1.0 Jun 01, 2025
*/
public class UnicodeDictLiteral {
private String id;
private Integer dec;
private String image;
private String mode;
private String type;
private String latex;
private String description;
private String dataCategory;
private String dataCombClass;
private String dataBibi;
private String dataDecomp;
private String dataNumeric;
private String dataMirror;
private String dataComment;
private String dataMathClass;
public UnicodeDictLiteral() {
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public Integer getDec() {
return dec;
}
public void setDec(Integer dec) {
this.dec = dec;
}
public String getImage() {
return image;
}
public void setImage(String image) {
this.image = image;
}
public String getMode() {
return mode;
}
public void setMode(String mode) {
this.mode = mode;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getLatex() {
return latex;
}
public void setLatex(String latex) {
this.latex = latex;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public String getDataCategory() {
return dataCategory;
}
public void setDataCategory(String dataCategory) {
this.dataCategory = dataCategory;
}
public String getDataCombClass() {
return dataCombClass;
}
public void setDataCombClass(String dataCombClass) {
this.dataCombClass = dataCombClass;
}
public String getDataBibi() {
return dataBibi;
}
public void setDataBibi(String dataBibi) {
this.dataBibi = dataBibi;
}
public String getDataDecomp() {
return dataDecomp;
}
public void setDataDecomp(String dataDecomp) {
this.dataDecomp = dataDecomp;
}
public String getDataNumeric() {
return dataNumeric;
}
public void setDataNumeric(String dataNumeric) {
this.dataNumeric = dataNumeric;
}
public String getDataMirror() {
return dataMirror;
}
public void setDataMirror(String dataMirror) {
this.dataMirror = dataMirror;
}
public String getDataComment() {
return dataComment;
}
public void setDataComment(String dataComment) {
this.dataComment = dataComment;
}
public String getDataMathClass() {
return dataMathClass;
}
public void setDataMathClass(String dataMathClass) {
this.dataMathClass = dataMathClass;
}
}

View file

@ -0,0 +1,63 @@
/*
* Copyright (c) 2004-2014, Willem Cazander
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted provided
* that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this list of conditions and the
* following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package love.distributedrebirth.nx01.mushroom.mais.fc18.unicode;
/**
* The unicode blocks for literals.
*
* @author Willem Cazander
* @version 1.0 Jun 01, 2025
*/
public class UnicodeDictLiteralBlock {
private String name;
private Integer start;
private Integer end;
public UnicodeDictLiteralBlock() {
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public Integer getStart() {
return start;
}
public void setStart(Integer start) {
this.start = start;
}
public Integer getEnd() {
return end;
}
public void setEnd(Integer end) {
this.end = end;
}
}

View file

@ -0,0 +1,60 @@
/*
* Copyright (c) 2004-2014, Willem Cazander
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted provided
* that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this list of conditions and the
* following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and
* the following disclaimer in the documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package love.distributedrebirth.nx01.mushroom.mais.fc18.unicode;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.List;
/**
* Write data files from "unicode.xml"
*
* @author Willem Cazander
* @version 1.0 Jun 01, 2025
*/
public class UnicodeDictTest {
static public void main(String[] args) throws Exception {
if (args.length == 0) {
System.err.println("No argument file given");
System.exit(1);
return;
}
UnicodeDict dict = new UnicodeDict();
try (InputStream in = new FileInputStream(new File(args[0]))) {
dict.parseXML(in);
}
System.out.println("=======================================");
for (UnicodeDictLiteralBlock block : dict.getBlocks()) {
List<UnicodeDictLiteral> charList = dict.getLiterals(block);
if (charList == null) {
continue;
}
System.out.println("Block size=" + charList.size() + " name=" + block.getName());
//for (UnicodeDictLiteral literal : charList) {
// System.out.println("Literal: " + literal.getId() + " desc=" + literal.getDescription());
//}
}
}
}