Added raw code for reading unicode.xml dict information
This commit is contained in:
parent
3d6edc8773
commit
df85b66fe0
4 changed files with 486 additions and 0 deletions
|
|
@ -0,0 +1,192 @@
|
|||
/*
|
||||
* Copyright (c) 2004-2014, Willem Cazander
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted provided
|
||||
* that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of conditions and the
|
||||
* following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and
|
||||
* the following disclaimer in the documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
||||
* THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
|
||||
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
|
||||
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
package love.distributedrebirth.nx01.mushroom.mais.fc18.unicode;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.InputSource;
|
||||
import org.xml.sax.Locator;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.XMLReader;
|
||||
|
||||
/**
|
||||
* Simple dict to lookup kanji.
|
||||
*
|
||||
* @author Willem Cazander
|
||||
* @version 1.0 Jun 01, 2025
|
||||
*/
|
||||
public class UnicodeDict {
|
||||
|
||||
private final List<UnicodeDictLiteralBlock> blocks = new ArrayList<>();
|
||||
private final Map<UnicodeDictLiteralBlock, List<UnicodeDictLiteral>> blockChars = new HashMap<>();
|
||||
|
||||
public UnicodeDict() {
|
||||
UnicodeDictLiteralBlock block;
|
||||
block = new UnicodeDictLiteralBlock();
|
||||
block.setName("Unknown-1");
|
||||
block.setStart(Integer.parseInt("1D800", 16));
|
||||
block.setEnd(Integer.parseInt("1E7FF", 16));
|
||||
blocks.add(block);
|
||||
block = new UnicodeDictLiteralBlock();
|
||||
block.setName("Unknown-2");
|
||||
block.setStart(Integer.parseInt("1F900", 16));
|
||||
block.setEnd(Integer.parseInt("1FFFF", 16));
|
||||
blocks.add(block);
|
||||
}
|
||||
|
||||
public List<UnicodeDictLiteralBlock> getBlocks() {
|
||||
return blocks;
|
||||
}
|
||||
|
||||
public List<UnicodeDictLiteral> getLiterals(UnicodeDictLiteralBlock block) {
|
||||
return blockChars.get(block);
|
||||
}
|
||||
|
||||
private UnicodeDictLiteralBlock findLiteralBlock(Integer dec) {
|
||||
for (UnicodeDictLiteralBlock block : blocks) {
|
||||
if (dec >= block.getStart() && dec <= block.getEnd()) {
|
||||
return block;
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException("Unmapped decimal unicode number: " + dec);
|
||||
}
|
||||
|
||||
public void parseXML(InputStream input) throws ParserConfigurationException, SAXException, IOException {
|
||||
SAXParserFactory factory = SAXParserFactory.newInstance();
|
||||
SAXParser parser = factory.newSAXParser();
|
||||
XMLReader reader = parser.getXMLReader();
|
||||
UnicodeDictContentHandler handler = new UnicodeDictContentHandler();
|
||||
reader.setContentHandler(handler);
|
||||
InputSource inputSource = new InputSource(input);
|
||||
reader.parse(inputSource);
|
||||
}
|
||||
|
||||
class UnicodeDictContentHandler implements ContentHandler {
|
||||
|
||||
StringBuilder bufChar = new StringBuilder();
|
||||
UnicodeDictLiteral literal;
|
||||
|
||||
@Override
|
||||
public void characters(char[] ch, int start, int length) throws SAXException {
|
||||
bufChar.append(new String(ch,start,length));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endDocument() throws SAXException {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
|
||||
if (literal == null) {
|
||||
return;
|
||||
}
|
||||
if ("character".equals(qName)) {
|
||||
UnicodeDictLiteralBlock block = findLiteralBlock(literal.getDec());
|
||||
List<UnicodeDictLiteral> charList = blockChars.get(block);
|
||||
if (charList == null) {
|
||||
charList = new ArrayList<>();
|
||||
blockChars.put(block, charList);
|
||||
}
|
||||
charList.add(literal);
|
||||
} else if ("latex".equals(qName)) {
|
||||
literal.setLatex(bufChar.toString());
|
||||
} else if ("description".equals(qName)) {
|
||||
literal.setDescription(bufChar.toString());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endPrefixMapping(String arg0) throws SAXException {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void ignorableWhitespace(char[] arg0, int arg1, int arg2) throws SAXException {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void processingInstruction(String arg0, String arg1) throws SAXException {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setDocumentLocator(Locator arg0) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void skippedEntity(String arg0) throws SAXException {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startDocument() throws SAXException {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
|
||||
bufChar = new StringBuilder();
|
||||
if ("character".equals(qName)) {
|
||||
if (atts.getValue("dec").contains("-")) {
|
||||
literal = null;
|
||||
return;
|
||||
}
|
||||
literal = new UnicodeDictLiteral();
|
||||
literal.setId(atts.getValue("id"));
|
||||
literal.setDec(Integer.parseInt(atts.getValue("dec")));
|
||||
literal.setImage(atts.getValue("image"));
|
||||
literal.setMode(atts.getValue("mode"));
|
||||
literal.setType(atts.getValue("type"));
|
||||
} else if ("unicodedata".equals(qName)) {
|
||||
if (literal == null) {
|
||||
return;
|
||||
}
|
||||
literal.setDataCategory(atts.getValue("category"));
|
||||
literal.setDataCombClass(atts.getValue("combClass"));
|
||||
literal.setDataBibi(atts.getValue("bibi"));
|
||||
literal.setDataDecomp(atts.getValue("decomp"));
|
||||
literal.setDataNumeric(atts.getValue("numeric"));
|
||||
literal.setDataMirror(atts.getValue("mirror"));
|
||||
literal.setDataComment(atts.getValue("comment"));
|
||||
literal.setDataMathClass(atts.getValue("mathclass"));
|
||||
} else if ("block".equals(qName)) {
|
||||
UnicodeDictLiteralBlock block = new UnicodeDictLiteralBlock();
|
||||
block.setStart(Integer.parseInt(atts.getValue("start"), 16));
|
||||
block.setEnd(Integer.parseInt(atts.getValue("end"), 16));
|
||||
block.setName(atts.getValue("name"));
|
||||
blocks.add(block);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startPrefixMapping(String arg0, String arg1) throws SAXException {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,171 @@
|
|||
/*
|
||||
* Copyright (c) 2004-2014, Willem Cazander
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted provided
|
||||
* that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of conditions and the
|
||||
* following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and
|
||||
* the following disclaimer in the documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
||||
* THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
|
||||
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
|
||||
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
package love.distributedrebirth.nx01.mushroom.mais.fc18.unicode;
|
||||
|
||||
/**
|
||||
* Unicode character literal.
|
||||
*
|
||||
* @author Willem Cazander
|
||||
* @version 1.0 Jun 01, 2025
|
||||
*/
|
||||
public class UnicodeDictLiteral {
|
||||
|
||||
private String id;
|
||||
private Integer dec;
|
||||
private String image;
|
||||
private String mode;
|
||||
private String type;
|
||||
private String latex;
|
||||
private String description;
|
||||
private String dataCategory;
|
||||
private String dataCombClass;
|
||||
private String dataBibi;
|
||||
private String dataDecomp;
|
||||
private String dataNumeric;
|
||||
private String dataMirror;
|
||||
private String dataComment;
|
||||
private String dataMathClass;
|
||||
|
||||
public UnicodeDictLiteral() {
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public Integer getDec() {
|
||||
return dec;
|
||||
}
|
||||
|
||||
public void setDec(Integer dec) {
|
||||
this.dec = dec;
|
||||
}
|
||||
|
||||
public String getImage() {
|
||||
return image;
|
||||
}
|
||||
|
||||
public void setImage(String image) {
|
||||
this.image = image;
|
||||
}
|
||||
|
||||
public String getMode() {
|
||||
return mode;
|
||||
}
|
||||
|
||||
public void setMode(String mode) {
|
||||
this.mode = mode;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(String type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public String getLatex() {
|
||||
return latex;
|
||||
}
|
||||
|
||||
public void setLatex(String latex) {
|
||||
this.latex = latex;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
|
||||
public void setDescription(String description) {
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
public String getDataCategory() {
|
||||
return dataCategory;
|
||||
}
|
||||
|
||||
public void setDataCategory(String dataCategory) {
|
||||
this.dataCategory = dataCategory;
|
||||
}
|
||||
|
||||
public String getDataCombClass() {
|
||||
return dataCombClass;
|
||||
}
|
||||
|
||||
public void setDataCombClass(String dataCombClass) {
|
||||
this.dataCombClass = dataCombClass;
|
||||
}
|
||||
|
||||
public String getDataBibi() {
|
||||
return dataBibi;
|
||||
}
|
||||
|
||||
public void setDataBibi(String dataBibi) {
|
||||
this.dataBibi = dataBibi;
|
||||
}
|
||||
|
||||
public String getDataDecomp() {
|
||||
return dataDecomp;
|
||||
}
|
||||
|
||||
public void setDataDecomp(String dataDecomp) {
|
||||
this.dataDecomp = dataDecomp;
|
||||
}
|
||||
|
||||
public String getDataNumeric() {
|
||||
return dataNumeric;
|
||||
}
|
||||
|
||||
public void setDataNumeric(String dataNumeric) {
|
||||
this.dataNumeric = dataNumeric;
|
||||
}
|
||||
|
||||
public String getDataMirror() {
|
||||
return dataMirror;
|
||||
}
|
||||
|
||||
public void setDataMirror(String dataMirror) {
|
||||
this.dataMirror = dataMirror;
|
||||
}
|
||||
|
||||
public String getDataComment() {
|
||||
return dataComment;
|
||||
}
|
||||
|
||||
public void setDataComment(String dataComment) {
|
||||
this.dataComment = dataComment;
|
||||
}
|
||||
|
||||
public String getDataMathClass() {
|
||||
return dataMathClass;
|
||||
}
|
||||
|
||||
public void setDataMathClass(String dataMathClass) {
|
||||
this.dataMathClass = dataMathClass;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,63 @@
|
|||
/*
|
||||
* Copyright (c) 2004-2014, Willem Cazander
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted provided
|
||||
* that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of conditions and the
|
||||
* following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and
|
||||
* the following disclaimer in the documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
||||
* THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
|
||||
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
|
||||
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
package love.distributedrebirth.nx01.mushroom.mais.fc18.unicode;
|
||||
|
||||
/**
|
||||
* The unicode blocks for literals.
|
||||
*
|
||||
* @author Willem Cazander
|
||||
* @version 1.0 Jun 01, 2025
|
||||
*/
|
||||
public class UnicodeDictLiteralBlock {
|
||||
|
||||
private String name;
|
||||
private Integer start;
|
||||
private Integer end;
|
||||
|
||||
public UnicodeDictLiteralBlock() {
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public Integer getStart() {
|
||||
return start;
|
||||
}
|
||||
|
||||
public void setStart(Integer start) {
|
||||
this.start = start;
|
||||
}
|
||||
|
||||
public Integer getEnd() {
|
||||
return end;
|
||||
}
|
||||
|
||||
public void setEnd(Integer end) {
|
||||
this.end = end;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,60 @@
|
|||
/*
|
||||
* Copyright (c) 2004-2014, Willem Cazander
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without modification, are permitted provided
|
||||
* that the following conditions are met:
|
||||
*
|
||||
* * Redistributions of source code must retain the above copyright notice, this list of conditions and the
|
||||
* following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and
|
||||
* the following disclaimer in the documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
||||
* THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
|
||||
* OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
|
||||
* TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
package love.distributedrebirth.nx01.mushroom.mais.fc18.unicode;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Write data files from "unicode.xml"
|
||||
*
|
||||
* @author Willem Cazander
|
||||
* @version 1.0 Jun 01, 2025
|
||||
*/
|
||||
public class UnicodeDictTest {
|
||||
|
||||
static public void main(String[] args) throws Exception {
|
||||
if (args.length == 0) {
|
||||
System.err.println("No argument file given");
|
||||
System.exit(1);
|
||||
return;
|
||||
}
|
||||
UnicodeDict dict = new UnicodeDict();
|
||||
try (InputStream in = new FileInputStream(new File(args[0]))) {
|
||||
dict.parseXML(in);
|
||||
}
|
||||
System.out.println("=======================================");
|
||||
for (UnicodeDictLiteralBlock block : dict.getBlocks()) {
|
||||
List<UnicodeDictLiteral> charList = dict.getLiterals(block);
|
||||
if (charList == null) {
|
||||
continue;
|
||||
}
|
||||
System.out.println("Block size=" + charList.size() + " name=" + block.getName());
|
||||
//for (UnicodeDictLiteral literal : charList) {
|
||||
// System.out.println("Literal: " + literal.getId() + " desc=" + literal.getDescription());
|
||||
//}
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue