国标GB2312编码自学教程 - v3.13, by 杨和荣
从Unicode到GB2312转换表制作程式
本章介绍了UnicodeGB2312.java源程序。它可以用来制作Unicode编码到GB2312编码转换 表。
在我发表了GB2312到Unicode的转换表以后,收到了许多读者信件,寻求Unicode到GB2312 的转换表。
下面的程式便可以用来制作这样的转换表。程式的输出结果收入下一章之中。
/* UnicodeGB2312.java - Copyright (c) 2016, HerongYang.com, All Rights Reserved. */ import java.io.*; import java.nio.*; import java.nio.charset.*; class UnicodeGB2312 { static OutputStream out = null; static char hexDigit[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; static int b_out[] = {201,267,279,293,484,587,625,657,734,782,827, 874,901,980,1001,5590,8801}; static int e_out[] = {216,268,280,294,494,594,632,694,748,794,836, 894,903,994,1594,5594,9494}; public static void main(String[] a) { try { out = new FileOutputStream("unicode_gb2312.gb"); writeCode(); out.close(); } catch (IOException e) { System.out.println(e.toString()); } } public static void writeCode() throws IOException { CharsetEncoder gbec = Charset.forName("GBK").newEncoder(); char[] ca = new char[1]; CharBuffer cb = null; ByteBuffer gbbb = null; writeHeader(); int count = 0; for (int i=0; i<0x010000; i++) { ca[0] = (char) i; cb = CharBuffer.wrap(ca); try { gbbb = gbec.encode(cb); } catch (CharacterCodingException e) { gbbb = null; } if (validGB(gbbb)) { count++; writeHex((byte) (ca[0] >>> 8)); writeHex((byte) (ca[0] & 0xff)); writeString(" "); writeByteBuffer(gbbb,2); writeString(" "); writeByte(gbbb.get(0)); writeByte(gbbb.get(1)); if (count%5 == 0) writeln(); else writeString(" "); } } if (count%5 != 0) writeln(); writeFooter(); System.out.println("Number of GB characters wrote: "+count); } public static boolean validGB(ByteBuffer gbbb) { if (gbbb==null) return false; else if (gbbb.limit()!=2) return false; else { byte hi = gbbb.get(0); byte lo = gbbb.get(1); if ((hi&0xFF)<0xA0) return false; if ((lo&0xFF)<0xA0) return false; int i = (hi&0xFF) - 0xA0; int j = (lo&0xFF) - 0xA0; if (i<1 || i>94) return false; if (j<1 || j>94) return false; for (int l=0; l<b_out.length; l++) { if (i*100+j>=b_out[l] && i*100+j<=e_out[l]) return false; } } return true; } public static void writeHeader() throws IOException { writeString("<pre>"); writeln(); writeString("Uni. GB "); writeGBSpace(); writeString(" "); writeString("Uni. GB "); writeGBSpace(); writeString(" "); writeString("Uni. GB "); writeGBSpace(); writeString(" "); writeString("Uni. GB "); writeGBSpace(); writeString(" "); writeString("Uni. GB "); writeGBSpace(); writeln(); writeln(); } public static void writeFooter() throws IOException { writeString("</pre>"); writeln(); } public static void writeln() throws IOException { out.write(0x0D); out.write(0x0A); } public static void writeGBSpace() throws IOException { out.write(0xA1); out.write(0xA1); } public static void writeByteBuffer(ByteBuffer b, int l) throws IOException { int i = 0; if (b==null) { writeString("null"); i = 2; } else { for (i=0; i<b.limit(); i++) writeHex(b.get(i)); } for (int j=i; j<l; j++) writeString(" "); } public static void writeString(String s) throws IOException { if (s!=null) { for (int i=0; i<s.length(); i++) { out.write((int) (s.charAt(i) & 0xFF)); } } } public static void writeHex(byte b) throws IOException { out.write((int) hexDigit[(b >> 4) & 0x0F]); out.write((int) hexDigit[b & 0x0F]); } public static void writeByte(byte b) throws IOException { out.write(b & 0xFF); } }
上面的程式发表后,又有许多读者来信要求对程式加以说明,以便理解。其实这个程式的 逻辑很简单,阅读时仅需注意以下几点:
一, Unicode字符集的全体编码都在0x0000和0xFFFF之间,所以子程式writeCode()使用 了一个循环复句,以变量i走遍了Unicode的全体可能编码。
二,把单个Unicode编码转换成GB2312编码的关键语句是:gbec.encode(cb),它使用了 JDK中CharsetEncoder的中文编码功能。注意,GBK是由GB2312扩张而成。JDK只提供GBK编 码功能。
三,由于Unicode字符集比GB2312大,gbec.encode(cb)输出的编码有许多是废码,或者是 GBK的扩张码,所以要用子程式validGB()进行验证。
四,程式的其它部分主要是用于输出的列表制作。
Table of Contents