大五碼(Big5)編碼自學筆記 - v3.14,楊和榮
從大五碼到 Unicode 轉換表製作程式
本章介紹了 Big5Unicode.java 源程式。它可以用來製作大五碼 (Big5) 到 Unicode 編碼轉換表。
本書列出的從大五碼到 Unicode 編碼轉換表由下面的程式所產生。這個程式採用了 Java 內部編碼轉換函數 CharsetDecoder.encode() 和 CharsetDecoder.decode(),
/* Big5Unicode.java - Copyright (c) 2015, HerongYang.com, All Rights Reserved. */ import java.io.*; import java.nio.*; import java.nio.charset.*; class Big5Unicode { static OutputStream out = null; static char hexDigit[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; static String blk_name[] = {"Special Symbols", "Level 1 Characters", "Level 2 Characters"}; static int blk_first[] = {0xA140, 0xA440, 0xC940}; static int blk_last[] = {0xA3BF, 0xC67E, 0xF9D5}; static int blk_size[] = {408, 5401, 7652}; static int blk_count[] = {0, 0, 0}; public static void main(String[] args) { try { out = new FileOutputStream("big5-unicode.big5"); writeCode(); out.close(); } catch (IOException e) { System.out.println(e.toString()); } } public static void writeCode() throws IOException { String name = null; CharsetDecoder b5dc = Charset.forName("Big5").newDecoder(); CharsetEncoder uxec = Charset.forName("UTF-16BE").newEncoder(); ByteBuffer b5bb = null; ByteBuffer uxbb = null; CharBuffer cb = null; for (int i=0xA1; i<=0xFF; i++) { int blk = getBlock(i); if (blk==-1) continue; name = blk_name[blk]; writeln(); writeString("<p><b>Row "); writeHex((byte)i); writeString(": "+name+"</b></p>"); writeln(); writeln(); writeHeader(); for (int j=0x40; j<=0xFF; j++) { byte hi = (byte)(i); byte lo = (byte)(j); if (validBig5(i, j, blk)) { b5bb = ByteBuffer.wrap(new byte[]{hi, lo}); try { cb = b5dc.decode(b5bb); uxbb = uxec.encode(cb); writeByte(hi); writeByte(lo); writeString(" "); writeHex(hi); writeHex(lo); blk_count[blk] = blk_count[blk] + 1; } catch (CharacterCodingException e) { cb = null; uxbb = null; writeBig5Space(); writeString(" fail"); } } else { cb = null; uxbb = null; writeBig5Space(); writeString(" null"); } writeString(" "); writeByteBuffer(uxbb, 2); if ((j+1)%4 == 0) { writeln(); } else { writeString(" "); } } writeFooter(); } for (int l=0; l<blk_name.length; l++) { System.out.println(blk_name[l]+": " + blk_count[l]+" of "+blk_size[l]); } } public static void writeln() throws IOException { out.write(0x0D); out.write(0x0A); } public static void writeByte(byte b) throws IOException { out.write(b & 0xFF); } public static void writeByteBuffer(ByteBuffer b, int l) throws IOException { int i = 0; if (b==null) { writeString("null"); i = 2; } else { for (i=0; i<b.limit(); i++) writeHex(b.get(i)); } for (int j=i; j<l; j++) writeString(" "); } public static void writeBig5Space() throws IOException { out.write(0xA1); out.write(0x40); } public static void writeString(String s) throws IOException { if (s!=null) { for (int i=0; i<s.length(); i++) { out.write((int) (s.charAt(i) & 0xFF)); } } } public static void writeNumber(int i) throws IOException { String s = "00" + String.valueOf(i); writeString(s.substring(s.length()-2,s.length())); } public static void writeHex(byte b) throws IOException { out.write((int) hexDigit[(b >> 4) & 0x0F]); out.write((int) hexDigit[b & 0x0F]); } public static void writeHeader() throws IOException { writeString("<pre class=\"chinese\">"); writeBig5Space(); writeString(" Big5 Uni."); writeString(" "); writeBig5Space(); writeString(" Big5 Uni."); writeString(" "); writeBig5Space(); writeString(" Big5 Uni."); writeString(" "); writeBig5Space(); writeString(" Big5 Uni."); writeln(); writeln(); } public static void writeFooter() throws IOException { writeString("</pre>"); writeln(); } public static boolean validBig5(int i, int j, int blk) { // valid ranges for j: 0x40 - 0x7E and 0xA1 - 0xFE. if (j<0x40) return false; if (j>0x7E && j<0xA1) return false; if (j>0xFE) return false; int last_i = blk_last[blk] >> 8; int last_j = blk_last[blk] & 0xFF; if (i==last_i && j>last_j) return false; return true; } public static int getBlock(int i) { for (int l=0; l<blk_first.length; l++) { int first = blk_first[l] >> 8; int last = blk_last[l] >> 8; if (i>=first && i<=last) return l; } return -1; } }
關於程式的幾點註解:
程式可以在 JDK 8 到 JDK 20 的任何一個版本中編譯和執行。 程式的輸出結果如下:
herong$ javac Big5Unicode.java herong$ java Big5Unicode Special Symbols: 406 of 408 Level 1 Characters: 5401 of 5401 Level 2 Characters: 7652 of 7652
結果顯示,JDK 無法處理兩個大五碼符號: 0xA1C3 ( ̄) 和 0xA1C5 (ˍ)。 我們只好在輸出的轉換錶中,做下列修改:
 ̄ A1C3 U+FFE3 ˍ A1C5 U+02CD
另外,JDK 還有三個解碼錯誤,需要修改:
A1FE: Java bug - wrong mapping (/ A1FE U+2571, ╱ A2AC U+2571) It should be: / A1FE U+FF0F A240: Java bug - wrong mapping (\ A240 U+2572, ╲ A2AD U+2572) It should be: \ A240 U+FF3C A15A: Java bug - wrong mapping (╴ A15A U+FF3F, _ A1C4 U+FF3F) It should be: ╴ A15A U+2574
程式輸出的轉換錶將列入本書的後面部分。
Table of Contents