UnicodeBig5.java - Unicode to Big5 Mapping

UnicodeBig5.java is a Java program that generates a table to map all Big5 characters from Unicode Codes to Big5 Codes.

After published my Big5 to Unicode conversion table, I got requests from readers asking for Unicode to Big5 conversion table. So I wrote another Java program, UnicodeBig5.java, to produce a table that maps a Unicode code point to a Big5 code point.

The output of the program includes 3 columns per character:

/* UnicodeBig5.java
 - Copyright (c) 2015, HerongYang.com, All Rights Reserved.
 */
import java.io.*;
import java.nio.*;
import java.nio.charset.*;
import java.util.*;

class UnicodeBig5 {
  static OutputStream out = null;
  static char hexDigit[] = {'0', '1', '2', '3', '4', '5', '6', '7',
                            '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
  static String blk_name[] = {"Special Symbols",
        "Level 1 Characters", "Level 2 Characters"};
  static int blk_first[] = {0xA140, 0xA440, 0xC940};
  static int blk_last[] = {0xA3BF, 0xC67E, 0xF9D5};
  static int blk_size[] = {408, 5401, 7652};
  static int blk_count[] = {0, 0, 0};
  static Hashtable<String, Integer> code_map
    = new Hashtable<String, Integer>();

  public static void main(String[] a) {
    setupMap();
    try {
      out = new FileOutputStream("unicode_big5.big5");
      writeCode();
      out.close();
    } catch (IOException e) {
      System.out.println(e.toString());
    }
  }

  public static void writeCode() throws IOException {
    CharsetEncoder b5ec = Charset.forName("Big5").newEncoder();
    char[] ca = new char[1];
    CharBuffer cb = null;
    ByteBuffer b5bb = null;
    writeHeader();
    int count = 0;
    for (int i=0; i<0x010000; i++) {
      ca[0] = (char) i;
      cb = CharBuffer.wrap(ca);
      try {
        b5bb = b5ec.encode(cb);
      } catch (CharacterCodingException e) {
        b5bb = null;
      }
      if (validBig5Bytes(b5bb)) {
        count++;
        writeHex((byte) (ca[0] >>> 8));
        writeHex((byte) (ca[0] & 0xff));
        writeString(" ");
        writeByteBuffer(b5bb,2);
        writeString(" ");
        writeByte(b5bb.get(0));
        writeByte(b5bb.get(1));
        if (count%5 == 0) writeln();
        else writeString("   ");
        if (count%250 == 0) {
          writeFooter();
          writeHeader();
        }
      }
    }
    if (count%5 != 0) writeln();
    writeFooter();

    for (int l=0; l<blk_name.length; l++) {
      System.out.println(blk_name[l]+": "
        + blk_count[l]+" of "+blk_size[l]);
    }

    System.out.println("Remaining Big5 codes:");
    Enumeration<String> e = code_map.keys();
    while (e.hasMoreElements()) {
      System.out.println("   "+e.nextElement());
    }

  }

  public static void setupMap() {
    for (int i=0xA1; i<=0xFF; i++) {
      int blk = getBlock(i);
      if (blk==-1) continue;

      for (int j=0x40; j<=0xFF; j++) {
        if (validBig5(i, j, blk)) updateMap(i, j, "insert");
      }
    }
  }

  public static void updateMap(int i, int j, String action) {
    String code = Integer.toHexString(i) + Integer.toHexString(j);
    if (action=="insert") code_map.put(code, 1);
    else if (action=="remove") code_map.remove(code);
  }

  public static int getBlock(int i) {
    for (int l=0; l<blk_first.length; l++) {
      int first = blk_first[l] >> 8;
      int last = blk_last[l] >> 8;
      if (i>=first && i<=last) return l;
    }
    return -1;
  }

  public static boolean validBig5(int i, int j, int blk) {
    // valid ranges for j: 0x40 - 0x7E and 0xA1 - 0xFE.
    if (j<0x40) return false;
    if (j>0x7E && j<0xA1) return false;
    if (j>0xFE) return false;

    int last_i = blk_last[blk] >> 8;
    int last_j = blk_last[blk] & 0xFF;
    if (i==last_i && j>last_j) return false;

    return true;
  }

  public static boolean validBig5Bytes(ByteBuffer b5bb) {
    if (b5bb==null) return false;
    else if (b5bb.limit()!=2) return false;
    else {
      byte hi = b5bb.get(0);
      byte lo = b5bb.get(1);
      int i = (hi&0xFF);
      int j = (lo&0xFF);

      int blk = getBlock(i);
      if (blk==-1) return false;

      if (validBig5(i, j, blk)) {
        blk_count[blk] = blk_count[blk] + 1;
        updateMap(i, j, "remove");
        return true;
      } else {
        return false;
      }
    }
  }

  public static void writeHeader() throws IOException {
    writeString("<pre class=\"chinese\">");
    writeln();
    writeString("Uni. Big5 ");
    writeBig5Space();
    writeString("   ");
    writeString("Uni. Big5 ");
    writeBig5Space();
    writeString("   ");
    writeString("Uni. Big5 ");
    writeBig5Space();
    writeString("   ");
    writeString("Uni. Big5 ");
    writeBig5Space();
    writeString("   ");
    writeString("Uni. Big5 ");
    writeBig5Space();
    writeln();
    writeln();
  }

  public static void writeFooter() throws IOException {
    writeString("</pre>");
    writeln();
    writeln();
  }

  public static void writeln() throws IOException {
    out.write(0x0D);
    out.write(0x0A);
  }

  public static void writeBig5Space() throws IOException {
    out.write(0xA1);
    out.write(0x40);
  }

  public static void writeByteBuffer(ByteBuffer b, int l)
    throws IOException {
    int i = 0;
    if (b==null) {
      writeString("null");
      i = 2;
    } else {
      for (i=0; i<b.limit(); i++) writeHex(b.get(i));
    }
    for (int j=i; j<l; j++) writeString("  ");
  }

  public static void writeString(String s) throws IOException {
    if (s!=null) {
      for (int i=0; i<s.length(); i++) {
        out.write((int) (s.charAt(i) & 0xFF));
      }
    }
  }

  public static void writeHex(byte b) throws IOException {
    out.write((int) hexDigit[(b >> 4) & 0x0F]);
    out.write((int) hexDigit[b & 0x0F]);
  }

  public static void writeByte(byte b) throws IOException {
    out.write(b & 0xFF);
  }
}

Notes on the Java source code:

You can compile and run this Java program in with any JDK versions from JDK 8 to JDK 20. Here is the execution output:

herong$ javac UnicodeBig5.java
herong$ java UnicodeBig5

Special Symbols: 401 of 408
Level 1 Characters: 5401 of 5401
Level 2 Characters: 7652 of 7652

Remaining Big5 codes:
   a2ce
   a2cc
   a1fe
   a1c5
   a1c3
   a240
   a15a

As you can see from the output, there were 7 Big5 codes that left un-mapped in the Special Symbols block. There were missed due to different reasons:

A2CC: Duplicate Big5 codes ( A2CC U+5341,  A451 U+5341)
A2CE: Duplicate Big5 codes ( A2CE U+5345,  A4CA U+5345)
A1FE: Java bug - wrong mapping ( A1FE U+2571,  A2AC U+2571)
   It should be:  A1FE U+FF0F
A240: Java bug - wrong mapping ( A240 U+2572,  A2AD U+2572)
   It should be:  A240 U+FF3C
A15A: Java bug - wrong mapping ( A15A U+FF3F, _ A1C4 U+FF3F)
   It should be:  A15A U+2574
A1C3: Java bug - no mapping ( A1C3 U+FFE3)
A1C5: Java bug - no mapping (ˍ A1C5 U+02CD)

The entire output of this program is included later in the book.

Table of Contents

 About This Book

 Introduction to Big5

 Big5Unicode.java - Big5 to Unicode Mapping

 Big5 to Unicode Mapping - Special Symbols

 Big5 to Unicode Mapping - Level 1 Characters

 Big5 to Unicode Mapping - Level 2 Characters

UnicodeBig5.java - Unicode to Big5 Mapping

 Unicode to Big5 Mapping - All 13,461 Characters

 References of This Book - Big5 Tutorials

 Full Version in PDF/ePUB