Character Counter Program for Any Given Encoding

JDK Tutorials - Herong's Tutorial Examples

∟Character Counter Program for Any Given Encoding

This section provides a tutorial example on how to write a simple program to count valid characters in a give encoding character set encoding.

As mentioned in the previous chapter, JDK supports many build-in character set encodings.

Of course, each encoding is designed for a specific character set only. As a simple exercise, I want to write a sample program that counts the number of characters in the character set of a given encoding.

The sample program, EncodingCounter.java, counts the number of code points that are mapped valid byte sequences in the 0x0000 - 0xFFFF range for a given encoding:

/* EncodingCounter.java
 * Copyright (c) HerongYang.com. All Rights Reserved.
 */
import java.io.*;
import java.nio.*;
import java.nio.charset.*;
class EncodingCounter {
   static char hexDigit[] = {'0', '1', '2', '3', '4', '5', '6', '7',
                             '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};
   public static void main(String[] a) {
      String charset = "CP1252";
      if (a.length>0) charset = a[0];
      System.out.println(charset+" encoding:");
      int lastByte = 0;
      int lastLength = 0;
      byte[] startSequence = null;
      char startChar = 0;
      byte[] endSequence = null;
      char endChar = 0;
      boolean isFirstChar = true;
      int validCount = 0;
      int subCount = 0;
      int totalCount = 0x010000;
      for (int i=0; i<totalCount; i++) {
         subCount++;
         char c = (char) i;
         byte[] b = encodeByEncoder(c,charset);
         int l = 0;
         int lb = 0;
         if (b!=null) {
            l = b.length;
            lb = ((int) b[l-1]) & 0x00FF;
            validCount++;
         }
         if (isFirstChar==true) {
            isFirstChar = false;
            startSequence = b;
            startChar = c;
            lastByte = lb - 1;
            lastLength = l;
         }
         if (!(l==lastLength)) {
            System.out.print(charToHex(startChar)+" >");
            printBytes(startSequence);
            System.out.print(" - "+charToHex(endChar)+" >");
            printBytes(endSequence);
            System.out.println(" = "+(subCount-1));
            startSequence = b;
            startChar = c;
            subCount = 1;
         }
         endSequence = b;
         endChar = c;
         lastLength = l;
         lastByte = lb;
      }
      System.out.print(charToHex(startChar)+" >");
      printBytes(startSequence);
      System.out.print(" - "+charToHex(endChar)+" >");
      printBytes(endSequence);
      System.out.println(" = "+(subCount));
      System.out.println("Total characters = "+totalCount);
      System.out.println("Valid characters = "+validCount);
      System.out.println("Invalid characters = "
         +(totalCount-validCount));
   }
   public static byte[] encodeByEncoder(char c, String cs) {
      Charset cso = null;
      byte[] b = null;
      try {
         cso = Charset.forName(cs);
         CharsetEncoder e =  cso.newEncoder();
         e.reset();
         ByteBuffer bb = e.encode(CharBuffer.wrap(new char[] {c}));
         if (bb.limit()>0) b = copyBytes(bb.array(),bb.limit());
      } catch (IllegalCharsetNameException e) {
         System.out.println(e.toString());
      } catch (CharacterCodingException e) {
         // invalid character, return null
      }
      return b;
   }
   public static void printBytes(byte[] b) {
      if (b!=null) {
         for (int j=0; j<b.length; j++)
            System.out.print(" "+byteToHex(b[j]));
      } else {
         System.out.print(" XX");
      }
   }
   public static byte[] copyBytes(byte[] a, int l) {
      byte[] b = new byte[l];
      for (int i=0; i<Math.min(l,a.length); i++) b[i] = a[i];
      return b;
   }
   public static String byteToHex(byte b) {
      char[] a = { hexDigit[(b >> 4) & 0x0f], hexDigit[b & 0x0f] };
      return new String(a);
   }
   public static String charToHex(char c) {
      byte hi = (byte) (c >>> 8);
      byte lo = (byte) (c & 0xff);
      return byteToHex(hi) + byteToHex(lo);
   }
}

Note that:

CharsetEncoder.encode() is used to encode the code points stored as "char" type.
Since Java can only encode code points in the 0x0000 - 0xFFFF range, only a subset of the character set will be encoded for some encodings, like UTF-8, which can encode code points up to 0x10FFFF.
The encoding name should be specified as command argument.