Kjetil's Information Center: A Blog About My Projects

Katakana to ASCII Converter

The Japanese writing system of Katakana is typically used to represent text from foreign languages. This means it's possible to translate it directly, and still be able to understand some of the meaning.

So, I made this C-based filter to convert UTF-8 based Katakana text to ASCII, take a look:

#include <stdio.h>

typedef struct unicode_s {
  int code;
  char text[3];
} unicode_t;

#define KATAKANA_SIZE 96

static unicode_t katakana[KATAKANA_SIZE] = {
  {0x30A0, "="},   {0x30A1, "a"},   {0x30A2, "a"},   {0x30A3, "i"},
  {0x30A4, "i"},   {0x30A5, "u"},   {0x30A6, "u"},   {0x30A7, "e"},
  {0x30A8, "e"},   {0x30A9, "o"},   {0x30AA, "o"},   {0x30AB, "ka"},
  {0x30AC, "ga"},  {0x30AD, "ki"},  {0x30AE, "gi"},  {0x30AF, "ku"},

  {0x30B0, "gu"},  {0x30B1, "ke"},  {0x30B2, "ge"},  {0x30B3, "ko"},
  {0x30B4, "go"},  {0x30B5, "sa"},  {0x30B6, "za"},  {0x30B7, "shi"},
  {0x30B8, "ji"},  {0x30B9, "su"},  {0x30BA, "zu"},  {0x30BB, "se"},
  {0x30BC, "ze"},  {0x30BD, "so"},  {0x30BE, "zo"},  {0x30BF, "ta"},

  {0x30C0, "da"},  {0x30C1, "chi"}, {0x30C2, "di"},  {0x30C3, "tsu"},
  {0x30C4, "tsu"}, {0x30C5, "dzu"}, {0x30C6, "te"},  {0x30C7, "de"},
  {0x30C8, "to"},  {0x30C9, "do"},  {0x30CA, "na"},  {0x30CB, "ni"},
  {0x30CC, "nu"},  {0x30CD, "ne"},  {0x30CE, "no"},  {0x30CF, "ha"},
   
  {0x30D0, "ba"},  {0x30D1, "pa"},  {0x30D2, "hi"},  {0x30D3, "bi"},
  {0x30D4, "pi"},  {0x30D5, "fu"},  {0x30D6, "bu"},  {0x30D7, "pu"},
  {0x30D8, "he"},  {0x30D9, "be"},  {0x30DA, "pe"},  {0x30DB, "ho"},
  {0x30DC, "bo"},  {0x30DD, "po"},  {0x30DE, "ma"},  {0x30DF, "mi"},
   
  {0x30E0, "mu"},  {0x30E1, "me"},  {0x30E2, "mo"},  {0x30E3, "ya"},
  {0x30E4, "ya"},  {0x30E5, "yu"},  {0x30E6, "yu"},  {0x30E7, "yo"},
  {0x30E8, "yo"},  {0x30E9, "ra"},  {0x30EA, "ri"},  {0x30EB, "ru"},
  {0x30EC, "re"},  {0x30ED, "ro"},  {0x30EE, "wa"},  {0x30EF, "wa"},

  {0x30F0, "wi"},  {0x30F1, "we"},  {0x30F2, "wo"},  {0x30F3, "n"},
  {0x30F4, "vu"},  {0x30F5, "ka"},  {0x30F6, "ke"},  {0x30F7, "va"},
  {0x30F8, "vi"},  {0x30F9, "ve"},  {0x30FA, "vo"},  {0x30FB, "."},
  {0x30FC, "-"},   {0x30FD, ","},   {0x30FE, ","},   {0x30FF, "|"},
};

static int multibyte_len(unsigned char byte)
{
  if (byte & 0x80) {
    if (byte & 0x40) {
      if (byte & 0x20) {
        if (byte & 0x10) {
          if (byte & 0x8) {
            if (byte & 0x4) {
              if (byte & 0x2) {
                if (byte & 0x1) {
                  return 8;
                } else { return 7; }
              } else { return 6; }
            } else { return 5; }
          } else { return 4; }
        } else { return 3; }
      } else { return 2; }
    } else { return 1; }
  } else { return 0; }
}

static int multibyte_data(unsigned char byte)
{
  if (byte & 0x80) {
    if (byte & 0x40) {
      if (byte & 0x20) {
        if (byte & 0x10) {
          if (byte & 0x8) {
            if (byte & 0x4) {
              if (byte & 0x2) {
                if (byte & 0x1) {
                  return -1;
                } else { return -1; }
              } else { return byte & 0x1; }
            } else { return byte & 0x3; }
          } else { return byte & 0x7; }
        } else { return byte & 0xf; }
      } else { return byte & 0x1f; }
    } else { return byte & 0x3f; }
  } else { return -1; }
}

static char *katakana_to_ascii(int unicode)
{
  int i;
  for (i = 0; i < KATAKANA_SIZE; i++) {
    if (katakana[i].code == unicode) {
      return katakana[i].text;
    }
  }
  return "?";
}

int main(void)
{
  int c, in_utf8, len, unicode;

  in_utf8 = 0;
  while ((c = fgetc(stdin)) != EOF) {
    if (c & 0x80) { /* If multibyte character... */
      if (in_utf8) {
        unicode = unicode << (7 - multibyte_len(c)); /* Shift existing... */
        unicode = unicode | multibyte_data(c); /* ...then add new bits. */
        len--;
        if (len <= 0) {
          fputs(katakana_to_ascii(unicode), stdout);
          in_utf8 = 0;
        }

      } else {
        in_utf8 = 1;
        len = multibyte_len(c) - 1; /* More multibytes to read. */
        unicode = multibyte_data(c);
      }

    } else {
      in_utf8 = 0;
      fputc(c, stdout);
    }
  }

  return 0;
}
          


Topic: Scripts and Code, by Kjetil @ 06/07-2015, Article Link