Logo Search packages:      
Sourcecode: w3m version File versions  Download package

charset.c

#include <stdlib.h>
#include <ctype.h>
#include <gc.h>
#define New_N(type,n) ((type*)GC_MALLOC((n)*sizeof(type)))

#include "wc.h"

wc_locale WcLocale = 0;

static struct {
  char *lang;
  wc_ces ces;
} lang_ces_table[] = {
  { "cs", WC_CES_ISO_8859_2 },      /* cs_CZ */
  { "el", WC_CES_ISO_8859_7 },      /* el_GR */
  { "iw", WC_CES_ISO_8859_8 },      /* iw_IL */
  { "ja", WC_CES_EUC_JP },    /* ja_JP */
  { "ko", WC_CES_EUC_KR },    /* ko_KR */
  { "hu", WC_CES_ISO_8859_2 },      /* hu_HU */
  { "pl", WC_CES_ISO_8859_2 },      /* pl_PL */
  { "ro", WC_CES_ISO_8859_2 },      /* ro_RO */
  { "ru", WC_CES_ISO_8859_5 },      /* ru_SU */
  { "sk", WC_CES_ISO_8859_2 },      /* sk_SK */
  { "sl", WC_CES_ISO_8859_2 },      /* sl_CS */
  { "tr", WC_CES_ISO_8859_9 },      /* tr_TR */
  { "zh", WC_CES_EUC_CN },    /* zh_CN */
  { NULL, 0 }
};

wc_ces
wc_guess_charset(char *charset, wc_ces orig)
{
    wc_ces guess;

    if (charset == NULL || *charset == '\0')
      return orig;
    guess = wc_charset_to_ces(charset);
    return guess ? guess : orig;
}

wc_ces
wc_guess_charset_short(char *charset, wc_ces orig)
{
    wc_ces guess;

    if (charset == NULL || *charset == '\0')
      return orig;
    guess = wc_charset_short_to_ces(charset);
    return guess ? guess : orig;
}

wc_ces
wc_guess_locale_charset(char *locale, wc_ces orig)
{
    wc_ces guess;

    if (locale == NULL || *locale == '\0')
      return orig;
    guess = wc_locale_to_ces(locale);
    return guess ? guess : orig;
}

wc_ces
wc_charset_to_ces(char *charset)
{
    char *p = charset;
    char buf[16];
    int n;

    if (tolower(*p) == 'x' && *(p+1) == '-')
      p += 2;
    for (n = 0; *p && n < 15; p++) {
      if ((unsigned char)*p > 0x20 && *p != '_' && *p != '-')
          buf[n++] = tolower(*p);
    }
    buf[n] = 0;
    p = buf;
    switch (*p) {
    case 'e':
      if (! strncmp(p, "euc", 3)) {
          p += 3;
          switch (*p) {
          case 'j': return WC_CES_EUC_JP;
          case 'c': return WC_CES_EUC_CN;
          case 't': return WC_CES_EUC_TW;
          case 'k': return WC_CES_EUC_KR;
          }
          switch (WcLocale) {
          case WC_LOCALE_JA_JP: return WC_CES_EUC_JP;
          case WC_LOCALE_ZH_CN: return WC_CES_EUC_CN;
          case WC_LOCALE_ZH_TW: return WC_CES_EUC_TW;
          case WC_LOCALE_ZH_HK: return WC_CES_EUC_CN;
          case WC_LOCALE_KO_KR: return WC_CES_EUC_KR;
          }
          return WC_CES_EUC_JP;
        }
      break;
    case 'i':
      if (! strncmp(p, "iso2022", 7)) {
          p += 7;
          switch (*p) {
          case 'j':
            if (! strncmp(p, "jp2", 3))
                return WC_CES_ISO_2022_JP_2;
            if (! strncmp(p, "jp3", 3))
                return WC_CES_ISO_2022_JP_3;
            return WC_CES_ISO_2022_JP;
          case 'c': return WC_CES_ISO_2022_CN;
          case 'k': return WC_CES_ISO_2022_KR;
          }
          return WC_CES_ISO_2022_JP;
      } else if (! strncmp(p, "iso8859", 7)) {
          n = atoi(p + 7);
          if (n >= 1 && n <= 16 && n != 12)
            return (WC_CES_E_ISO_8859 | n);
          return WC_CES_ISO_8859_1;
      }
      break;
    case 'j':
      if (! strncmp(p, "johab", 5))
          return WC_CES_JOHAB;
      if (! strncmp(p, "jis", 3))
          return WC_CES_ISO_2022_JP;
      break;
    case 's':
      if (! strncmp(p, "shiftjisx0213", 13) ||
          ! strncmp(p, "sjisx0213", 9))
          return WC_CES_SHIFT_JISX0213;
      if (! strncmp(p, "shiftjis", 8) ||
          ! strncmp(p, "sjis", 4))
          return WC_CES_SHIFT_JIS;
      break;
    case 'g':
      if (! strncmp(p, "gb18030", 7) ||
          ! strncmp(p, "gbk2k", 5))
          return WC_CES_GB18030;
      if (! strncmp(p, "gbk", 3))
          return WC_CES_GBK;
      if (! strncmp(p, "gb2312", 6))
          return WC_CES_EUC_CN;
      break;
    case 'b':
      if (! strncmp(p, "big5hkscs", 9))
          return WC_CES_HKSCS;
      if (! strncmp(p, "big5", 4))
          return WC_CES_BIG5;
      break;
    case 'h':
      if (! strncmp(p, "hz", 2))
          return WC_CES_HZ_GB_2312;
      if (! strncmp(p, "hkscs", 5))
          return WC_CES_HKSCS;
      break;
    case 'k':
      if (! strncmp(p, "koi8r", 5))
          return WC_CES_KOI8_R;
      if (! strncmp(p, "koi8u", 5))
          return WC_CES_KOI8_U;
      if (! strncmp(p, "ksx1001", 7))
          return WC_CES_EUC_KR;
      if (! strncmp(p, "ksc5601", 7))
          return WC_CES_EUC_KR;
      break;
    case 't':
      if (! strncmp(p, "tis620", 6))
          return WC_CES_TIS_620;
      if (! strncmp(p, "tcvn", 4))
          return WC_CES_TCVN_5712;
      break;
    case 'n':
      if (! strncmp(p, "next", 4))
          return WC_CES_NEXTSTEP;
      break;
    case 'v':
      if (! strncmp(p, "viet", 4)) {
          p += 4;
          if (! strncmp(p, "tcvn", 4))
            return WC_CES_TCVN_5712;
      }
      if (! strncmp(p, "viscii", 6))
          return WC_CES_VISCII_11;
      if (! strncmp(p, "vps", 3))
          return WC_CES_VPS;
      break;
    case 'u':
#ifdef USE_UNICODE
      if (! strncmp(p, "utf8", 4))
          return WC_CES_UTF_8;
      if (! strncmp(p, "utf7", 4))
          return WC_CES_UTF_7;
#endif
      if (! strncmp(p, "uhc", 3))
          return WC_CES_UHC;
      if (! strncmp(p, "ujis", 4))
          return WC_CES_EUC_JP;
      if (! strncmp(p, "usascii", 7))
          return WC_CES_US_ASCII;
      break;
    case 'a':
      if (! strncmp(p, "ascii", 5))
          return WC_CES_US_ASCII;
      break;
    case 'c':
      if (! strncmp(p, "cngb", 4))
          return WC_CES_EUC_CN;
      if (*(p+1) != 'p')
          break;
      n = atoi(p + 2);
      switch (n) {
      case 437: return WC_CES_CP437;
      case 737: return WC_CES_CP737;
      case 775: return WC_CES_CP775;
      case 850: return WC_CES_CP850;
      case 852: return WC_CES_CP852;
      case 855: return WC_CES_CP855;
      case 856: return WC_CES_CP856;
      case 857: return WC_CES_CP857;
      case 860: return WC_CES_CP860;
      case 861: return WC_CES_CP861;
      case 862: return WC_CES_CP862;
      case 863: return WC_CES_CP863;
      case 864: return WC_CES_CP864;
      case 865: return WC_CES_CP865;
      case 866: return WC_CES_CP866;
      case 869: return WC_CES_CP869;
      case 874: return WC_CES_CP874;
      case 932: return WC_CES_CP932;            /* CP932 = Shift_JIS */
      case 936: return WC_CES_CP936;            /* CP936 = GBK > EUC_CN */
      case 949: return WC_CES_CP949;            /* CP949 = UHC > EUC_KR */
      case 950: return WC_CES_CP950;            /* CP950 = Big5 */
      case 1006: return WC_CES_CP1006;
      case 1250: return WC_CES_CP1250;
      case 1251: return WC_CES_CP1251;
      case 1252: return WC_CES_CP1252;
      case 1253: return WC_CES_CP1253;
      case 1254: return WC_CES_CP1254;
      case 1255: return WC_CES_CP1255;
      case 1256: return WC_CES_CP1256;
      case 1257: return WC_CES_CP1257;
      case 1258: return WC_CES_CP1258;
      }
      break;
    case 'w':
      if (strncmp(p, "windows", 7))
          break;
      if (! strncmp(p, "31j", 3))
          return WC_CES_CP932;
      n = atoi(p + 7);
      switch (n) {
      case 1250: return WC_CES_CP1250;
      case 1251: return WC_CES_CP1251;
      case 1252: return WC_CES_CP1252;
      case 1253: return WC_CES_CP1253;
      case 1254: return WC_CES_CP1254;
      case 1255: return WC_CES_CP1255;
      case 1256: return WC_CES_CP1256;
      case 1257: return WC_CES_CP1257;
      case 1258: return WC_CES_CP1258;
      }
      break;
    }
    return 0;
}

wc_ces
wc_charset_short_to_ces(char *charset)
{
    char *p = charset;
    char buf[16];
    wc_ces ces;
    int n;

    ces = wc_charset_to_ces(charset);
    if (ces)
      return ces;

    for (n = 0; *p && n < 15; p++) {
      if ((unsigned char)*p > 0x20 && *p != '_' && *p != '-')
          buf[n++] = tolower(*p);
    }
    buf[n] = 0;
    p = buf;
    switch (*p) {
    case 'e':
      switch (*(p+1)) {
      case 'j': return WC_CES_EUC_JP;
      case 'c': return WC_CES_EUC_CN;
      case 't': return WC_CES_EUC_TW;
      case 'k': return WC_CES_EUC_KR;
      }
      return WC_CES_EUC_JP;
    case 'j':
      p++;
      if (*p == 'o')
          return WC_CES_JOHAB;
      if (*p == 'p')
         p++;
      if (*p == '2')
         return WC_CES_ISO_2022_JP_2;
      if (*p == '3')
         return WC_CES_ISO_2022_JP_3;
      return WC_CES_ISO_2022_JP;
    case 's':
      return WC_CES_SHIFT_JIS;
    case 'g':
      return WC_CES_EUC_CN;
    case 'b':
      return WC_CES_BIG5;
    case 'h':
      if (*(p+1) == 'k')
          return WC_CES_HKSCS;
      return WC_CES_HZ_GB_2312;
    case 'k':
      if (*(p+1) == 'o')
          return WC_CES_KOI8_R;
      return WC_CES_ISO_2022_KR;
    case 'l':
      n = atoi(p + 1);
      if (n >= 1 && n <= 16 && n != 12)
          return (WC_CES_E_ISO_8859 | n);
      return WC_CES_ISO_8859_1;
    case 't':
      if (*(p+1) == 'c')
          return WC_CES_TCVN_5712;
      return WC_CES_TIS_620;
    case 'n':
      return WC_CES_NEXTSTEP;
    case 'v':
      if (*(p+1) == 'p')
          return WC_CES_VPS;
      return WC_CES_VISCII_11;
#ifdef USE_UNICODE
    case 'u':
      if (*(p+1) == '7')
          return WC_CES_UTF_7;
      return WC_CES_UTF_8;
#endif
    case 'a':
      return WC_CES_US_ASCII;
    case 'c':
      return WC_CES_ISO_2022_CN;
    case 'w':
      n = atoi(p + 1);
      switch (n) {
      case 1250: return WC_CES_CP1250;
      case 1251: return WC_CES_CP1251;
      case 1252: return WC_CES_CP1252;
      case 1253: return WC_CES_CP1253;
      case 1254: return WC_CES_CP1254;
      case 1255: return WC_CES_CP1255;
      case 1256: return WC_CES_CP1256;
      case 1257: return WC_CES_CP1257;
      case 1258: return WC_CES_CP1258;
      }
      break;
    case 'r':
      return WC_CES_RAW;
    }
    return 0;
}

wc_ces
wc_locale_to_ces(char *locale)
{
    char *p = locale;
    char buf[6];
    int n;

    if (*p == 'C' && *(p+1) == '\0')
      return WC_CES_US_ASCII;
    for (n = 0; *p && *p != '.' && n < 5; p++) {
      if ((unsigned char)*p > 0x20)
          buf[n++] = tolower(*p);
    }
    buf[n] = 0;
    if (*p == '.') {
      p++;
      if (! strcasecmp(p, "euc")) {
          switch (buf[0]) {
          case 'j':
            WcLocale = WC_LOCALE_JA_JP;
            break;
          case 'k':
            WcLocale = WC_LOCALE_KO_KR;
            break;
          case 'z':
              if (!strcmp(buf, "zh_tw"))
                WcLocale = WC_LOCALE_ZH_TW;
              else if (!strcmp(buf, "zh_hk"))
                WcLocale = WC_LOCALE_ZH_HK;
            else
                WcLocale = WC_LOCALE_ZH_CN;
            break;
          default:
            WcLocale = 0;
            break;
          }
      }
      return wc_charset_to_ces(p);
    }

    if (!strcmp(buf, "japanese"))
      return WC_CES_SHIFT_JIS;
    if (!strcmp(buf, "zh_tw") ||
      !strcmp(buf, "zh_hk"))
      return WC_CES_BIG5;
    for (n = 0; lang_ces_table[n].lang; n++) {
      if (!strncmp(buf, lang_ces_table[n].lang, 2))
          return lang_ces_table[n].ces;
    }
    return WC_CES_ISO_8859_1;
}

char *
wc_ces_to_charset(wc_ces ces)
{
    if (ces == WC_CES_WTF)
      return "WTF";
    return WcCesInfo[WC_CES_INDEX(ces)].name;
}

char *
wc_ces_to_charset_desc(wc_ces ces)
{
    if (ces == WC_CES_WTF)
      return "W3M Transfer Format";
    return WcCesInfo[WC_CES_INDEX(ces)].desc;
}

wc_ces
wc_guess_8bit_charset(wc_ces orig)
{
    switch (orig) {
    case WC_CES_ISO_2022_JP:
    case WC_CES_ISO_2022_JP_2:
    case WC_CES_ISO_2022_JP_3:
      return WC_CES_EUC_JP;
    case WC_CES_ISO_2022_KR:
      return WC_CES_EUC_KR;
    case WC_CES_ISO_2022_CN:
    case WC_CES_HZ_GB_2312:
      return WC_CES_EUC_CN;
    case WC_CES_US_ASCII:
      return WC_CES_ISO_8859_1;
    }
    return orig;
}

wc_bool
wc_check_ces(wc_ces ces)
{
    size_t i = WC_CES_INDEX(ces);

    return (i <= WC_CES_END && WcCesInfo[i].id == ces);
}

static int
wc_ces_list_cmp(const void *a, const void *b)
{
    return strcasecmp(((wc_ces_list *)a)->desc, ((wc_ces_list *)b)->desc);
}

static wc_ces_list *list = NULL;

wc_ces_list *
wc_get_ces_list(void)
{
    wc_ces_info *info;
    size_t n;

    if (list)
      return list;
    for (info = WcCesInfo, n = 0; info->id; info++) {
      if (info->name != NULL)
          n++;
    }
    list = New_N(wc_ces_list, n + 1);
    for (info = WcCesInfo, n = 0; info->id; info++) {
      if (info->name != NULL) {
          list[n].id = info->id;
          list[n].name = info->name;
          list[n].desc = info->desc;
          n++;
      }
    }
    list[n].id = 0;
    list[n].name = NULL;
    list[n].desc = NULL;
    qsort(list, n, sizeof(wc_ces_list), wc_ces_list_cmp);
    return list;
}

Generated by  Doxygen 1.6.0   Back to index