/* ------------------------------------------------------------------------ unicodedata -- Provides access to the Unicode 3.2 data base. Data was extracted from the Unicode 3.2 UnicodeData.txt file. Written by Marc-Andre Lemburg (mal@lemburg.com). Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) Modified by Martin v. Löwis (martin@v.loewis.de) Copyright (c) Corporation for National Research Initiatives. ------------------------------------------------------------------------ */ #include "Python.h" #include "ucnhash.h" /* character properties */ typedef struct { const unsigned char category; /* index into _PyUnicode_CategoryNames */ const unsigned char combining; /* combining class value 0 - 255 */ const unsigned char bidirectional; /* index into _PyUnicode_BidirectionalNames */ const unsigned char mirrored; /* true if mirrored in bidir mode */ const unsigned char east_asian_width; /* index into _PyUnicode_EastAsianWidth */ } _PyUnicode_DatabaseRecord; /* data file generated by Tools/unicode/makeunicodedata.py */ #include "unicodedata_db.h" static const _PyUnicode_DatabaseRecord* _getrecord_ex(Py_UCS4 code) { int index; if (code >= 0x110000) index = 0; else { index = index1[(code>>SHIFT)]; index = index2[(index<category; return PyString_FromString(_PyUnicode_CategoryNames[index]); } static PyObject * unicodedata_bidirectional(PyObject *self, PyObject *args) { PyUnicodeObject *v; int index; if (!PyArg_ParseTuple(args, "O!:bidirectional", &PyUnicode_Type, &v)) return NULL; if (PyUnicode_GET_SIZE(v) != 1) { PyErr_SetString(PyExc_TypeError, "need a single Unicode character as parameter"); return NULL; } index = (int) _getrecord(v)->bidirectional; return PyString_FromString(_PyUnicode_BidirectionalNames[index]); } static PyObject * unicodedata_combining(PyObject *self, PyObject *args) { PyUnicodeObject *v; if (!PyArg_ParseTuple(args, "O!:combining", &PyUnicode_Type, &v)) return NULL; if (PyUnicode_GET_SIZE(v) != 1) { PyErr_SetString(PyExc_TypeError, "need a single Unicode character as parameter"); return NULL; } return PyInt_FromLong((int) _getrecord(v)->combining); } static PyObject * unicodedata_mirrored(PyObject *self, PyObject *args) { PyUnicodeObject *v; if (!PyArg_ParseTuple(args, "O!:mirrored", &PyUnicode_Type, &v)) return NULL; if (PyUnicode_GET_SIZE(v) != 1) { PyErr_SetString(PyExc_TypeError, "need a single Unicode character as parameter"); return NULL; } return PyInt_FromLong((int) _getrecord(v)->mirrored); } static PyObject * unicodedata_east_asian_width(PyObject *self, PyObject *args) { PyUnicodeObject *v; int index; if (!PyArg_ParseTuple(args, "O!:east_asian_width", &PyUnicode_Type, &v)) return NULL; if (PyUnicode_GET_SIZE(v) != 1) { PyErr_SetString(PyExc_TypeError, "need a single Unicode character as parameter"); return NULL; } index = (int) _getrecord(v)->east_asian_width; return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]); } static PyObject * unicodedata_decomposition(PyObject *self, PyObject *args) { PyUnicodeObject *v; char decomp[256]; int code, index, count, i; unsigned int prefix_index; if (!PyArg_ParseTuple(args, "O!:decomposition", &PyUnicode_Type, &v)) return NULL; if (PyUnicode_GET_SIZE(v) != 1) { PyErr_SetString(PyExc_TypeError, "need a single Unicode character as parameter"); return NULL; } code = (int) *PyUnicode_AS_UNICODE(v); if (code < 0 || code >= 0x110000) index = 0; else { index = decomp_index1[(code>>DECOMP_SHIFT)]; index = decomp_index2[(index<> 8; /* XXX: could allocate the PyString up front instead (strlen(prefix) + 5 * count + 1 bytes) */ /* Based on how index is calculated above and decomp_data is generated from Tools/unicode/makeunicodedata.py, it should not be possible to overflow decomp_prefix. */ prefix_index = decomp_data[index] & 255; assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix))); /* copy prefix */ i = strlen(decomp_prefix[prefix_index]); memcpy(decomp, decomp_prefix[prefix_index], i); while (count-- > 0) { if (i) decomp[i++] = ' '; assert((size_t)i < sizeof(decomp)); PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X", decomp_data[++index]); i += strlen(decomp + i); } decomp[i] = '\0'; return PyString_FromString(decomp); } void get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count) { if (code >= 0x110000) { *index = 0; } else { *index = decomp_index1[(code>>DECOMP_SHIFT)]; *index = decomp_index2[(*index<> 8; *prefix = decomp_data[*index] & 255; (*index)++; } #define SBase 0xAC00 #define LBase 0x1100 #define VBase 0x1161 #define TBase 0x11A7 #define LCount 19 #define VCount 21 #define TCount 28 #define NCount (VCount*TCount) #define SCount (LCount*NCount) static PyObject* nfd_nfkd(PyObject *input, int k) { PyObject *result; Py_UNICODE *i, *end, *o; /* Longest decomposition in Unicode 3.2: U+FDFA */ Py_UNICODE stack[20]; int space, stackptr, isize; int index, prefix, count; unsigned char prev, cur; stackptr = 0; isize = PyUnicode_GET_SIZE(input); /* Overallocate atmost 10 characters. */ space = (isize > 10 ? 10 : isize) + isize; result = PyUnicode_FromUnicode(NULL, space); if (!result) return NULL; i = PyUnicode_AS_UNICODE(input); end = i + isize; o = PyUnicode_AS_UNICODE(result); while (i < end) { stack[stackptr++] = *i++; while(stackptr) { Py_UNICODE code = stack[--stackptr]; /* Hangul Decomposition adds three characters in a single step, so we need atleast that much room. */ if (space < 3) { int newsize = PyString_GET_SIZE(result) + 10; space += 10; if (PyUnicode_Resize(&result, newsize) == -1) return NULL; o = PyUnicode_AS_UNICODE(result) + newsize - space; } /* Hangul Decomposition. */ if (SBase <= code && code < (SBase+SCount)) { int SIndex = code - SBase; int L = LBase + SIndex / NCount; int V = VBase + (SIndex % NCount) / TCount; int T = TBase + SIndex % TCount; *o++ = L; *o++ = V; space -= 2; if (T != TBase) { *o++ = T; space --; } continue; } /* Other decompoistions. */ get_decomp_record(code, &index, &prefix, &count); /* Copy character if it is not decomposable, or has a compatibility decomposition, but we do NFD. */ if (!count || (prefix && !k)) { *o++ = code; space--; continue; } /* Copy decomposition onto the stack, in reverse order. */ while(count) { code = decomp_data[index + (--count)]; stack[stackptr++] = code; } } } /* Drop overallocation. Cannot fail. */ PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space); /* Sort canonically. */ i = PyUnicode_AS_UNICODE(result); prev = _getrecord_ex(*i)->combining; end = i + PyUnicode_GET_SIZE(result); for (i++; i < end; i++) { cur = _getrecord_ex(*i)->combining; if (prev == 0 || cur == 0 || prev <= cur) { prev = cur; continue; } /* Non-canonical order. Need to switch *i with previous. */ o = i - 1; while (1) { Py_UNICODE tmp = o[1]; o[1] = o[0]; o[0] = tmp; o--; if (o < PyUnicode_AS_UNICODE(result)) break; prev = _getrecord_ex(*o)->combining; if (prev == 0 || prev <= cur) break; } prev = _getrecord_ex(*i)->combining; } return result; } static int find_nfc_index(struct reindex* nfc, Py_UNICODE code) { int index; for (index = 0; nfc[index].start; index++) { int start = nfc[index].start; if (code < start) return -1; if (code <= start + nfc[index].count) { int delta = code - start; return nfc[index].index + delta; } } return -1; } static PyObject* nfc_nfkc(PyObject *input, int k) { PyObject *result; Py_UNICODE *i, *i1, *o, *end; int f,l,index,index1,comb; Py_UNICODE code; Py_UNICODE *skipped[20]; int cskipped = 0; result = nfd_nfkd(input, k); if (!result) return NULL; /* We are going to modify result in-place. If nfd_nfkd is changed to sometimes return the input, this code needs to be reviewed. */ assert(result != input); i = PyUnicode_AS_UNICODE(result); end = i + PyUnicode_GET_SIZE(result); o = PyUnicode_AS_UNICODE(result); again: while (i < end) { for (index = 0; index < cskipped; index++) { if (skipped[index] == i) { /* *i character is skipped. Remove from list. */ skipped[index] = skipped[cskipped-1]; cskipped--; i++; goto again; /* continue while */ } } /* Hangul Composition. We don't need to check for pairs, since we always have decomposed data. */ if (LBase <= *i && *i < (LBase+LCount) && i + 1 < end && VBase <= i[1] && i[1] <= (VBase+VCount)) { int LIndex, VIndex; LIndex = i[0] - LBase; VIndex = i[1] - VBase; code = SBase + (LIndex*VCount+VIndex)*TCount; i+=2; if (i < end && TBase <= *i && *i <= (TBase+TCount)) { code += *i-TBase; i++; } *o++ = code; continue; } f = find_nfc_index(nfc_first, *i); if (f == -1) { *o++ = *i++; continue; } /* Find next unblocked character. */ i1 = i+1; comb = 0; while (i1 < end) { int comb1 = _getrecord_ex(*i1)->combining; if (comb1 && comb == comb1) { /* Character is blocked. */ i1++; continue; } l = find_nfc_index(nfc_last, *i1); /* *i1 cannot be combined with *i. If *i1 is a starter, we don't need to look further. Otherwise, record the combining class. */ if (l == -1) { not_combinable: if (comb1 == 0) break; comb = comb1; i1++; continue; } index = f*TOTAL_LAST + l; index1 = comp_index[index >> COMP_SHIFT]; code = comp_data[(index1<>24) & 0xff)) & 0x00ffffff; } return h; } static char *hangul_syllables[][3] = { { "G", "A", "" }, { "GG", "AE", "G" }, { "N", "YA", "GG" }, { "D", "YAE", "GS" }, { "DD", "EO", "N", }, { "R", "E", "NJ" }, { "M", "YEO", "NH" }, { "B", "YE", "D" }, { "BB", "O", "L" }, { "S", "WA", "LG" }, { "SS", "WAE", "LM" }, { "", "OE", "LB" }, { "J", "YO", "LS" }, { "JJ", "U", "LT" }, { "C", "WEO", "LP" }, { "K", "WE", "LH" }, { "T", "WI", "M" }, { "P", "YU", "B" }, { "H", "EU", "BS" }, { 0, "YI", "S" }, { 0, "I", "SS" }, { 0, 0, "NG" }, { 0, 0, "J" }, { 0, 0, "C" }, { 0, 0, "K" }, { 0, 0, "T" }, { 0, 0, "P" }, { 0, 0, "H" } }; static int is_unified_ideograph(Py_UCS4 code) { return ( (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */ (0x4E00 <= code && code <= 0x9FA5) || /* CJK Ideograph */ (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */ } static int _getucname(Py_UCS4 code, char* buffer, int buflen) { int offset; int i; int word; unsigned char* w; if (SBase <= code && code < SBase+SCount) { /* Hangul syllable. */ int SIndex = code - SBase; int L = SIndex / NCount; int V = (SIndex % NCount) / TCount; int T = SIndex % TCount; if (buflen < 27) /* Worst case: HANGUL SYLLABLE <10chars>. */ return 0; strcpy(buffer, "HANGUL SYLLABLE "); buffer += 16; strcpy(buffer, hangul_syllables[L][0]); buffer += strlen(hangul_syllables[L][0]); strcpy(buffer, hangul_syllables[V][1]); buffer += strlen(hangul_syllables[V][1]); strcpy(buffer, hangul_syllables[T][2]); buffer += strlen(hangul_syllables[T][2]); *buffer = '\0'; return 1; } if (is_unified_ideograph(code)) { if (buflen < 28) /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */ return 0; sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code); return 1; } if (code >= 0x110000) return 0; /* get offset into phrasebook */ offset = phrasebook_offset1[(code>>phrasebook_shift)]; offset = phrasebook_offset2[(offset<= 0) { word = (word << 8) + phrasebook[offset+1]; offset += 2; } else word = phrasebook[offset++]; if (i) { if (i > buflen) return 0; /* buffer overflow */ buffer[i++] = ' '; } /* copy word string from lexicon. the last character in the word has bit 7 set. the last word in a string ends with 0x80 */ w = lexicon + lexicon_offset[word]; while (*w < 128) { if (i >= buflen) return 0; /* buffer overflow */ buffer[i++] = *w++; } if (i >= buflen) return 0; /* buffer overflow */ buffer[i++] = *w & 127; if (*w == 128) break; /* end of word */ } return 1; } static int _cmpname(int code, const char* name, int namelen) { /* check if code corresponds to the given name */ int i; char buffer[NAME_MAXLEN]; if (!_getucname(code, buffer, sizeof(buffer))) return 0; for (i = 0; i < namelen; i++) { if (toupper(name[i]) != buffer[i]) return 0; } return buffer[namelen] == '\0'; } static void find_syllable(const char *str, int *len, int *pos, int count, int column) { int i, len1; *len = -1; for (i = 0; i < count; i++) { char *s = hangul_syllables[i][column]; len1 = strlen(s); if (len1 <= *len) continue; if (strncmp(str, s, len1) == 0) { *len = len1; *pos = i; } } if (*len == -1) { *len = 0; *pos = -1; } } static int _getcode(const char* name, int namelen, Py_UCS4* code) { unsigned int h, v; unsigned int mask = code_size-1; unsigned int i, incr; /* Check for hangul syllables. */ if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) { int L, V, T, len; const char *pos = name + 16; find_syllable(pos, &len, &L, LCount, 0); pos += len; find_syllable(pos, &len, &V, VCount, 1); pos += len; find_syllable(pos, &len, &T, TCount, 2); pos += len; if (L != -1 && V != -1 && T != -1 && pos-name == namelen) { *code = SBase + (L*VCount+V)*TCount + T; return 1; } /* Otherwise, it's an illegal syllable name. */ return 0; } /* Check for unified ideographs. */ if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) { /* Four or five hexdigits must follow. */ v = 0; name += 22; namelen -= 22; if (namelen != 4 && namelen != 5) return 0; while (namelen--) { v *= 16; if (*name >= '0' && *name <= '9') v += *name - '0'; else if (*name >= 'A' && *name <= 'F') v += *name - 'A' + 10; else return 0; name++; } if (!is_unified_ideograph(v)) return 0; *code = v; return 1; } /* the following is the same as python's dictionary lookup, with only minor changes. see the makeunicodedata script for more details */ h = (unsigned int) _gethash(name, namelen, code_magic); i = (~h) & mask; v = code_hash[i]; if (!v) return 0; if (_cmpname(v, name, namelen)) { *code = v; return 1; } incr = (h ^ (h >> 3)) & mask; if (!incr) incr = mask; for (;;) { i = (i + incr) & mask; v = code_hash[i]; if (!v) return 0; if (_cmpname(v, name, namelen)) { *code = v; return 1; } incr = incr << 1; if (incr > mask) incr = incr ^ code_poly; } } static const _PyUnicode_Name_CAPI hashAPI = { sizeof(_PyUnicode_Name_CAPI), _getucname, _getcode }; /* -------------------------------------------------------------------- */ /* Python bindings */ static PyObject * unicodedata_name(PyObject* self, PyObject* args) { char name[NAME_MAXLEN]; PyUnicodeObject* v; PyObject* defobj = NULL; if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj)) return NULL; if (PyUnicode_GET_SIZE(v) != 1) { PyErr_SetString(PyExc_TypeError, "need a single Unicode character as parameter"); return NULL; } if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v), name, sizeof(name))) { if (defobj == NULL) { PyErr_SetString(PyExc_ValueError, "no such name"); return NULL; } else { Py_INCREF(defobj); return defobj; } } return Py_BuildValue("s", name); } static PyObject * unicodedata_lookup(PyObject* self, PyObject* args) { Py_UCS4 code; Py_UNICODE str[1]; char errbuf[256]; char* name; int namelen; if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen)) return NULL; if (!_getcode(name, namelen, &code)) { /* XXX(nnorwitz): why are we allocating for the error msg? Why not always use snprintf? */ char fmt[] = "undefined character name '%s'"; char *buf = PyMem_MALLOC(sizeof(fmt) + namelen); if (buf) sprintf(buf, fmt, name); else { buf = errbuf; PyOS_snprintf(buf, sizeof(errbuf), fmt, name); } PyErr_SetString(PyExc_KeyError, buf); if (buf != errbuf) PyMem_FREE(buf); return NULL; } str[0] = (Py_UNICODE) code; return PyUnicode_FromUnicode(str, 1); } /* XXX Add doc strings. */ static PyMethodDef unicodedata_functions[] = { {"decimal", unicodedata_decimal, METH_VARARGS}, {"digit", unicodedata_digit, METH_VARARGS}, {"numeric", unicodedata_numeric, METH_VARARGS}, {"category", unicodedata_category, METH_VARARGS}, {"bidirectional", unicodedata_bidirectional, METH_VARARGS}, {"combining", unicodedata_combining, METH_VARARGS}, {"mirrored", unicodedata_mirrored, METH_VARARGS}, {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS}, {"decomposition",unicodedata_decomposition, METH_VARARGS}, {"name", unicodedata_name, METH_VARARGS}, {"lookup", unicodedata_lookup, METH_VARARGS}, {"normalize", unicodedata_normalize, METH_VARARGS}, {NULL, NULL} /* sentinel */ }; PyDoc_STRVAR(unicodedata_docstring, "unicode character database"); PyMODINIT_FUNC initunicodedata(void) { PyObject *m, *v; m = Py_InitModule3( "unicodedata", unicodedata_functions, unicodedata_docstring); if (!m) return; PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION); /* Export C API */ v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL); if (v != NULL) PyModule_AddObject(m, "ucnhash_CAPI", v); } /* Local variables: c-basic-offset: 4 indent-tabs-mode: nil End: */