-
Notifications
You must be signed in to change notification settings - Fork 2
/
unicode.c
100 lines (88 loc) · 1.81 KB
/
unicode.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include "unicode.h"
int isKoreanUnicode (int32_t unicode) {
return unicode >= 0xAC00 && unicode <= 0xD7A3;
}
int getUTF8Len (char *p)
{
int flag = 0x80, len;
for (len = *p & flag ? 0:1; *p & flag; flag>>=1, len++);
return len;
}
int32_t getUnicode (char *p)
{
int32_t res = 0;
unsigned char *x = (unsigned char*)p;
int len = getUTF8Len(x);
switch (len) {
case 1:
res = *x; break;
case 2:
res = (x[0] & 0300) << 6 | (x[1] - 0200);
break;
case 3:
res = (x[0] - 0340) << 12 | (x[1] - 0200) << 6 | (x[2] - 0200);
break;
case 4:
res = (x[0] - 0360) << 18 | (x[1] - 0200) << 12;
res|= (x[2] - 0200) << 6 | (x[3] - 0200);
break;
}
return res;
}
int getUnicodeLen (int32_t unicode)
{
int b = 1;
if (unicode >= 0x80) b++;
if (unicode >= 0x800) b++;
if (unicode >= 0x10000) b++;
return b;
}
char *getUTF8 (int32_t unicode)
{
char *utf8 = calloc(4, sizeof(char));
char lastmark = 0xC0; // 1100 0000
switch (getUnicodeLen(unicode))
{
case 1:
utf8[0] = (char)unicode;
break;
case 4:
utf8[3] = unicode & 077 | 0200;
unicode >>= 6;
lastmark >>= 1;
case 3:
utf8[2] = unicode & 077 | 0200;
unicode >>= 6;
lastmark >>= 1;
case 2:
utf8[1] = unicode & 077 | 0200;
unicode >>= 6;
utf8[0] = (char)unicode | lastmark;
break;
}
return utf8;
}
int32_t getcUnicode (char *buffer, const int len, int *pos, FILE *ifp)
{
char *p = &buffer[*pos];
while (*p == 0)
{
p = fgets(buffer, len, ifp);
if (!p) return 0;
*pos = 0;
}
int count = getUTF8Len(p);
if (*pos + count >= len) {
int offset;
for (offset = 0; *p; offset++, p++) buffer[offset] = *p;
fgets(buffer + offset, len - offset, ifp);
*pos = 0;
return getcUnicode(buffer, len, pos, ifp);
}
int32_t unicode = getUnicode(p);
*pos += count;
return unicode;
}