diff options
| -rw-r--r-- | .gitignore | 3 | ||||
| -rw-r--r-- | Blocks.parse.c | 136 | ||||
| -rw-r--r-- | LICENSE.UNICODE | 39 | ||||
| -rw-r--r-- | Makefile | 31 | ||||
| -rw-r--r-- | Scripts.parse.c | 260 | ||||
| -rw-r--r-- | config.mk | 2 | ||||
| -rw-r--r-- | libcmap.h | 4 | ||||
| -rw-r--r-- | libcmap_block_list.c | 5 | ||||
| -rw-r--r-- | libcmap_find_in_no_block.c | 20 | ||||
| -rw-r--r-- | libcmap_script_list.c | 5 | ||||
| -rw-r--r-- | parse-common.c | 123 |
11 files changed, 618 insertions, 10 deletions
@@ -12,3 +12,6 @@ *.gcov *.gcno *.gcda +*.txt +*.txt.c +*.parse diff --git a/Blocks.parse.c b/Blocks.parse.c new file mode 100644 index 0000000..bb61704 --- /dev/null +++ b/Blocks.parse.c @@ -0,0 +1,136 @@ +/* See LICENSE file for copyright and license details. */ +#include "parse-common.c" + + +struct block { + char *name; + unsigned long int low, high; +}; + + +static struct block *blocks = NULL; +static size_t nblocks = 0; + + +static void +parse_line(char *text, size_t lineno) +{ + unsigned long int low, high; + char *name; + size_t i; + + errno = 0; + + if (!isxdigit(*text)) { + malformat: + fprintf(stderr, "%s: line %zu in is malformatted\n", argv0, lineno); + exit(1); + } + + high = low = strtoul(text, &text, 16); + if (errno || low > 0x10FFFFUL) + goto malformat; + if (text[0] == '.' && text[1] == '.') { + if (!isxdigit(text[2])) + goto malformat; + high = strtoul(&text[2], &text, 16); + if (errno || high > 0x10FFFFUL || high < low) + goto malformat; + } + while (isspace(*text)) + text++; + + if (*text++ != ';') + goto malformat; + + while (isspace(*text)) + text++; + name = text; + while (*text && *text != ';') + text++; + for (i = 1U; isspace(text[-i]); i++) + text[-i] = '\0'; + + if (*text == ';') { + static int warned = 0; + if (!warned) { + warned = 1; + fprintf(stderr, "%s: unrecognised column detected in <stdin>\n", argv0); + } + *text++ = '\0'; + } else if (*text) { + goto malformat; + } + + i = nblocks++; + blocks = realloc(blocks, nblocks * sizeof(*blocks)); + if (!blocks) { + fprintf(stderr, "%s: realloc %zu: %s\n", argv0, nblocks * sizeof(*blocks), strerror(errno)); + exit(1); + } + blocks[i].name = strdup(name); + if (!blocks[i].name) { + fprintf(stderr, "%s: strdup: %s\n", argv0, strerror(errno)); + exit(1); + } + blocks[i].low = low; + blocks[i].high = high; +} + + +static int +blockcmp_name(const void *av, const void *bv) +{ + const struct block *a = av, *b = bv; + return strcmp(a->name, b->name); +} + + +static int +blockcmp_range(const void *av, const void *bv) +{ + const struct block *a = av, *b = bv; + return a->low < b->low ? -1 : +1; +} + + +static int +output(void) +{ + size_t i; + int x = 0; + + qsort(blocks, nblocks, sizeof(*blocks), &blockcmp_name); + + x |= printf("static const struct libcmap_block list[] = {\n"); + for (i = 0; i < nblocks;) { + x |= printf("\t{\"%s\", {0x%04lX, 0x%04lX}}", blocks[i].name, blocks[i].low, blocks[i].high); + free(blocks[i].name); + x |= printf("%s\n", ++i < nblocks ? "," : ""); + } + x |= printf("};\n\n"); + + qsort(blocks, nblocks, sizeof(*blocks), &blockcmp_range); + if (!nblocks || blocks[0].low) + abort(); + for (i = 1U; i < nblocks; i++) + if (blocks[i].low > blocks[i - 1U].high + 1U) + break; + if (i == nblocks && blocks[i - 1U].high == 0x10FFFFUL) { + x |= printf("const struct libcmap_script libcmap_no_block = {\"No Block\", NULL, 0};\n"); + } else { + x |= printf("static const struct libcmap_range No_Block[] = {\n"); + x |= printf("\t{0x%04lX, 0x%04lX}", blocks[i - 1U].high + 1U, blocks[i].low - 1U); + for (i++; i < nblocks; i++) + if (blocks[i].low > blocks[i - 1U].high + 1) + x |= printf(",\n\t{0x%04lX, 0x%04lX}", blocks[i - 1U].high + 1U, blocks[i].low - 1U); + if (blocks[i - 1U].high < 0x10FFFFUL) + x |= printf(",\n\t{0x%04lX, 0x10FFFFUL}", blocks[i - 1U].high + 1U); + x |= printf("\n};\n"); + x |= printf("const struct libcmap_script libcmap_no_block = "); + x |= printf("{\"No Block\", No_Block, sizeof(No_Block) / sizeof(*No_Block)};\n"); + } + + free(blocks); + return x; +} diff --git a/LICENSE.UNICODE b/LICENSE.UNICODE new file mode 100644 index 0000000..c16d7cc --- /dev/null +++ b/LICENSE.UNICODE @@ -0,0 +1,39 @@ +UNICODE LICENSE V3 + +COPYRIGHT AND PERMISSION NOTICE + +Copyright © 1991-2025 Unicode, Inc. + +NOTICE TO USER: Carefully read the following legal agreement. BY +DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING DATA FILES, AND/OR +SOFTWARE, YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE +TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT +DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE. + +Permission is hereby granted, free of charge, to any person obtaining a +copy of data files and any associated documentation (the "Data Files") or +software and any associated documentation (the "Software") to deal in the +Data Files or Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, and/or sell +copies of the Data Files or Software, and to permit persons to whom the +Data Files or Software are furnished to do so, provided that either (a) +this copyright and permission notice appear with all copies of the Data +Files or Software, or (b) this copyright and permission notice appear in +associated Documentation. + +THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY +KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF +THIRD PARTY RIGHTS. + +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE +BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, +OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA +FILES OR SOFTWARE. + +Except as contained in this notice, the name of a copyright holder shall +not be used in advertising or otherwise to promote the sale, use or other +dealings in these Data Files or Software without prior written +authorization of the copyright holder. @@ -15,10 +15,13 @@ LIB_MINOR = 0 LIB_VERSION = $(LIB_MAJOR).$(LIB_MINOR) LIB_NAME = cmap +UNICODE_VERSION = 17.0.0 + OBJ =\ libcmap_block_list.o\ libcmap_script_list.o\ + libcmap_find_in_no_block.o\ libcmap_find_block.o\ libcmap_find_script.o @@ -27,10 +30,18 @@ HDR =\ LOBJ = $(OBJ:.o=.lo) +PARSERS =\ + Scripts.parse\ + Blocks.parse + all: libcmap.a libcmap.$(LIBEXT) $(OBJ): $(HDR) $(LOBJ): $(HDR) +$(PARSERS:=.o): parse-common.c + +libcmap_script_list.o: Scripts.txt.c +libcmap_block_list.o: Blocks.txt.c .c.o: $(CC) -c -o $@ $< $(CFLAGS) $(CPPFLAGS) @@ -46,6 +57,18 @@ libcmap.a: $(OBJ) libcmap.$(LIBEXT): $(LOBJ) $(CC) $(LIBFLAGS) -o $@ $(LOBJ) $(LDFLAGS) +Scripts.$(UNICODE_VERSION).txt: + $(DOWNLOAD) 'https://www.unicode.org/Public/17.0.0/ucd/Scripts.txt' > $@ + +Blocks.$(UNICODE_VERSION).txt: + $(DOWNLOAD) 'https://www.unicode.org/Public/17.0.0/ucd/Blocks.txt' > $@ + +Scripts.txt.c: Scripts.$(UNICODE_VERSION).txt Scripts.parse + ./Scripts.parse < $(@:.txt.c=).$(UNICODE_VERSION).txt > $@ + +Blocks.txt.c: Blocks.$(UNICODE_VERSION).txt Blocks.parse + ./Blocks.parse < $(@:.txt.c=).$(UNICODE_VERSION).txt > $@ + install: libcmap.a libcmap.$(LIBEXT) mkdir -p -- "$(DESTDIR)$(PREFIX)/lib" mkdir -p -- "$(DESTDIR)$(PREFIX)/include" @@ -66,8 +89,14 @@ uninstall: clean: -rm -f -- *.o *.a *.lo *.su *.so *.so.* *.dll *.dylib -rm -f -- *.gch *.gcov *.gcno *.gcda *.$(LIBEXT) + -rm -f -- *.txt.c *.parse + +clean-downloads: + -rm -f -- *.txt + +clean-all: clean clean-downloads .SUFFIXES: .SUFFIXES: .lo .o .c -.PHONY: all install uninstall clean +.PHONY: all install uninstall clean clean-downloads clean-all diff --git a/Scripts.parse.c b/Scripts.parse.c new file mode 100644 index 0000000..d5d167b --- /dev/null +++ b/Scripts.parse.c @@ -0,0 +1,260 @@ +/* See LICENSE file for copyright and license details. */ +#include "parse-common.c" + + +struct range { + unsigned long int low, high; +}; + +struct script { + char *cname; + char *hname; + struct range *ranges; + size_t nranges; +}; + + +static struct script *scripts = NULL; +static size_t nscripts = 0; + + +static const char * +fixed_script_name(const char *hname) +{ + if (!strcmp(hname, "NKo")) + return "N'Ko"; + return hname; +} + + +static struct script * +find_script(char *cname) +{ + size_t i, j; + + for (i = nscripts; i--;) + if (!strcmp(cname, scripts[i].cname)) + return &scripts[i]; + + i = nscripts++; + scripts = realloc(scripts, nscripts * sizeof(*scripts)); + if (!scripts) { + fprintf(stderr, "%s: realloc %zu: %s\n", argv0, nscripts * sizeof(*scripts), strerror(errno)); + exit(1); + } + scripts[i].cname = strdup(cname); + if (!scripts[i].cname) { + fprintf(stderr, "%s: strdup: %s\n", argv0, strerror(errno)); + exit(1); + } + scripts[i].ranges = NULL; + scripts[i].nranges = 0; + for (j = 0; cname[j]; j++) + if (cname[j] == '_') + cname[j] = ' '; + scripts[i].hname = strdup(fixed_script_name(cname)); + if (!scripts[i].hname) { + fprintf(stderr, "%s: strdup: %s\n", argv0, strerror(errno)); + exit(1); + } + + return &scripts[i]; +} + + +static void +parse_line(char *text, size_t lineno) +{ + unsigned long int low, high; + struct script *script; + char *name; + size_t i; + + errno = 0; + + if (!isxdigit(*text)) { + malformat: + fprintf(stderr, "%s: line %zu in is malformatted\n", argv0, lineno); + exit(1); + } + + high = low = strtoul(text, &text, 16); + if (errno || low > 0x10FFFFUL) + goto malformat; + if (text[0] == '.' && text[1] == '.') { + if (!isxdigit(text[2])) + goto malformat; + high = strtoul(&text[2], &text, 16); + if (errno || high > 0x10FFFFUL || high < low) + goto malformat; + } + while (isspace(*text)) + text++; + + if (*text++ != ';') + goto malformat; + + while (isspace(*text)) + text++; + name = text; + while (*text && !isspace(*text) && *text != ';') + text++; + while (isspace(*text)) + *text++ = '\0'; + + if (*text == ';') { + static int warned = 0; + if (!warned) { + warned = 1; + fprintf(stderr, "%s: unrecognised column detected in <stdin>\n", argv0); + } + *text++ = '\0'; + } else if (*text) { + goto malformat; + } + + script = find_script(name); + i = script->nranges++; + script->ranges = realloc(script->ranges, script->nranges * sizeof(*script->ranges)); + if (!script->ranges) { + fprintf(stderr, "%s: realloc %zu: %s\n", argv0, script->nranges * sizeof(*script->ranges), strerror(errno)); + exit(1); + } + script->ranges[i].low = low; + script->ranges[i].high = high; +} + + +static int +scriptcmp(const void *av, const void *bv) +{ + const struct script *a = av, *b = bv; + return strcmp(a->hname, b->hname); +} + + +static int +rangecmp(const void *av, const void *bv) +{ + const struct range *a = av, *b = bv; + return a->low < b->low ? -1 : a->low > b->low ? +1 : a->high < b->high ? -1 : a->high > b->high; +} + + +static size_t +join_ranges(struct range *ranges, size_t n) +{ + size_t r, w; + + if (!n) + abort(); + + qsort(ranges, n, sizeof(*ranges), &rangecmp); + for (r = w = 1U; r < n; r++) { + if (ranges[r].low == ranges[w - 1U].low) + ranges[w - 1U].high = ranges[r].high; + else if (ranges[r].low <= ranges[w - 1U].high + 1U) + ranges[w - 1U].high = ranges[r].high; + else + ranges[w++] = ranges[r]; + } + + return w; +} + + +static void +range_minus(struct range **rangesp, size_t *np, const struct range *xranges, size_t xn) +{ + struct range *ranges = *rangesp; + size_t i, j, n = *np; + + for (i = 0, j = 0; i < n && j < xn;) { + if (xranges[j].high < ranges[i].low) { + j++; + } else if (xranges[j].low > ranges[i].high) { + i++; + } else if (xranges[j].low <= ranges[i].low && xranges[j].high >= ranges[i].high) { + memmove(&ranges[i], &ranges[i + 1U], (--n - i) * sizeof(*ranges)); + } else if (xranges[j].low <= ranges[i].low && xranges[j].high < ranges[i].high) { + ranges[i].low = xranges[j++].high + 1U; + } else if (xranges[j].high >= ranges[i].high && xranges[j].low > ranges[i].low) { + ranges[i++].high = xranges[j].low - 1U; + } else if (xranges[j].low > ranges[i].low && xranges[j].high < ranges[i].high) { + ranges = realloc(ranges, (n + 1U) * sizeof(*ranges)); + if (!ranges) { + fprintf(stderr, "%s: realloc %zu: %s\n", argv0, (n + 1U) * sizeof(*ranges), strerror(errno)); + exit(1); + } + memmove(&ranges[i + 1U], &ranges[i], (n++ - i) * sizeof(*ranges)); + ranges[i].low = ranges[i + 1U].low; + ranges[i].high = xranges[j].low - 1U; + ranges[i + 1U].low = xranges[j].high + 1U; + i++; + } else { + abort(); + } + } + + *rangesp = ranges; + *np = n; +} + + +static int +output(void) +{ + size_t i, j; + int x = 0; + const char *prefix; + struct script *unknown; + struct range *ranges; + size_t nranges; + + ranges = malloc(sizeof(*unknown->ranges)); + if (!ranges) { + fprintf(stderr, "%s: malloc %zu: %s\n", argv0, sizeof(*unknown->ranges), strerror(errno)); + exit(1); + } + nranges = 1U; + ranges[0].low = 0; + ranges[0].high = 0x10FFFF; + + for (i = 0; i < nscripts; i++) { + scripts[i].nranges = join_ranges(scripts[i].ranges, scripts[i].nranges); + range_minus(&ranges, &nranges, scripts[i].ranges, scripts[i].nranges); + } + + qsort(scripts, nscripts, sizeof(*scripts), &scriptcmp); + + if (nranges) { + unknown = find_script((char []){"Unknown"}); + unknown->ranges = ranges; + unknown->nranges = nranges; + } else { + free(ranges); + } + + for (i = 0; i < nscripts; i++) { + x |= printf("static const struct libcmap_range %s[] = {", scripts[i].cname); + for (j = 0; j < scripts[i].nranges;) { + prefix = j % 5U /* no more than 93!! */ == 0U ? "\n\t" : " "; + x |= printf("%s{0x%04lX, 0x%04lX}", prefix, scripts[i].ranges[j].low, scripts[i].ranges[j].high); + if (++j < scripts[i].nranges) + x |= printf(","); + } + x |= printf("\n};\n"); + } + x |= printf("\nstatic const struct libcmap_script list[] = {\n"); + for (i = 0; i < nscripts;) { + x |= printf("\t{\"%s\", %s, %zu}", scripts[i].hname, scripts[i].cname, scripts[i].nranges); + free(scripts[i].cname); + free(scripts[i].hname); + free(scripts[i].ranges); + x |= printf("%s\n", ++i < nscripts ? "," : ""); + } + x |= printf("};\n"); + free(scripts); + + return x; +} @@ -6,3 +6,5 @@ CC = c99 CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700 -D_GNU_SOURCE CFLAGS = LDFLAGS = + +DOWNLOAD = curl @@ -26,11 +26,12 @@ struct libcmap_block { struct libcmap_script { const char *name; - struct libcmap_range *ranges; + const struct libcmap_range *ranges; size_t nranges; }; +extern const struct libcmap_script libcmap_no_block; extern const struct libcmap_block *const libcmap_block_list; extern const size_t libcmap_block_list_size; @@ -38,6 +39,7 @@ extern const struct libcmap_script *const libcmap_script_list; extern const size_t libcmap_script_list_size; +int libcmap_find_in_no_block(uint32_t codepoint, size_t *offset_out, size_t *subrange_out); const struct libcmap_block *libcmap_find_block(uint32_t codepoint, size_t *offset_out); const struct libcmap_script *libcmap_find_script(uint32_t codepoint, size_t *offset_out, size_t *subrange_out); diff --git a/libcmap_block_list.c b/libcmap_block_list.c index e0d3b93..c0d9214 100644 --- a/libcmap_block_list.c +++ b/libcmap_block_list.c @@ -2,10 +2,7 @@ #include "libcmap.h" -static const struct libcmap_block list[] = { - {"Phony block", {0, 0}} -}; - +#include "Blocks.txt.c" const struct libcmap_block *const libcmap_block_list = list; const size_t libcmap_block_list_size = sizeof(list) / sizeof(*list); diff --git a/libcmap_find_in_no_block.c b/libcmap_find_in_no_block.c new file mode 100644 index 0000000..216aaa0 --- /dev/null +++ b/libcmap_find_in_no_block.c @@ -0,0 +1,20 @@ +/* See LICENSE file for copyright and license details. */ +#include "libcmap.h" + + +int +libcmap_find_in_no_block(uint32_t codepoint, size_t *offset_out, size_t *subrange_out) +{ + size_t i, skipped = 0; + for (i = 0; i < libcmap_no_block.nranges; i++) { + if (libcmap_no_block.ranges[i].first <= codepoint && libcmap_no_block.ranges[i].last <= codepoint) { + if (offset_out) + *offset_out = skipped + (size_t)(codepoint - libcmap_no_block.ranges[i].first); + if (subrange_out) + *subrange_out = i; + return 1; + } + skipped += (size_t)(libcmap_no_block.ranges[i].last - libcmap_no_block.ranges[i].first) + 1U; + } + return 0; +} diff --git a/libcmap_script_list.c b/libcmap_script_list.c index 7464f5c..b692073 100644 --- a/libcmap_script_list.c +++ b/libcmap_script_list.c @@ -2,10 +2,7 @@ #include "libcmap.h" -static const struct libcmap_script list[] = { - {"Phony script", NULL, 0} -}; - +#include "Scripts.txt.c" const struct libcmap_script *const libcmap_script_list = list; const size_t libcmap_script_list_size = sizeof(list) / sizeof(*list); diff --git a/parse-common.c b/parse-common.c new file mode 100644 index 0000000..2333a1c --- /dev/null +++ b/parse-common.c @@ -0,0 +1,123 @@ +/* See LICENSE file for copyright and license details. */ +#include <ctype.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + + +static const char *argv0; + + +static void parse_line(char *text, size_t lineno); +static int output(void); + + +static size_t +parse(char *buf, size_t len) +{ + static int prev_was_cr = 0; + static size_t lineno = 1U; + size_t ret = 0; + size_t off = 0; + size_t she; + +beginning: + if (prev_was_cr && off < len && buf[off] == '\n') + off++; + while (off < len && buf[off] != '\n' && buf[off] != '\r' && isspace(buf[off])) + off++; + ret = off; + if (off == len) + return ret; + if (buf[off] == '#') { + while (off < len && buf[off] != '\n' && buf[off] != '\r') + off++; + if (off == len) + return ret; + goto newline; + } else if (buf[off] == '\n' || buf[off] == '\r') { + newline: + prev_was_cr = buf[off] == '\r'; + off++; + lineno++; + goto beginning; + } + + for (; off < len; off++) { + if (buf[off] == '\0') { + fprintf(stderr, "%s: NUL byte found in <stdin> on line %zu\n", argv0, lineno); + exit(1); + } else if (buf[off] == '\n' || buf[off] == '\r' || buf[off] == '#') { + break; + } + } + if (off == len) + return ret; + + if (buf[off] == '\n' || buf[off] == '\r') { + prev_was_cr = buf[off] == '\r'; + buf[off++] = '\0'; + } else if (buf[off] == '#') { + she = off++; + while (off < len && buf[off] != '\n' && buf[off] != '\r') + off++; + if (off == len) + return ret; + buf[she] = '\0'; + prev_was_cr = buf[off++] == '\r'; + } else { + abort(); + } + + parse_line(&buf[ret], lineno); + lineno++; + goto beginning; +} + + +int +main(int argc, char *argv[]) +{ + char *buf = NULL; + size_t bufsize = 0; + size_t len = 0; + size_t parsed; + ssize_t r; + + argv0 = argv[0]; + (void) argc; + + for (;;) { + if (len == bufsize) { + buf = realloc(buf, bufsize += 8192U); + if (!buf) { + fprintf(stderr, "%s: realloc %zu: %s\n", argv0, bufsize, strerror(errno)); + exit(1); + } + } + r = read(STDIN_FILENO, &buf[len], bufsize - len); + if (r <= 0) { + if (!r) + break; + if (errno == EINTR) + continue; + fprintf(stderr, "%s: read <stdin>: %s\n", argv0, strerror(errno)); + exit(1); + } + len += (size_t)r; + parsed = parse(buf, len); + memmove(&buf[0], &buf[parsed], len -= parsed); + } + buf[len++] = '\n'; + parse(buf, len); + free(buf); + + if (output() < 0 || fflush(stdout)) { + fprintf(stderr, "%s: failed to write to <stdout>\n", argv0); + exit(1); + } + + return 0; +} |
