diff options
| -rw-r--r-- | libcmap.h | 216 |
1 files changed, 216 insertions, 0 deletions
@@ -14,42 +14,258 @@ #endif +/** + * The last valid Unicode codepoint + */ #define LIBCMAP_ULTIMATE_CODEPOINT UINT32_C(0x10FFFF) + +/** + * The range of all valid Unicode codepoints, include + * assigned, unassigned, reserved, and surrogates. + */ #define LIBCMAP_UNIVERSE_RANGE {0, LIBCMAP_ULTIMATE_CODEPOINT} +/** + * Calculate the number of Unicode codepoints in a range + * + * @param RANGE:const libcmap_range * The codepoint range + * @return :size_t The number of covered codepoints + */ #define LIBCMAP_RANGE_SIZE(RANGE) ((size_t)((RANGE)->last - (RANGE)->first + 1U)) +/** + * Contiguous range of Unicode codepoints + * + * @seealso LIBCMAP_RANGE_SIZE + * @seealso libcmap_sprint_range + */ struct libcmap_range { + /** + * The lowest codepoint within the range + */ uint32_t first; + + /** + * The highest codepoint within the range + */ uint32_t last; }; +/** + * Unicode character block + * + * @seealso libcmap_block_list + * @seealso libcmap_find_block + */ struct libcmap_block { + /** + * The name of the block + */ const char *name; + + /** + * The range of codepoints the block covers + */ struct libcmap_range range; }; +/** + * Script and the codepoints used for it + * + * @seealso libcmap_script_list + * @seealso libcmap_find_script + */ struct libcmap_script { + /** + * The name of the script + * + * Replace any space with an underscore and remove + * all other non-alphanumeric characters to get + * the name as declared by Unicode + */ const char *name; + + /** + * Sorted list of disjoint ranges of codepoints + * used for the script + * + * The list is in ascending order + * + * The is a non-zero gap of codepoints between + * each range + */ const struct libcmap_range *ranges; + + /** + * The number of elements in `.ranges` + */ size_t nranges; }; +/** + * Listing of all Unicode codepoints not assigned any block + * + * This listing can potentially be empty + * + * The name of this listing is called "No Block", in accordance + * with the Unicode specification of it being called "No_Block" + * + * @seealso libcmap_block_list + * @seealso libcmap_find_in_no_block + */ extern const struct libcmap_script libcmap_no_block; + +/** + * Listing of all Unicode blocks + * + * The blocks are sorted, in ascending order by name + * + * @seealso libcmap_block_list_size + * @seealso libcmap_no_block + * @seealso libcmap_find_block + * @seealso libcmap_script_list + */ extern const struct libcmap_block *const libcmap_block_list; + +/** + * The number of elements in `libcmap_block_list` + */ extern const size_t libcmap_block_list_size; +/** + * Listing of all so called scripts as divided by Unicode + * + * The scripts are sorted, in ascending order by name, + * except "Unknown" is added as the last script + * + * It contains three special ”scripts”: "Common", "Inherited", + * and "Unknown". "Unknown" is an implicitly listed script + * in Unicode, assigned to any codepoint that is not covered + * by the other so called scripts; therefore it has been added + * to the end of the list; however future versions of this + * library may include it in order. + * + * @seealso libcmap_script_list_size + * @seealso libcmap_script_block + * @seealso libcmap_block_list + */ extern const struct libcmap_script *const libcmap_script_list; + +/** + * The number of elements in `libcmap_script_list` + */ extern const size_t libcmap_script_list_size; +/** + * Find a codepoint within `libcmap_no_block` + * + * @param codepoint The codepoint to locate + * @param offset_out Output parameter for the codepoints index within + * `libcmap_no_block`; which is sum of the number + * of codepoints in all ranges with `libcmap_no_block` + * up to but excluding `*subrange_out`, plus + * `codepoint - libcmap_no_block->ranges[*subrange_out].first`; + * may be `NULL` + * @param subrange_out Output parameter for the index of the range in + * `libcmap_no_block` which covers the codepoint; + * may be `NULL` + * @return 1 if `libcmap_no_block` covers the codepoint, + * 0 otherwise + * + * If 0 is returned, `*offset_out` and `*subrange_out` will not be set + * + * @seealso libcmap_find_block + * @seealso libcmap_find_script + */ int libcmap_find_in_no_block(uint32_t codepoint, size_t *offset_out, size_t *subrange_out); + +/** + * Find the Unicode block that covers a specified codepoint + * + * @param codepoint The codepoint to find the block for + * @param offset_out Output parameter for the codepoints index within + * the block; that is, the offset from the block's + * lowest codepoint; may be `NULL` + * @return The block containing the codepoint, + * `NULL` if not found + * + * If `NULL` is returned, *offset_out` will not be set + * + * @seealso libcmap_find_in_no_block + * @seealso libcmap_find_script + */ const struct libcmap_block *libcmap_find_block(uint32_t codepoint, size_t *offset_out); + +/** + * Find the primary script, as assigned and divided by Unicode, + * of a specified codepoint + * + * @param codepoint The codepoint to find the primary script for + * @param offset_out Output parameter for the codepoints index within + * the script; which is sum of the number of + * codepoints in all ranges with script up to but + * excluding `*subrange_out`, plus + * `codepoint - script->ranges[*subrange_out].first`, + * where `script` is the returned script; + * may be `NULL` + * @param subrange_out Output parameter for the index of the range in + * the returned script which covers the codepoint; + * may be `NULL` + * @return The codepoints primary script, `NULL` if not found + * (should only be possible if the codepoint is + * greater than `LIBCMAP_ULTIMATE_CODEPOINT`) + * + * If 0 is returned, `*offset_out` and `*subrange_out` will not be set + * + * @seealso libcmap_find_block + */ const struct libcmap_script *libcmap_find_script(uint32_t codepoint, size_t *offset_out, size_t *subrange_out); +/** + * Print a string representing a Unicode codepoint range + * + * @param buf Output buffer to print to, must be sufficiently large; + * or `NULL` to only measure the length of the string + * @param range The range to print a representation of + * @param endash Desired delimiter between the first and the last + * codepoint, or `NULL` for the default (not ASCII) + * @return The number of bytes in the string, excluding the + * terminating NUL byte + * + * A NUL byte will added to the end of the string, but this byte + * will not be counted in the return value + */ int libcmap_sprint_range(char *buf, const struct libcmap_range *range, const char *endash); + +/** + * Print a string representing a Unicode codepoint range + * + * @param buf Output buffer to print to + * @param bufsize The maximum number of bytes to write to `buf`, + * including the terminal NUL byte + * @param range The range to print a representation of + * @param endash Desired delimiter between the first and the last + * codepoint, or `NULL` for the default (not ASCII) + * @return The number of bytes in the string, excluding the + * terminating NUL byte + * + * A NUL byte will added to the end of the string, but this byte + * will not be counted in the return value + * + * If `bufsize` is not sufficiently large, string printed to + * `buf` will be truncated to `bufsize - 1U` bytes, a NUL + * byte will be added to the end (at `buf[bufsize - 1U`); + * however if `bufsize` is 0, nothing is written to `buf`. + * The function will always return the size the full string, + * even if it was truncated; therefore the return value can + * be equal to or exceed `bufsize`. + * + * NB! if `buf` is `NULL`, `bufsize' shall be 0, as the function + * does not check if `buf` is `NULL` + */ int libcmap_snprint_range(char *buf, size_t bufsize, const struct libcmap_range *range, const char *endash); |
