aboutsummaryrefslogtreecommitdiffstats
path: root/libcmap.h
diff options
context:
space:
mode:
authorMattias Andrée <m@maandree.se>2025-12-21 15:10:23 +0100
committerMattias Andrée <m@maandree.se>2025-12-21 15:10:23 +0100
commit768c066357c3b3a9cde618a4310138428ddeddcc (patch)
tree132a18e5766c4c14893820b950384f33ccbea7d2 /libcmap.h
parentUse LIBCMAP_RANGE_SIZE, and do not *offset_out and *subrange_out when the codepoint is not found (diff)
downloadlibcmap-768c066357c3b3a9cde618a4310138428ddeddcc.tar.gz
libcmap-768c066357c3b3a9cde618a4310138428ddeddcc.tar.bz2
libcmap-768c066357c3b3a9cde618a4310138428ddeddcc.tar.xz
Add documentation
Signed-off-by: Mattias Andrée <m@maandree.se>
Diffstat (limited to '')
-rw-r--r--libcmap.h216
1 files changed, 216 insertions, 0 deletions
diff --git a/libcmap.h b/libcmap.h
index c02857b..324f790 100644
--- a/libcmap.h
+++ b/libcmap.h
@@ -14,42 +14,258 @@
#endif
+/**
+ * The last valid Unicode codepoint
+ */
#define LIBCMAP_ULTIMATE_CODEPOINT UINT32_C(0x10FFFF)
+
+/**
+ * The range of all valid Unicode codepoints, include
+ * assigned, unassigned, reserved, and surrogates.
+ */
#define LIBCMAP_UNIVERSE_RANGE {0, LIBCMAP_ULTIMATE_CODEPOINT}
+/**
+ * Calculate the number of Unicode codepoints in a range
+ *
+ * @param RANGE:const libcmap_range * The codepoint range
+ * @return :size_t The number of covered codepoints
+ */
#define LIBCMAP_RANGE_SIZE(RANGE) ((size_t)((RANGE)->last - (RANGE)->first + 1U))
+/**
+ * Contiguous range of Unicode codepoints
+ *
+ * @seealso LIBCMAP_RANGE_SIZE
+ * @seealso libcmap_sprint_range
+ */
struct libcmap_range {
+ /**
+ * The lowest codepoint within the range
+ */
uint32_t first;
+
+ /**
+ * The highest codepoint within the range
+ */
uint32_t last;
};
+/**
+ * Unicode character block
+ *
+ * @seealso libcmap_block_list
+ * @seealso libcmap_find_block
+ */
struct libcmap_block {
+ /**
+ * The name of the block
+ */
const char *name;
+
+ /**
+ * The range of codepoints the block covers
+ */
struct libcmap_range range;
};
+/**
+ * Script and the codepoints used for it
+ *
+ * @seealso libcmap_script_list
+ * @seealso libcmap_find_script
+ */
struct libcmap_script {
+ /**
+ * The name of the script
+ *
+ * Replace any space with an underscore and remove
+ * all other non-alphanumeric characters to get
+ * the name as declared by Unicode
+ */
const char *name;
+
+ /**
+ * Sorted list of disjoint ranges of codepoints
+ * used for the script
+ *
+ * The list is in ascending order
+ *
+ * The is a non-zero gap of codepoints between
+ * each range
+ */
const struct libcmap_range *ranges;
+
+ /**
+ * The number of elements in `.ranges`
+ */
size_t nranges;
};
+/**
+ * Listing of all Unicode codepoints not assigned any block
+ *
+ * This listing can potentially be empty
+ *
+ * The name of this listing is called "No Block", in accordance
+ * with the Unicode specification of it being called "No_Block"
+ *
+ * @seealso libcmap_block_list
+ * @seealso libcmap_find_in_no_block
+ */
extern const struct libcmap_script libcmap_no_block;
+
+/**
+ * Listing of all Unicode blocks
+ *
+ * The blocks are sorted, in ascending order by name
+ *
+ * @seealso libcmap_block_list_size
+ * @seealso libcmap_no_block
+ * @seealso libcmap_find_block
+ * @seealso libcmap_script_list
+ */
extern const struct libcmap_block *const libcmap_block_list;
+
+/**
+ * The number of elements in `libcmap_block_list`
+ */
extern const size_t libcmap_block_list_size;
+/**
+ * Listing of all so called scripts as divided by Unicode
+ *
+ * The scripts are sorted, in ascending order by name,
+ * except "Unknown" is added as the last script
+ *
+ * It contains three special ”scripts”: "Common", "Inherited",
+ * and "Unknown". "Unknown" is an implicitly listed script
+ * in Unicode, assigned to any codepoint that is not covered
+ * by the other so called scripts; therefore it has been added
+ * to the end of the list; however future versions of this
+ * library may include it in order.
+ *
+ * @seealso libcmap_script_list_size
+ * @seealso libcmap_script_block
+ * @seealso libcmap_block_list
+ */
extern const struct libcmap_script *const libcmap_script_list;
+
+/**
+ * The number of elements in `libcmap_script_list`
+ */
extern const size_t libcmap_script_list_size;
+/**
+ * Find a codepoint within `libcmap_no_block`
+ *
+ * @param codepoint The codepoint to locate
+ * @param offset_out Output parameter for the codepoints index within
+ * `libcmap_no_block`; which is sum of the number
+ * of codepoints in all ranges with `libcmap_no_block`
+ * up to but excluding `*subrange_out`, plus
+ * `codepoint - libcmap_no_block->ranges[*subrange_out].first`;
+ * may be `NULL`
+ * @param subrange_out Output parameter for the index of the range in
+ * `libcmap_no_block` which covers the codepoint;
+ * may be `NULL`
+ * @return 1 if `libcmap_no_block` covers the codepoint,
+ * 0 otherwise
+ *
+ * If 0 is returned, `*offset_out` and `*subrange_out` will not be set
+ *
+ * @seealso libcmap_find_block
+ * @seealso libcmap_find_script
+ */
int libcmap_find_in_no_block(uint32_t codepoint, size_t *offset_out, size_t *subrange_out);
+
+/**
+ * Find the Unicode block that covers a specified codepoint
+ *
+ * @param codepoint The codepoint to find the block for
+ * @param offset_out Output parameter for the codepoints index within
+ * the block; that is, the offset from the block's
+ * lowest codepoint; may be `NULL`
+ * @return The block containing the codepoint,
+ * `NULL` if not found
+ *
+ * If `NULL` is returned, *offset_out` will not be set
+ *
+ * @seealso libcmap_find_in_no_block
+ * @seealso libcmap_find_script
+ */
const struct libcmap_block *libcmap_find_block(uint32_t codepoint, size_t *offset_out);
+
+/**
+ * Find the primary script, as assigned and divided by Unicode,
+ * of a specified codepoint
+ *
+ * @param codepoint The codepoint to find the primary script for
+ * @param offset_out Output parameter for the codepoints index within
+ * the script; which is sum of the number of
+ * codepoints in all ranges with script up to but
+ * excluding `*subrange_out`, plus
+ * `codepoint - script->ranges[*subrange_out].first`,
+ * where `script` is the returned script;
+ * may be `NULL`
+ * @param subrange_out Output parameter for the index of the range in
+ * the returned script which covers the codepoint;
+ * may be `NULL`
+ * @return The codepoints primary script, `NULL` if not found
+ * (should only be possible if the codepoint is
+ * greater than `LIBCMAP_ULTIMATE_CODEPOINT`)
+ *
+ * If 0 is returned, `*offset_out` and `*subrange_out` will not be set
+ *
+ * @seealso libcmap_find_block
+ */
const struct libcmap_script *libcmap_find_script(uint32_t codepoint, size_t *offset_out, size_t *subrange_out);
+/**
+ * Print a string representing a Unicode codepoint range
+ *
+ * @param buf Output buffer to print to, must be sufficiently large;
+ * or `NULL` to only measure the length of the string
+ * @param range The range to print a representation of
+ * @param endash Desired delimiter between the first and the last
+ * codepoint, or `NULL` for the default (not ASCII)
+ * @return The number of bytes in the string, excluding the
+ * terminating NUL byte
+ *
+ * A NUL byte will added to the end of the string, but this byte
+ * will not be counted in the return value
+ */
int libcmap_sprint_range(char *buf, const struct libcmap_range *range, const char *endash);
+
+/**
+ * Print a string representing a Unicode codepoint range
+ *
+ * @param buf Output buffer to print to
+ * @param bufsize The maximum number of bytes to write to `buf`,
+ * including the terminal NUL byte
+ * @param range The range to print a representation of
+ * @param endash Desired delimiter between the first and the last
+ * codepoint, or `NULL` for the default (not ASCII)
+ * @return The number of bytes in the string, excluding the
+ * terminating NUL byte
+ *
+ * A NUL byte will added to the end of the string, but this byte
+ * will not be counted in the return value
+ *
+ * If `bufsize` is not sufficiently large, string printed to
+ * `buf` will be truncated to `bufsize - 1U` bytes, a NUL
+ * byte will be added to the end (at `buf[bufsize - 1U`);
+ * however if `bufsize` is 0, nothing is written to `buf`.
+ * The function will always return the size the full string,
+ * even if it was truncated; therefore the return value can
+ * be equal to or exceed `bufsize`.
+ *
+ * NB! if `buf` is `NULL`, `bufsize' shall be 0, as the function
+ * does not check if `buf` is `NULL`
+ */
int libcmap_snprint_range(char *buf, size_t bufsize, const struct libcmap_range *range, const char *endash);