diff options
| author | Mattias Andrée <m@maandree.se> | 2025-02-23 15:48:02 +0100 |
|---|---|---|
| committer | Mattias Andrée <m@maandree.se> | 2025-02-23 15:48:02 +0100 |
| commit | 120d0259be2e9796757572ea5b7e43ef5715402e (patch) | |
| tree | 80a8c2af9e01c5702d6a6c3ed747bbad16467104 | |
| parent | First commit (diff) | |
| download | libcharsets-master.tar.gz libcharsets-master.tar.bz2 libcharsets-master.tar.xz | |
Signed-off-by: Mattias Andrée <m@maandree.se>
| -rw-r--r-- | TODO | 2 | ||||
| -rw-r--r-- | maps/SYNTAX | 52 | ||||
| -rw-r--r-- | maps/iso-ir-014 | 5 | ||||
| -rw-r--r-- | maps/ucs-2-be | 1 | ||||
| -rw-r--r-- | maps/ucs-2-le | 1 | ||||
| -rw-r--r-- | maps/utf-1 | 35 | ||||
| -rw-r--r-- | maps/utf-16-be | 4 | ||||
| -rw-r--r-- | maps/utf-16-le | 4 | ||||
| -rw-r--r-- | maps/utf-32-be | 1 | ||||
| -rw-r--r-- | maps/utf-32-le | 1 | ||||
| -rw-r--r-- | maps/utf-8 | 26 | ||||
| -rw-r--r-- | maps/wtf-8 | 56 |
12 files changed, 188 insertions, 0 deletions
@@ -46,3 +46,5 @@ Iran System encoding Kamenický encoding MIK ELOT 927 + +IBM-437 has a number of characters that can be mapped into more than one Unicode code point diff --git a/maps/SYNTAX b/maps/SYNTAX new file mode 100644 index 0000000..a3ad110 --- /dev/null +++ b/maps/SYNTAX @@ -0,0 +1,52 @@ +"include" <file> + Include the contents of <file>. + +"unmap" <sequence range> + Remove mapping for byte sequence range. + +"remap" <sequence range> "to" "U+"<code point> ["using" <endian>] + Like "map", except assume all sequence is already defined + and redefine them instead. + +["readonly"] "map" <sequence range> "to" "U+"<code point> ["using" <endian>] + Map a byte sequence range to a code point range. + If a sequence is already defined, fail. + <code point> (hexadecimal) shall be the code point for the + lowest valued sequence in <sequence range> + If multiple byte sequences, <endian> may be "big-endian" or "little-endian" + to specify the significance of the bytes in the sequence, alternative + value calculations can be specified in <sequence range> + If "readonly" is present, the mapping should only be used when decoding, + not when encoding. For each mapped code point, there must exist exactly + on mapping that is not "readonly". + +<sequence range> + <byte> + Single byte range. + <byte> [<value calculation>] (", " <byte> [<value calculation>]) ... + A sequence of bytes, in storage order. + <value calculation> shall appear on all or no <byte>, and + shall appear if and only if <endian> was not used but at + least two <byte>s are valued (have multiple legal values). + +<byte> + <value> + Single byte, not valued. + <value>(["-"|"+"]<value>)... + The <value>s should be ordered by mapping value. + The first <value> is the lowest values byte in domain (valued 0). + "-" creates a range from the previous <value> upto and include + the <value> behind it + "+" creates a domain disjunction, skipping over the values + between, but excluding, the values between the <value>s + on either side of it + +<value calculation> + "<<<" <positive decimal> + set the relative significance of the byte, 0 is least significant. + +<value> + <positive decimal> + 0x<positive hexadecimal> + +\ may be used on the end of a line to replace the <newline> with a space. diff --git a/maps/iso-ir-014 b/maps/iso-ir-014 new file mode 100644 index 0000000..1e660e3 --- /dev/null +++ b/maps/iso-ir-014 @@ -0,0 +1,5 @@ +map 0x20-0x5B to U+0020 +map 0x5C to U+00A5 +map 0x5D-0x7D to U+005D +map 0x7E to U+00AF +map 0x7F to U+007F diff --git a/maps/ucs-2-be b/maps/ucs-2-be new file mode 100644 index 0000000..97719bb --- /dev/null +++ b/maps/ucs-2-be @@ -0,0 +1 @@ +map 0x00-0xFF, 0x00-0xFF to U+0000 using big-endian diff --git a/maps/ucs-2-le b/maps/ucs-2-le new file mode 100644 index 0000000..1ed78ee --- /dev/null +++ b/maps/ucs-2-le @@ -0,0 +1 @@ +map 0x00-0xFF, 0x00-0xFF to U+0000 using little-endian diff --git a/maps/utf-1 b/maps/utf-1 new file mode 100644 index 0000000..f81a7b6 --- /dev/null +++ b/maps/utf-1 @@ -0,0 +1,35 @@ +map 0x00-0x9F to U+0000 + +map 0xA0,\ + 0xA0-0xFF to U+00A0 + +map 0xA1-0xF5,\ + 0x21-0x7E+0xA0-0xFF to U+0100 using big-endian + +map 0xF6-0xFB,\ + 0x21-0x7E+0xA0-0xFF,\ + 0x21-0x7E+0xA0-0xFF to U+4016 using big-endian + +map 0xFC,\ + 0x21-0x7E+0xA0-0xFF,\ + 0x21-0x7E+0xA0-0xFF,\ + 0x21-0x7E+0xA0-0xFF,\ + 0x21-0x7E+0xA0-0xFF to U+00038E2E using big-endian + +map 0xFD,\ + 0x21-0x7E+0xA0-0xBC,\ + 0x21-0x7E+0xA0-0xFF,\ + 0x21-0x7E+0xA0-0xFF,\ + 0x21-0x7E+0xA0-0xFF to U+4DB0F63E using big-endian + +map 0xFD, 0xBC,\ + 0x21-0x2A,\ + 0x21-0x7E+0xA0-0xFF,\ + 0x21-0x7E+0xA0-0xFF to U+7FFA2566 using big-endian + +map 0xFD, 0xBC, 0x2B,\ + 0x21-0x7E+0xA0-0xB8,\ + 0x21-0x7E+0xA0-0xFF to U+7FFFA78E using big-endian + +map 0xFD, 0xBC, 0x2B, 0xB9,\ + 0x21-0x40 to U+7FFFFFE0 diff --git a/maps/utf-16-be b/maps/utf-16-be new file mode 100644 index 0000000..7199108 --- /dev/null +++ b/maps/utf-16-be @@ -0,0 +1,4 @@ +map 0x00-0xD7, 0x00-0xFF to U+0000 using big-endian +map 0xE0-0xFF, 0x00-0xFF to U+E000 using big-endian +map 0xD8-0xDB, 0x00-0xFF,\ + 0xDC-0xDF, 0x00-0xFF to U+010000 using big-endian diff --git a/maps/utf-16-le b/maps/utf-16-le new file mode 100644 index 0000000..0985f1f --- /dev/null +++ b/maps/utf-16-le @@ -0,0 +1,4 @@ +map 0x00-0xFF, 0x00-0xD7 to U+0000 using little-endian +map 0x00-0xFF, 0xE0-0xFF to U+E000 using little-endian +map 0x00-0xFF <<< 2, 0xD8-0xDB <<< 3,\ + 0x00-0xFF <<< 0, 0xDC-0xDF <<< 1 to U+010000 diff --git a/maps/utf-32-be b/maps/utf-32-be new file mode 100644 index 0000000..27a74ec --- /dev/null +++ b/maps/utf-32-be @@ -0,0 +1 @@ +map 0x00-0x7F, 0x00-0xFF, 0x00-0xFF, 0x00-0xFF to U+00000000 using big-endian diff --git a/maps/utf-32-le b/maps/utf-32-le new file mode 100644 index 0000000..994884e --- /dev/null +++ b/maps/utf-32-le @@ -0,0 +1 @@ +map 0x00-0xFF, 0x00-0xFF, 0x00-0xFF, 0x00-0x7F to U+00000000 using little-endian diff --git a/maps/utf-8 b/maps/utf-8 new file mode 100644 index 0000000..f707552 --- /dev/null +++ b/maps/utf-8 @@ -0,0 +1,26 @@ +map 0x00-0x7F to U+0000 + +map 0xC2-0xDF,\ + 0x80-0xBF to U+0080 using big-endian + +map 0xE0-0xEF,\ + 0x80-0xBF,\ + 0x80-0xBF to U+0800 using big-endian + +map 0xF0-0xF7,\ + 0x80-0xBF,\ + 0x80-0xBF,\ + 0x80-0xBF to U+010000 using big-endian + +map 0xF8-0xFB,\ + 0x80-0xBF,\ + 0x80-0xBF,\ + 0x80-0xBF,\ + 0x80-0xBF to U+200000 using big-endian + +map 0xFC-0xFD,\ + 0x80-0xBF,\ + 0x80-0xBF,\ + 0x80-0xBF,\ + 0x80-0xBF,\ + 0x80-0xBF to U+04000000 using big-endian diff --git a/maps/wtf-8 b/maps/wtf-8 new file mode 100644 index 0000000..43757ee --- /dev/null +++ b/maps/wtf-8 @@ -0,0 +1,56 @@ +map 0x00-0x7F to U+0000 + +map 0xC2-0xDF,\ + 0x80-0xBF to U+0080 using big-endian + +map 0xE0-0xEC,\ + 0x80-0xBF,\ + 0x80-0xBF to U+0800 using big-endian + +map 0xED,\ + 0x80-0x9F,\ + 0x80-0xBF to U+D000 using big-endian + +map 0xED,\ + 0xA0-0xAF,\ + 0x80-0xBF,\ + 0xED,\ + 0xB0-0xBF,\ + 0x80-0xBF to U+010000 using big-endian + +map 0xEE-0xEF,\ + 0x80-0xBF,\ + 0x80-0xBF to U+E000 using big-endian + +readonly map 0xF0-0xF3,\ + 0x80-0xBF,\ + 0x80-0xBF,\ + 0x80-0xBF to U+010000 using big-endian + +readonly map 0xF4,\ + 0x80-0x8F,\ + 0x80-0xBF,\ + 0x80-0xBF to U+100000 using big-endian + +map 0xF4,\ + 0x90-0xBF,\ + 0x80-0xBF,\ + 0x80-0xBF to U+110000 using big-endian + +map 0xF5-0xF7,\ + 0x80-0xBF,\ + 0x80-0xBF,\ + 0x80-0xBF to U+140000 using big-endian + +map 0xF8-0xFB,\ + 0x80-0xBF,\ + 0x80-0xBF,\ + 0x80-0xBF,\ + 0x80-0xBF to U+200000 using big-endian + +map 0xFC-0xFD,\ + 0x80-0xBF,\ + 0x80-0xBF,\ + 0x80-0xBF,\ + 0x80-0xBF,\ + 0x80-0xBF to U+04000000 using big-endian |
