summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--TODO2
-rw-r--r--maps/SYNTAX52
-rw-r--r--maps/iso-ir-0145
-rw-r--r--maps/ucs-2-be1
-rw-r--r--maps/ucs-2-le1
-rw-r--r--maps/utf-135
-rw-r--r--maps/utf-16-be4
-rw-r--r--maps/utf-16-le4
-rw-r--r--maps/utf-32-be1
-rw-r--r--maps/utf-32-le1
-rw-r--r--maps/utf-826
-rw-r--r--maps/wtf-856
12 files changed, 188 insertions, 0 deletions
diff --git a/TODO b/TODO
index 00348ea..1494220 100644
--- a/TODO
+++ b/TODO
@@ -46,3 +46,5 @@ Iran System encoding
Kamenický encoding
MIK
ELOT 927
+
+IBM-437 has a number of characters that can be mapped into more than one Unicode code point
diff --git a/maps/SYNTAX b/maps/SYNTAX
new file mode 100644
index 0000000..a3ad110
--- /dev/null
+++ b/maps/SYNTAX
@@ -0,0 +1,52 @@
+"include" <file>
+ Include the contents of <file>.
+
+"unmap" <sequence range>
+ Remove mapping for byte sequence range.
+
+"remap" <sequence range> "to" "U+"<code point> ["using" <endian>]
+ Like "map", except assume all sequence is already defined
+ and redefine them instead.
+
+["readonly"] "map" <sequence range> "to" "U+"<code point> ["using" <endian>]
+ Map a byte sequence range to a code point range.
+ If a sequence is already defined, fail.
+ <code point> (hexadecimal) shall be the code point for the
+ lowest valued sequence in <sequence range>
+ If multiple byte sequences, <endian> may be "big-endian" or "little-endian"
+ to specify the significance of the bytes in the sequence, alternative
+ value calculations can be specified in <sequence range>
+ If "readonly" is present, the mapping should only be used when decoding,
+ not when encoding. For each mapped code point, there must exist exactly
+ on mapping that is not "readonly".
+
+<sequence range>
+ <byte>
+ Single byte range.
+ <byte> [<value calculation>] (", " <byte> [<value calculation>]) ...
+ A sequence of bytes, in storage order.
+ <value calculation> shall appear on all or no <byte>, and
+ shall appear if and only if <endian> was not used but at
+ least two <byte>s are valued (have multiple legal values).
+
+<byte>
+ <value>
+ Single byte, not valued.
+ <value>(["-"|"+"]<value>)...
+ The <value>s should be ordered by mapping value.
+ The first <value> is the lowest values byte in domain (valued 0).
+ "-" creates a range from the previous <value> upto and include
+ the <value> behind it
+ "+" creates a domain disjunction, skipping over the values
+ between, but excluding, the values between the <value>s
+ on either side of it
+
+<value calculation>
+ "<<<" <positive decimal>
+ set the relative significance of the byte, 0 is least significant.
+
+<value>
+ <positive decimal>
+ 0x<positive hexadecimal>
+
+\ may be used on the end of a line to replace the <newline> with a space.
diff --git a/maps/iso-ir-014 b/maps/iso-ir-014
new file mode 100644
index 0000000..1e660e3
--- /dev/null
+++ b/maps/iso-ir-014
@@ -0,0 +1,5 @@
+map 0x20-0x5B to U+0020
+map 0x5C to U+00A5
+map 0x5D-0x7D to U+005D
+map 0x7E to U+00AF
+map 0x7F to U+007F
diff --git a/maps/ucs-2-be b/maps/ucs-2-be
new file mode 100644
index 0000000..97719bb
--- /dev/null
+++ b/maps/ucs-2-be
@@ -0,0 +1 @@
+map 0x00-0xFF, 0x00-0xFF to U+0000 using big-endian
diff --git a/maps/ucs-2-le b/maps/ucs-2-le
new file mode 100644
index 0000000..1ed78ee
--- /dev/null
+++ b/maps/ucs-2-le
@@ -0,0 +1 @@
+map 0x00-0xFF, 0x00-0xFF to U+0000 using little-endian
diff --git a/maps/utf-1 b/maps/utf-1
new file mode 100644
index 0000000..f81a7b6
--- /dev/null
+++ b/maps/utf-1
@@ -0,0 +1,35 @@
+map 0x00-0x9F to U+0000
+
+map 0xA0,\
+ 0xA0-0xFF to U+00A0
+
+map 0xA1-0xF5,\
+ 0x21-0x7E+0xA0-0xFF to U+0100 using big-endian
+
+map 0xF6-0xFB,\
+ 0x21-0x7E+0xA0-0xFF,\
+ 0x21-0x7E+0xA0-0xFF to U+4016 using big-endian
+
+map 0xFC,\
+ 0x21-0x7E+0xA0-0xFF,\
+ 0x21-0x7E+0xA0-0xFF,\
+ 0x21-0x7E+0xA0-0xFF,\
+ 0x21-0x7E+0xA0-0xFF to U+00038E2E using big-endian
+
+map 0xFD,\
+ 0x21-0x7E+0xA0-0xBC,\
+ 0x21-0x7E+0xA0-0xFF,\
+ 0x21-0x7E+0xA0-0xFF,\
+ 0x21-0x7E+0xA0-0xFF to U+4DB0F63E using big-endian
+
+map 0xFD, 0xBC,\
+ 0x21-0x2A,\
+ 0x21-0x7E+0xA0-0xFF,\
+ 0x21-0x7E+0xA0-0xFF to U+7FFA2566 using big-endian
+
+map 0xFD, 0xBC, 0x2B,\
+ 0x21-0x7E+0xA0-0xB8,\
+ 0x21-0x7E+0xA0-0xFF to U+7FFFA78E using big-endian
+
+map 0xFD, 0xBC, 0x2B, 0xB9,\
+ 0x21-0x40 to U+7FFFFFE0
diff --git a/maps/utf-16-be b/maps/utf-16-be
new file mode 100644
index 0000000..7199108
--- /dev/null
+++ b/maps/utf-16-be
@@ -0,0 +1,4 @@
+map 0x00-0xD7, 0x00-0xFF to U+0000 using big-endian
+map 0xE0-0xFF, 0x00-0xFF to U+E000 using big-endian
+map 0xD8-0xDB, 0x00-0xFF,\
+ 0xDC-0xDF, 0x00-0xFF to U+010000 using big-endian
diff --git a/maps/utf-16-le b/maps/utf-16-le
new file mode 100644
index 0000000..0985f1f
--- /dev/null
+++ b/maps/utf-16-le
@@ -0,0 +1,4 @@
+map 0x00-0xFF, 0x00-0xD7 to U+0000 using little-endian
+map 0x00-0xFF, 0xE0-0xFF to U+E000 using little-endian
+map 0x00-0xFF <<< 2, 0xD8-0xDB <<< 3,\
+ 0x00-0xFF <<< 0, 0xDC-0xDF <<< 1 to U+010000
diff --git a/maps/utf-32-be b/maps/utf-32-be
new file mode 100644
index 0000000..27a74ec
--- /dev/null
+++ b/maps/utf-32-be
@@ -0,0 +1 @@
+map 0x00-0x7F, 0x00-0xFF, 0x00-0xFF, 0x00-0xFF to U+00000000 using big-endian
diff --git a/maps/utf-32-le b/maps/utf-32-le
new file mode 100644
index 0000000..994884e
--- /dev/null
+++ b/maps/utf-32-le
@@ -0,0 +1 @@
+map 0x00-0xFF, 0x00-0xFF, 0x00-0xFF, 0x00-0x7F to U+00000000 using little-endian
diff --git a/maps/utf-8 b/maps/utf-8
new file mode 100644
index 0000000..f707552
--- /dev/null
+++ b/maps/utf-8
@@ -0,0 +1,26 @@
+map 0x00-0x7F to U+0000
+
+map 0xC2-0xDF,\
+ 0x80-0xBF to U+0080 using big-endian
+
+map 0xE0-0xEF,\
+ 0x80-0xBF,\
+ 0x80-0xBF to U+0800 using big-endian
+
+map 0xF0-0xF7,\
+ 0x80-0xBF,\
+ 0x80-0xBF,\
+ 0x80-0xBF to U+010000 using big-endian
+
+map 0xF8-0xFB,\
+ 0x80-0xBF,\
+ 0x80-0xBF,\
+ 0x80-0xBF,\
+ 0x80-0xBF to U+200000 using big-endian
+
+map 0xFC-0xFD,\
+ 0x80-0xBF,\
+ 0x80-0xBF,\
+ 0x80-0xBF,\
+ 0x80-0xBF,\
+ 0x80-0xBF to U+04000000 using big-endian
diff --git a/maps/wtf-8 b/maps/wtf-8
new file mode 100644
index 0000000..43757ee
--- /dev/null
+++ b/maps/wtf-8
@@ -0,0 +1,56 @@
+map 0x00-0x7F to U+0000
+
+map 0xC2-0xDF,\
+ 0x80-0xBF to U+0080 using big-endian
+
+map 0xE0-0xEC,\
+ 0x80-0xBF,\
+ 0x80-0xBF to U+0800 using big-endian
+
+map 0xED,\
+ 0x80-0x9F,\
+ 0x80-0xBF to U+D000 using big-endian
+
+map 0xED,\
+ 0xA0-0xAF,\
+ 0x80-0xBF,\
+ 0xED,\
+ 0xB0-0xBF,\
+ 0x80-0xBF to U+010000 using big-endian
+
+map 0xEE-0xEF,\
+ 0x80-0xBF,\
+ 0x80-0xBF to U+E000 using big-endian
+
+readonly map 0xF0-0xF3,\
+ 0x80-0xBF,\
+ 0x80-0xBF,\
+ 0x80-0xBF to U+010000 using big-endian
+
+readonly map 0xF4,\
+ 0x80-0x8F,\
+ 0x80-0xBF,\
+ 0x80-0xBF to U+100000 using big-endian
+
+map 0xF4,\
+ 0x90-0xBF,\
+ 0x80-0xBF,\
+ 0x80-0xBF to U+110000 using big-endian
+
+map 0xF5-0xF7,\
+ 0x80-0xBF,\
+ 0x80-0xBF,\
+ 0x80-0xBF to U+140000 using big-endian
+
+map 0xF8-0xFB,\
+ 0x80-0xBF,\
+ 0x80-0xBF,\
+ 0x80-0xBF,\
+ 0x80-0xBF to U+200000 using big-endian
+
+map 0xFC-0xFD,\
+ 0x80-0xBF,\
+ 0x80-0xBF,\
+ 0x80-0xBF,\
+ 0x80-0xBF,\
+ 0x80-0xBF to U+04000000 using big-endian