add support for BOM-determined-endian UCS2, UTF-16, and UTF-32 to iconv

previously, the charset names without endianness specified were always interpreted as big endian. unicode specifies that UTF-16 and UTF-32 have BOM-determined endianness if BOM is present, and are otherwise big endian. since commit 5b546faa67544af395d6407553762b37e9711157 added support for stateful encodings, it is now possible to implement BOM support via the conversion descriptor state. for conversions to these charsets, the output is always big endian and does not have a BOM.
author: Rich Felker <dalias@aerifal.cx> 2017-12-18 22:08:54 -0500
committer: Rich Felker <dalias@aerifal.cx> 2017-12-18 22:31:18 -0500
commit: 95c6044e2ae85846330814c4ac5ebf4102dbe02c (patch)
tree: af2c6f65ebcb37cca79713f72a678faf54117e9f /src
parent: 9d4d0ee41b06acf68dac40332f53be7bfbde7404 (diff)
download: musl-95c6044e2ae85846330814c4ac5ebf4102dbe02c.tar.gz
1 files changed, 40 insertions, 3 deletions
diff --git a/src/locale/iconv.c b/src/locale/iconv.c
index 1784dc9d..c5dd122f 100644
--- a/src/locale/iconv.c
+++ b/src/locale/iconv.c
@@ -16,6 +16,9 @@
 #define WCHAR_T     0306
 #define US_ASCII    0307
 #define UTF_8       0310
+#define UTF_16      0312
+#define UTF_32      0313
+#define UCS2        0314
 #define EUC_JP      0320
 #define SHIFT_JIS   0321
 #define ISO2022_JP  0322
@@ -35,13 +38,16 @@
 static const unsigned char charmaps[] =
 "utf8\0char\0\0\310"
 "wchart\0\0\306"
-"ucs2\0ucs2be\0\0\304"
+"ucs2be\0\0\304"
 "ucs2le\0\0\305"
-"utf16\0utf16be\0\0\302"
+"utf16be\0\0\302"
 "utf16le\0\0\301"
-"ucs4\0ucs4be\0utf32\0utf32be\0\0\300"
+"ucs4be\0utf32be\0\0\300"
 "ucs4le\0utf32le\0\0\303"
 "ascii\0usascii\0iso646\0iso646us\0\0\307"
+"utf16\0\0\312"
+"ucs4\0utf32\0\0\313"
+"ucs2\0\0\314"
 "eucjp\0\0\320"
 "shiftjis\0sjis\0\0\321"
 "iso2022jp\0\0\322"
@@ -145,6 +151,9 @@ iconv_t iconv_open(const char *to, const char *from)
 	iconv_t cd = combine_to_from(t, f);
 
 	switch (charmaps[f]) {
+	case UTF_16:
+	case UTF_32:
+	case UCS2:
 	case ISO2022_JP:
 		scd = malloc(sizeof *scd);
 		if (!scd) return (iconv_t)-1;
@@ -285,6 +294,31 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri
 				c = ((c-0xd7c0)<<10) + (d-0xdc00);
 			}
 			break;
+		case UCS2:
+		case UTF_16:
+			l = 0;
+			if (!scd->state) {
+				if (*inb < 2) goto starved;
+				c = get_16((void *)*in, 0);
+				scd->state = type==UCS2
+					? c==0xfffe ? UCS2LE : UCS2BE
+					: c==0xfffe ? UTF_16LE : UTF_16BE;
+				if (c == 0xfffe || c == 0xfeff)
+					l = 2;
+			}
+			type = scd->state;
+			continue;
+		case UTF_32:
+			l = 0;
+			if (!scd->state) {
+				if (*inb < 4) goto starved;
+				c = get_32((void *)*in, 0);
+				scd->state = c==0xfffe0000 ? UTF_32LE : UTF_32BE;
+				if (c == 0xfffe0000 || c == 0xfeff)
+					l = 4;
+			}
+			type = scd->state;
+			continue;
 		case SHIFT_JIS:
 			if (c < 128) break;
 			if (c-0xa1 <= 0xdf-0xa1) {
@@ -589,8 +623,11 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri
 			*(*out)++ = 'B';
 			*outb -= 8;
 			break;
+		case UCS2:
+			totype = UCS2BE;
 		case UCS2BE:
 		case UCS2LE:
+		case UTF_16:
 		case UTF_16BE:
 		case UTF_16LE:
 			if (c < 0x10000 || type-UCS2BE < 2U) {
author	Rich Felker <dalias@aerifal.cx>	2017-12-18 22:08:54 -0500
committer	Rich Felker <dalias@aerifal.cx>	2017-12-18 22:31:18 -0500
commit	95c6044e2ae85846330814c4ac5ebf4102dbe02c (patch)
tree	af2c6f65ebcb37cca79713f72a678faf54117e9f /src
parent	9d4d0ee41b06acf68dac40332f53be7bfbde7404 (diff)
download	musl-95c6044e2ae85846330814c4ac5ebf4102dbe02c.tar.gz