unicode.c (9406B)
1/* 2 * unicode.c 3 * 4 * PURPOSE 5 * Routines for converting between UTF-8 and OSTA Compressed Unicode. 6 * Also handles filename mangling 7 * 8 * DESCRIPTION 9 * OSTA Compressed Unicode is explained in the OSTA UDF specification. 10 * http://www.osta.org/ 11 * UTF-8 is explained in the IETF RFC XXXX. 12 * ftp://ftp.internic.net/rfc/rfcxxxx.txt 13 * 14 * COPYRIGHT 15 * This file is distributed under the terms of the GNU General Public 16 * License (GPL). Copies of the GPL can be obtained from: 17 * ftp://prep.ai.mit.edu/pub/gnu/GPL 18 * Each contributing author retains all rights to their own work. 19 */ 20 21#include "udfdecl.h" 22 23#include <linux/kernel.h> 24#include <linux/string.h> /* for memset */ 25#include <linux/nls.h> 26#include <linux/crc-itu-t.h> 27#include <linux/slab.h> 28 29#include "udf_sb.h" 30 31#define PLANE_SIZE 0x10000 32#define UNICODE_MAX 0x10ffff 33#define SURROGATE_MASK 0xfffff800 34#define SURROGATE_PAIR 0x0000d800 35#define SURROGATE_LOW 0x00000400 36#define SURROGATE_CHAR_BITS 10 37#define SURROGATE_CHAR_MASK ((1 << SURROGATE_CHAR_BITS) - 1) 38 39#define ILLEGAL_CHAR_MARK '_' 40#define EXT_MARK '.' 41#define CRC_MARK '#' 42#define EXT_SIZE 5 43/* Number of chars we need to store generated CRC to make filename unique */ 44#define CRC_LEN 5 45 46static unicode_t get_utf16_char(const uint8_t *str_i, int str_i_max_len, 47 int str_i_idx, int u_ch, unicode_t *ret) 48{ 49 unicode_t c; 50 int start_idx = str_i_idx; 51 52 /* Expand OSTA compressed Unicode to Unicode */ 53 c = str_i[str_i_idx++]; 54 if (u_ch > 1) 55 c = (c << 8) | str_i[str_i_idx++]; 56 if ((c & SURROGATE_MASK) == SURROGATE_PAIR) { 57 unicode_t next; 58 59 /* Trailing surrogate char */ 60 if (str_i_idx >= str_i_max_len) { 61 c = UNICODE_MAX + 1; 62 goto out; 63 } 64 65 /* Low surrogate must follow the high one... */ 66 if (c & SURROGATE_LOW) { 67 c = UNICODE_MAX + 1; 68 goto out; 69 } 70 71 WARN_ON_ONCE(u_ch != 2); 72 next = str_i[str_i_idx++] << 8; 73 next |= str_i[str_i_idx++]; 74 if ((next & SURROGATE_MASK) != SURROGATE_PAIR || 75 !(next & SURROGATE_LOW)) { 76 c = UNICODE_MAX + 1; 77 goto out; 78 } 79 80 c = PLANE_SIZE + 81 ((c & SURROGATE_CHAR_MASK) << SURROGATE_CHAR_BITS) + 82 (next & SURROGATE_CHAR_MASK); 83 } 84out: 85 *ret = c; 86 return str_i_idx - start_idx; 87} 88 89 90static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len, 91 int *str_o_idx, 92 const uint8_t *str_i, int str_i_max_len, 93 int *str_i_idx, 94 int u_ch, int *needsCRC, 95 int (*conv_f)(wchar_t, unsigned char *, int), 96 int translate) 97{ 98 unicode_t c; 99 int illChar = 0; 100 int len, gotch = 0; 101 102 while (!gotch && *str_i_idx < str_i_max_len) { 103 if (*str_o_idx >= str_o_max_len) { 104 *needsCRC = 1; 105 return gotch; 106 } 107 108 len = get_utf16_char(str_i, str_i_max_len, *str_i_idx, u_ch, 109 &c); 110 /* These chars cannot be converted. Replace them. */ 111 if (c == 0 || c > UNICODE_MAX || (conv_f && c > MAX_WCHAR_T) || 112 (translate && c == '/')) { 113 illChar = 1; 114 if (!translate) 115 gotch = 1; 116 } else if (illChar) 117 break; 118 else 119 gotch = 1; 120 *str_i_idx += len; 121 } 122 if (illChar) { 123 *needsCRC = 1; 124 c = ILLEGAL_CHAR_MARK; 125 gotch = 1; 126 } 127 if (gotch) { 128 if (conv_f) { 129 len = conv_f(c, &str_o[*str_o_idx], 130 str_o_max_len - *str_o_idx); 131 } else { 132 len = utf32_to_utf8(c, &str_o[*str_o_idx], 133 str_o_max_len - *str_o_idx); 134 if (len < 0) 135 len = -ENAMETOOLONG; 136 } 137 /* Valid character? */ 138 if (len >= 0) 139 *str_o_idx += len; 140 else if (len == -ENAMETOOLONG) { 141 *needsCRC = 1; 142 gotch = 0; 143 } else { 144 str_o[(*str_o_idx)++] = ILLEGAL_CHAR_MARK; 145 *needsCRC = 1; 146 } 147 } 148 return gotch; 149} 150 151static int udf_name_from_CS0(struct super_block *sb, 152 uint8_t *str_o, int str_max_len, 153 const uint8_t *ocu, int ocu_len, 154 int translate) 155{ 156 uint32_t c; 157 uint8_t cmp_id; 158 int idx, len; 159 int u_ch; 160 int needsCRC = 0; 161 int ext_i_len, ext_max_len; 162 int str_o_len = 0; /* Length of resulting output */ 163 int ext_o_len = 0; /* Extension output length */ 164 int ext_crc_len = 0; /* Extension output length if used with CRC */ 165 int i_ext = -1; /* Extension position in input buffer */ 166 int o_crc = 0; /* Rightmost possible output pos for CRC+ext */ 167 unsigned short valueCRC; 168 uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1]; 169 uint8_t crc[CRC_LEN]; 170 int (*conv_f)(wchar_t, unsigned char *, int); 171 172 if (str_max_len <= 0) 173 return 0; 174 175 if (ocu_len == 0) { 176 memset(str_o, 0, str_max_len); 177 return 0; 178 } 179 180 if (UDF_SB(sb)->s_nls_map) 181 conv_f = UDF_SB(sb)->s_nls_map->uni2char; 182 else 183 conv_f = NULL; 184 185 cmp_id = ocu[0]; 186 if (cmp_id != 8 && cmp_id != 16) { 187 memset(str_o, 0, str_max_len); 188 pr_err("unknown compression code (%u)\n", cmp_id); 189 return -EINVAL; 190 } 191 u_ch = cmp_id >> 3; 192 193 ocu++; 194 ocu_len--; 195 196 if (ocu_len % u_ch) { 197 pr_err("incorrect filename length (%d)\n", ocu_len + 1); 198 return -EINVAL; 199 } 200 201 if (translate) { 202 /* Look for extension */ 203 for (idx = ocu_len - u_ch, ext_i_len = 0; 204 (idx >= 0) && (ext_i_len < EXT_SIZE); 205 idx -= u_ch, ext_i_len++) { 206 c = ocu[idx]; 207 if (u_ch > 1) 208 c = (c << 8) | ocu[idx + 1]; 209 210 if (c == EXT_MARK) { 211 if (ext_i_len) 212 i_ext = idx; 213 break; 214 } 215 } 216 if (i_ext >= 0) { 217 /* Convert extension */ 218 ext_max_len = min_t(int, sizeof(ext), str_max_len); 219 ext[ext_o_len++] = EXT_MARK; 220 idx = i_ext + u_ch; 221 while (udf_name_conv_char(ext, ext_max_len, &ext_o_len, 222 ocu, ocu_len, &idx, 223 u_ch, &needsCRC, 224 conv_f, translate)) { 225 if ((ext_o_len + CRC_LEN) < str_max_len) 226 ext_crc_len = ext_o_len; 227 } 228 } 229 } 230 231 idx = 0; 232 while (1) { 233 if (translate && (idx == i_ext)) { 234 if (str_o_len > (str_max_len - ext_o_len)) 235 needsCRC = 1; 236 break; 237 } 238 239 if (!udf_name_conv_char(str_o, str_max_len, &str_o_len, 240 ocu, ocu_len, &idx, 241 u_ch, &needsCRC, conv_f, translate)) 242 break; 243 244 if (translate && 245 (str_o_len <= (str_max_len - ext_o_len - CRC_LEN))) 246 o_crc = str_o_len; 247 } 248 249 if (translate) { 250 if (str_o_len <= 2 && str_o[0] == '.' && 251 (str_o_len == 1 || str_o[1] == '.')) 252 needsCRC = 1; 253 if (needsCRC) { 254 str_o_len = o_crc; 255 valueCRC = crc_itu_t(0, ocu, ocu_len); 256 crc[0] = CRC_MARK; 257 crc[1] = hex_asc_upper_hi(valueCRC >> 8); 258 crc[2] = hex_asc_upper_lo(valueCRC >> 8); 259 crc[3] = hex_asc_upper_hi(valueCRC); 260 crc[4] = hex_asc_upper_lo(valueCRC); 261 len = min_t(int, CRC_LEN, str_max_len - str_o_len); 262 memcpy(&str_o[str_o_len], crc, len); 263 str_o_len += len; 264 ext_o_len = ext_crc_len; 265 } 266 if (ext_o_len > 0) { 267 memcpy(&str_o[str_o_len], ext, ext_o_len); 268 str_o_len += ext_o_len; 269 } 270 } 271 272 return str_o_len; 273} 274 275static int udf_name_to_CS0(struct super_block *sb, 276 uint8_t *ocu, int ocu_max_len, 277 const uint8_t *str_i, int str_len) 278{ 279 int i, len; 280 unsigned int max_val; 281 int u_len, u_ch; 282 unicode_t uni_char; 283 int (*conv_f)(const unsigned char *, int, wchar_t *); 284 285 if (ocu_max_len <= 0) 286 return 0; 287 288 if (UDF_SB(sb)->s_nls_map) 289 conv_f = UDF_SB(sb)->s_nls_map->char2uni; 290 else 291 conv_f = NULL; 292 293 memset(ocu, 0, ocu_max_len); 294 ocu[0] = 8; 295 max_val = 0xff; 296 u_ch = 1; 297 298try_again: 299 u_len = 1; 300 for (i = 0; i < str_len; i += len) { 301 /* Name didn't fit? */ 302 if (u_len + u_ch > ocu_max_len) 303 return 0; 304 if (conv_f) { 305 wchar_t wchar; 306 307 len = conv_f(&str_i[i], str_len - i, &wchar); 308 if (len > 0) 309 uni_char = wchar; 310 } else { 311 len = utf8_to_utf32(&str_i[i], str_len - i, 312 &uni_char); 313 } 314 /* Invalid character, deal with it */ 315 if (len <= 0 || uni_char > UNICODE_MAX) { 316 len = 1; 317 uni_char = '?'; 318 } 319 320 if (uni_char > max_val) { 321 unicode_t c; 322 323 if (max_val == 0xff) { 324 max_val = 0xffff; 325 ocu[0] = 0x10; 326 u_ch = 2; 327 goto try_again; 328 } 329 /* 330 * Use UTF-16 encoding for chars outside we 331 * cannot encode directly. 332 */ 333 if (u_len + 2 * u_ch > ocu_max_len) 334 return 0; 335 336 uni_char -= PLANE_SIZE; 337 c = SURROGATE_PAIR | 338 ((uni_char >> SURROGATE_CHAR_BITS) & 339 SURROGATE_CHAR_MASK); 340 ocu[u_len++] = (uint8_t)(c >> 8); 341 ocu[u_len++] = (uint8_t)(c & 0xff); 342 uni_char = SURROGATE_PAIR | SURROGATE_LOW | 343 (uni_char & SURROGATE_CHAR_MASK); 344 } 345 346 if (max_val == 0xffff) 347 ocu[u_len++] = (uint8_t)(uni_char >> 8); 348 ocu[u_len++] = (uint8_t)(uni_char & 0xff); 349 } 350 351 return u_len; 352} 353 354/* 355 * Convert CS0 dstring to output charset. Warning: This function may truncate 356 * input string if it is too long as it is used for informational strings only 357 * and it is better to truncate the string than to refuse mounting a media. 358 */ 359int udf_dstrCS0toChar(struct super_block *sb, uint8_t *utf_o, int o_len, 360 const uint8_t *ocu_i, int i_len) 361{ 362 int s_len = 0; 363 364 if (i_len > 0) { 365 s_len = ocu_i[i_len - 1]; 366 if (s_len >= i_len) { 367 pr_warn("incorrect dstring lengths (%d/%d)," 368 " truncating\n", s_len, i_len); 369 s_len = i_len - 1; 370 /* 2-byte encoding? Need to round properly... */ 371 if (ocu_i[0] == 16) 372 s_len -= (s_len - 1) & 2; 373 } 374 } 375 376 return udf_name_from_CS0(sb, utf_o, o_len, ocu_i, s_len, 0); 377} 378 379int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen, 380 uint8_t *dname, int dlen) 381{ 382 int ret; 383 384 if (!slen) 385 return -EIO; 386 387 if (dlen <= 0) 388 return 0; 389 390 ret = udf_name_from_CS0(sb, dname, dlen, sname, slen, 1); 391 /* Zero length filename isn't valid... */ 392 if (ret == 0) 393 ret = -EINVAL; 394 return ret; 395} 396 397int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen, 398 uint8_t *dname, int dlen) 399{ 400 return udf_name_to_CS0(sb, dname, dlen, sname, slen); 401} 402