513 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
		
		
			
		
	
	
			513 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
|  | /* Copyright  (C) 2010-2020 The RetroArch team
 | ||
|  |  * | ||
|  |  * --------------------------------------------------------------------------------------- | ||
|  |  * The following license statement only applies to this file (encoding_utf.c). | ||
|  |  * --------------------------------------------------------------------------------------- | ||
|  |  * | ||
|  |  * Permission is hereby granted, free of charge, | ||
|  |  * to any person obtaining a copy of this software and associated documentation files (the "Software"), | ||
|  |  * to deal in the Software without restriction, including without limitation the rights to | ||
|  |  * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, | ||
|  |  * and to permit persons to whom the Software is furnished to do so, subject to the following conditions: | ||
|  |  * | ||
|  |  * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. | ||
|  |  * | ||
|  |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, | ||
|  |  * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
|  |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | ||
|  |  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | ||
|  |  * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
|  |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
|  |  */ | ||
|  | 
 | ||
|  | #include <stdint.h>
 | ||
|  | #include <stdlib.h>
 | ||
|  | #include <stddef.h>
 | ||
|  | #include <string.h>
 | ||
|  | 
 | ||
|  | #include <boolean.h>
 | ||
|  | #include <compat/strl.h>
 | ||
|  | #include <retro_inline.h>
 | ||
|  | 
 | ||
|  | #include <encodings/utf.h>
 | ||
|  | 
 | ||
|  | #if defined(_WIN32) && !defined(_XBOX)
 | ||
|  | #include <windows.h>
 | ||
|  | #elif defined(_XBOX)
 | ||
|  | #include <xtl.h>
 | ||
|  | #endif
 | ||
|  | 
 | ||
|  | #define UTF8_WALKBYTE(string) (*((*(string))++))
 | ||
|  | 
 | ||
|  | static unsigned leading_ones(uint8_t c) | ||
|  | { | ||
|  |    unsigned ones = 0; | ||
|  |    while (c & 0x80) | ||
|  |    { | ||
|  |       ones++; | ||
|  |       c <<= 1; | ||
|  |    } | ||
|  | 
 | ||
|  |    return ones; | ||
|  | } | ||
|  | 
 | ||
|  | /* Simple implementation. Assumes the sequence is
 | ||
|  |  * properly synchronized and terminated. */ | ||
|  | 
 | ||
|  | size_t utf8_conv_utf32(uint32_t *out, size_t out_chars, | ||
|  |       const char *in, size_t in_size) | ||
|  | { | ||
|  |    unsigned i; | ||
|  |    size_t ret = 0; | ||
|  |    while (in_size && out_chars) | ||
|  |    { | ||
|  |       unsigned extra, shift; | ||
|  |       uint32_t c; | ||
|  |       uint8_t first = *in++; | ||
|  |       unsigned ones = leading_ones(first); | ||
|  | 
 | ||
|  |       if (ones > 6 || ones == 1) /* Invalid or desync. */ | ||
|  |          break; | ||
|  | 
 | ||
|  |       extra = ones ? ones - 1 : ones; | ||
|  |       if (1 + extra > in_size) /* Overflow. */ | ||
|  |          break; | ||
|  | 
 | ||
|  |       shift = (extra - 1) * 6; | ||
|  |       c     = (first & ((1 << (7 - ones)) - 1)) << (6 * extra); | ||
|  | 
 | ||
|  |       for (i = 0; i < extra; i++, in++, shift -= 6) | ||
|  |          c |= (*in & 0x3f) << shift; | ||
|  | 
 | ||
|  |       *out++ = c; | ||
|  |       in_size -= 1 + extra; | ||
|  |       out_chars--; | ||
|  |       ret++; | ||
|  |    } | ||
|  | 
 | ||
|  |    return ret; | ||
|  | } | ||
|  | 
 | ||
|  | bool utf16_conv_utf8(uint8_t *out, size_t *out_chars, | ||
|  |      const uint16_t *in, size_t in_size) | ||
|  | { | ||
|  |    size_t out_pos            = 0; | ||
|  |    size_t in_pos             = 0; | ||
|  |    static const  | ||
|  |       uint8_t utf8_limits[5] = { 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; | ||
|  | 
 | ||
|  |    for (;;) | ||
|  |    { | ||
|  |       unsigned num_adds; | ||
|  |       uint32_t value; | ||
|  | 
 | ||
|  |       if (in_pos == in_size) | ||
|  |       { | ||
|  |          *out_chars = out_pos; | ||
|  |          return true; | ||
|  |       } | ||
|  |       value = in[in_pos++]; | ||
|  |       if (value < 0x80) | ||
|  |       { | ||
|  |          if (out) | ||
|  |             out[out_pos] = (char)value; | ||
|  |          out_pos++; | ||
|  |          continue; | ||
|  |       } | ||
|  | 
 | ||
|  |       if (value >= 0xD800 && value < 0xE000) | ||
|  |       { | ||
|  |          uint32_t c2; | ||
|  | 
 | ||
|  |          if (value >= 0xDC00 || in_pos == in_size) | ||
|  |             break; | ||
|  |          c2 = in[in_pos++]; | ||
|  |          if (c2 < 0xDC00 || c2 >= 0xE000) | ||
|  |             break; | ||
|  |          value = (((value - 0xD800) << 10) | (c2 - 0xDC00)) + 0x10000; | ||
|  |       } | ||
|  | 
 | ||
|  |       for (num_adds = 1; num_adds < 5; num_adds++) | ||
|  |          if (value < (((uint32_t)1) << (num_adds * 5 + 6))) | ||
|  |             break; | ||
|  |       if (out) | ||
|  |          out[out_pos] = (char)(utf8_limits[num_adds - 1] | ||
|  |                + (value >> (6 * num_adds))); | ||
|  |       out_pos++; | ||
|  |       do | ||
|  |       { | ||
|  |          num_adds--; | ||
|  |          if (out) | ||
|  |             out[out_pos] = (char)(0x80 | ||
|  |                   + ((value >> (6 * num_adds)) & 0x3F)); | ||
|  |          out_pos++; | ||
|  |       }while (num_adds != 0); | ||
|  |    } | ||
|  | 
 | ||
|  |    *out_chars = out_pos; | ||
|  |    return false; | ||
|  | } | ||
|  | 
 | ||
|  | /* Acts mostly like strlcpy.
 | ||
|  |  * | ||
|  |  * Copies the given number of UTF-8 characters, | ||
|  |  * but at most d_len bytes. | ||
|  |  * | ||
|  |  * Always NULL terminates. | ||
|  |  * Does not copy half a character. | ||
|  |  * | ||
|  |  * Returns number of bytes. 's' is assumed valid UTF-8. | ||
|  |  * Use only if 'chars' is considerably less than 'd_len'. */ | ||
|  | size_t utf8cpy(char *d, size_t d_len, const char *s, size_t chars) | ||
|  | { | ||
|  |    const uint8_t *sb     = (const uint8_t*)s; | ||
|  |    const uint8_t *sb_org = sb; | ||
|  | 
 | ||
|  |    if (!s) | ||
|  |       return 0; | ||
|  | 
 | ||
|  |    while (*sb && chars-- > 0) | ||
|  |    { | ||
|  |       sb++; | ||
|  |       while ((*sb & 0xC0) == 0x80) | ||
|  |          sb++; | ||
|  |    } | ||
|  | 
 | ||
|  |    if ((size_t)(sb - sb_org) > d_len-1 /* NUL */) | ||
|  |    { | ||
|  |       sb = sb_org + d_len-1; | ||
|  |       while ((*sb & 0xC0) == 0x80) | ||
|  |          sb--; | ||
|  |    } | ||
|  | 
 | ||
|  |    memcpy(d, sb_org, sb-sb_org); | ||
|  |    d[sb-sb_org] = '\0'; | ||
|  | 
 | ||
|  |    return sb-sb_org; | ||
|  | } | ||
|  | 
 | ||
|  | const char *utf8skip(const char *str, size_t chars) | ||
|  | { | ||
|  |    const uint8_t *strb = (const uint8_t*)str; | ||
|  | 
 | ||
|  |    if (!chars) | ||
|  |       return str; | ||
|  | 
 | ||
|  |    do | ||
|  |    { | ||
|  |       strb++; | ||
|  |       while ((*strb & 0xC0)==0x80) | ||
|  |          strb++; | ||
|  |       chars--; | ||
|  |    }while (chars); | ||
|  | 
 | ||
|  |    return (const char*)strb; | ||
|  | } | ||
|  | 
 | ||
|  | size_t utf8len(const char *string) | ||
|  | { | ||
|  |    size_t ret = 0; | ||
|  | 
 | ||
|  |    if (!string) | ||
|  |       return 0; | ||
|  | 
 | ||
|  |    while (*string) | ||
|  |    { | ||
|  |       if ((*string & 0xC0) != 0x80) | ||
|  |          ret++; | ||
|  |       string++; | ||
|  |    } | ||
|  |    return ret; | ||
|  | } | ||
|  | 
 | ||
|  | /* Does not validate the input, returns garbage if it's not UTF-8. */ | ||
|  | uint32_t utf8_walk(const char **string) | ||
|  | { | ||
|  |    uint8_t first = UTF8_WALKBYTE(string); | ||
|  |    uint32_t ret  = 0; | ||
|  | 
 | ||
|  |    if (first < 128) | ||
|  |       return first; | ||
|  | 
 | ||
|  |    ret    = (ret << 6) | (UTF8_WALKBYTE(string) & 0x3F); | ||
|  |    if (first >= 0xE0) | ||
|  |    { | ||
|  |       ret = (ret << 6) | (UTF8_WALKBYTE(string) & 0x3F); | ||
|  |       if (first >= 0xF0) | ||
|  |       { | ||
|  |          ret = (ret << 6) | (UTF8_WALKBYTE(string) & 0x3F); | ||
|  |          return ret | (first & 7) << 18; | ||
|  |       } | ||
|  |       return ret | (first & 15) << 12; | ||
|  |    } | ||
|  | 
 | ||
|  |    return ret | (first & 31) << 6; | ||
|  | } | ||
|  | 
 | ||
|  | static bool utf16_to_char(uint8_t **utf_data, | ||
|  |       size_t *dest_len, const uint16_t *in) | ||
|  | { | ||
|  |    unsigned len    = 0; | ||
|  | 
 | ||
|  |    while (in[len] != '\0') | ||
|  |       len++; | ||
|  | 
 | ||
|  |    utf16_conv_utf8(NULL, dest_len, in, len); | ||
|  |    *dest_len  += 1; | ||
|  |    *utf_data   = (uint8_t*)malloc(*dest_len); | ||
|  |    if (*utf_data == 0) | ||
|  |       return false; | ||
|  | 
 | ||
|  |    return utf16_conv_utf8(*utf_data, dest_len, in, len); | ||
|  | } | ||
|  | 
 | ||
|  | bool utf16_to_char_string(const uint16_t *in, char *s, size_t len) | ||
|  | { | ||
|  |    size_t     dest_len  = 0; | ||
|  |    uint8_t *utf16_data  = NULL; | ||
|  |    bool            ret  = utf16_to_char(&utf16_data, &dest_len, in); | ||
|  | 
 | ||
|  |    if (ret) | ||
|  |    { | ||
|  |       utf16_data[dest_len] = 0; | ||
|  |       strlcpy(s, (const char*)utf16_data, len); | ||
|  |    } | ||
|  | 
 | ||
|  |    free(utf16_data); | ||
|  |    utf16_data = NULL; | ||
|  | 
 | ||
|  |    return ret; | ||
|  | } | ||
|  | 
 | ||
|  | #if defined(_WIN32) && !defined(_XBOX) && !defined(UNICODE)
 | ||
|  | /* Returned pointer MUST be freed by the caller if non-NULL. */ | ||
|  | static char *mb_to_mb_string_alloc(const char *str, | ||
|  |       enum CodePage cp_in, enum CodePage cp_out) | ||
|  | { | ||
|  |    wchar_t *path_buf_wide = NULL; | ||
|  |    int path_buf_wide_len  = MultiByteToWideChar(cp_in, 0, str, -1, NULL, 0); | ||
|  | 
 | ||
|  |    /* Windows 95 will return 0 from these functions with 
 | ||
|  |     * a UTF8 codepage set without MSLU. | ||
|  |     * | ||
|  |     * From an unknown MSDN version (others omit this info): | ||
|  |     *   - CP_UTF8 Windows 98/Me, Windows NT 4.0 and later:  | ||
|  |     *   Translate using UTF-8. When this is set, dwFlags must be zero. | ||
|  |     *   - Windows 95: Under the Microsoft Layer for Unicode,  | ||
|  |     *   MultiByteToWideChar also supports CP_UTF7 and CP_UTF8. | ||
|  |     */ | ||
|  | 
 | ||
|  |    if (!path_buf_wide_len) | ||
|  |       return strdup(str); | ||
|  | 
 | ||
|  |    path_buf_wide = (wchar_t*) | ||
|  |       calloc(path_buf_wide_len + sizeof(wchar_t), sizeof(wchar_t)); | ||
|  | 
 | ||
|  |    if (path_buf_wide) | ||
|  |    { | ||
|  |       MultiByteToWideChar(cp_in, 0, | ||
|  |             str, -1, path_buf_wide, path_buf_wide_len); | ||
|  | 
 | ||
|  |       if (*path_buf_wide) | ||
|  |       { | ||
|  |          int path_buf_len = WideCharToMultiByte(cp_out, 0, | ||
|  |                path_buf_wide, -1, NULL, 0, NULL, NULL); | ||
|  | 
 | ||
|  |          if (path_buf_len) | ||
|  |          { | ||
|  |             char *path_buf = (char*) | ||
|  |                calloc(path_buf_len + sizeof(char), sizeof(char)); | ||
|  | 
 | ||
|  |             if (path_buf) | ||
|  |             { | ||
|  |                WideCharToMultiByte(cp_out, 0, | ||
|  |                      path_buf_wide, -1, path_buf, | ||
|  |                      path_buf_len, NULL, NULL); | ||
|  | 
 | ||
|  |                free(path_buf_wide); | ||
|  | 
 | ||
|  |                if (*path_buf) | ||
|  |                   return path_buf; | ||
|  | 
 | ||
|  |                free(path_buf); | ||
|  |                return NULL; | ||
|  |             } | ||
|  |          } | ||
|  |          else | ||
|  |          { | ||
|  |             free(path_buf_wide); | ||
|  |             return strdup(str); | ||
|  |          } | ||
|  |       } | ||
|  | 
 | ||
|  |       free(path_buf_wide); | ||
|  |    } | ||
|  | 
 | ||
|  |    return NULL; | ||
|  | } | ||
|  | #endif
 | ||
|  | 
 | ||
|  | /* Returned pointer MUST be freed by the caller if non-NULL. */ | ||
|  | char* utf8_to_local_string_alloc(const char *str) | ||
|  | { | ||
|  |    if (str && *str) | ||
|  |    { | ||
|  | #if defined(_WIN32) && !defined(_XBOX) && !defined(UNICODE)
 | ||
|  |       return mb_to_mb_string_alloc(str, CODEPAGE_UTF8, CODEPAGE_LOCAL); | ||
|  | #else
 | ||
|  |       /* assume string needs no modification if not on Windows */ | ||
|  |       return strdup(str); | ||
|  | #endif
 | ||
|  |    } | ||
|  |    return NULL; | ||
|  | } | ||
|  | 
 | ||
|  | /* Returned pointer MUST be freed by the caller if non-NULL. */ | ||
|  | char* local_to_utf8_string_alloc(const char *str) | ||
|  | { | ||
|  |    if (str && *str) | ||
|  |    { | ||
|  | #if defined(_WIN32) && !defined(_XBOX) && !defined(UNICODE)
 | ||
|  |       return mb_to_mb_string_alloc(str, CODEPAGE_LOCAL, CODEPAGE_UTF8); | ||
|  | #else
 | ||
|  |       /* assume string needs no modification if not on Windows */ | ||
|  |       return strdup(str); | ||
|  | #endif
 | ||
|  |    } | ||
|  |    return NULL; | ||
|  | } | ||
|  | 
 | ||
|  | /* Returned pointer MUST be freed by the caller if non-NULL. */ | ||
|  | wchar_t* utf8_to_utf16_string_alloc(const char *str) | ||
|  | { | ||
|  | #ifdef _WIN32
 | ||
|  |    int len        = 0; | ||
|  |    int out_len    = 0; | ||
|  | #else
 | ||
|  |    size_t len     = 0; | ||
|  |    size_t out_len = 0; | ||
|  | #endif
 | ||
|  |    wchar_t *buf   = NULL; | ||
|  | 
 | ||
|  |    if (!str || !*str) | ||
|  |       return NULL; | ||
|  | 
 | ||
|  | #ifdef _WIN32
 | ||
|  |    len = MultiByteToWideChar(CP_UTF8, 0, str, -1, NULL, 0); | ||
|  | 
 | ||
|  |    if (len) | ||
|  |    { | ||
|  |       buf = (wchar_t*)calloc(len, sizeof(wchar_t)); | ||
|  | 
 | ||
|  |       if (!buf) | ||
|  |          return NULL; | ||
|  | 
 | ||
|  |       out_len = MultiByteToWideChar(CP_UTF8, 0, str, -1, buf, len); | ||
|  |    } | ||
|  |    else | ||
|  |    { | ||
|  |       /* fallback to ANSI codepage instead */ | ||
|  |       len = MultiByteToWideChar(CP_ACP, 0, str, -1, NULL, 0); | ||
|  | 
 | ||
|  |       if (len) | ||
|  |       { | ||
|  |          buf = (wchar_t*)calloc(len, sizeof(wchar_t)); | ||
|  | 
 | ||
|  |          if (!buf) | ||
|  |             return NULL; | ||
|  | 
 | ||
|  |          out_len = MultiByteToWideChar(CP_ACP, 0, str, -1, buf, len); | ||
|  |       } | ||
|  |    } | ||
|  | 
 | ||
|  |    if (out_len < 0) | ||
|  |    { | ||
|  |       free(buf); | ||
|  |       return NULL; | ||
|  |    } | ||
|  | #else
 | ||
|  |    /* NOTE: For now, assume non-Windows platforms' locale is already UTF-8. */ | ||
|  |    len = mbstowcs(NULL, str, 0) + 1; | ||
|  | 
 | ||
|  |    if (len) | ||
|  |    { | ||
|  |       buf = (wchar_t*)calloc(len, sizeof(wchar_t)); | ||
|  | 
 | ||
|  |       if (!buf) | ||
|  |          return NULL; | ||
|  | 
 | ||
|  |       out_len = mbstowcs(buf, str, len); | ||
|  |    } | ||
|  | 
 | ||
|  |    if (out_len == (size_t)-1) | ||
|  |    { | ||
|  |       free(buf); | ||
|  |       return NULL; | ||
|  |    } | ||
|  | #endif
 | ||
|  | 
 | ||
|  |    return buf; | ||
|  | } | ||
|  | 
 | ||
|  | /* Returned pointer MUST be freed by the caller if non-NULL. */ | ||
|  | char* utf16_to_utf8_string_alloc(const wchar_t *str) | ||
|  | { | ||
|  | #ifdef _WIN32
 | ||
|  |    int len        = 0; | ||
|  | #else
 | ||
|  |    size_t len     = 0; | ||
|  | #endif
 | ||
|  |    char *buf      = NULL; | ||
|  | 
 | ||
|  |    if (!str || !*str) | ||
|  |       return NULL; | ||
|  | 
 | ||
|  | #ifdef _WIN32
 | ||
|  |    { | ||
|  |       UINT code_page = CP_UTF8; | ||
|  |       len            = WideCharToMultiByte(code_page, | ||
|  |             0, str, -1, NULL, 0, NULL, NULL); | ||
|  | 
 | ||
|  |       /* fallback to ANSI codepage instead */ | ||
|  |       if (!len) | ||
|  |       { | ||
|  |          code_page   = CP_ACP; | ||
|  |          len         = WideCharToMultiByte(code_page, | ||
|  |                0, str, -1, NULL, 0, NULL, NULL); | ||
|  |       } | ||
|  | 
 | ||
|  |       buf = (char*)calloc(len, sizeof(char)); | ||
|  | 
 | ||
|  |       if (!buf) | ||
|  |          return NULL; | ||
|  | 
 | ||
|  |       if (WideCharToMultiByte(code_page, | ||
|  |             0, str, -1, buf, len, NULL, NULL) < 0) | ||
|  |       { | ||
|  |          free(buf); | ||
|  |          return NULL; | ||
|  |       } | ||
|  |    } | ||
|  | #else
 | ||
|  |    /* NOTE: For now, assume non-Windows platforms' 
 | ||
|  |     * locale is already UTF-8. */ | ||
|  |    len = wcstombs(NULL, str, 0) + 1; | ||
|  | 
 | ||
|  |    if (len) | ||
|  |    { | ||
|  |       buf = (char*)calloc(len, sizeof(char)); | ||
|  | 
 | ||
|  |       if (!buf) | ||
|  |          return NULL; | ||
|  | 
 | ||
|  |       if (wcstombs(buf, str, len) == (size_t)-1) | ||
|  |       { | ||
|  |          free(buf); | ||
|  |          return NULL; | ||
|  |       } | ||
|  |    } | ||
|  | #endif
 | ||
|  | 
 | ||
|  |    return buf; | ||
|  | } |