/**
|
* \file unicode.c
|
*
|
* This file contains general Unicode string manipulation functions.
|
* It mainly consist of functions for converting between UCS-2 (used on
|
* the devices) and UTF-8 (used by several applications).
|
*
|
* For a deeper understanding of Unicode encoding formats see the
|
* Wikipedia entries for
|
* <a href="http://en.wikipedia.org/wiki/UTF-16/UCS-2">UTF-16/UCS-2</a>
|
* and <a href="http://en.wikipedia.org/wiki/UTF-8">UTF-8</a>.
|
*
|
* Copyright (C) 2005-2009 Linus Walleij <triad@df.lth.se>
|
*
|
* This library is free software; you can redistribute it and/or
|
* modify it under the terms of the GNU Lesser General Public
|
* License as published by the Free Software Foundation; either
|
* version 2 of the License, or (at your option) any later version.
|
*
|
* This library is distributed in the hope that it will be useful,
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
* Lesser General Public License for more details.
|
*
|
* You should have received a copy of the GNU Lesser General Public
|
* License along with this library; if not, write to the
|
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
* Boston, MA 02111-1307, USA.
|
*
|
*/
|
|
#include "config.h"
|
|
#include <stdlib.h>
|
#include <string.h>
|
#ifdef HAVE_ICONV
|
#include "iconv.h"
|
#else
|
#error "libmtp unicode.c needs fixing to work without iconv()!"
|
#endif
|
#include "libmtp.h"
|
#include "unicode.h"
|
#include "util.h"
|
#include "ptp.h"
|
|
/**
|
* The size of the buffer (in characters) used for creating string copies.
|
*/
|
#define STRING_BUFFER_LENGTH 1024
|
|
/**
|
* Gets the length (in characters, not bytes) of a unicode
|
* UCS-2 string, eg a string which physically is 0x00 0x41 0x00 0x00
|
* will return a value of 1.
|
*
|
* @param unicstr a UCS-2 Unicode string
|
* @return the length of the string, in number of characters. If you
|
* want to know the length in bytes, multiply this by two and
|
* add two (for zero terminator).
|
*/
|
int ucs2_strlen(uint16_t const * const unicstr)
|
{
|
int length;
|
|
/* Unicode strings are terminated with 2 * 0x00 */
|
for(length = 0; unicstr[length] != 0x0000U; length ++);
|
return length;
|
}
|
|
/**
|
* Converts a big-endian UTF-16 2-byte string
|
* to a UTF-8 string. Actually just a UCS-2 internal conversion
|
* routine that strips off the BOM if there is one.
|
*
|
* @param device a pointer to the current device.
|
* @param unicstr the UTF-16 unicode string to convert
|
* @return a UTF-8 string.
|
*/
|
char *utf16_to_utf8(LIBMTP_mtpdevice_t *device, const uint16_t *unicstr)
|
{
|
PTPParams *params = (PTPParams *) device->params;
|
char *stringp = (char *) unicstr;
|
char loclstr[STRING_BUFFER_LENGTH*3+1]; // UTF-8 encoding is max 3 bytes per UCS2 char.
|
char *locp = loclstr;
|
size_t nconv;
|
size_t convlen = (ucs2_strlen(unicstr)+1) * sizeof(uint16_t); // UCS-2 is 16 bit wide, include terminator
|
size_t convmax = STRING_BUFFER_LENGTH*3;
|
|
loclstr[0]='\0';
|
/* Do the conversion. */
|
nconv = iconv(params->cd_ucs2_to_locale, &stringp, &convlen, &locp, &convmax);
|
if (nconv == (size_t) -1) {
|
// Return partial string anyway.
|
*locp = '\0';
|
}
|
loclstr[STRING_BUFFER_LENGTH*3] = '\0';
|
// Strip off any BOM, it's totally useless...
|
if ((uint8_t) loclstr[0] == 0xEFU && (uint8_t) loclstr[1] == 0xBBU && (uint8_t) loclstr[2] == 0xBFU) {
|
return strdup(loclstr+3);
|
}
|
return strdup(loclstr);
|
}
|
|
/**
|
* Converts a UTF-8 string to a big-endian UTF-16 2-byte string
|
* Actually just a UCS-2 internal conversion.
|
*
|
* @param device a pointer to the current device.
|
* @param localstr the UTF-8 unicode string to convert
|
* @return a UTF-16 string.
|
*/
|
uint16_t *utf8_to_utf16(LIBMTP_mtpdevice_t *device, const char *localstr)
|
{
|
PTPParams *params = (PTPParams *) device->params;
|
char *stringp = (char *) localstr; // cast away "const"
|
char unicstr[(STRING_BUFFER_LENGTH+1)*2]; // UCS2 encoding is 2 bytes per UTF-8 char.
|
char *unip = unicstr;
|
size_t nconv = 0;
|
size_t convlen = strlen(localstr)+1; // utf8 bytes, include terminator
|
size_t convmax = STRING_BUFFER_LENGTH*2;
|
|
unicstr[0]='\0';
|
unicstr[1]='\0';
|
|
/* Do the conversion. */
|
nconv = iconv(params->cd_locale_to_ucs2, &stringp, &convlen, &unip, &convmax);
|
|
if (nconv == (size_t) -1) {
|
// Return partial string anyway.
|
unip[0] = '\0';
|
unip[1] = '\0';
|
}
|
// make sure the string is null terminated
|
unicstr[STRING_BUFFER_LENGTH*2] = '\0';
|
unicstr[STRING_BUFFER_LENGTH*2+1] = '\0';
|
|
// allocate the string to be returned
|
// Note: can't use strdup since every other byte is a null byte
|
int ret_len = ucs2_strlen((uint16_t*)unicstr)*sizeof(uint16_t)+2;
|
uint16_t* ret = malloc(ret_len);
|
memcpy(ret,unicstr,(size_t)ret_len);
|
return ret;
|
}
|
|
/**
|
* This helper function simply removes any consecutive chars
|
* > 0x7F and replace then with an underscore. In UTF-8
|
* consequtive chars > 0x7F represent one single character so
|
* it has to be done like this (and it's elegant). It will only
|
* shrink the string in size so no copying is needed.
|
*/
|
void strip_7bit_from_utf8(char *str)
|
{
|
int i,j,k;
|
i = 0;
|
j = 0;
|
k = strlen(str);
|
while (i < k) {
|
if ((uint8_t) str[i] > 0x7FU) {
|
str[j] = '_';
|
i++;
|
// Skip over any consequtive > 0x7F chars.
|
while((uint8_t) str[i] > 0x7FU) {
|
i++;
|
}
|
} else {
|
str[j] = str[i];
|
i++;
|
}
|
j++;
|
}
|
// Terminate stripped string...
|
str[j] = '\0';
|
}
|