/******************************************************************************
|
|
@File PVRTUnicode.cpp
|
|
@Title PVRTUnicode
|
|
@Version @Version
|
|
@Copyright Copyright (c) Imagination Technologies Limited.
|
|
@Platform All
|
|
@Description A small collection of functions used to decode Unicode formats to
|
individual code points.
|
|
******************************************************************************/
|
#include "PVRTUnicode.h"
|
#include <string.h>
|
|
/****************************************************************************
|
** Constants
|
****************************************************************************/
|
const PVRTuint32 c_u32ReplChar = 0xFFFD;
|
|
#define VALID_ASCII 0x80
|
#define TAIL_MASK 0x3F
|
#define BYTES_PER_TAIL 6
|
|
#define UTF16_SURG_H_MARK 0xD800
|
#define UTF16_SURG_H_END 0xDBFF
|
#define UTF16_SURG_L_MARK 0xDC00
|
#define UTF16_SURG_L_END 0xDFFF
|
|
#define UNICODE_NONCHAR_MARK 0xFDD0
|
#define UNICODE_NONCHAR_END 0xFDEF
|
#define UNICODE_RESERVED 0xFFFE
|
#define UNICODE_MAX 0x10FFFF
|
|
#define MAX_LEN 0x8FFF
|
|
/****************************************************************************
|
** A table which allows quick lookup to determine the number of bytes of a
|
** UTF8 code point.
|
****************************************************************************/
|
const PVRTuint8 c_u8UTF8Lengths[256] =
|
{
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,
|
};
|
|
/****************************************************************************
|
** A table which allows quick lookup to determine whether a UTF8 sequence
|
** is 'overlong'.
|
****************************************************************************/
|
const PVRTuint32 c_u32MinVals[4] =
|
{
|
0x00000000, // 0 tail bytes
|
0x00000080, // 1 tail bytes
|
0x00000800, // 2 tail bytes
|
0x00010000, // 3 tail bytes
|
};
|
|
/*!***************************************************************************
|
@Function CheckGenericUnicode
|
@Input c32 A UTF32 character/Unicode code point
|
@Returns Success or failure.
|
@Description Checks that the decoded code point is valid.
|
*****************************************************************************/
|
static bool CheckGenericUnicode(PVRTuint32 c32)
|
{
|
// Check that this value isn't a UTF16 surrogate mask.
|
if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_L_END)
|
return false;
|
// Check non-char values
|
if(c32 >= UNICODE_NONCHAR_MARK && c32 <= UNICODE_NONCHAR_END)
|
return false;
|
// Check reserved values
|
if((c32 & UNICODE_RESERVED) == UNICODE_RESERVED)
|
return false;
|
// Check max value.
|
if(c32 > UNICODE_MAX)
|
return false;
|
|
return true;
|
}
|
|
/*!***************************************************************************
|
@Function PVRTUnicodeUTF8ToUTF32
|
@Input pUTF8 A UTF8 string, which is null terminated.
|
@Output aUTF32 An array of Unicode code points.
|
@Returns Success or failure.
|
@Description Decodes a UTF8-encoded string in to Unicode code points
|
(UTF32). If pUTF8 is not null terminated, the results are
|
undefined.
|
*****************************************************************************/
|
EPVRTError PVRTUnicodeUTF8ToUTF32(const PVRTuint8* const pUTF8, CPVRTArray<PVRTuint32>& aUTF32)
|
{
|
unsigned int uiTailLen, uiIndex;
|
unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
|
PVRTuint32 c32;
|
|
const PVRTuint8* pC = pUTF8;
|
while(*pC)
|
{
|
// Quick optimisation for ASCII characters
|
while(*pC && *pC < VALID_ASCII)
|
{
|
aUTF32.Append(*pC++);
|
}
|
// Done
|
if(!*pC)
|
break;
|
|
c32 = *pC++;
|
uiTailLen = c_u8UTF8Lengths[c32];
|
|
// Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
|
// Also check to make sure the tail length is inside the provided buffer.
|
if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes))
|
return PVR_OVERFLOW;
|
|
c32 &= (TAIL_MASK >> uiTailLen); // Get the data out of the first byte. This depends on the length of the tail.
|
|
// Get the data out of each tail byte
|
uiIndex = 0;
|
while(uiIndex < uiTailLen)
|
{
|
if((pC[uiIndex] & 0xC0) != 0x80)
|
return PVR_FAIL; // Invalid tail byte!
|
|
c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
|
uiIndex++;
|
}
|
|
pC += uiIndex;
|
|
// Check overlong values.
|
if(c32 < c_u32MinVals[uiTailLen])
|
return PVR_FAIL;
|
|
if(!CheckGenericUnicode(c32))
|
return PVR_FAIL;
|
|
// OK
|
aUTF32.Append(c32);
|
}
|
|
return PVR_SUCCESS;
|
}
|
|
/*!***************************************************************************
|
@Function PVRTUnicodeUTF16ToUTF32
|
@Input pUTF16 A UTF16 string, which is null terminated.
|
@Output aUTF32 An array of Unicode code points.
|
@Returns Success or failure.
|
@Description Decodes a UTF16-encoded string in to Unicode code points
|
(UTF32). If pUTF16 is not null terminated, the results are
|
undefined.
|
*****************************************************************************/
|
EPVRTError PVRTUnicodeUTF16ToUTF32(const PVRTuint16* const pUTF16, CPVRTArray<PVRTuint32>& aUTF32)
|
{
|
const PVRTuint16* pC = pUTF16;
|
|
// Determine the number of shorts
|
while(*++pC && (pC - pUTF16) < MAX_LEN);
|
unsigned int uiBufferLen = (unsigned int) (pC - pUTF16);
|
|
if(uiBufferLen == MAX_LEN)
|
return PVR_OVERFLOW; // Probably not NULL terminated.
|
|
// Reset to start.
|
pC = pUTF16;
|
|
PVRTuint32 c32;
|
while(*pC)
|
{
|
// Straight copy. We'll check for surrogate pairs next...
|
c32 = *pC++;
|
|
// Check surrogate pair
|
if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_H_END)
|
{
|
// Make sure the next 2 bytes are in range...
|
if(pC + 1 > pUTF16 + uiBufferLen || *pC == 0)
|
return PVR_OVERFLOW;
|
|
// Check that the next value is in the low surrogate range
|
if(*pC < UTF16_SURG_L_MARK || *pC > UTF16_SURG_L_END)
|
return PVR_FAIL;
|
|
// Decode
|
c32 = ((c32 - UTF16_SURG_H_MARK) << 10) + (*pC - UTF16_SURG_L_MARK) + 0x10000;
|
pC++;
|
}
|
|
if(!CheckGenericUnicode(c32))
|
return PVR_FAIL;
|
|
// OK
|
aUTF32.Append(c32);
|
}
|
|
return PVR_SUCCESS;
|
}
|
|
/*!***************************************************************************
|
@Function PVRTUnicodeUTF8Length
|
@Input pUTF8 A UTF8 string, which is null terminated.
|
@Returns The length of the string, in Unicode code points.
|
@Description Calculates the length of a UTF8 string. If pUTF8 is
|
not null terminated, the results are undefined.
|
*****************************************************************************/
|
unsigned int PVRTUnicodeUTF8Length(const PVRTuint8* const pUTF8)
|
{
|
const PVRTuint8* pC = pUTF8;
|
|
unsigned int charCount = 0;
|
unsigned int mask;
|
while(*pC)
|
{
|
// Quick optimisation for ASCII characters
|
const PVRTuint8* pStart = pC;
|
while(*pC && *pC < VALID_ASCII)
|
pC++;
|
|
charCount += (unsigned int) (pC - pStart);
|
|
// Done
|
if(!*pC)
|
break;
|
|
mask = *pC & 0xF0;
|
switch(mask)
|
{
|
case 0xF0: pC++;
|
case 0xE0: pC++;
|
case 0xC0: pC++;
|
break;
|
default:
|
_ASSERT(!"Invalid tail byte!");
|
return 0;
|
}
|
|
pC++;
|
charCount++;
|
}
|
|
return charCount;
|
}
|
|
/*!***************************************************************************
|
@Function PVRTUnicodeUTF16Length
|
@Input pUTF16 A UTF16 string, which is null terminated.
|
@Returns The length of the string, in Unicode code points.
|
@Description Calculates the length of a UTF16 string.
|
If pUTF16 is not null terminated, the results are
|
undefined.
|
*****************************************************************************/
|
unsigned int PVRTUnicodeUTF16Length(const PVRTuint16* const pUTF16)
|
{
|
const PVRTuint16* pC = pUTF16;
|
unsigned int charCount = 0;
|
while(*pC && (pC - pUTF16) < MAX_LEN)
|
{
|
if( pC[0] >= UTF16_SURG_H_MARK && pC[0] <= UTF16_SURG_H_END
|
&& pC[1] >= UTF16_SURG_L_MARK && pC[0] <= UTF16_SURG_L_END)
|
{
|
pC += 2;
|
}
|
else
|
{
|
pC += 1;
|
}
|
|
charCount++;
|
}
|
|
return charCount;
|
}
|
|
/*!***************************************************************************
|
@Function PVRTUnicodeValidUTF8
|
@Input pUTF8 A UTF8 string, which is null terminated.
|
@Returns true or false
|
@Description Checks whether the encoding of a UTF8 string is valid.
|
If pUTF8 is not null terminated, the results are undefined.
|
*****************************************************************************/
|
bool PVRTUnicodeValidUTF8(const PVRTuint8* const pUTF8)
|
{
|
unsigned int uiTailLen, uiIndex;
|
unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
|
const PVRTuint8* pC = pUTF8;
|
while(*pC)
|
{
|
// Quick optimisation for ASCII characters
|
while(*pC && *pC < VALID_ASCII) pC++;
|
// Done?
|
if(!*pC)
|
break;
|
|
PVRTuint32 c32 = *pC++;
|
uiTailLen = c_u8UTF8Lengths[c32];
|
|
// Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
|
// Also check to make sure the tail length is inside the provided buffer.
|
if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes))
|
return false;
|
|
// Get the data out of each tail byte
|
uiIndex = 0;
|
while(uiIndex < uiTailLen)
|
{
|
if((pC[uiIndex] & 0xC0) != 0x80)
|
return false; // Invalid tail byte!
|
|
c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
|
uiIndex++;
|
}
|
|
pC += uiIndex;
|
|
// Check overlong values.
|
if(c32 < c_u32MinVals[uiTailLen])
|
return false;
|
if(!CheckGenericUnicode(c32))
|
return false;
|
}
|
|
return true;
|
}
|
|
/*****************************************************************************
|
End of file (PVRTUnicode.cpp)
|
*****************************************************************************/
|