~lzh/A133.git

/******************************************************************************
 
 @File         PVRTUnicode.cpp
 
 @Title        PVRTUnicode
 
 @Version       @Version      
 
 @Copyright    Copyright (c) Imagination Technologies Limited.
 
 @Platform     All
 
 @Description  A small collection of functions used to decode Unicode formats to
               individual code points.
 
******************************************************************************/
#include "PVRTUnicode.h"
#include <string.h>
 
/****************************************************************************
** Constants
****************************************************************************/
const PVRTuint32 c_u32ReplChar = 0xFFFD;
 
#define VALID_ASCII 0x80
#define TAIL_MASK 0x3F
#define BYTES_PER_TAIL 6
 
#define UTF16_SURG_H_MARK 0xD800
#define UTF16_SURG_H_END  0xDBFF
#define UTF16_SURG_L_MARK 0xDC00
#define UTF16_SURG_L_END  0xDFFF
 
#define UNICODE_NONCHAR_MARK 0xFDD0
#define UNICODE_NONCHAR_END  0xFDEF
#define UNICODE_RESERVED     0xFFFE
#define UNICODE_MAX             0x10FFFF
 
#define MAX_LEN 0x8FFF
 
/****************************************************************************
** A table which allows quick lookup to determine the number of bytes of a 
** UTF8 code point.
****************************************************************************/
const PVRTuint8 c_u8UTF8Lengths[256] = 
{
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,
};
 
/****************************************************************************
** A table which allows quick lookup to determine whether a UTF8 sequence
** is 'overlong'.
****************************************************************************/
const PVRTuint32 c_u32MinVals[4] =
{
    0x00000000,        // 0 tail bytes
    0x00000080,        // 1 tail bytes
    0x00000800,        // 2 tail bytes
    0x00010000,        // 3 tail bytes
};
 
/*!***************************************************************************
 @Function            CheckGenericUnicode
 @Input                c32            A UTF32 character/Unicode code point
 @Returns            Success or failure. 
 @Description        Checks that the decoded code point is valid.
*****************************************************************************/
static bool CheckGenericUnicode(PVRTuint32 c32)
{
    // Check that this value isn't a UTF16 surrogate mask.
    if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_L_END)
        return false;
    // Check non-char values
    if(c32 >= UNICODE_NONCHAR_MARK && c32 <= UNICODE_NONCHAR_END)
        return false;
    // Check reserved values
    if((c32 & UNICODE_RESERVED) == UNICODE_RESERVED)
        return false;
    // Check max value.
    if(c32 > UNICODE_MAX)
        return false;
 
    return true;
}
 
/*!***************************************************************************
 @Function            PVRTUnicodeUTF8ToUTF32
 @Input                pUTF8            A UTF8 string, which is null terminated.
 @Output            aUTF32            An array of Unicode code points.
 @Returns            Success or failure. 
 @Description        Decodes a UTF8-encoded string in to Unicode code points
                    (UTF32). If pUTF8 is not null terminated, the results are 
                    undefined.
*****************************************************************************/
EPVRTError PVRTUnicodeUTF8ToUTF32(const PVRTuint8* const pUTF8, CPVRTArray<PVRTuint32>& aUTF32)                        
{
    unsigned int uiTailLen, uiIndex;
    unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
    PVRTuint32 c32;
 
    const PVRTuint8* pC = pUTF8;
    while(*pC)
    {
        // Quick optimisation for ASCII characters
        while(*pC && *pC < VALID_ASCII)
        {
            aUTF32.Append(*pC++);
        }
        // Done
        if(!*pC)            
            break;
 
        c32 = *pC++;
        uiTailLen = c_u8UTF8Lengths[c32];
 
        // Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
        // Also check to make sure the tail length is inside the provided buffer.
        if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes))
            return PVR_OVERFLOW;
 
        c32 &= (TAIL_MASK >> uiTailLen);    // Get the data out of the first byte. This depends on the length of the tail.
 
        // Get the data out of each tail byte
        uiIndex = 0;
        while(uiIndex < uiTailLen)
        {
            if((pC[uiIndex] & 0xC0) != 0x80)
                return PVR_FAIL;        // Invalid tail byte!
 
            c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
            uiIndex++;
        }
 
        pC += uiIndex;
 
        // Check overlong values.
        if(c32 < c_u32MinVals[uiTailLen])
            return PVR_FAIL;        
        
        if(!CheckGenericUnicode(c32))
            return PVR_FAIL;
 
        // OK
        aUTF32.Append(c32);
    }
 
    return PVR_SUCCESS;
}
 
/*!***************************************************************************
 @Function            PVRTUnicodeUTF16ToUTF32
 @Input                pUTF16            A UTF16 string, which is null terminated.
 @Output            aUTF32            An array of Unicode code points.
 @Returns            Success or failure. 
 @Description        Decodes a UTF16-encoded string in to Unicode code points
                    (UTF32). If pUTF16 is not null terminated, the results are 
                    undefined.
*****************************************************************************/
EPVRTError PVRTUnicodeUTF16ToUTF32(const PVRTuint16* const pUTF16, CPVRTArray<PVRTuint32>& aUTF32)
{
    const PVRTuint16* pC = pUTF16;
 
    // Determine the number of shorts
    while(*++pC && (pC - pUTF16) < MAX_LEN);
    unsigned int uiBufferLen = (unsigned int) (pC - pUTF16);
 
    if(uiBufferLen == MAX_LEN)
        return PVR_OVERFLOW;        // Probably not NULL terminated.    
 
    // Reset to start.
    pC = pUTF16;
 
    PVRTuint32 c32;
    while(*pC)
    {
        // Straight copy. We'll check for surrogate pairs next...
        c32 = *pC++;
 
        // Check surrogate pair
        if(c32 >= UTF16_SURG_H_MARK && c32 <= UTF16_SURG_H_END)
        {
            // Make sure the next 2 bytes are in range...
            if(pC + 1 > pUTF16 + uiBufferLen || *pC == 0)
                return PVR_OVERFLOW;
 
            // Check that the next value is in the low surrogate range
            if(*pC < UTF16_SURG_L_MARK || *pC > UTF16_SURG_L_END)
                return PVR_FAIL;
 
            // Decode
            c32 = ((c32 - UTF16_SURG_H_MARK) << 10) + (*pC - UTF16_SURG_L_MARK) + 0x10000;
            pC++;
        }
 
        if(!CheckGenericUnicode(c32))
            return PVR_FAIL;
 
        // OK
        aUTF32.Append(c32);
    }
 
    return PVR_SUCCESS;
}
 
/*!***************************************************************************
 @Function            PVRTUnicodeUTF8Length
 @Input                pUTF8            A UTF8 string, which is null terminated.
 @Returns            The length of the string, in Unicode code points.
 @Description        Calculates the length of a UTF8 string. If pUTF8 is 
                    not null terminated, the results are undefined.
*****************************************************************************/
unsigned int PVRTUnicodeUTF8Length(const PVRTuint8* const pUTF8)
{
    const PVRTuint8* pC = pUTF8;
 
    unsigned int charCount = 0;
    unsigned int mask;
    while(*pC)
    {
        // Quick optimisation for ASCII characters
        const PVRTuint8* pStart = pC;
        while(*pC && *pC < VALID_ASCII)
            pC++;
 
        charCount += (unsigned int) (pC - pStart);
 
        // Done
        if(!*pC)    
            break;
        
        mask = *pC & 0xF0;
        switch(mask)
        {
        case 0xF0: pC++;
        case 0xE0: pC++;
        case 0xC0: pC++;
            break;
        default:
            _ASSERT(!"Invalid tail byte!");
            return 0;
        }
 
        pC++;
        charCount++;
    }
 
    return charCount;
}
 
/*!***************************************************************************
 @Function            PVRTUnicodeUTF16Length
 @Input                pUTF16            A UTF16 string, which is null terminated.
 @Returns            The length of the string, in Unicode code points.
 @Description        Calculates the length of a UTF16 string.
                    If pUTF16 is not null terminated, the results are 
                    undefined.
*****************************************************************************/
unsigned int PVRTUnicodeUTF16Length(const PVRTuint16* const pUTF16)
{
    const PVRTuint16* pC = pUTF16;    
    unsigned int charCount = 0;
    while(*pC && (pC - pUTF16) < MAX_LEN)
    {
        if(    pC[0] >= UTF16_SURG_H_MARK && pC[0] <= UTF16_SURG_H_END
         && pC[1] >= UTF16_SURG_L_MARK && pC[0] <= UTF16_SURG_L_END)
        {
            pC += 2;
        }
        else
        {
            pC += 1;
        }
 
        charCount++;
    }
 
    return charCount;
}
 
/*!***************************************************************************
 @Function            PVRTUnicodeValidUTF8
 @Input                pUTF8            A UTF8 string, which is null terminated.
 @Returns            true or false
 @Description        Checks whether the encoding of a UTF8 string is valid.
                    If pUTF8 is not null terminated, the results are undefined.
*****************************************************************************/
bool PVRTUnicodeValidUTF8(const PVRTuint8* const pUTF8)
{
    unsigned int uiTailLen, uiIndex;
    unsigned int uiBytes = (unsigned int) strlen((const char*)pUTF8);
    const PVRTuint8* pC = pUTF8;
    while(*pC)
    {
        // Quick optimisation for ASCII characters
        while(*pC && *pC < VALID_ASCII)    pC++;
        // Done?
        if(!*pC)            
            break;
 
        PVRTuint32 c32 = *pC++;
        uiTailLen = c_u8UTF8Lengths[c32];
 
        // Check for invalid tail length. Maximum 4 bytes for each UTF8 character.
        // Also check to make sure the tail length is inside the provided buffer.
        if(uiTailLen == 0 || (pC + uiTailLen > pUTF8 + uiBytes))
            return false;
 
        // Get the data out of each tail byte
        uiIndex = 0;
        while(uiIndex < uiTailLen)
        {
            if((pC[uiIndex] & 0xC0) != 0x80)
                return false;        // Invalid tail byte!
            
            c32 = (c32 << BYTES_PER_TAIL) + (pC[uiIndex] & TAIL_MASK);
            uiIndex++;
        }
        
        pC += uiIndex;
 
        // Check overlong values.
        if(c32 < c_u32MinVals[uiTailLen])
            return false;        
        if(!CheckGenericUnicode(c32))
            return false;
    }
 
    return true;
}
 
/*****************************************************************************
 End of file (PVRTUnicode.cpp)
*****************************************************************************/