C语言判断chat*是否是标准UTF8String, 同时把不标准部分剔除。

#define UNDEFINECHAR (0x2e)

int IsUTF8V2(const void* pBuffer, int size)

{

    int IsUTF8 = WK_TRUE;

    unsigned char* start = (unsigned char*)pBuffer;

    unsigned char* end = (unsigned char*)pBuffer + size;

    

    while (start < end)

    {

        if ( (*start &0x80) == 0 ) // (10000000): value less then 0x80 ASCII char

        {

            start++;

        }

        else if ((*start & 0xe0 ) == 0xc0) // (11000000): 2 bytes UTF-8 char

        {

            if (start >= end - 1)

            {

                while( start < end)

                {

                    *start = UNDEFINECHAR;

                    start++;

                }

            }

            if ((start[1] & (0xC0)) != 0x80)

            {

                start[0] = UNDEFINECHAR;

                //start[1] = UNDEFINECHAR;

                start += 1;

                continue;

            }

            start += 2;

        }

        else if ( ((*start) &0xf0) == 0xe0 )  // (11100000): 3 bytes UTF-8 char

        {

            if (start >= end - 2)

            {

                while( start < end)

                {

                    *start = UNDEFINECHAR;

                    start++;

                }

                break;

            }

            

            if ((start[1] & (0xC0)) != 0x80 || (start[2] & (0xC0)) != 0x80)

            {

                start[0] = UNDEFINECHAR;

                // start[1] = UNDEFINECHAR;

                // start[2] = UNDEFINECHAR;

                start += 1;

                continue;

            }

            start += 3;

        }

        else

        {

            *start = UNDEFINECHAR;

            start ++;

        }    

    }    

    return IsUTF8;    

}