Adding utility functions to convert invalid utf8 to wtf8 encoding

This is to deal with windows clients who pass in cp1252 as if it
were utf8
This commit is contained in:
Kristján Valur Jónsson
2019-02-15 15:57:03 +00:00
parent 951761ee2c
commit ca2d1d873d
4 changed files with 160 additions and 0 deletions

View File

@ -451,6 +451,96 @@ unsigned char* s3fs_decode64(const char* input, size_t* plength)
return result;
}
/* handle invalid utf8 by creating surrogate escape pairs.
* this converts the data into the so-called wtf-8 encoding.
* It is necessary if we are given data that isn't proper utf8
* but the aws api requires proper utf8 for object names
*/
string s3fs_surrogateescape(const string &s)
{
// Pass valid utf8 code through
string result;
for (unsigned i = 0; i < s.length(); i++) {
unsigned char c = s[i];
// single byte encoding
if (c <= 0x7f) {
result += c;
continue;
}
// two byte encoding
if ((c & 0xe0) == 0xc0) {
if ((i + 1) < s.length() && (s[i+1] & 0xc0) == 0x80) {
// printf("two bytes %02x at %d\n", c, i);
result += c;
result += s[++i];
continue;
}
}
// three byte encoding
if ((c & 0xf0) == 0xe0) {
if ((i + 2) < s.length() && (s[i+1] & 0xc0) == 0x80 && (s[i+2] & 0xc0) == 0x80) {
// printf("three bytes %02x at %d\n", c, i);
result += c;
result += s[++i];
result += s[++i];
continue;
}
}
// four byte encoding
if ((c & 0xf8) == 0xf0) {
if ((i + 3) < s.length() && (s[i+1] & 0xc0) == 0x80 && (s[i+2] & 0xc0) == 0x80 && (s[i+3] & 0xc0) == 0x80) {
// printf("four bytes %02x at %d\n", c, i);
result += c;
result += s[++i];
result += s[++i];
result += s[++i];
continue;
}
}
// printf("invalid %02x at %d\n", c, i);
// Invalid utf8 code. Convert to the surrogate pair (also known as wtf-8 encoding)
// we use lone surrogates, UDC80-UDCFF for this.
// if the byte is below 128, we cannot do this so we just pass the byte through and hope
// for the best, but really, this should be an error
if (c < 128) {
result += c;
continue;
}
// output the lone surrogate as utf8 encoded. This is a three byte utf8 encoding:
unsigned surr = 0xdc00 + c;
result += 0xe0 | ((surr >> 12) & 0x0f);
result += 0x80 | ((surr >> 06) & 0x3f);
result += 0x80 | ((surr >> 00) & 0x3f);
}
return result;
}
string s3fs_surrogatedecode(const string &s)
{
// the reverse operation. Look for lone surrogates and replace them
string result;
for (unsigned i = 0; i < s.length(); i++) {
unsigned char c = s[i];
// look for a three byte encoding matching a lone surrogate
// three byte encoding
if ((c & 0xf0) == 0xe0) {
if ((i + 2) < s.length() && (s[i+1] & 0xc0) == 0x80 && (s[i+2] & 0xc0) == 0x80) {
unsigned surr = (c & 0x0f) << 12;
surr |= (s[i+1] & 0x3f) << 6;
surr |= (s[i+2] & 0x3f) << 0;
if (surr >= 0xdc80 && surr <= 0xdcff) {
// convert back
result += surr & 0xff;
i+=2;
continue;
}
}
}
result += c;
}
return result;
}
/*
* Local variables:
* tab-width: 4