From ca2d1d873d8f32bba857742f7af879511d2276d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kristj=C3=A1n=20Valur=20J=C3=B3nsson?= Date: Fri, 15 Feb 2019 15:57:03 +0000 Subject: [PATCH] Adding utility functions to convert invalid utf8 to wtf8 encoding This is to deal with windows clients who pass in cp1252 as if it were utf8 --- src/string_util.cpp | 90 ++++++++++++++++++++++++++++++++++++++++ src/string_util.h | 3 ++ src/test_string_util.cpp | 25 +++++++++++ src/test_util.h | 42 +++++++++++++++++++ 4 files changed, 160 insertions(+) diff --git a/src/string_util.cpp b/src/string_util.cpp index 9522b1e..f1b1369 100644 --- a/src/string_util.cpp +++ b/src/string_util.cpp @@ -451,6 +451,96 @@ unsigned char* s3fs_decode64(const char* input, size_t* plength) return result; } +/* handle invalid utf8 by creating surrogate escape pairs. + * this converts the data into the so-called wtf-8 encoding. + * It is necessary if we are given data that isn't proper utf8 + * but the aws api requires proper utf8 for object names + */ +string s3fs_surrogateescape(const string &s) +{ + // Pass valid utf8 code through + string result; + for (unsigned i = 0; i < s.length(); i++) { + unsigned char c = s[i]; + // single byte encoding + if (c <= 0x7f) { + result += c; + continue; + } + // two byte encoding + if ((c & 0xe0) == 0xc0) { + if ((i + 1) < s.length() && (s[i+1] & 0xc0) == 0x80) { + // printf("two bytes %02x at %d\n", c, i); + result += c; + result += s[++i]; + continue; + } + } + // three byte encoding + if ((c & 0xf0) == 0xe0) { + if ((i + 2) < s.length() && (s[i+1] & 0xc0) == 0x80 && (s[i+2] & 0xc0) == 0x80) { + // printf("three bytes %02x at %d\n", c, i); + result += c; + result += s[++i]; + result += s[++i]; + continue; + } + } + // four byte encoding + if ((c & 0xf8) == 0xf0) { + if ((i + 3) < s.length() && (s[i+1] & 0xc0) == 0x80 && (s[i+2] & 0xc0) == 0x80 && (s[i+3] & 0xc0) == 0x80) { + // printf("four bytes %02x at %d\n", c, i); + result += c; + result += s[++i]; + result += s[++i]; + result += s[++i]; + continue; + } + } + // printf("invalid %02x at %d\n", c, i); + // Invalid utf8 code. Convert to the surrogate pair (also known as wtf-8 encoding) + // we use lone surrogates, UDC80-UDCFF for this. + // if the byte is below 128, we cannot do this so we just pass the byte through and hope + // for the best, but really, this should be an error + if (c < 128) { + result += c; + continue; + } + // output the lone surrogate as utf8 encoded. This is a three byte utf8 encoding: + unsigned surr = 0xdc00 + c; + result += 0xe0 | ((surr >> 12) & 0x0f); + result += 0x80 | ((surr >> 06) & 0x3f); + result += 0x80 | ((surr >> 00) & 0x3f); + } + return result; +} + +string s3fs_surrogatedecode(const string &s) +{ + // the reverse operation. Look for lone surrogates and replace them + string result; + for (unsigned i = 0; i < s.length(); i++) { + unsigned char c = s[i]; + // look for a three byte encoding matching a lone surrogate + // three byte encoding + if ((c & 0xf0) == 0xe0) { + if ((i + 2) < s.length() && (s[i+1] & 0xc0) == 0x80 && (s[i+2] & 0xc0) == 0x80) { + unsigned surr = (c & 0x0f) << 12; + surr |= (s[i+1] & 0x3f) << 6; + surr |= (s[i+2] & 0x3f) << 0; + if (surr >= 0xdc80 && surr <= 0xdcff) { + // convert back + result += surr & 0xff; + i+=2; + continue; + } + } + } + result += c; + } + return result; +} + /* * Local variables: * tab-width: 4 diff --git a/src/string_util.h b/src/string_util.h index cf0a7c8..5de44d4 100644 --- a/src/string_util.h +++ b/src/string_util.h @@ -58,6 +58,9 @@ std::string s3fs_hex(const unsigned char* input, size_t length); char* s3fs_base64(const unsigned char* input, size_t length); unsigned char* s3fs_decode64(const char* input, size_t* plength); +std::string s3fs_surrogateescape(const std::string &s); +std::string s3fs_surrogatedecode(const std::string &s); + #endif // S3FS_STRING_UTIL_H_ /* diff --git a/src/test_string_util.cpp b/src/test_string_util.cpp index 09d7ae7..b36b0df 100644 --- a/src/test_string_util.cpp +++ b/src/test_string_util.cpp @@ -87,10 +87,35 @@ void test_strtoofft() ASSERT_EQUALS(s3fs_strtoofft("deadbeef", /*is_base_16=*/ true), static_cast(3735928559L)); } +void test_surrogateescape() +{ + std::string ascii("normal string"); + std::string utf8("Hyld\xc3\xbdpi \xc3\xbej\xc3\xb3\xc3\xb0""f\xc3\xa9lagsins vex \xc3\xbar k\xc3\xa6rkomnu b\xc3\xb6li \xc3\xad \xc3\xa1st"); + std::string cp1252("Hyld\xfdpi \xfej\xf3\xf0""f\xe9lagsins vex \xfar k\xe6rkomnu b\xf6li \xed \xe1st"); + std::string broken = utf8; + broken[14] = 0x97; + std::string mixed = ascii + utf8 + cp1252; + + ASSERT_EQUALS(s3fs_surrogateescape(ascii), ascii); + ASSERT_EQUALS(s3fs_surrogatedecode(ascii), ascii); + ASSERT_EQUALS(s3fs_surrogateescape(utf8), utf8); + ASSERT_EQUALS(s3fs_surrogatedecode(utf8), utf8); + + ASSERT_NEQUALS(s3fs_surrogateescape(cp1252), cp1252); + ASSERT_EQUALS(s3fs_surrogatedecode(s3fs_surrogateescape(cp1252)), cp1252); + + ASSERT_NEQUALS(s3fs_surrogateescape(broken), broken); + ASSERT_EQUALS(s3fs_surrogatedecode(s3fs_surrogateescape(broken)), broken); + + ASSERT_NEQUALS(s3fs_surrogateescape(mixed), mixed); + ASSERT_EQUALS(s3fs_surrogatedecode(s3fs_surrogateescape(mixed)), mixed); +} + int main(int argc, char *argv[]) { test_trim(); test_base64(); test_strtoofft(); + test_surrogateescape(); return 0; } diff --git a/src/test_util.h b/src/test_util.h index 9a83edd..f405297 100644 --- a/src/test_util.h +++ b/src/test_util.h @@ -20,11 +20,50 @@ #include #include +#include template void assert_equals(const T &x, const T &y, const char *file, int line) { if (x != y) { std::cerr << x << " != " << y << " at " << file << ":" << line << std::endl; + std::cerr << std::endl; + std::exit(1); + } +} + +template <> void assert_equals(const std::string &x, const std::string &y, const char *file, int line) +{ + if (x != y) { + std::cerr << x << " != " << y << " at " << file << ":" << line << std::endl; + for (unsigned i=0; i void assert_nequals(const T &x, const T &y, const char *file, int line) +{ + if (x == y) { + std::cerr << x << " == " << y << " at " << file << ":" << line << std::endl; + std::exit(1); + } +} + +template <> void assert_nequals(const std::string &x, const std::string &y, const char *file, int line) +{ + if (x == y) { + std::cerr << x << " == " << y << " at " << file << ":" << line << std::endl; + for (unsigned i=0; i