diff --git a/.gitmodules b/.gitmodules index b62ed1f30..40b89e748 100644 --- a/.gitmodules +++ b/.gitmodules @@ -28,3 +28,7 @@ [submodule "third-party/googletest"] path = third-party/googletest url = https://github.com/google/googletest.git +[submodule "third-party/utfcpp"] + branch = post-3.2.1-transmission + path = third-party/utfcpp + url = https://github.com/transmission/utfcpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 69a55ca56..5e4304384 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -142,6 +142,7 @@ if(WIN32) endforeach() endif() +find_package(UtfCpp) find_package(Threads) find_package(PkgConfig QUIET) diff --git a/Transmission.xcodeproj/project.pbxproj b/Transmission.xcodeproj/project.pbxproj index 5f428ad20..74c4e8e4d 100644 --- a/Transmission.xcodeproj/project.pbxproj +++ b/Transmission.xcodeproj/project.pbxproj @@ -206,8 +206,6 @@ A2A1CB7A0BF29D5500AE959F /* PeerProgressIndicatorCell.mm in Sources */ = {isa = PBXBuildFile; fileRef = A2A1CB780BF29D5500AE959F /* PeerProgressIndicatorCell.mm */; }; A2A4E9210DE0F7E9000CE197 /* web.h in Headers */ = {isa = PBXBuildFile; fileRef = A29EBE530DC01FC9006CEE80 /* web.h */; }; A2A4E9220DE0F7EB000CE197 /* web.cc in Sources */ = {isa = PBXBuildFile; fileRef = A29EBE520DC01FC9006CEE80 /* web.cc */; }; - A2A4EA0E0DE106EB000CE197 /* ConvertUTF.c in Sources */ = {isa = PBXBuildFile; fileRef = A2A4EA0A0DE106E8000CE197 /* ConvertUTF.c */; }; - A2A4EA0F0DE106EE000CE197 /* ConvertUTF.h in Headers */ = {isa = PBXBuildFile; fileRef = A2A4EA0B0DE106E8000CE197 /* ConvertUTF.h */; }; A2A6321B0CD9751700E3DA60 /* BadgeView.mm in Sources */ = {isa = PBXBuildFile; fileRef = A2A6321A0CD9751700E3DA60 /* BadgeView.mm */; }; A2A7B32A164F87D400B98C65 /* jsonsl.c in Sources */ = {isa = PBXBuildFile; fileRef = A2A7B328164F87D400B98C65 /* jsonsl.c */; }; A2A7B32B164F87D400B98C65 /* jsonsl.h in Headers */ = {isa = PBXBuildFile; fileRef = A2A7B329164F87D400B98C65 /* jsonsl.h */; }; @@ -830,8 +828,6 @@ A2A1C81D142EC032008C17BF /* nl */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = nl; path = nl.lproj/GlobalOptionsPopover.xib; sourceTree = ""; }; A2A1CB770BF29D5500AE959F /* PeerProgressIndicatorCell.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = PeerProgressIndicatorCell.h; sourceTree = ""; }; A2A1CB780BF29D5500AE959F /* PeerProgressIndicatorCell.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = PeerProgressIndicatorCell.mm; sourceTree = ""; }; - A2A4EA0A0DE106E8000CE197 /* ConvertUTF.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = ConvertUTF.c; sourceTree = ""; }; - A2A4EA0B0DE106E8000CE197 /* ConvertUTF.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ConvertUTF.h; sourceTree = ""; }; A2A632190CD9751700E3DA60 /* BadgeView.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = BadgeView.h; sourceTree = ""; }; A2A6321A0CD9751700E3DA60 /* BadgeView.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = BadgeView.mm; sourceTree = ""; }; A2A7B328164F87D400B98C65 /* jsonsl.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = jsonsl.c; sourceTree = ""; }; @@ -1511,8 +1507,6 @@ A2EA52301686AC0D00180493 /* quark.h */, A2AF23C616B44FA0003BC59E /* log.cc */, A2AF23C716B44FA0003BC59E /* log.h */, - A2A4EA0B0DE106E8000CE197 /* ConvertUTF.h */, - A2A4EA0A0DE106E8000CE197 /* ConvertUTF.c */, 4DB74F070E8CD75100AEB1A8 /* wildmat.c */, C1FEE5751C3223CC00D62832 /* watchdir.cc */, C1FEE5761C3223CC00D62832 /* watchdir.h */, @@ -1884,7 +1878,6 @@ C1FEE57B1C3223CC00D62832 /* watchdir.h in Headers */, A2AAB6650DE0D08B00E04DDA /* blocklist.h in Headers */, A2A4E9210DE0F7E9000CE197 /* web.h in Headers */, - A2A4EA0F0DE106EE000CE197 /* ConvertUTF.h in Headers */, A25E03E20E4015380086C225 /* tr-getopt.h in Headers */, A21FBBAB0EDA78C300BC3C51 /* bandwidth.h in Headers */, A22CFCA90FC24ED80009BD3E /* tr-dht.h in Headers */, @@ -2477,7 +2470,6 @@ 4D4ADFC70DA1631500A68297 /* blocklist.cc in Sources */, A29DF8B90DB2544C00D04E5A /* resume.cc in Sources */, A2A4E9220DE0F7EB000CE197 /* web.cc in Sources */, - A2A4EA0E0DE106EB000CE197 /* ConvertUTF.c in Sources */, A292A6E80DFB45FC004B9C0A /* webseed.cc in Sources */, A25E03E30E4015380086C225 /* tr-getopt.cc in Sources */, C1305EBE186A13B100F03351 /* file.cc in Sources */, @@ -3034,6 +3026,7 @@ "third-party/libb64/include", "third-party/libevent/include", "third-party/libutp", + "third-party/utfcpp/source", ); OTHER_CFLAGS = ( "$(inherited)", @@ -3227,6 +3220,7 @@ "third-party/libb64/include", "third-party/libevent/include", "third-party/libutp", + "third-party/utfcpp/source", ); OTHER_CFLAGS = ( "$(inherited)", @@ -3477,6 +3471,7 @@ "third-party/libb64/include", "third-party/libevent/include", "third-party/libutp", + "third-party/utfcpp/source", ); OTHER_CFLAGS = ( "$(inherited)", diff --git a/cmake/FindUtfCpp.cmake b/cmake/FindUtfCpp.cmake new file mode 100644 index 000000000..80384d66a --- /dev/null +++ b/cmake/FindUtfCpp.cmake @@ -0,0 +1 @@ +set(UTFCPP_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/third-party/utfcpp/source) diff --git a/libtransmission/CMakeLists.txt b/libtransmission/CMakeLists.txt index c57d45fa6..82d6cf0d6 100644 --- a/libtransmission/CMakeLists.txt +++ b/libtransmission/CMakeLists.txt @@ -87,7 +87,6 @@ foreach(FILE ${PROJECT_FILES}) endforeach() set(THIRD_PARTY_FILES - ConvertUTF.c jsonsl.c wildmat.c ) @@ -144,7 +143,6 @@ set(${PROJECT_NAME}_PUBLIC_HEADERS ) set(${PROJECT_NAME}_PRIVATE_HEADERS - ConvertUTF.h announcer-common.h announcer.h bandwidth.h @@ -244,6 +242,7 @@ include_directories( include_directories( SYSTEM + ${UTFCPP_INCLUDE_DIRS} ${ZLIB_INCLUDE_DIRS} ${CRYPTO_INCLUDE_DIRS} ${CURL_INCLUDE_DIRS} diff --git a/libtransmission/ConvertUTF.c b/libtransmission/ConvertUTF.c deleted file mode 100644 index 517714503..000000000 --- a/libtransmission/ConvertUTF.c +++ /dev/null @@ -1,602 +0,0 @@ -/* - * Copyright 2001-2004 Unicode, Inc. - * - * Disclaimer - * - * This source code is provided as is by Unicode, Inc. No claims are - * made as to fitness for any particular purpose. No warranties of any - * kind are expressed or implied. The recipient agrees to determine - * applicability of information provided. If this file has been - * purchased on magnetic or optical media from Unicode, Inc., the - * sole remedy for any claim will be exchange of defective media - * within 90 days of receipt. - * - * Limitations on Rights to Redistribute This Code - * - * Unicode, Inc. hereby grants the right to freely use the information - * supplied in this file in the creation of products supporting the - * Unicode Standard, and to make copies of this file in any form - * for internal or external distribution as long as this notice - * remains attached. - */ - -/* --------------------------------------------------------------------- - - Conversions between UTF32, UTF-16, and UTF-8. Source code file. - Author: Mark E. Davis, 1994. - Rev History: Rick McGowan, fixes & updates May 2001. - Sept 2001: fixed const & error conditions per - mods suggested by S. Parent & A. Lillich. - June 2002: Tim Dodd added detection and handling of incomplete - source sequences, enhanced error detection, added casts - to eliminate compiler warnings. - July 2003: slight mods to back out aggressive FFFE detection. - Jan 2004: updated switches in from-UTF8 conversions. - Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. - May 2006: updated isLegalUTF8Sequence. - - See the header file "ConvertUTF.h" for complete documentation. - ------------------------------------------------------------------------- */ - -#ifdef CVTUTF_DEBUG - #include -#endif -#include /* strlen () */ - -#include "ConvertUTF.h" - -static const int halfShift = 10; /* used for shifting by 10 bits */ - -static const UTF32 halfBase = 0x0010000UL; -static const UTF32 halfMask = 0x3FFUL; - -#define UNI_SUR_HIGH_START (UTF32)0xD800 -#define UNI_SUR_HIGH_END (UTF32)0xDBFF -#define UNI_SUR_LOW_START (UTF32)0xDC00 -#define UNI_SUR_LOW_END (UTF32)0xDFFF -#define false 0 -#define true 1 - -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF32toUTF16 ( - const UTF32** sourceStart, const UTF32* sourceEnd, - UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF32* source = *sourceStart; - UTF16* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - if (target >= targetEnd) { - result = targetExhausted; break; - } - ch = *source++; - if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ - /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - *target++ = (UTF16)ch; /* normal case */ - } - } else if (ch > UNI_MAX_LEGAL_UTF32) { - if (flags == strictConversion) { - result = sourceIllegal; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - /* target is a character in range 0xFFFF - 0x10FFFF. */ - if (target + 1 >= targetEnd) { - --source; /* Back up source pointer! */ - result = targetExhausted; break; - } - ch -= halfBase; - *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); - *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); - } - } - *sourceStart = source; - *targetStart = target; - return result; -} - -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF16toUTF32 ( - const UTF16** sourceStart, const UTF16* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF16* source = *sourceStart; - UTF32* target = *targetStart; - UTF32 ch, ch2; - while (source < sourceEnd) { - const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ - ch = *source++; - /* If we have a surrogate pair, convert to UTF32 first. */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { - /* If the 16 bits following the high surrogate are in the source buffer... */ - if (source < sourceEnd) { - ch2 = *source; - /* If it's a low surrogate, convert to UTF32. */ - if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { - ch = ((ch - UNI_SUR_HIGH_START) << halfShift) - + (ch2 - UNI_SUR_LOW_START) + halfBase; - ++source; - } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } else { /* We don't have the 16 bits following the high surrogate. */ - --source; /* return to the high surrogate */ - result = sourceExhausted; - break; - } - } else if (flags == strictConversion) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - if (target >= targetEnd) { - source = oldSource; /* Back up source pointer! */ - result = targetExhausted; break; - } - *target++ = ch; - } - *sourceStart = source; - *targetStart = target; -#ifdef CVTUTF_DEBUG -if (result == sourceIllegal) { - fprintf (stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); - fflush (stderr); -} -#endif - return result; -} - -/* --------------------------------------------------------------------- */ - -/* - * Index into the table below with the first byte of a UTF-8 sequence to - * get the number of trailing bytes that are supposed to follow it. - * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is - * left as-is for anyone who may want to do such conversion, which was - * allowed in earlier algorithms. - */ -static const char trailingBytesForUTF8[256] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 -}; - -/* - * Magic values subtracted from a buffer value during UTF8 conversion. - * This table contains as many values as there might be trailing bytes - * in a UTF-8 sequence. - */ -static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, - 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; - -/* - * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed - * into the first byte, depending on how many bytes follow. There are - * as many entries in this table as there are UTF-8 sequence types. - * (I.e., one byte sequence, two byte... etc.). Remember that sequencs - * for *legal* UTF-8 will be 4 or fewer bytes total. - */ -static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; - -/* --------------------------------------------------------------------- */ - -/* The interface converts a whole buffer to avoid function-call overhead. - * Constants have been gathered. Loops & conditionals have been removed as - * much as possible for efficiency, in favor of drop-through switches. - * (See "Note A" at the bottom of the file for equivalent code.) - * If your compiler supports it, the "isLegalUTF8" call can be turned - * into an inline function. - */ - -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF16toUTF8 ( - const UTF16** sourceStart, const UTF16* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF16* source = *sourceStart; - UTF8* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - unsigned short bytesToWrite = 0; - const UTF32 byteMask = 0xBF; - const UTF32 byteMark = 0x80; - const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ - ch = *source++; - /* If we have a surrogate pair, convert to UTF32 first. */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { - /* If the 16 bits following the high surrogate are in the source buffer... */ - if (source < sourceEnd) { - UTF32 ch2 = *source; - /* If it's a low surrogate, convert to UTF32. */ - if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { - ch = ((ch - UNI_SUR_HIGH_START) << halfShift) - + (ch2 - UNI_SUR_LOW_START) + halfBase; - ++source; - } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } else { /* We don't have the 16 bits following the high surrogate. */ - --source; /* return to the high surrogate */ - result = sourceExhausted; - break; - } - } else if (flags == strictConversion) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - /* Figure out how many bytes the result will require */ - if (ch < (UTF32)0x80) { bytesToWrite = 1; - } else if (ch < (UTF32)0x800) { bytesToWrite = 2; - } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; - } else { bytesToWrite = 3; - ch = UNI_REPLACEMENT_CHAR; - } - - target += bytesToWrite; - if (target > targetEnd) { - source = oldSource; /* Back up source pointer! */ - target -= bytesToWrite; result = targetExhausted; break; - } - switch (bytesToWrite) { /* note: everything falls through. */ - case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); - } - target += bytesToWrite; - } - *sourceStart = source; - *targetStart = target; - return result; -} - -/* --------------------------------------------------------------------- */ - -/* - * Utility routine to tell whether a sequence of bytes is legal UTF-8. - * This must be called with the length pre-determined by the first byte. - * If not calling this from ConvertUTF8to*, then the length can be set by: - * length = trailingBytesForUTF8[*source]+1; - * and the sequence is illegal right away if there aren't that many bytes - * available. - * If presented with a length > 4, this returns false. The Unicode - * definition of UTF-8 goes up to 4-byte sequences. - */ - -static Boolean isLegalUTF8 (const UTF8 *source, int length) { - UTF8 a; - const UTF8 *srcptr = source+length; - switch (length) { - default: return false; - /* Everything else falls through when "true"... */ - case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - - switch (*source) { - /* no fall-through in this inner switch */ - case 0xE0: if (a < 0xA0) return false; break; - case 0xED: if (a > 0x9F) return false; break; - case 0xF0: if (a < 0x90) return false; break; - case 0xF4: if (a > 0x8F) return false; break; - default: if (a < 0x80) return false; - } - - case 1: if (*source >= 0x80 && *source < 0xC2) return false; - } - if (*source > 0xF4) return false; - return true; -} - -/* --------------------------------------------------------------------- */ - -/* - * Exported function to return whether a UTF-8 sequence is legal or not. - * This is not used here; it's just exported. - */ - -Boolean isLegalUTF8Sequence (const UTF8 *source, const UTF8 *sourceEnd) { - int length; - if (source == sourceEnd) { - return true; - } - while (true) { - length = trailingBytesForUTF8[*source]+1; - if (source+length > sourceEnd) { - return false; - } - if (!isLegalUTF8 (source, length)) { - return false; - } - source += length; - if (source >= sourceEnd) { - return true; - } - } -} - -/** - * This is a variation of isLegalUTF8Sequence () that behaves like g_utf8_validate (). - * In addition to knowing if the sequence is legal, it also tells you the last good character. - */ -Boolean -tr_utf8_validate (const char * str, size_t max_len, const char ** end) -{ - const UTF8* source = (const UTF8*) str; - const UTF8* sourceEnd; - - if (max_len == 0) - return true; - - if (str == NULL) - return false; - - sourceEnd = source + (max_len == ((size_t)-1) ? strlen (str) : max_len); - - if (source == sourceEnd) - { - if (end != NULL) - *end = (const char*) source; - return true; - } - - for (;;) - { - const int length = trailingBytesForUTF8[*source] + 1; - if (source + length > sourceEnd) { - if (end != NULL) - *end = (const char*) source; - return false; - } - if (!isLegalUTF8 (source, length)) { - if (end != NULL) - *end = (const char*) source; - return false; - } - source += length; - if (source >= sourceEnd) { - if (end != NULL) - *end = (const char*) source; - return true; - } - } -} - - -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF8toUTF16 ( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF8* source = *sourceStart; - UTF16* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (extraBytesToRead >= sourceEnd - source) { - result = sourceExhausted; break; - } - /* Do this check whether lenient or strict */ - if (! isLegalUTF8 (source, extraBytesToRead+1)) { - result = sourceIllegal; - break; - } - /* - * The cases all fall through. See "Note A" below. - */ - switch (extraBytesToRead) { - case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ - case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - } - ch -= offsetsFromUTF8[extraBytesToRead]; - - if (target >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up source pointer! */ - result = targetExhausted; break; - } - if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - source -= (extraBytesToRead+1); /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - *target++ = (UTF16)ch; /* normal case */ - } - } else if (ch > UNI_MAX_UTF16) { - if (flags == strictConversion) { - result = sourceIllegal; - source -= (extraBytesToRead+1); /* return to the start */ - break; /* Bail out; shouldn't continue */ - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - /* target is a character in range 0xFFFF - 0x10FFFF. */ - if (target + 1 >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up source pointer! */ - result = targetExhausted; break; - } - ch -= halfBase; - *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); - *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); - } - } - *sourceStart = source; - *targetStart = target; - return result; -} - -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF32toUTF8 ( - const UTF32** sourceStart, const UTF32* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF32* source = *sourceStart; - UTF8* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - unsigned short bytesToWrite = 0; - const UTF32 byteMask = 0xBF; - const UTF32 byteMark = 0x80; - ch = *source++; - if (flags == strictConversion) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - /* - * Figure out how many bytes the result will require. Turn any - * illegally large UTF32 things (> Plane 17) into replacement chars. - */ - if (ch < (UTF32)0x80) { bytesToWrite = 1; - } else if (ch < (UTF32)0x800) { bytesToWrite = 2; - } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; - } else { bytesToWrite = 3; - ch = UNI_REPLACEMENT_CHAR; - result = sourceIllegal; - } - - target += bytesToWrite; - if (target > targetEnd) { - --source; /* Back up source pointer! */ - target -= bytesToWrite; result = targetExhausted; break; - } - switch (bytesToWrite) { /* note: everything falls through. */ - case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); - } - target += bytesToWrite; - } - *sourceStart = source; - *targetStart = target; - return result; -} - -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF8toUTF32 ( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF8* source = *sourceStart; - UTF32* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (extraBytesToRead >= sourceEnd - source) { - result = sourceExhausted; break; - } - /* Do this check whether lenient or strict */ - if (! isLegalUTF8 (source, extraBytesToRead+1)) { - result = sourceIllegal; - break; - } - /* - * The cases all fall through. See "Note A" below. - */ - switch (extraBytesToRead) { - case 5: ch += *source++; ch <<= 6; - case 4: ch += *source++; ch <<= 6; - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - } - ch -= offsetsFromUTF8[extraBytesToRead]; - - if (target >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up the source pointer! */ - result = targetExhausted; break; - } - if (ch <= UNI_MAX_LEGAL_UTF32) { - /* - * UTF-16 surrogate values are illegal in UTF-32, and anything - * over Plane 17 (> 0x10FFFF) is illegal. - */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - source -= (extraBytesToRead+1); /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - *target++ = ch; - } - } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ - result = sourceIllegal; - *target++ = UNI_REPLACEMENT_CHAR; - } - } - *sourceStart = source; - *targetStart = target; - return result; -} - -/* --------------------------------------------------------------------- - - Note A. - The fall-through switches in UTF-8 reading code save a - temp variable, some decrements & conditionals. The switches - are equivalent to the following loop: - { - int tmpBytesToRead = extraBytesToRead+1; - do { - ch += *source++; - --tmpBytesToRead; - if (tmpBytesToRead) ch <<= 6; - } while (tmpBytesToRead > 0); - } - In UTF-8 writing code, the switches on "bytesToWrite" are - similarly unrolled loops. - - --------------------------------------------------------------------- */ diff --git a/libtransmission/ConvertUTF.h b/libtransmission/ConvertUTF.h deleted file mode 100644 index c73b72796..000000000 --- a/libtransmission/ConvertUTF.h +++ /dev/null @@ -1,160 +0,0 @@ -#pragma once - -#ifndef __TRANSMISSION__ - #error only libtransmission should #include this header. -#endif - -/* - * Copyright 2001-2004 Unicode, Inc. - * - * Disclaimer - * - * This source code is provided as is by Unicode, Inc. No claims are - * made as to fitness for any particular purpose. No warranties of any - * kind are expressed or implied. The recipient agrees to determine - * applicability of information provided. If this file has been - * purchased on magnetic or optical media from Unicode, Inc., the - * sole remedy for any claim will be exchange of defective media - * within 90 days of receipt. - * - * Limitations on Rights to Redistribute This Code - * - * Unicode, Inc. hereby grants the right to freely use the information - * supplied in this file in the creation of products supporting the - * Unicode Standard, and to make copies of this file in any form - * for internal or external distribution as long as this notice - * remains attached. - */ - -/* --------------------------------------------------------------------- - - Conversions between UTF32, UTF-16, and UTF-8. Header file. - - Several funtions are included here, forming a complete set of - conversions between the three formats. UTF-7 is not included - here, but is handled in a separate source file. - - Each of these routines takes pointers to input buffers and output - buffers. The input buffers are const. - - Each routine converts the text between *sourceStart and sourceEnd, - putting the result into the buffer between *targetStart and - targetEnd. Note: the end pointers are *after* the last item: e.g. - * (sourceEnd - 1) is the last item. - - The return result indicates whether the conversion was successful, - and if not, whether the problem was in the source or target buffers. - (Only the first encountered problem is indicated.) - - After the conversion, *sourceStart and *targetStart are both - updated to point to the end of last text successfully converted in - the respective buffers. - - Input parameters: - sourceStart - pointer to a pointer to the source buffer. - The contents of this are modified on return so that - it points at the next thing to be converted. - targetStart - similarly, pointer to pointer to the target buffer. - sourceEnd, targetEnd - respectively pointers to the ends of the - two buffers, for overflow checking only. - - These conversion functions take a ConversionFlags argument. When this - flag is set to strict, both irregular sequences and isolated surrogates - will cause an error. When the flag is set to lenient, both irregular - sequences and isolated surrogates are converted. - - Whether the flag is strict or lenient, all illegal sequences will cause - an error return. This includes sequences such as: , , - or in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code - must check for illegal sequences. - - When the flag is set to lenient, characters over 0x10FFFF are converted - to the replacement character; otherwise (when the flag is set to strict) - they constitute an error. - - Output parameters: - The value "sourceIllegal" is returned from some routines if the input - sequence is malformed. When "sourceIllegal" is returned, the source - value will point to the illegal value that caused the problem. E.g., - in UTF-8 when a sequence is malformed, it points to the start of the - malformed sequence. - - Author: Mark E. Davis, 1994. - Rev History: Rick McGowan, fixes & updates May 2001. - Fixes & updates, Sept 2001. - ------------------------------------------------------------------------- */ - -/* --------------------------------------------------------------------- - The following 4 definitions are compiler-specific. - The C standard does not guarantee that wchar_t has at least - 16 bits, so wchar_t is no less portable than unsigned short! - All should be unsigned values to avoid sign extension during - bit mask & shift operations. ------------------------------------------------------------------------- */ - -typedef unsigned long UTF32; /* at least 32 bits */ -typedef unsigned short UTF16; /* at least 16 bits */ -typedef unsigned char UTF8; /* typically 8 bits */ -typedef unsigned char Boolean; /* 0 or 1 */ - -/* Some fundamental constants */ -#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD -#define UNI_MAX_BMP (UTF32)0x0000FFFF -#define UNI_MAX_UTF16 (UTF32)0x0010FFFF -#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF -#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF - -typedef enum { - conversionOK, /* conversion successful */ - sourceExhausted, /* partial character in source, but hit end */ - targetExhausted, /* insuff. room in target for conversion */ - sourceIllegal /* source sequence is illegal/malformed */ -} ConversionResult; - -typedef enum { - strictConversion = 0, - lenientConversion -} ConversionFlags; - -/* This is for C++ and does no harm in C */ -#ifdef __cplusplus -extern "C" { -#endif - -ConversionResult ConvertUTF8toUTF16 ( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); - -ConversionResult ConvertUTF16toUTF8 ( - const UTF16** sourceStart, const UTF16* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); - -ConversionResult ConvertUTF8toUTF32 ( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); - -ConversionResult ConvertUTF32toUTF8 ( - const UTF32** sourceStart, const UTF32* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); - -ConversionResult ConvertUTF16toUTF32 ( - const UTF16** sourceStart, const UTF16* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); - -ConversionResult ConvertUTF32toUTF16 ( - const UTF32** sourceStart, const UTF32* sourceEnd, - UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); - -Boolean isLegalUTF8Sequence (const UTF8 *source, const UTF8 *sourceEnd); - - -/* intended to work the same as g_utf8_validate */ -Boolean tr_utf8_validate (const char * str, size_t max_len, const char ** end); - - -#ifdef __cplusplus -} -#endif - -/* --------------------------------------------------------------------- */ diff --git a/libtransmission/utils.cc b/libtransmission/utils.cc index e60abc530..9ca99b3b4 100644 --- a/libtransmission/utils.cc +++ b/libtransmission/utils.cc @@ -51,6 +51,7 @@ #include #endif +#include #include #include @@ -58,7 +59,6 @@ #include "error.h" #include "error-types.h" #include "file.h" -#include "ConvertUTF.h" #include "log.h" #include "mime-types.h" #include "net.h" @@ -756,29 +756,43 @@ void tr_removeElementFromArray(void* array, size_t index_to_remove, size_t sizeo **** ***/ -static char* strip_non_utf8(char const* in, size_t inlen) +bool tr_utf8_validate(std::string_view sv, char const** good_end) { - evbuffer* const buf = evbuffer_new(); + auto const* begin = std::data(sv); + auto const* const end = begin + std::size(sv); + auto const* walk = begin; - char const* end = nullptr; - while (!tr_utf8_validate(in, inlen, &end)) + try + { + while (walk < end) + { + utf8::next(walk, end); + } + } + catch (utf8::exception&) { - int const good_len = end - in; - - evbuffer_add(buf, in, good_len); - inlen -= (good_len + 1); - in += (good_len + 1); - evbuffer_add(buf, "?", 1); } - evbuffer_add(buf, in, inlen); - return evbuffer_free_to_str(buf, nullptr); + if (good_end != nullptr) + { + *good_end = walk; + } + + return walk == end; } -static char* to_utf8(char const* in, size_t inlen) +static char* strip_non_utf8(std::string_view sv) +{ + char* ret = tr_new(char, std::size(sv) + 1); + auto const it = utf8::replace_invalid(std::data(sv), std::data(sv) + std::size(sv), ret, '?'); + *it = '\0'; + return ret; +} + +static char* to_utf8(std::string_view sv) { #ifdef HAVE_ICONV - size_t const buflen = inlen * 4 + 10; + size_t const buflen = std::size(sv) * 4 + 10; char* out = tr_new(char, buflen); auto constexpr Encodings = std::array{ "CURRENT", "ISO-8859-15" }; @@ -791,12 +805,12 @@ static char* to_utf8(char const* in, size_t inlen) } #ifdef ICONV_SECOND_ARGUMENT_IS_CONST - auto const* inbuf = in; + auto const* inbuf = std::data(sv); #else - auto* inbuf = const_cast(in); + auto* inbuf = const_cast(std::data(sv)); #endif char* outbuf = out; - size_t inbytesleft = inlen; + size_t inbytesleft = std::size(sv); size_t outbytesleft = buflen; auto const rv = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); iconv_close(cd); @@ -812,30 +826,24 @@ static char* to_utf8(char const* in, size_t inlen) #endif - return strip_non_utf8(in, inlen); + return strip_non_utf8(sv); } -char* tr_utf8clean(std::string_view str) +char* tr_utf8clean(std::string_view sv) { - char* const ret = tr_utf8_validate(std::data(str), std::size(str), nullptr) ? tr_strndup(std::data(str), std::size(str)) : - to_utf8(std::data(str), std::size(str)); - TR_ASSERT(tr_utf8_validate(ret, strlen(ret), nullptr)); + char* const ret = tr_utf8_validate(sv, nullptr) ? tr_strvDup(sv) : to_utf8(sv); + TR_ASSERT(tr_utf8_validate(ret, nullptr)); return ret; } -static bool tr_strvUtf8Validate(std::string_view sv) -{ - return tr_utf8_validate(std::data(sv), std::size(sv), nullptr); -} - std::string tr_strvUtf8Clean(std::string_view sv) { - if (tr_strvUtf8Validate(sv)) + if (tr_utf8_validate(sv, nullptr)) { return std::string{ sv }; } - auto* const tmp = to_utf8(std::data(sv), std::size(sv)); + auto* const tmp = to_utf8(sv); auto ret = std::string{ tmp ? tmp : "" }; tr_free(tmp); return ret; diff --git a/libtransmission/utils.h b/libtransmission/utils.h index 2c762d276..56532cec0 100644 --- a/libtransmission/utils.h +++ b/libtransmission/utils.h @@ -132,6 +132,8 @@ void tr_wait_msec(long int delay_milliseconds); */ char* tr_utf8clean(std::string_view str) TR_GNUC_MALLOC; +bool tr_utf8_validate(std::string_view sv, char const** endptr); + #ifdef _WIN32 char* tr_win32_native_to_utf8(wchar_t const* text, int text_size); diff --git a/libtransmission/variant-json.cc b/libtransmission/variant-json.cc index befd5b978..c711b1463 100644 --- a/libtransmission/variant-json.cc +++ b/libtransmission/variant-json.cc @@ -14,18 +14,19 @@ #include #include +#include #include /* evbuffer_add() */ #define LIBTRANSMISSION_VARIANT_MODULE #include "transmission.h" -#include "ConvertUTF.h" + #include "jsonsl.h" #include "log.h" #include "tr-assert.h" #include "utils.h" -#include "variant.h" #include "variant-common.h" +#include "variant.h" using namespace std::literals; @@ -224,19 +225,17 @@ static std::string_view extract_escaped_string(char const* in, size_t in_len, st if (decode_hex_string(in, &val)) { - UTF32 str32_buf[2] = { val, 0 }; - UTF32 const* str32_walk = str32_buf; - UTF32 const* str32_end = str32_buf + 1; - UTF8 str8_buf[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; - UTF8* str8_walk = str8_buf; - UTF8* str8_end = str8_buf + 8; - - if (ConvertUTF32toUTF8(&str32_walk, str32_end, &str8_walk, str8_end, {}) == 0) + try { - evbuffer_add(buf, str8_buf, str8_walk - str8_buf); - unescaped = true; + auto buf8 = std::array{}; + auto const it = utf8::append(val, std::data(buf8)); + evbuffer_add(buf, std::data(buf8), it - std::data(buf8)); } - + catch (utf8::exception&) + { // invalid codepoint + evbuffer_add(buf, "?", 1); + } + unescaped = true; in += 6; break; } @@ -580,20 +579,20 @@ static void jsonStringFunc(tr_variant const* val, void* vdata) } else { - auto const* const begin = reinterpret_cast(std::data(sv)); - auto const* tmp = begin; - auto const* end = tmp + std::size(sv); - UTF32 buf[1] = { 0 }; - UTF32* u32 = buf; - ConversionResult result = ConvertUTF8toUTF32(&tmp, end, &u32, buf + 1, {}); - - if ((result == conversionOK || result == targetExhausted) && tmp != begin) + try { - outwalk += tr_snprintf(outwalk, outend - outwalk, "\\u%04x", (unsigned int)buf[0]); - sv.remove_prefix(tmp - begin - 1); + auto* begin8 = std::data(sv); + auto* end8 = begin8 + std::size(sv); + auto* walk8 = begin8; + auto const uch32 = utf8::next(walk8, end8); + outwalk += tr_snprintf(outwalk, outend - outwalk, "\\u%04x", uch32); + sv.remove_prefix(walk8 - begin8 - 1); + } + catch (utf8::exception&) + { + *outwalk++ = '?'; } } - break; } } diff --git a/tests/libtransmission/utils-test.cc b/tests/libtransmission/utils-test.cc index f593c8e1a..8873f627e 100644 --- a/tests/libtransmission/utils-test.cc +++ b/tests/libtransmission/utils-test.cc @@ -16,7 +16,6 @@ #include "transmission.h" -#include "ConvertUTF.h" // tr_utf8_validate() #include "crypto-utils.h" // tr_rand_int_weak() #include "platform.h" #include "ptrarray.h" @@ -182,26 +181,26 @@ TEST_F(UtilsTest, trUtf8clean) in = "\x92\xE0\xE3\xA4\xAD\xAE \xA1\xEB\xE2\xEC \x81\xAE\xA3\xAE\xAC"sv; out = makeString(tr_utf8clean(in)); EXPECT_TRUE(std::size(out) == 17 || std::size(out) == 33); - EXPECT_TRUE(tr_utf8_validate(out.c_str(), out.size(), nullptr)); + EXPECT_TRUE(tr_utf8_validate(out, nullptr)); // same string, but utf-8 clean in = "Трудно быть Богом"sv; out = makeString(tr_utf8clean(in)); EXPECT_NE(nullptr, out.data()); - EXPECT_TRUE(tr_utf8_validate(out.c_str(), out.size(), nullptr)); + EXPECT_TRUE(tr_utf8_validate(out, nullptr)); EXPECT_EQ(in, out); in = "\xF4\x00\x81\x82"sv; out = makeString(tr_utf8clean(in)); EXPECT_NE(nullptr, out.data()); EXPECT_TRUE(out.size() == 1 || out.size() == 2); - EXPECT_TRUE(tr_utf8_validate(out.c_str(), out.size(), nullptr)); + EXPECT_TRUE(tr_utf8_validate(out, nullptr)); in = "\xF4\x33\x81\x82"sv; out = makeString(tr_utf8clean(in)); EXPECT_NE(nullptr, out.data()); EXPECT_TRUE(out.size() == 4 || out.size() == 7); - EXPECT_TRUE(tr_utf8_validate(out.c_str(), out.size(), nullptr)); + EXPECT_TRUE(tr_utf8_validate(out, nullptr)); } TEST_F(UtilsTest, trStrvUtf8Clean) @@ -218,13 +217,13 @@ TEST_F(UtilsTest, trStrvUtf8Clean) in = "\x92\xE0\xE3\xA4\xAD\xAE \xA1\xEB\xE2\xEC \x81\xAE\xA3\xAE\xAC"sv; out = tr_strvUtf8Clean(in); EXPECT_TRUE(std::size(out) == 17 || std::size(out) == 33); - EXPECT_TRUE(tr_utf8_validate(out.c_str(), out.size(), nullptr)); + EXPECT_TRUE(tr_utf8_validate(out, nullptr)); // same string, but utf-8 clean in = "Трудно быть Богом"sv; out = tr_strvUtf8Clean(in); EXPECT_NE(nullptr, out.data()); - EXPECT_TRUE(tr_utf8_validate(out.c_str(), out.size(), nullptr)); + EXPECT_TRUE(tr_utf8_validate(out, nullptr)); EXPECT_EQ(in, out); // https://trac.transmissionbt.com/ticket/6064 @@ -236,13 +235,13 @@ TEST_F(UtilsTest, trStrvUtf8Clean) out = tr_strvUtf8Clean(in); EXPECT_NE(nullptr, out.data()); EXPECT_TRUE(out.size() == 1 || out.size() == 2); - EXPECT_TRUE(tr_utf8_validate(out.c_str(), out.size(), nullptr)); + EXPECT_TRUE(tr_utf8_validate(out, nullptr)); in = "\xF4\x33\x81\x82"sv; out = tr_strvUtf8Clean(in); EXPECT_NE(nullptr, out.data()); EXPECT_TRUE(out.size() == 4 || out.size() == 7); - EXPECT_TRUE(tr_utf8_validate(out.c_str(), out.size(), nullptr)); + EXPECT_TRUE(tr_utf8_validate(out, nullptr)); } TEST_F(UtilsTest, trParseNumberRange) diff --git a/third-party/utfcpp b/third-party/utfcpp new file mode 160000 index 000000000..b85efd66a --- /dev/null +++ b/third-party/utfcpp @@ -0,0 +1 @@ +Subproject commit b85efd66a76caccbe0c186b00cab34df1e4281fa