Skip to content

Commit 2569c6b

Browse files
authored
Implement a functional 'locale' character encoding (#252)
* Log the mbsrtowcs detected illegal byte sequence position * Fix MbstowcsCharsetDecoder
1 parent 002bf04 commit 2569c6b

File tree

6 files changed

+254
-134
lines changed

6 files changed

+254
-134
lines changed

src/main/cpp/charsetdecoder.cpp

Lines changed: 66 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,13 @@
1414
* See the License for the specific language governing permissions and
1515
* limitations under the License.
1616
*/
17+
#define NOMINMAX /* tell windows not to define min/max macros */
1718
#include <log4cxx/logstring.h>
1819
#include <log4cxx/helpers/charsetdecoder.h>
1920
#include <log4cxx/helpers/bytebuffer.h>
2021
#include <log4cxx/helpers/exception.h>
2122
#include <log4cxx/helpers/pool.h>
23+
#include <log4cxx/helpers/loglog.h>
2224
#include <apr_xlate.h>
2325
#if !defined(LOG4CXX)
2426
#define LOG4CXX 1
@@ -165,21 +167,14 @@ class MbstowcsCharsetDecoder : public CharsetDecoder
165167
{
166168
log4cxx_status_t stat = APR_SUCCESS;
167169
enum { BUFSIZE = 256 };
168-
wchar_t buf[BUFSIZE];
170+
wchar_t wbuf[BUFSIZE];
171+
char cbuf[BUFSIZE*4];
169172

170173
mbstate_t mbstate;
171174
memset(&mbstate, 0, sizeof(mbstate));
172175

173176
while (in.remaining() > 0)
174177
{
175-
size_t requested = in.remaining();
176-
177-
if (requested > BUFSIZE - 1)
178-
{
179-
requested = BUFSIZE - 1;
180-
}
181-
182-
memset(buf, 0, BUFSIZE * sizeof(wchar_t));
183178
const char* src = in.current();
184179

185180
if (*src == 0)
@@ -189,21 +184,31 @@ class MbstowcsCharsetDecoder : public CharsetDecoder
189184
}
190185
else
191186
{
192-
size_t converted = mbsrtowcs(buf,
187+
auto available = std::min(sizeof (cbuf) - 1, in.remaining());
188+
strncpy(cbuf, src, available);
189+
cbuf[available] = 0;
190+
src = cbuf;
191+
size_t wCharCount = mbsrtowcs(wbuf,
193192
&src,
194-
requested,
193+
BUFSIZE - 1,
195194
&mbstate);
195+
auto converted = src - cbuf;
196+
in.position(in.position() + converted);
196197

197-
if (converted == (size_t) -1)
198+
if (wCharCount == (size_t) -1) // Illegal byte sequence?
198199
{
199-
stat = APR_BADARG;
200-
in.position(src - in.data());
200+
LogString msg(LOG4CXX_STR("Illegal byte sequence at "));
201+
msg.append(std::to_wstring(in.position()));
202+
msg.append(LOG4CXX_STR(" of "));
203+
msg.append(std::to_wstring(in.limit()));
204+
LogLog::warn(msg);
205+
stat = APR_BADCH;
201206
break;
202207
}
203208
else
204209
{
205-
stat = append(out, buf);
206-
in.position(in.position() + requested);
210+
wbuf[wCharCount] = 0;
211+
stat = append(out, wbuf);
207212
}
208213
}
209214
}
@@ -418,73 +423,60 @@ class USASCIICharsetDecoder : public CharsetDecoder
418423
};
419424

420425
/**
421-
* Charset decoder that uses an embedded CharsetDecoder consistent
422-
* with current locale settings.
426+
* Charset decoder that uses current locale settings.
423427
*/
424428
class LocaleCharsetDecoder : public CharsetDecoder
425429
{
426430
public:
427-
LocaleCharsetDecoder() : pool(), decoder(), encoding()
428-
{
429-
}
430-
virtual ~LocaleCharsetDecoder()
431+
LocaleCharsetDecoder() : state()
431432
{
432433
}
433-
virtual log4cxx_status_t decode(ByteBuffer& in,
434-
LogString& out)
434+
log4cxx_status_t decode(ByteBuffer& in, LogString& out) override
435435
{
436+
log4cxx_status_t result = APR_SUCCESS;
436437
const char* p = in.current();
437438
size_t i = in.position();
439+
size_t remain = in.limit() - i;
438440
#if !LOG4CXX_CHARSET_EBCDIC
439-
440-
for (; i < in.limit() && ((unsigned int) *p) < 0x80; i++, p++)
441+
if (std::mbsinit(&this->state)) // ByteBuffer not partially decoded?
441442
{
442-
out.append(1, *p);
443+
// Copy single byte characters
444+
for (; 0 < remain && ((unsigned int) *p) < 0x80; --remain, ++i, p++)
445+
{
446+
out.append(1, *p);
447+
}
443448
}
444-
445-
in.position(i);
446449
#endif
447-
448-
if (i < in.limit())
450+
// Decode characters that may be represented by multiple bytes
451+
while (0 < remain)
449452
{
450-
Pool subpool;
451-
const char* enc = apr_os_locale_encoding(subpool.getAPRPool());
453+
wchar_t ch;
454+
size_t n = std::mbrtowc(&ch, p, remain, &this->state);
455+
if (0 == n) // NULL encountered?
452456
{
453-
std::unique_lock<std::mutex> lock(mutex);
454-
455-
if (enc == 0)
456-
{
457-
if (decoder == 0)
458-
{
459-
encoding = "C";
460-
decoder.reset( new USASCIICharsetDecoder() );
461-
}
462-
}
463-
else if (encoding != enc)
464-
{
465-
encoding = enc;
466-
467-
try
468-
{
469-
LOG4CXX_DECODE_CHAR(e, encoding);
470-
decoder = getDecoder(e);
471-
}
472-
catch (IllegalArgumentException&)
473-
{
474-
decoder.reset( new USASCIICharsetDecoder() );
475-
}
476-
}
457+
++i;
458+
break;
459+
}
460+
if (static_cast<std::size_t>(-1) == n) // decoding error?
461+
{
462+
result = APR_BADARG;
463+
break;
464+
}
465+
if (static_cast<std::size_t>(-2) == n) // incomplete sequence?
466+
{
467+
break;
477468
}
478-
return decoder->decode(in, out);
469+
Transcoder::encode(static_cast<unsigned int>(ch), out);
470+
remain -= n;
471+
i += n;
472+
p += n;
479473
}
480-
481-
return APR_SUCCESS;
474+
in.position(i);
475+
return result;
482476
}
477+
483478
private:
484-
Pool pool;
485-
std::mutex mutex;
486-
CharsetDecoderPtr decoder;
487-
std::string encoding;
479+
std::mbstate_t state;
488480
};
489481

490482

@@ -561,23 +553,30 @@ CharsetDecoderPtr CharsetDecoder::getISOLatinDecoder()
561553
CharsetDecoderPtr CharsetDecoder::getDecoder(const LogString& charset)
562554
{
563555
if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF-8"), LOG4CXX_STR("utf-8")) ||
564-
StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF8"), LOG4CXX_STR("utf8")))
556+
StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF8"), LOG4CXX_STR("utf8")) ||
557+
StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP65001"), LOG4CXX_STR("cp65001")))
565558
{
566559
return std::make_shared<UTF8CharsetDecoder>();
567560
}
568561
else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("C"), LOG4CXX_STR("c")) ||
569562
charset == LOG4CXX_STR("646") ||
570563
StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("US-ASCII"), LOG4CXX_STR("us-ascii")) ||
571564
StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO646-US"), LOG4CXX_STR("iso646-US")) ||
572-
StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ANSI_X3.4-1968"), LOG4CXX_STR("ansi_x3.4-1968")))
565+
StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ANSI_X3.4-1968"), LOG4CXX_STR("ansi_x3.4-1968")) ||
566+
StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP20127"), LOG4CXX_STR("cp20127")))
573567
{
574568
return std::make_shared<USASCIICharsetDecoder>();
575569
}
576570
else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-8859-1"), LOG4CXX_STR("iso-8859-1")) ||
577-
StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-LATIN-1"), LOG4CXX_STR("iso-latin-1")))
571+
StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-LATIN-1"), LOG4CXX_STR("iso-latin-1")) ||
572+
StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP1252"), LOG4CXX_STR("cp1252")))
578573
{
579574
return std::make_shared<ISOLatinCharsetDecoder>();
580575
}
576+
else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("LOCALE"), LOG4CXX_STR("locale")))
577+
{
578+
return std::make_shared<LocaleCharsetDecoder>();
579+
}
581580

582581
#if APR_HAS_XLATE
583582
return std::make_shared<APRCharsetDecoder>(charset);

src/main/cpp/charsetencoder.cpp

Lines changed: 40 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -446,79 +446,54 @@ class UTF16LECharsetEncoder : public CharsetEncoder
446446
};
447447

448448
/**
449-
* Charset encoder that uses an embedded CharsetEncoder consistent
450-
* with current locale settings.
449+
* Charset encoder that uses current locale settings.
451450
*/
452451
class LocaleCharsetEncoder : public CharsetEncoder
453452
{
454453
public:
455-
LocaleCharsetEncoder() : pool(), encoder(), encoding()
454+
LocaleCharsetEncoder() : state()
456455
{
457456
}
458-
virtual ~LocaleCharsetEncoder()
459-
{
460-
}
461-
virtual log4cxx_status_t encode(const LogString& in,
462-
LogString::const_iterator& iter,
463-
ByteBuffer& out)
457+
log4cxx_status_t encode
458+
( const LogString& in
459+
, LogString::const_iterator& iter
460+
, ByteBuffer& out
461+
) override
464462
{
463+
log4cxx_status_t result = APR_SUCCESS;
465464
#if !LOG4CXX_CHARSET_EBCDIC
466465
char* current = out.current();
467466
size_t remain = out.remaining();
468-
469-
for (;
470-
iter != in.end() && ((unsigned int) *iter) < 0x80 && remain > 0;
471-
iter++, remain--, current++)
467+
if (std::mbsinit(&this->state)) // ByteBuffer not partially encoded?
472468
{
473-
*current = *iter;
469+
// Copy single byte characters
470+
for (;
471+
iter != in.end() && ((unsigned int) *iter) < 0x80 && 0 < remain;
472+
iter++, remain--, current++)
473+
{
474+
*current = *iter;
475+
}
474476
}
475-
476-
out.position(current - out.data());
477477
#endif
478-
479-
if (iter != in.end() && out.remaining() > 0)
478+
// Encode characters that may require multiple bytes
479+
while (iter != in.end() && MB_CUR_MAX <= remain)
480480
{
481-
Pool subpool;
482-
const char* enc = apr_os_locale_encoding(subpool.getAPRPool());
481+
auto ch = Transcoder::decode(in, iter);
482+
auto n = std::wcrtomb(current, ch, &this->state);
483+
if (static_cast<std::size_t>(-1) == n) // not a valid wide character?
483484
{
484-
std::unique_lock<std::mutex> lock(mutex);
485-
486-
if (enc == 0)
487-
{
488-
if (encoder == 0)
489-
{
490-
encoding = "C";
491-
encoder.reset( new USASCIICharsetEncoder() );
492-
}
493-
}
494-
else if (encoding != enc)
495-
{
496-
encoding = enc;
497-
LOG4CXX_DECODE_CHAR(ename, encoding);
498-
499-
try
500-
{
501-
encoder = CharsetEncoder::getEncoder(ename);
502-
}
503-
catch (IllegalArgumentException&)
504-
{
505-
encoder.reset( new USASCIICharsetEncoder() );
506-
}
507-
}
485+
result = APR_BADARG;
486+
break;
508487
}
509-
return encoder->encode(in, iter, out);
488+
remain -= n;
489+
current += n;
510490
}
511-
512-
return APR_SUCCESS;
491+
out.position(current - out.data());
492+
return result;
513493
}
514494

515495
private:
516-
LocaleCharsetEncoder(const LocaleCharsetEncoder&);
517-
LocaleCharsetEncoder& operator=(const LocaleCharsetEncoder&);
518-
Pool pool;
519-
std::mutex mutex;
520-
CharsetEncoderPtr encoder;
521-
std::string encoding;
496+
std::mbstate_t state;
522497
};
523498

524499

@@ -578,32 +553,40 @@ CharsetEncoderPtr CharsetEncoder::getUTF8Encoder()
578553

579554
CharsetEncoderPtr CharsetEncoder::getEncoder(const LogString& charset)
580555
{
581-
if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF-8"), LOG4CXX_STR("utf-8")))
556+
if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF-8"), LOG4CXX_STR("utf-8"))
557+
|| StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP65001"), LOG4CXX_STR("cp65001")))
582558
{
583559
return std::make_shared<UTF8CharsetEncoder>();
584560
}
585561
else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("C"), LOG4CXX_STR("c")) ||
586562
charset == LOG4CXX_STR("646") ||
587563
StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("US-ASCII"), LOG4CXX_STR("us-ascii")) ||
588564
StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO646-US"), LOG4CXX_STR("iso646-US")) ||
589-
StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ANSI_X3.4-1968"), LOG4CXX_STR("ansi_x3.4-1968")))
565+
StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ANSI_X3.4-1968"), LOG4CXX_STR("ansi_x3.4-1968")) ||
566+
StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP20127"), LOG4CXX_STR("cp20127")))
590567
{
591568
return std::make_shared<USASCIICharsetEncoder>();
592569
}
593570
else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-8859-1"), LOG4CXX_STR("iso-8859-1")) ||
594-
StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-LATIN-1"), LOG4CXX_STR("iso-latin-1")))
571+
StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("ISO-LATIN-1"), LOG4CXX_STR("iso-latin-1")) ||
572+
StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP1252"), LOG4CXX_STR("cp1252")))
595573
{
596574
return std::make_shared<ISOLatinCharsetEncoder>();
597575
}
598576
else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF-16BE"), LOG4CXX_STR("utf-16be"))
599-
|| StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF-16"), LOG4CXX_STR("utf-16")))
577+
|| StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF-16"), LOG4CXX_STR("utf-16"))
578+
|| StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("CP1200"), LOG4CXX_STR("cp1200")))
600579
{
601580
return std::make_shared<UTF16BECharsetEncoder>();
602581
}
603582
else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("UTF-16LE"), LOG4CXX_STR("utf-16le")))
604583
{
605584
return std::make_shared<UTF16LECharsetEncoder>();
606585
}
586+
else if (StringHelper::equalsIgnoreCase(charset, LOG4CXX_STR("LOCALE"), LOG4CXX_STR("locale")))
587+
{
588+
return std::make_shared<LocaleCharsetEncoder>();
589+
}
607590

608591
#if APR_HAS_XLATE
609592
return std::make_shared<APRCharsetEncoder>(charset);

src/main/cpp/exception.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ IOException& IOException::operator=(const IOException& src)
172172

173173
LogString IOException::formatMessage(log4cxx_status_t stat)
174174
{
175-
char err_buff[32];
175+
char err_buff[1024];
176176
LogString s(LOG4CXX_STR("IO Exception : status code = "));
177177
Pool p;
178178
StringHelper::toString(stat, p, s);

0 commit comments

Comments
 (0)