+ fixes #0002126: Fails to load .FCStd file which contains many MultiByte-Char strings

This commit is contained in:
wmayer 2015-09-20 12:29:05 +02:00
parent bd105711d4
commit 1dad42c2f6
2 changed files with 30 additions and 62 deletions

View File

@ -57,6 +57,8 @@ using namespace std;
StdInputStream::StdInputStream( std::istream& Stream, XERCES_CPP_NAMESPACE_QUALIFIER MemoryManager* const manager )
: stream(Stream), fMemoryManager(manager)
{
state.flags |= QTextCodec::IgnoreHeader;
state.flags |= QTextCodec::ConvertInvalidToNull;
}
@ -84,37 +86,19 @@ unsigned int StdInputStream::readBytes( XMLByte* const toFill, const unsigned i
stream.read((char *)toFill,maxToRead);
XMLSize_t len = stream.gcount();
// See http://de.wikipedia.org/wiki/UTF-8#Kodierung
for (XMLSize_t i=0; i<len; i++) {
XMLByte& b = toFill[i];
int seqlen = 0;
if ((b & 0x80) == 0) {
seqlen = 1;
}
else if ((b & 0xE0) == 0xC0) {
seqlen = 2;
if (b == 0xC0 || b == 0xC1)
b = '?'; // these both values are not allowed
}
else if ((b & 0xF0) == 0xE0) {
seqlen = 3;
}
else if ((b & 0xF8) == 0xF0) {
seqlen = 4;
}
else {
b = '?';
}
for(int j = 1; j < seqlen; ++j) {
i++;
XMLByte& c = toFill[i];
// range of second, third or fourth byte
if ((c & 0xC0) != 0x80) {
b = '?';
c = '?';
}
QTextCodec *codec = QTextCodec::codecForName("UTF-8");
const QString text = codec->toUnicode((char *)toFill, len, &state);
if (state.invalidChars > 0) {
// In case invalid characters were found decode back to 'utf-8' and replace
// them with '?'
// First, Qt replaces invalid characters with '\0' (see ConvertInvalidToNull)
// but Xerces doesn't like this because it handles this as termination. Thus,
// we have to go through the array and replace '\0' with '?'.
XMLSize_t pos = 0;
QByteArray ba = codec->fromUnicode(text);
for (int i=0; i<ba.length(); i++, pos++) {
if (pos < len && ba[i] == '\0')
toFill[i] = '?';
}
}
@ -136,37 +120,19 @@ XMLSize_t StdInputStream::readBytes( XMLByte* const toFill, const XMLSize_t max
stream.read((char *)toFill,maxToRead);
XMLSize_t len = stream.gcount();
// See http://de.wikipedia.org/wiki/UTF-8#Kodierung
for (XMLSize_t i=0; i<len; i++) {
XMLByte& b = toFill[i];
int seqlen = 0;
if ((b & 0x80) == 0) {
seqlen = 1;
}
else if ((b & 0xE0) == 0xC0) {
seqlen = 2;
if (b == 0xC0 || b == 0xC1)
b = '?'; // these both values are not allowed
}
else if ((b & 0xF0) == 0xE0) {
seqlen = 3;
}
else if ((b & 0xF8) == 0xF0) {
seqlen = 4;
}
else {
b = '?';
}
for(int j = 1; j < seqlen; ++j) {
i++;
XMLByte& c = toFill[i];
// range of second, third or fourth byte
if ((c & 0xC0) != 0x80) {
b = '?';
c = '?';
}
QTextCodec *codec = QTextCodec::codecForName("UTF-8");
const QString text = codec->toUnicode((char *)toFill, len, &state);
if (state.invalidChars > 0) {
// In case invalid characters were found decode back to 'utf-8' and replace
// them with '?'
// First, Qt replaces invalid characters with '\0' (see ConvertInvalidToNull)
// but Xerces doesn't like this because it handles this as termination. Thus,
// we have to go through the array and replace '\0' with '?'.
XMLSize_t pos = 0;
QByteArray ba = codec->fromUnicode(text);
for (int i=0; i<ba.length(); i++, pos++) {
if (pos < len && ba[i] == '\0')
toFill[i] = '?';
}
}

View File

@ -31,6 +31,7 @@
#include <xercesc/util/PlatformUtils.hpp>
#include <xercesc/util/XercesVersion.hpp>
#include <xercesc/sax/InputSource.hpp>
#include <QTextCodec>
XERCES_CPP_NAMESPACE_BEGIN
@ -75,6 +76,7 @@ private :
// -----------------------------------------------------------------------
std::istream &stream;
XERCES_CPP_NAMESPACE_QUALIFIER MemoryManager* const fMemoryManager;
QTextCodec::ConverterState state;
};