407 lines
19 KiB
C++
407 lines
19 KiB
C++
/*
|
|
*
|
|
* Copyright (C) 2011-2016, OFFIS e.V.
|
|
* All rights reserved. See COPYRIGHT file for details.
|
|
*
|
|
* This software and supporting documentation were developed by
|
|
*
|
|
* OFFIS e.V.
|
|
* R&D Division Health
|
|
* Escherweg 2
|
|
* D-26121 Oldenburg, Germany
|
|
*
|
|
*
|
|
* Module: ofstd
|
|
*
|
|
* Author: Joerg Riesmeier
|
|
*
|
|
* Purpose: Class for character encoding conversion (Header)
|
|
*
|
|
*/
|
|
|
|
|
|
#ifndef OFCHRENC_H
|
|
#define OFCHRENC_H
|
|
|
|
#include "dcmtk/config/osconfig.h"
|
|
|
|
#include "dcmtk/ofstd/ofcond.h"
|
|
#include "dcmtk/ofstd/ofstring.h"
|
|
|
|
|
|
/*---------------------*
|
|
* class declaration *
|
|
*---------------------*/
|
|
|
|
/** A class for managing and converting between different character encodings.
|
|
* The implementation relies on the libiconv toolkit (if available).
|
|
*/
|
|
class DCMTK_OFSTD_EXPORT OFCharacterEncoding
|
|
{
|
|
|
|
// allow the DICOM-specific character set class to access protected methods
|
|
friend class DcmSpecificCharacterSet;
|
|
|
|
public:
|
|
|
|
/** constructor. Initializes the member variables, which includes the
|
|
* current locale's character encoding.
|
|
*/
|
|
OFCharacterEncoding();
|
|
|
|
/** destructor
|
|
*/
|
|
~OFCharacterEncoding();
|
|
|
|
/** clear the internal state. This also closes the conversion descriptor
|
|
* if it was allocated before, so selectEncoding() has to be called again
|
|
* before a string can be converted to a new character encoding.
|
|
*/
|
|
void clear();
|
|
|
|
/** get mode specifying whether a character that cannot be represented in
|
|
* the destination character encoding is approximated through one or more
|
|
* characters that look similar to the original one
|
|
* @return current value of the mode. OFTrue means that the mode is
|
|
* enabled, OFFalse means disabled.
|
|
*/
|
|
OFBool getTransliterationMode() const;
|
|
|
|
/** get mode specifying whether characters that cannot be represented in
|
|
* the destination character encoding will be silently discarded
|
|
* @return current value of the mode. OFTrue means that the mode is
|
|
* enabled, OFFalse means disabled.
|
|
*/
|
|
OFBool getDiscardIllegalSequenceMode() const;
|
|
|
|
/** set mode specifying whether a character that cannot be represented in
|
|
* the destination character encoding is approximated through one or more
|
|
* characters that look similar to the original one. By default, this
|
|
* mode is disabled.
|
|
* @param mode enable mode by OFTrue or disable it by OFFalse
|
|
* @return status, EC_Normal if successful, an error code otherwise
|
|
*/
|
|
OFCondition setTransliterationMode(const OFBool mode);
|
|
|
|
/** set mode specifying whether characters that cannot be represented in
|
|
* the destination character encoding will be silently discarded. By
|
|
* default, this mode is disabled.
|
|
* @param mode enable mode by OFTrue or disable it by OFFalse
|
|
* @return status, EC_Normal if successful, an error code otherwise
|
|
*/
|
|
OFCondition setDiscardIllegalSequenceMode(const OFBool mode);
|
|
|
|
/** get the current locale's character encoding
|
|
* @return the current locale's character encoding
|
|
*/
|
|
const OFString &getLocaleEncoding() const;
|
|
|
|
/** updates the current locale's character encoding. This is only needed
|
|
* if the locale setting changed during the lifetime of this object,
|
|
* because the current locale's character encoding is always determined
|
|
* in the constructor. If possible the canonical encoding names listed
|
|
* in "config.charset" (see libiconv toolkit) are used.
|
|
* @return status, EC_Normal if successful, an error code otherwise
|
|
*/
|
|
OFCondition updateLocaleEncoding();
|
|
|
|
/** select source and destination character encoding for subsequent
|
|
* conversion(s). The encoding names can be found in the documentation
|
|
* of the libiconv toolkit. Typical names are "ASCII", "ISO-8859-1" and
|
|
* "UTF-8". An empty string denotes the locale dependent character
|
|
* encoding (see getLocaleEncoding()).
|
|
* @param fromEncoding name of the source character encoding
|
|
* @param toEncoding name of the destination character encoding
|
|
* @return status, EC_Normal if successful, an error code otherwise
|
|
*/
|
|
OFCondition selectEncoding(const OFString &fromEncoding,
|
|
const OFString &toEncoding);
|
|
|
|
/** convert the given string between the selected character encodings.
|
|
* That means selectEncoding() has to be called prior to this method.
|
|
* @param fromString input string to be converted (using the source
|
|
* character encoding)
|
|
* @param toString reference to variable where the converted string
|
|
* (using the destination character encoding) is
|
|
* stored (or appended, see parameter 'clearMode')
|
|
* @param clearMode flag indicating whether to clear the variable
|
|
* 'toString' before appending the converted string
|
|
* @return status, EC_Normal if successful, an error code otherwise
|
|
*/
|
|
OFCondition convertString(const OFString &fromString,
|
|
OFString &toString,
|
|
const OFBool clearMode = OFTrue);
|
|
|
|
/** convert the given string between the selected character encodings.
|
|
* That means selectEncoding() has to be called prior to this method.
|
|
* Since the length of the input string has to be specified explicitly,
|
|
* the string can contain more than one NULL byte.
|
|
* @param fromString input string to be converted (using the source
|
|
* character encoding). A NULL pointer is regarded
|
|
* as an empty string.
|
|
* @param fromLength length of the input string (number of bytes without
|
|
* the trailing NULL byte)
|
|
* @param toString reference to variable where the converted string
|
|
* (using the destination character encoding) is
|
|
* stored (or appended, see parameter 'clearMode')
|
|
* @param clearMode flag indicating whether to clear the variable
|
|
* 'toString' before appending the converted string
|
|
* @return status, EC_Normal if successful, an error code otherwise
|
|
*/
|
|
OFCondition convertString(const char *fromString,
|
|
const size_t fromLength,
|
|
OFString &toString,
|
|
const OFBool clearMode = OFTrue);
|
|
|
|
#ifdef HAVE_WINDOWS_H
|
|
|
|
/** @name code page definitions.
|
|
* Short list of common code page identifiers used for the conversion to
|
|
* and from Windows-specific wide character encoding (UTF-16).
|
|
* For further code pages, please refer to the MSDN documentation on
|
|
* "Code Page Identifiers".
|
|
*/
|
|
//@{
|
|
|
|
/// system default Windows ANSI code page. See Windows function GetACP().
|
|
static const unsigned int CPC_ANSI;
|
|
/// current system OEM code page. See Windows function GetOEMCP().
|
|
static const unsigned int CPC_OEM;
|
|
/// code page for US-ASCII (7-bit)
|
|
static const unsigned int CPC_ASCII;
|
|
/// code page for ISO 8859-1 (Latin-1)
|
|
static const unsigned int CPC_Latin1;
|
|
/// code page for UTF-8
|
|
static const unsigned int CPC_UTF8;
|
|
|
|
//@}
|
|
|
|
// --- static Windows-specific functions ---
|
|
|
|
/** convert the given string between Windows-specific wide character
|
|
* encoding (UTF-16) and the specified code page. In contrast to
|
|
* convertString(), no special character encoding library is needed,
|
|
* but on the other hand it only works on Windows systems.
|
|
* Please note that no conversion flags are specified for the internal
|
|
* call to the WideCharToMultiByte() function.
|
|
* Since the length of the input string has to be specified explicitly,
|
|
* the string can contain more than one NULL character.
|
|
* @param fromString input string to be converted (using the UTF-16
|
|
* character encoding). A NULL pointer is regarded
|
|
* as an empty string.
|
|
* @param fromLength length of the input string (number of characters
|
|
* without the trailing NULL character)
|
|
* @param toString reference to variable where the converted string
|
|
* (using the character encoding specified by
|
|
* 'codePage') is stored (or appended, see parameter
|
|
* 'clearMode')
|
|
* @param codePage identifier of the code page to be used for the
|
|
* conversion (default: UTF-8)
|
|
* @param clearMode flag indicating whether to clear the variable
|
|
* 'toString' before appending the converted string
|
|
* @return status, EC_Normal if successful, an error code otherwise
|
|
*/
|
|
static OFCondition convertFromWideCharString(const wchar_t *fromString,
|
|
const size_t fromLength,
|
|
OFString &toString,
|
|
const unsigned int codePage = CPC_UTF8,
|
|
const OFBool clearMode = OFTrue);
|
|
|
|
/** convert the given string between the specified code page and the
|
|
* Windows-specific wide character encoding (UTF-16). In contrast to
|
|
* convertString(), no special character encoding library is needed, but
|
|
* on the other hand it only works on Windows systems.
|
|
* Please note that no conversion flags are specified for the internal
|
|
* call to the MultiByteToWideChar() function.
|
|
* @param fromString input string to be converted (using character
|
|
* encoding specified by 'codePage')
|
|
* @param toString reference to variable in which the pointer to the
|
|
* converted string (using the UTF-16 character
|
|
* encoding) is stored. Might only be NULL if memory
|
|
* is exhausted. Please note that the buffer is
|
|
* created with new[] and has to be deleted by the
|
|
* caller.
|
|
* @param toLength number of converted characters, i.e.\ length of
|
|
* 'toString'
|
|
* @param codePage identifier of the code page to be used for the
|
|
* conversion (default: UTF-8)
|
|
* @return status, EC_Normal if successful, an error code otherwise
|
|
*/
|
|
static OFCondition convertToWideCharString(const OFString &fromString,
|
|
wchar_t *&toString,
|
|
size_t &toLength,
|
|
const unsigned int codePage = CPC_UTF8);
|
|
|
|
/** convert the given string between the specified code page and the
|
|
* Windows-specific wide character encoding (UTF-16). In contrast to
|
|
* convertString(), no special character encoding library is needed, but
|
|
* on the other hand it only works on Windows systems.
|
|
* Please note that no conversion flags are specified for the internal
|
|
* call to the MultiByteToWideChar() function.
|
|
* Since the length of the input string has to be specified explicitly,
|
|
* the string can contain more than one NULL byte.
|
|
* @param fromString input string to be converted (using the character
|
|
* encoding specified by 'codePage'). A NULL pointer
|
|
* is regarded as an empty string.
|
|
* @param fromLength length of the input string (number of bytes
|
|
* without the trailing NULL byte)
|
|
* @param toString reference to variable in which the pointer to the
|
|
* converted string (using the UTF-16 character
|
|
* encoding) is stored. Might only be NULL if memory
|
|
* is exhausted. Please note that the buffer is
|
|
* created with new[] and has to be deleted by the
|
|
* caller.
|
|
* @param toLength number of converted characters, i.e.\ length of
|
|
* 'toString'
|
|
* @param codePage identifier of the code page to be used for the
|
|
* conversion (default: UTF-8)
|
|
* @return status, EC_Normal if successful, an error code otherwise
|
|
*/
|
|
static OFCondition convertToWideCharString(const char *fromString,
|
|
const size_t fromLength,
|
|
wchar_t *&toString,
|
|
size_t &toLength,
|
|
const unsigned int codePage = CPC_UTF8);
|
|
|
|
#endif // HAVE_WINDOWS_H
|
|
|
|
// --- static helper functions ---
|
|
|
|
/** check whether the underlying character encoding library is available.
|
|
* If the library is not available, no conversion between different
|
|
* character encodings will be possible (apart from the Windows-specific
|
|
* wide character conversion functions).
|
|
* @return OFTrue if the character encoding library is available, OFFalse
|
|
* otherwise
|
|
*/
|
|
static OFBool isLibraryAvailable();
|
|
|
|
/** get version information of the underlying character encoding library.
|
|
* Typical output format: "LIBICONV, Version 1.14". If the library is not
|
|
* available the output is: "<no character encoding library available>"
|
|
* @return name and version number of the character encoding library
|
|
*/
|
|
static OFString getLibraryVersionString();
|
|
|
|
/** count characters in given UTF-8 string and return the resulting number
|
|
* of so-called "code points". Please note that invalid UTF-8 encodings
|
|
* are not handled properly. ASCII strings (7-bit) are also supported,
|
|
* although OFString::length() is probably much faster.
|
|
* @param utf8String valid character string with UTF-8 encoding
|
|
* @return number of characters (code points) in given UTF-8 string
|
|
*/
|
|
static size_t countCharactersInUTF8String(const OFString &utf8String);
|
|
|
|
|
|
protected:
|
|
|
|
/// type of the conversion descriptor (used by libiconv)
|
|
typedef void* T_Descriptor;
|
|
|
|
/** allocate conversion descriptor for the given source and destination
|
|
* character encoding. Please make sure that the descriptor is
|
|
* deallocated with closeDescriptor() when not needed any longer.
|
|
* @param descriptor reference to variable where the newly allocated
|
|
* conversion descriptor is stored
|
|
* @param fromEncoding name of the source character encoding
|
|
* @param toEncoding name of the destination character encoding
|
|
* @return status, EC_Normal if successful, an error code otherwise
|
|
*/
|
|
OFCondition openDescriptor(T_Descriptor &descriptor,
|
|
const OFString &fromEncoding,
|
|
const OFString &toEncoding);
|
|
|
|
/** deallocate the given conversion descriptor that was previously
|
|
* allocated with openDescriptor(). Please do not pass arbitrary values
|
|
* to this method, since this will result in a segmentation fault.
|
|
* @param descriptor conversion descriptor to be closed. After the
|
|
* descriptor has been deallocated, 'descriptor' is
|
|
* set to an invalid value - see isDescriptorValid().
|
|
* @return status, EC_Normal if successful, an error code otherwise. In
|
|
* case an invalid descriptor is passed, it is not regarded as an error.
|
|
*/
|
|
OFCondition closeDescriptor(T_Descriptor &descriptor);
|
|
|
|
/** check whether the given conversion descriptor is valid, i.e.\ has been
|
|
* allocated by a previous call to openDescriptor()
|
|
* @param descriptor conversion descriptor to be checked
|
|
* @return OFTrue if the conversion descriptor is valid, OFFalse otherwise
|
|
*/
|
|
OFBool isDescriptorValid(const T_Descriptor descriptor);
|
|
|
|
/** convert the given string between the specified character encodings.
|
|
* Since the length of the input string has to be specified explicitly,
|
|
* the string can contain more than one NULL byte.
|
|
* @param descriptor previously allocated conversion descriptor to be
|
|
* used for the conversion of the character encodings
|
|
* @param fromString input string to be converted (using the source
|
|
* character encoding). A NULL pointer is regarded
|
|
* as an empty string.
|
|
* @param fromLength length of the input string (number of bytes without
|
|
* the trailing NULL byte)
|
|
* @param toString reference to variable where the converted string
|
|
* (using the destination character encoding) is
|
|
* stored (or appended, see parameter 'clearMode')
|
|
* @param clearMode flag indicating whether to clear the variable
|
|
* 'toString' before appending the converted string
|
|
* @return status, EC_Normal if successful, an error code otherwise
|
|
*/
|
|
OFCondition convertString(T_Descriptor descriptor,
|
|
const char *fromString,
|
|
const size_t fromLength,
|
|
OFString &toString,
|
|
const OFBool clearMode = OFTrue);
|
|
|
|
|
|
private:
|
|
|
|
// private undefined copy constructor
|
|
OFCharacterEncoding(const OFCharacterEncoding &);
|
|
|
|
// private undefined assignment operator
|
|
OFCharacterEncoding &operator=(const OFCharacterEncoding &);
|
|
|
|
// --- static helper functions ---
|
|
|
|
/** create an error condition based on the current value of "errno" and the
|
|
* given parameters. The function OFStandard::strerror() is used to map
|
|
* the numerical value of the error to a textual description.
|
|
* @param status reference to variable where the condition is stored
|
|
* @param message message text that is used as a prefix to strerror()
|
|
* @param code unique status code of the error condition
|
|
*/
|
|
static void createErrnoCondition(OFCondition &status,
|
|
OFString message,
|
|
const unsigned short code);
|
|
|
|
#ifdef HAVE_WINDOWS_H
|
|
|
|
/** create an error condition based on the return value of "getLastError()"
|
|
* and the given parameters. The Windows function FormatMessage() is used
|
|
* to map the numerical value of the error to a textual description.
|
|
* @param status reference to variable where the condition is stored
|
|
* @param message message text that is used as a prefix to the error
|
|
* @param code unique status code of the error condition
|
|
*/
|
|
static void createGetLastErrorCondition(OFCondition &status,
|
|
OFString message,
|
|
const unsigned short code);
|
|
|
|
#endif // HAVE_WINDOWS_H
|
|
|
|
/// current locale's character encoding
|
|
OFString LocaleEncoding;
|
|
|
|
/// conversion descriptor used by libiconv
|
|
T_Descriptor ConversionDescriptor;
|
|
|
|
/// transliteration mode (default: disabled)
|
|
OFBool TransliterationMode;
|
|
|
|
/// discard illegal sequence mode (default: disabled)
|
|
OFBool DiscardIllegalSequenceMode;
|
|
};
|
|
|
|
|
|
#endif
|