/*
 *
 *  Copyright (C) 2011-2016, OFFIS e.V.
 *  All rights reserved.  See COPYRIGHT file for details.
 *
 *  This software and supporting documentation were developed by
 *
 *    OFFIS e.V.
 *    R&D Division Health
 *    Escherweg 2
 *    D-26121 Oldenburg, Germany
 *
 *
 *  Module:  ofstd
 *
 *  Author:  Joerg Riesmeier
 *
 *  Purpose: Class for character encoding conversion (Header)
 *
 */


#ifndef OFCHRENC_H
#define OFCHRENC_H

#include "dcmtk/config/osconfig.h"

#include "dcmtk/ofstd/ofcond.h"
#include "dcmtk/ofstd/ofstring.h"


/*---------------------*
 *  class declaration  *
 *---------------------*/

/** A class for managing and converting between different character encodings.
 *  The implementation relies on the libiconv toolkit (if available).
 */
class DCMTK_OFSTD_EXPORT OFCharacterEncoding
{

  // allow the DICOM-specific character set class to access protected methods
  friend class DcmSpecificCharacterSet;

  public:

    /** constructor. Initializes the member variables, which includes the
     *  current locale's character encoding.
     */
    OFCharacterEncoding();

    /** destructor
     */
    ~OFCharacterEncoding();

    /** clear the internal state.  This also closes the conversion descriptor
     *  if it was allocated before, so selectEncoding() has to be called again
     *  before a string can be converted to a new character encoding.
     */
    void clear();

    /** get mode specifying whether a character that cannot be represented in
     *  the destination character encoding is approximated through one or more
     *  characters that look similar to the original one
     *  @return current value of the mode.  OFTrue means that the mode is
     *    enabled, OFFalse means disabled.
     */
    OFBool getTransliterationMode() const;

    /** get mode specifying whether characters that cannot be represented in
     *  the destination character encoding will be silently discarded
     *  @return current value of the mode.  OFTrue means that the mode is
     *    enabled, OFFalse means disabled.
     */
    OFBool getDiscardIllegalSequenceMode() const;

    /** set mode specifying whether a character that cannot be represented in
     *  the destination character encoding is approximated through one or more
     *  characters that look similar to the original one.  By default, this
     *  mode is disabled.
     *  @param  mode  enable mode by OFTrue or disable it by OFFalse
     *  @return status, EC_Normal if successful, an error code otherwise
     */
    OFCondition setTransliterationMode(const OFBool mode);

    /** set mode specifying whether characters that cannot be represented in
     *  the destination character encoding will be silently discarded.  By
     *  default, this mode is disabled.
     *  @param  mode  enable mode by OFTrue or disable it by OFFalse
     *  @return status, EC_Normal if successful, an error code otherwise
     */
    OFCondition setDiscardIllegalSequenceMode(const OFBool mode);

    /** get the current locale's character encoding
     *  @return the current locale's character encoding
     */
    const OFString &getLocaleEncoding() const;

    /** updates the current locale's character encoding.  This is only needed
     *  if the locale setting changed during the lifetime of this object,
     *  because the current locale's character encoding is always determined
     *  in the constructor.  If possible the canonical encoding names listed
     *  in "config.charset" (see libiconv toolkit) are used.
     *  @return status, EC_Normal if successful, an error code otherwise
     */
    OFCondition updateLocaleEncoding();

    /** select source and destination character encoding for subsequent
     *  conversion(s).  The encoding names can be found in the documentation
     *  of the libiconv toolkit.  Typical names are "ASCII", "ISO-8859-1" and
     *  "UTF-8".  An empty string denotes the locale dependent character
     *  encoding (see getLocaleEncoding()).
     *  @param  fromEncoding  name of the source character encoding
     *  @param  toEncoding    name of the destination character encoding
     *  @return status, EC_Normal if successful, an error code otherwise
     */
    OFCondition selectEncoding(const OFString &fromEncoding,
                               const OFString &toEncoding);

    /** convert the given string between the selected character encodings.
     *  That means selectEncoding() has to be called prior to this method.
     *  @param  fromString  input string to be converted (using the source
     *                      character encoding)
     *  @param  toString    reference to variable where the converted string
     *                      (using the destination character encoding) is
     *                      stored (or appended, see parameter 'clearMode')
     *  @param  clearMode   flag indicating whether to clear the variable
     *                      'toString' before appending the converted string
     *  @return status, EC_Normal if successful, an error code otherwise
     */
    OFCondition convertString(const OFString &fromString,
                              OFString &toString,
                              const OFBool clearMode = OFTrue);

    /** convert the given string between the selected character encodings.
     *  That means selectEncoding() has to be called prior to this method.
     *  Since the length of the input string has to be specified explicitly,
     *  the string can contain more than one NULL byte.
     *  @param  fromString  input string to be converted (using the source
     *                      character encoding).  A NULL pointer is regarded
     *                      as an empty string.
     *  @param  fromLength  length of the input string (number of bytes without
     *                      the trailing NULL byte)
     *  @param  toString    reference to variable where the converted string
     *                      (using the destination character encoding) is
     *                      stored (or appended, see parameter 'clearMode')
     *  @param  clearMode   flag indicating whether to clear the variable
     *                      'toString' before appending the converted string
     *  @return status, EC_Normal if successful, an error code otherwise
     */
    OFCondition convertString(const char *fromString,
                              const size_t fromLength,
                              OFString &toString,
                              const OFBool clearMode = OFTrue);

#ifdef HAVE_WINDOWS_H

    /** @name code page definitions.
     *  Short list of common code page identifiers used for the conversion to
     *  and from Windows-specific wide character encoding (UTF-16).
     *  For further code pages, please refer to the MSDN documentation on
     *  "Code Page Identifiers".
     */
    //@{

    /// system default Windows ANSI code page.  See Windows function GetACP().
    static const unsigned int CPC_ANSI;
    /// current system OEM code page.  See Windows function GetOEMCP().
    static const unsigned int CPC_OEM;
    /// code page for US-ASCII (7-bit)
    static const unsigned int CPC_ASCII;
    /// code page for ISO 8859-1 (Latin-1)
    static const unsigned int CPC_Latin1;
    /// code page for UTF-8
    static const unsigned int CPC_UTF8;

    //@}

    // --- static Windows-specific functions ---

    /** convert the given string between Windows-specific wide character
     *  encoding (UTF-16) and the specified code page.  In contrast to
     *  convertString(), no special character encoding library is needed,
     *  but on the other hand it only works on Windows systems.
     *  Please note that no conversion flags are specified for the internal
     *  call to the WideCharToMultiByte() function.
     *  Since the length of the input string has to be specified explicitly,
     *  the string can contain more than one NULL character.
     *  @param  fromString  input string to be converted (using the UTF-16
     *                      character encoding).  A NULL pointer is regarded
     *                      as an empty string.
     *  @param  fromLength  length of the input string (number of characters
     *                      without the trailing NULL character)
     *  @param  toString    reference to variable where the converted string
     *                      (using the character encoding specified by
     *                      'codePage') is stored (or appended, see parameter
     *                      'clearMode')
     *  @param  codePage    identifier of the code page to be used for the
     *                      conversion (default: UTF-8)
     *  @param  clearMode   flag indicating whether to clear the variable
     *                      'toString' before appending the converted string
     *  @return status, EC_Normal if successful, an error code otherwise
     */
    static OFCondition convertFromWideCharString(const wchar_t *fromString,
                                                 const size_t fromLength,
                                                 OFString &toString,
                                                 const unsigned int codePage = CPC_UTF8,
                                                 const OFBool clearMode = OFTrue);

    /** convert the given string between the specified code page and the
     *  Windows-specific wide character encoding (UTF-16).  In contrast to
     *  convertString(), no special character encoding library is needed, but
     *  on the other hand it only works on Windows systems.
     *  Please note that no conversion flags are specified for the internal
     *  call to the MultiByteToWideChar() function.
     *  @param  fromString  input string to be converted (using character
     *                      encoding specified by 'codePage')
     *  @param  toString    reference to variable in which the pointer to the
     *                      converted string (using the UTF-16 character
     *                      encoding) is stored.  Might only be NULL if memory
     *                      is exhausted.  Please note that the buffer is
     *                      created with new[] and has to be deleted by the
     *                      caller.
     *  @param  toLength    number of converted characters, i.e.\ length of
     *                      'toString'
     *  @param  codePage    identifier of the code page to be used for the
     *                      conversion (default: UTF-8)
     *  @return status, EC_Normal if successful, an error code otherwise
     */
    static OFCondition convertToWideCharString(const OFString &fromString,
                                               wchar_t *&toString,
                                               size_t &toLength,
                                               const unsigned int codePage = CPC_UTF8);

    /** convert the given string between the specified code page and the
     *  Windows-specific wide character encoding (UTF-16).  In contrast to
     *  convertString(), no special character encoding library is needed, but
     *  on the other hand it only works on Windows systems.
     *  Please note that no conversion flags are specified for the internal
     *  call to the MultiByteToWideChar() function.
     *  Since the length of the input string has to be specified explicitly,
     *  the string can contain more than one NULL byte.
     *  @param  fromString  input string to be converted (using the  character
     *                      encoding specified by 'codePage').  A NULL pointer
     *                      is regarded as an empty string.
     *  @param  fromLength  length of the input string (number of bytes
     *                      without the trailing NULL byte)
     *  @param  toString    reference to variable in which the pointer to the
     *                      converted string (using the UTF-16 character
     *                      encoding) is stored.  Might only be NULL if memory
     *                      is exhausted.  Please note that the buffer is
     *                      created with new[] and has to be deleted by the
     *                      caller.
     *  @param  toLength    number of converted characters, i.e.\ length of
     *                      'toString'
     *  @param  codePage    identifier of the code page to be used for the
     *                      conversion (default: UTF-8)
     *  @return status, EC_Normal if successful, an error code otherwise
     */
    static OFCondition convertToWideCharString(const char *fromString,
                                               const size_t fromLength,
                                               wchar_t *&toString,
                                               size_t &toLength,
                                               const unsigned int codePage = CPC_UTF8);

#endif  // HAVE_WINDOWS_H

    // --- static helper functions ---

    /** check whether the underlying character encoding library is available.
     *  If the library is not available, no conversion between different
     *  character encodings will be possible (apart from the Windows-specific
     *  wide character conversion functions).
     *  @return OFTrue if the character encoding library is available, OFFalse
     *    otherwise
     */
    static OFBool isLibraryAvailable();

    /** get version information of the underlying character encoding library.
     *  Typical output format: "LIBICONV, Version 1.14".  If the library is not
     *  available the output is: "<no character encoding library available>"
     *  @return name and version number of the character encoding library
     */
    static OFString getLibraryVersionString();

    /** count characters in given UTF-8 string and return the resulting number
     *  of so-called "code points".  Please note that invalid UTF-8 encodings
     *  are not handled properly.  ASCII strings (7-bit) are also supported,
     *  although OFString::length() is probably much faster.
     *  @param  utf8String  valid character string with UTF-8 encoding
     *  @return number of characters (code points) in given UTF-8 string
     */
    static size_t countCharactersInUTF8String(const OFString &utf8String);


  protected:

    /// type of the conversion descriptor (used by libiconv)
    typedef void* T_Descriptor;

    /** allocate conversion descriptor for the given source and destination
     *  character encoding.  Please make sure that the descriptor is
     *  deallocated with closeDescriptor() when not needed any longer.
     *  @param  descriptor    reference to variable where the newly allocated
     *                        conversion descriptor is stored
     *  @param  fromEncoding  name of the source character encoding
     *  @param  toEncoding    name of the destination character encoding
     *  @return status, EC_Normal if successful, an error code otherwise
     */
    OFCondition openDescriptor(T_Descriptor &descriptor,
                               const OFString &fromEncoding,
                               const OFString &toEncoding);

    /** deallocate the given conversion descriptor that was previously
     *  allocated with openDescriptor().  Please do not pass arbitrary values
     *  to this method, since this will result in a segmentation fault.
     *  @param  descriptor  conversion descriptor to be closed.  After the
     *                      descriptor has been deallocated, 'descriptor' is
     *                      set to an invalid value - see isDescriptorValid().
     *  @return status, EC_Normal if successful, an error code otherwise.  In
     *    case an invalid descriptor is passed, it is not regarded as an error.
     */
    OFCondition closeDescriptor(T_Descriptor &descriptor);

    /** check whether the given conversion descriptor is valid, i.e.\ has been
     *  allocated by a previous call to openDescriptor()
     *  @param  descriptor  conversion descriptor to be checked
     *  @return OFTrue if the conversion descriptor is valid, OFFalse otherwise
     */
    OFBool isDescriptorValid(const T_Descriptor descriptor);

    /** convert the given string between the specified character encodings.
     *  Since the length of the input string has to be specified explicitly,
     *  the string can contain more than one NULL byte.
     *  @param  descriptor  previously allocated conversion descriptor to be
     *                      used for the conversion of the character encodings
     *  @param  fromString  input string to be converted (using the source
     *                      character encoding).  A NULL pointer is regarded
     *                      as an empty string.
     *  @param  fromLength  length of the input string (number of bytes without
     *                      the trailing NULL byte)
     *  @param  toString    reference to variable where the converted string
     *                      (using the destination character encoding) is
     *                      stored (or appended, see parameter 'clearMode')
     *  @param  clearMode   flag indicating whether to clear the variable
     *                      'toString' before appending the converted string
     *  @return status, EC_Normal if successful, an error code otherwise
     */
    OFCondition convertString(T_Descriptor descriptor,
                              const char *fromString,
                              const size_t fromLength,
                              OFString &toString,
                              const OFBool clearMode = OFTrue);


  private:

    // private undefined copy constructor
    OFCharacterEncoding(const OFCharacterEncoding &);

    // private undefined assignment operator
    OFCharacterEncoding &operator=(const OFCharacterEncoding &);

    // --- static helper functions ---

    /** create an error condition based on the current value of "errno" and the
     *  given parameters.  The function OFStandard::strerror() is used to map
     *  the numerical value of the error to a textual description.
     *  @param  status   reference to variable where the condition is stored
     *  @param  message  message text that is used as a prefix to strerror()
     *  @param  code     unique status code of the error condition
     */
    static void createErrnoCondition(OFCondition &status,
                                     OFString message,
                                     const unsigned short code);

#ifdef HAVE_WINDOWS_H

    /** create an error condition based on the return value of "getLastError()"
     *  and the given parameters.  The Windows function FormatMessage() is used
     *  to map the numerical value of the error to a textual description.
     *  @param  status   reference to variable where the condition is stored
     *  @param  message  message text that is used as a prefix to the error
     *  @param  code     unique status code of the error condition
     */
    static void createGetLastErrorCondition(OFCondition &status,
                                            OFString message,
                                            const unsigned short code);

#endif  // HAVE_WINDOWS_H

    /// current locale's character encoding
    OFString LocaleEncoding;

    /// conversion descriptor used by libiconv
    T_Descriptor ConversionDescriptor;

    /// transliteration mode (default: disabled)
    OFBool TransliterationMode;

    /// discard illegal sequence mode (default: disabled)
    OFBool DiscardIllegalSequenceMode;
};


#endif