/*
 *
 *  Copyright (C) 2011-2015, OFFIS e.V.
 *  All rights reserved.  See COPYRIGHT file for details.
 *
 *  This software and supporting documentation were developed by
 *
 *    OFFIS e.V.
 *    R&D Division Health
 *    Escherweg 2
 *    D-26121 Oldenburg, Germany
 *
 *
 *  Module:  dcmdata
 *
 *  Author:  Joerg Riesmeier
 *
 *  Purpose: Class for supporting the Specific Character Set attribute
 *
 */


#ifndef DCSPCHRS_H
#define DCSPCHRS_H

#include "dcmtk/config/osconfig.h"

#include "dcmtk/ofstd/ofchrenc.h"
#include "dcmtk/ofstd/ofmap.h"
#include "dcmtk/dcmdata/dcdefine.h"


// forward declaration
class DcmItem;


/** A class for managing and converting between different DICOM character sets.
 *  The conversion relies on the OFCharacterEncoding class, which again relies
 *  on the libiconv toolkit (if available).
 *  @note Please note that a current limitation is that only a single value is
 *    allowed for the destination character set (i.e. no code extensions).  Of
 *    course, for the source character set, also multiple values are supported.
 */
class DCMTK_DCMDATA_EXPORT DcmSpecificCharacterSet
{

  public:

    /** constructor. Initializes the member variables.
     */
    DcmSpecificCharacterSet();

    /** destructor
     */
    ~DcmSpecificCharacterSet();

    /** clear the internal state.  This also forgets about the currently
     *  selected character sets, so selectCharacterSet() has to be called again
     *  before a string can be converted with convertString().
     */
    void clear();

    /** get currently selected source DICOM character set(s).  Please note that
     *  the returned string can contain multiple values (defined terms separated
     *  by a backslash) if code extension techniques are used.  Furthermore,
     *  the returned string is always normalized, i.e. leading and trailing
     *  spaces have been removed.
     *  @return currently selected source DICOM character set(s) or an empty
     *    string if none is selected (identical to ASCII, which is the default)
     */
    const OFString &getSourceCharacterSet() const;

    /** get currently selected destination DICOM character set.  Please note
     *  that the returned string, which contains a defined term, is always
     *  normalized, i.e. leading and trailing spaces have been removed.
     *  @return currently selected destination DICOM character set or an empty
     *    string if none is selected (identical to ASCII, which is the default)
     */
    const OFString &getDestinationCharacterSet() const;

    /** get currently selected destination encoding, i.e. the name of the
     *  character set as used by libiconv for the conversion.  If code
     *  extension techniques are used to switch between different character
     *  encodings, the main/default encoding is returned.
     *  @return currently selected destination encoding or an empty string if
     *    none is selected
     */
    const OFString &getDestinationEncoding() const;

    /** get mode specifying whether a character that cannot be represented in
     *  the destination character encoding is approximated through one or more
     *  characters that look similar to the original one.  See
     *  selectCharacterSet().
     *  @return current value of the mode.  OFTrue means that the mode is
     *    enabled, OFFalse means disabled.
     */
    OFBool getTransliterationMode() const;

    /** get mode specifying whether characters that cannot be represented in
     *  the destination character encoding will be silently discarded
     *  @return current value of the mode.  OFTrue means that the mode is
     *    enabled, OFFalse means disabled.
     */
    OFBool getDiscardIllegalSequenceMode() const;

    /** select DICOM character sets for the input and output string, between
     *  which subsequent calls of convertString() convert.  The defined terms
     *  for a particular character set can be found in the DICOM standard, e.g.
     *  "ISO_IR 100" for ISO 8859-1 (Latin 1) or "ISO_IR 192" for Unicode in
     *  UTF-8.  An empty string denotes the default character repertoire, which
     *  is ASCII (7-bit).  If multiple values are given for 'fromCharset'
     *  (separated by a backslash) code extension techniques are used and
     *  escape sequences may be encountered in the source string to switch
     *  between the specified character sets.
     *  @param  fromCharset     name of the source character set(s) used for the
     *                          input string as given in the DICOM attribute
     *                          Specific Character Set (0008,0005).  Leading and
     *                          trailing spaces are removed automatically (if
     *                          present).
     *  @param  toCharset       name of the destination character set used for
     *                          the output string.  Only a single value is
     *                          permitted (no code extensions).  Leading and
     *                          trailing spaces are removed automatically (if
     *                          present).  The default value is "ISO_IR 192"
     *                          (Unicode in UTF-8).
     *  @param  transliterate   mode specifying whether a character that cannot
     *                          be represented in the destination character
     *                          encoding is approximated through one or more
     *                          characters that look similar to the original
     *                          one.  By default, this mode is disabled.
     *  @param  discardIllegal  mode specifying whether characters that cannot
     *                          be represented in the destination character
     *                          encoding will be silently discarded.  By
     *                          default, this mode is disabled.
     *  @return status, EC_Normal if successful, an error code otherwise
     */
    OFCondition selectCharacterSet(const OFString &fromCharset,
                                   const OFString &toCharset = "ISO_IR 192",
                                   const OFBool transliterate = OFFalse,
                                   const OFBool discardIllegal = OFFalse);

    /** select DICOM character sets for the input and output string, between
     *  which subsequent calls of convertString() convert.  The source
     *  character set is determined from the DICOM element Specific Character
     *  Set (0008,0005) stored in the given dataset/item.  The defined terms
     *  for the destination character set can be found in the DICOM standard,
     *  e.g. "ISO_IR 100" for ISO 8859-1 (Latin 1) or "ISO_IR 192" for Unicode
     *  in UTF-8.  An empty string denotes the default character repertoire,
     *  which is ASCII (7-bit).  If multiple values are found in the Specific
     *  Character Set element of the given 'dataset' (separated by a backslash)
     *  code extension techniques are used and escape sequences may be
     *  encountered in the source string to switch between the specified
     *  character sets.
     *  @param  dataset         DICOM dataset or item from which the source
     *                          character set should be retrieved.  If the data
     *                          element Specific Character Set (0008,0005) is
     *                          empty or missing, the default character set
     *                          (i.e. ASCII) is used.
     *  @param  toCharset       name of the destination character set used for
     *                          the output string.  Only a single value is
     *                          permitted (no code extensions).  Leading and
     *                          trailing spaces are removed automatically (if
     *                          present).  The default value is "ISO_IR 192"
     *                          (Unicode in UTF-8).
     *  @param  transliterate   mode specifying whether a character that cannot
     *                          be represented in the destination character
     *                          encoding is approximated through one or more
     *                          characters that look similar to the original
     *                          one.  By default, this mode is disabled.
     *  @param  discardIllegal  mode specifying whether characters that cannot
     *                          be represented in the destination character
     *                          encoding will be silently discarded.  By
     *                          default, this mode is disabled.
     *  @return status, EC_Normal if successful, an error code otherwise
     */
    OFCondition selectCharacterSet(DcmItem &dataset,
                                   const OFString &toCharset = "ISO_IR 192",
                                   const OFBool transliterate = OFFalse,
                                   const OFBool discardIllegal = OFFalse);

    /** convert the given string from the selected source character set(s) to
     *  the selected destination character set.  That means selectCharacterSet()
     *  has to be called prior to this method.
     *  @param  fromString  input string to be converted (using the currently
     *                      selected source character set)
     *  @param  toString    reference to variable where the converted string
     *                      (using the currently selected destination character
     *                      set) is stored
     *  @param  delimiters  optional string of characters that are regarded as
     *                      delimiters, i.e.\ when found the character set is
     *                      switched back to the default.  CR, LF and FF are
     *                      always regarded as delimiters (see DICOM PS 3.5).
     *  @return status, EC_Normal if successful, an error code otherwise
     */
    OFCondition convertString(const OFString &fromString,
                              OFString &toString,
                              const OFString &delimiters = "");

    /** convert the given string from the selected source character set(s) to
     *  the selected destination character set. That means selectCharacterSet()
     *  has to be called prior to this method.  Since the length of the input
     *  string has to be specified explicitly, the string can contain more than
     *  one NULL byte.
     *  @param  fromString  input string to be converted (using the currently
     *                      selected character set)
     *  @param  fromLength  length of the input string (number of bytes without
     *                      the trailing NULL byte)
     *  @param  toString    reference to variable where the converted string
     *                      (using the currently selected destination character
     *                      set) is stored
     *  @param  delimiters  optional string of characters that are regarded as
     *                      delimiters, i.e.\ when found the character set is
     *                      switched back to the default.  CR, LF and FF are
     *                      always regarded as delimiters (see DICOM PS 3.5).
     *  @return status, EC_Normal if successful, an error code otherwise
     */
    OFCondition convertString(const char *fromString,
                              const size_t fromLength,
                              OFString &toString,
                              const OFString &delimiters = "");

    // --- static helper functions ---

    /** check whether the underlying character set conversion library is
     *  available.  If the library is not available, no conversion between
     *  different character sets will be possible.
     *  @return OFTrue if the character set conversion library is available,
     *    OFFalse otherwise
     */
    static OFBool isConversionLibraryAvailable();

    /** count characters in given UTF-8 string and return the resulting number
     *  of so-called "code points".  Please note that invalid UTF-8 encodings
     *  are not handled properly.  ASCII strings (7-bit) are also supported,
     *  although OFString::length() is probably much faster.
     *  @param  utf8String  valid character string with UTF-8 encoding
     *  @return number of characters (code points) in given UTF-8 string
     */
    static size_t countCharactersInUTF8String(const OFString &utf8String);


  protected:

    /// type definition of a map storing the identifier (key) of a character
    /// set and the associated conversion descriptor
    typedef OFMap<OFString, OFCharacterEncoding::T_Descriptor> T_DescriptorMap;

    /** determine the destination character encoding (as used by libiconv) from
     *  the given DICOM defined term (specific character set), and set the
     *  member variables accordingly.
     *  @param  toCharset  name of the destination character set used for the
     *                     output string
     *  @return status, EC_Normal if successful, an error code otherwise
     */
    OFCondition determineDestinationEncoding(const OFString &toCharset);

    /** select a particular DICOM character set without code extensions for
     *  subsequent conversions.  The corresponding DICOM defined term for the
     *  source character set is determined from the member variable
     *  'SourceCharacterSet'.
     *  @return status, EC_Normal if successful, an error code otherwise
     */
    OFCondition selectCharacterSetWithoutCodeExtensions();

    /** select a particular DICOM character set with code extensions for
     *  subsequent conversions.  The corresponding DICOM defined terms for the
     *  source character set are determined from the member variable
     *  'SourceCharacterSet'.
     *  @param  sourceVM  value multiplicity of the member variable
     *                    'SourceCharacterSet'.  Usually, this value has
     *                    already been determined by the calling method.
     *  @return status, EC_Normal if successful, an error code otherwise
     */
    OFCondition selectCharacterSetWithCodeExtensions(const unsigned long sourceVM);

    /** close any currently open character set conversion descriptor(s).
     *  Afterwards, no conversion descriptor is selected, pretty much like
     *  after the initialization with the constructor.
     */
    void closeConversionDescriptors();

    /** check whether the given string contains at least one escape character
     *  (ESC), because it is used for code extension techniques like ISO 2022
     *  @param  strValue   input string to be checked for any escape character
     *  @param  strLength  length of the input string
     *  @return OFTrue if an escape character has been found, OFFalse otherwise
     */
    OFBool checkForEscapeCharacter(const char *strValue,
                                   const size_t strLength) const;

    /** convert given string to octal format, i.e.\ all non-ASCII and control
     *  characters are converted to their octal representation.  The total
     *  length of the string is always limited to a particular maximum (see
     *  implementation).  If the converted string would be longer, it is
     *  cropped and "..." is appended to indicate this cropping.
     *  @param  strValue   input string to be converted and possibly cropped
     *  @param  strLength  length of the input string
     *  @return resulting string in octal format
     */
    OFString convertToLengthLimitedOctalString(const char *strValue,
                                               const size_t strLength) const;


  private:

    // private undefined copy constructor
    DcmSpecificCharacterSet(const DcmSpecificCharacterSet &);

    // private undefined assignment operator
    DcmSpecificCharacterSet &operator=(const DcmSpecificCharacterSet &);

    /// selected source character set(s) based on one or more DICOM defined terms
    OFString SourceCharacterSet;

    /// selected destination character set based on a single DICOM defined term
    OFString DestinationCharacterSet;

    /// selected destination encoding based on names supported by the libiconv toolkit
    OFString DestinationEncoding;

    /// character encoding converter
    OFCharacterEncoding EncodingConverter;

    /// map of character set conversion descriptors
    /// (only used if multiple character sets are needed)
    T_DescriptorMap ConversionDescriptors;
};


#endif