// Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc.  All rights reserved.
//
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file or at
// https://developers.google.com/open-source/licenses/bsd

// Internal JSON tokenization utilities; not public API.
#ifndef GOOGLE_PROTOBUF_JSON_INTERNAL_LEXER_H__
#define GOOGLE_PROTOBUF_JSON_INTERNAL_LEXER_H__

#include <array>
#include <cfloat>
#include <cmath>
#include <cstdint>
#include <iostream>
#include <limits>
#include <ostream>
#include <string>
#include <utility>

#include "absl/status/status.h"
#include "absl/status/statusor.h"
#include "absl/strings/match.h"
#include "absl/strings/str_format.h"
#include "absl/strings/string_view.h"
#include "google/protobuf/descriptor.h"
#include "google/protobuf/io/zero_copy_stream.h"
#include "google/protobuf/json/internal/message_path.h"
#include "google/protobuf/json/internal/zero_copy_buffered_stream.h"
#include "google/protobuf/stubs/status_macros.h"


// Must be included last.
#include "google/protobuf/port_def.inc"

namespace google {
namespace protobuf {
namespace json_internal {
// This is a duplicate of JsonParseOptions from json_util.h; it must be
// re-defined here so that :json_lexer does not need to depend on :json_util.
struct ParseOptions {
  bool ignore_unknown_fields = false;
  bool case_insensitive_enum_parsing = false;

  static constexpr size_t kDefaultDepth = 100;

  // The number of times we may recurse before bailing out on the grounds of
  // avoiding pathological input.
  int recursion_depth = kDefaultDepth;

  // The original parser used by json_util2 accepted a number of non-standard
  // options. Setting this flag enables them.
  //
  // What those extensions were is explicitly not documented, beyond what exists
  // in the unit tests; we intend to remove this setting eventually. See
  // b/234868512.
  bool allow_legacy_syntax = false;
};

// A position in JSON input, for error context.
struct JsonLocation {
  // This type exists to work around an absl type that has not yet been
  // released.
  struct SourceLocation {
    static SourceLocation current() { return {}; }
  };

  // Line and column are both zero-indexed in-memory.
  size_t offset = 0;
  size_t line = 0;
  size_t col = 0;
  const MessagePath* path = nullptr;

  // Creates an absl::InvalidArgumentError with line/column information.
  absl::Status Invalid(absl::string_view message,
                       SourceLocation sl = SourceLocation::current()) const;
};

template <typename T>
struct LocationWith {
  T value;
  JsonLocation loc;
};

class JsonLexer {
 public:
  // A kind of token that PeekKind() can detect.
  enum Kind {
    kObj,
    kArr,
    kStr,
    kNum,
    kTrue,
    kFalse,
    kNull,
  };

  using SourceLocation = JsonLocation::SourceLocation;

  JsonLexer(io::ZeroCopyInputStream* stream, const ParseOptions& options,
            MessagePath* path = nullptr, JsonLocation start = {})
      : stream_(stream), options_(options), json_loc_(start), path_(path) {
    json_loc_.path = path_;
  }

  const ParseOptions& options() const { return options_; }

  const MessagePath& path() const { return *path_; }
  MessagePath& path() { return *path_; }

  // Creates an absl::InvalidArgumentError with line/column information.
  absl::Status Invalid(absl::string_view message,
                       SourceLocation sl = SourceLocation::current()) {
    return json_loc_.Invalid(message, sl);
  }

  // Expects the next bytes to be parsed (after consuming whitespace) to be
  // exactly `literal`. If they are, consumes them; otherwise returns an error.
  absl::Status Expect(absl::string_view literal,
                      SourceLocation sl = SourceLocation::current()) {
    RETURN_IF_ERROR(SkipToToken());
    auto buffering = stream_.BufferAtLeast(literal.size());
    RETURN_IF_ERROR(buffering.status());

    if (!absl::StartsWith(stream_.Unread(), literal)) {
      return Invalid(
          absl::StrFormat("unexpected character: '%c'; expected '%s'",
                          stream_.PeekChar(), literal),
          sl);
    }

    return Advance(literal.size());
  }

  // Like Expect(), but returns a boolean. This makes it clear that the
  // lookahead is failible.
  bool Peek(absl::string_view literal) {
    // Suppress the error; this can only fail on EOF in which case we would
    // return false regardless.
    (void)SkipToToken();
    auto ignored = stream_.BufferAtLeast(literal.size());
    if (!absl::StartsWith(stream_.Unread(), literal)) {
      return false;
    }

    // We just ensured we had enough buffered so we can suppress this error.
    (void)Advance(literal.size());
    return true;
  }

  // Like Peek(string), but returns true if and only if a token of the given
  // kind can be lexed next. Returns false on EOF, just like Peek(string).
  bool Peek(Kind needle) {
    auto kind = PeekKind();
    return kind.ok() && *kind == needle;
  }

  // Consumes all whitespace and other ignored characters until the next
  // token.
  //
  // This function returns an error on EOF, so PeekChar() can be safely
  // called if it returns ok.
  absl::Status SkipToToken();

  // Returns which kind of value token (i.e., something that can occur after
  // a `:`) is next up to be parsed.
  absl::StatusOr<Kind> PeekKind();

  // Parses a JSON number.
  absl::StatusOr<LocationWith<double>> ParseNumber();

  // Parses a number as a string, without turning it into an integer.
  absl::StatusOr<LocationWith<MaybeOwnedString>> ParseRawNumber();

  // Parses a UTF-8 string. If the contents of the string happen to actually be
  // UTF-8, it will return a zero-copy view; otherwise it will allocate.
  absl::StatusOr<LocationWith<MaybeOwnedString>> ParseUtf8();

  // Walks over an array, calling `f` each time an element is reached.
  //
  // `f` should have type `() -> absl::Status`.
  template <typename F>
  absl::Status VisitArray(F f);

  // Walks over an object, calling `f` just after parsing each `:`.
  //
  // `f` should have type `(absl::string_view) -> absl::Status`.
  template <typename F>
  absl::Status VisitObject(F f);

  // Parses a single value and discards it.
  absl::Status SkipValue();

  // Forwards of functions from ZeroCopyBufferedStream.

  bool AtEof() {
    // Ignore whitespace for the purposes of finding the EOF. This will return
    // an error if we hit EOF, so we discard it.
    (void)SkipToToken();
    return stream_.AtEof();
  }

  absl::StatusOr<LocationWith<MaybeOwnedString>> Take(size_t len) {
    JsonLocation loc = json_loc_;
    auto taken = stream_.Take(len);
    RETURN_IF_ERROR(taken.status());
    return LocationWith<MaybeOwnedString>{*std::move(taken), loc};
  }

  template <typename Pred>
  absl::StatusOr<LocationWith<MaybeOwnedString>> TakeWhile(Pred p) {
    JsonLocation loc = json_loc_;
    auto taken = stream_.TakeWhile(std::move(p));
    RETURN_IF_ERROR(taken.status());
    return LocationWith<MaybeOwnedString>{*std::move(taken), loc};
  }

  LocationWith<Mark> BeginMark() { return {stream_.BeginMark(), json_loc_}; }

 private:
  friend BufferingGuard;
  friend Mark;
  friend MaybeOwnedString;

  absl::Status Push() {
    if (options_.recursion_depth == 0) {
      return Invalid("JSON content was too deeply nested");
    }
    --options_.recursion_depth;
    return absl::OkStatus();
  }

  void Pop() { ++options_.recursion_depth; }

  // Parses the next four bytes as a 16-bit hex numeral.
  absl::StatusOr<uint16_t> ParseU16HexCodepoint();

  // Parses a Unicode escape (\uXXXX); this may be a surrogate pair, so it may
  // consume the character that follows. Both are encoded as utf8 into
  // `out_utf8`; returns the number of bytes written.
  absl::StatusOr<size_t> ParseUnicodeEscape(char out_utf8[4]);

  // Parses an alphanumeric "identifier", for use with the non-standard
  // "unquoted keys" extension.
  absl::StatusOr<LocationWith<MaybeOwnedString>> ParseBareWord();

  absl::Status Advance(size_t bytes) {
    RETURN_IF_ERROR(stream_.Advance(bytes));
    json_loc_.offset += static_cast<int>(bytes);
    json_loc_.col += static_cast<int>(bytes);
    return absl::OkStatus();
  }

  ZeroCopyBufferedStream stream_;

  ParseOptions options_;
  JsonLocation json_loc_;
  MessagePath* path_;
};

template <typename F>
absl::Status JsonLexer::VisitArray(F f) {
  RETURN_IF_ERROR(Expect("["));
  RETURN_IF_ERROR(Push());

  if (Peek("]")) {
    Pop();
    return absl::OkStatus();
  }

  bool has_comma = true;
  do {
    if (!has_comma) {
      return Invalid("expected ','");
    }
    RETURN_IF_ERROR(f());
    has_comma = Peek(",");
  } while (!Peek("]"));

  if (!options_.allow_legacy_syntax && has_comma) {
    return Invalid("expected ']'");
  }

  Pop();
  return absl::OkStatus();
}

// Walks over an object, calling `f` just after parsing each `:`.
//
// `f` should have type `(MaybeOwnedString&) -> absl::Status`.
template <typename F>
absl::Status JsonLexer::VisitObject(F f) {
  RETURN_IF_ERROR(Expect("{"));
  RETURN_IF_ERROR(Push());

  if (Peek("}")) {
    Pop();
    return absl::OkStatus();
  }

  bool has_comma = true;
  do {
    if (!has_comma) {
      return Invalid("expected ','");
    }
    RETURN_IF_ERROR(SkipToToken());

    absl::StatusOr<LocationWith<MaybeOwnedString>> key;
    if (stream_.PeekChar() == '"' || stream_.PeekChar() == '\'') {
      key = ParseUtf8();
    } else if (options_.allow_legacy_syntax) {
      key = ParseBareWord();
    } else {
      return Invalid("expected '\"'");
    }

    RETURN_IF_ERROR(key.status());
    RETURN_IF_ERROR(Expect(":"));
    RETURN_IF_ERROR(f(*key));
    has_comma = Peek(",");
  } while (!Peek("}"));
  Pop();

  if (!options_.allow_legacy_syntax && has_comma) {
    return Invalid("expected '}'");
  }

  return absl::OkStatus();
}
}  // namespace json_internal
}  // namespace protobuf
}  // namespace google

#include "google/protobuf/port_undef.inc"
#endif  // GOOGLE_PROTOBUF_JSON_INTERNAL_LEXER_H__