/*****************************************************************************\
*                                                                             *
*  Name   : dos2unix                                                          *
*  Author : Chris Koeritz                                                     *
*                                                                             *
*  Purpose:                                                                   *
*                                                                             *
*    Takes text as input and replaces any line endings with the unix EOL.     *
*                                                                             *
*******************************************************************************
* Copyright (c) 2007-$now By Author.  This program is free software; you can  *
* redistribute it and/or modify it under the terms of the GNU General Public  *
* License as published by the Free Software Foundation; either version 2 of   *
* the License or (at your option) any later version.  This is online at:      *
*     http://www.fsf.org/copyleft/gpl.html                                    *
* Please send any updates to: fred@gruntose.com                               *
\*****************************************************************************/

#include <basis/istring.h>
#include <basis/set.cpp>
#include <opsystem/application_shell.h>
#include <opsystem/byte_filer.h>
#include <opsystem/command_line.h>
#include <opsystem/filename.h>
#include <data_struct/static_memory_gremlin.h>
#include <textual/parser_bits.h>

#include <stdio.h>

const int MAX_BUFFER = 4096;  // the largest chunk of text we handle at once.

////////////////////////////////////////////////////////////////////////////

class dos2unix_app : public application_shell
{
public:
  dos2unix_app() : application_shell(static_class_name()) {}

  IMPLEMENT_CLASS_NAME("dos2unix");

  virtual int execute();

  int print_instructions();

  void read_chunk(byte_filer &f, istring &buffer, istring &pushed_out);
    // reads from the file "f" and stores in the primary "buffer".  if there
    // are unacceptable contents (meaning that the buffer ended in \r), then
    // these get added to "pushed_out" rather than "buffer".

  void chew_input(istring &to_chew);
    // consumes the string specified and replaces any EOL characters with
    // just a line feed.  it's an error for this string to end with \r, since
    // that would be the beginning of a multiple character sequence.

  void write_chunk(istring &buffer, istring &pushed_out);
    // throws the current "buffer" to standard output and replaces it with
    // the "pushed_out" contents.  "pushed_out" is cleared.
};

////////////////////////////////////////////////////////////////////////////

int dos2unix_app::print_instructions()
{
  istring name = filename(__argv[0]).basename().raw();
  log(isprintf("%s usage:", name.s()));
  log("");
  log("\
This program consumes the DOS CRLF (Carriage Return / Line Feed) and replaces\n\
them with the Unix line ending (just Line Feed).  In hex, this means that\n\
characters with values 0d0a will be replaced with 0a.  As a convenience and\n\
to make this operation idempotent, single line feeds will still be replaced\n\
with single line feeds.  Malformed line feed sequences (such as 0d0d0a) will\n\
be replaced with a single line feed also.\n\
Any filenames on the command line are processed and sent to standard output.\n\
The following options are available:\n\
   --help or -?\tShow this help information.\n\
");
  return -3;
}

void dos2unix_app::chew_input(istring &to_chew)
{
  if (to_chew[to_chew.end()] == '\r') {
    log("error-- a string ending in \\r has been passed for consumption.");
  }

  // strategy...
  //   collapse R*N into N
  //   collapse N into N
  // key: R = \r and N = \n.

  bool saw_lf = false;
  // iterate backwards through the chunk of text we were given.
  for (int i = to_chew.end(); i >= 0; i--) {
    // if this is not an eol character, then it is deemed boring.
    if (!parser_bits::is_eol(to_chew[i])) {
      saw_lf = false;
      continue;
    }
    // here, we know we have either a line feed or a carriage return.
    if (to_chew[i] == '\n') {
      saw_lf = true;  // we saw a line feed--clean out CRs we see before it.
    } else {
      // this has to be a CR, unless the definition of eol changed.  remove it.
      to_chew.zap(i, i);
      if (!saw_lf) saw_lf = true;  // we were given bad data; missing LF.
    }
  }
}

void dos2unix_app::read_chunk(byte_filer &f, istring &buffer,
    istring &pushed_out)
{
  f.read(buffer, MAX_BUFFER);
  // we do not allow our consume method to see a lonely carriage return;
  // we will make sure that's not how the buffer ends, if we can.
  while (buffer[buffer.end()] == '\r') {
    pushed_out += '\r';
    buffer.zap(buffer.end(), buffer.end());
  }
  if (!buffer.length()) {
    // the crazy thing was empty, or it was all CRs!
    buffer = pushed_out;
      // at this point, we'll just go with the backup buffer, which might
      // also be empty.  but there's not much to lose even if so.
    pushed_out.reset();
  }
}

void dos2unix_app::write_chunk(istring &buffer, istring &pushed_out)
{
  if (buffer.length())
    printf("%s", buffer.s());
  buffer = pushed_out;
  pushed_out.reset();
}

int dos2unix_app::execute()
{
  command_line cmds(__argc, __argv);  // parse the command line up.

  // look for help commands.
  int junk_index = 0;
  if (cmds.find("help", junk_index, false)
      || cmds.find('h', junk_index, false)
      || cmds.find("?", junk_index, false)
      || cmds.find('?', junk_index, false) ) {
    print_instructions();
    return 0;
  }

  // gather extra input files.
  string_set input_files;
  for (int i = 0; i < cmds.entries(); i++) {
    const command_parameter &curr = cmds.get(i);
    if (curr.type() == command_parameter::VALUE) {
//log(istring("adding input file:") + curr.text());
      input_files += curr.text();
    }
  }

  istring accumulator;  // we will fill this up with data from the file.
  istring pushed;  // any stuff we decided to postpone will be dropped here.

  // iterate across the files and process each of them chunkwise.
  for (int q = 0; q < input_files.length(); q++) {
    byte_filer current(input_files[q], "rb");
    if (!current.good()) continue;
    while (!current.eof()) {
      read_chunk(current, accumulator, pushed);
      chew_input(accumulator);
      write_chunk(accumulator, pushed);
    }
  }

  // now get from standard input if there weren't any files specified.
  if (!input_files.length()) {
    byte_filer s_in(false, stdin);
    while (!s_in.eof()) {
      read_chunk(s_in, accumulator, pushed);
      chew_input(accumulator);
      write_chunk(accumulator, pushed);
    }
  }

  return 0;
}

////////////////////////////////////////////////////////////////////////////

HOOPLE_MAIN(dos2unix_app, )

#ifdef __BUILD_STATIC_APPLICATION__
  // static dependencies found by buildor_gen_deps.sh:
  #include <basis/array.cpp>
  #include <basis/byte_array.cpp>
  #include <basis/callstack_tracker.cpp>
  #include <basis/chaos.cpp>
  #include <basis/convert_utf.cpp>
  #include <basis/definitions.cpp>
  #include <basis/earth_time.cpp>
  #include <basis/guards.cpp>
  #include <basis/istring.cpp>
  #include <basis/log_base.cpp>
  #include <basis/memory_checker.cpp>
  #include <basis/mutex.cpp>
  #include <basis/object_base.cpp>
  #include <basis/outcome.cpp>
  #include <basis/packable.cpp>
  #include <basis/portable.cpp>
  #include <basis/sequence.cpp>
  #include <basis/set.cpp>
  #include <basis/utility.cpp>
  #include <basis/version_record.cpp>
  #include <data_struct/amorph.cpp>
  #include <data_struct/bit_vector.cpp>
  #include <data_struct/byte_hasher.cpp>
  #include <data_struct/configurator.cpp>
  #include <data_struct/hash_table.cpp>
  #include <data_struct/pointer_hash.cpp>
  #include <data_struct/stack.cpp>
  #include <data_struct/static_memory_gremlin.cpp>
  #include <data_struct/string_hash.cpp>
  #include <data_struct/string_hasher.cpp>
  #include <data_struct/string_table.cpp>
  #include <data_struct/symbol_table.cpp>
  #include <data_struct/table_configurator.cpp>
  #include <loggers/console_logger.cpp>
  #include <loggers/file_logger.cpp>
  #include <loggers/locked_logger.cpp>
  #include <loggers/null_logger.cpp>
  #include <loggers/program_wide_logger.cpp>
  #include <opsystem/application_base.cpp>
  #include <opsystem/application_shell.cpp>
  #include <opsystem/byte_filer.cpp>
  #include <opsystem/command_line.cpp>
  #include <opsystem/critical_events.cpp>
  #include <opsystem/directory.cpp>
  #include <opsystem/filename.cpp>
  #include <opsystem/ini_config.cpp>
  #include <opsystem/ini_parser.cpp>
  #include <opsystem/path_configuration.cpp>
  #include <opsystem/rendezvous.cpp>
  #include <textual/byte_format.cpp>
  #include <textual/parser_bits.cpp>
  #include <textual/string_manipulation.cpp>
  #include <textual/tokenizer.cpp>
#endif // __BUILD_STATIC_APPLICATION__

