Blob Blame History Raw
/*
 *  Name: dos2unix
 *  Documentation:
 *    Remove cr ('\x0d') characters from a file.
 *
 *  The dos2unix package is distributed under FreeBSD style license.
 *  See also http://www.freebsd.org/copyright/freebsd-license.html
 *  --------
 *
 *  Copyright (C) 2009-2016 Erwin Waterlander
 *  Copyright (C) 1998 Christian Wurll
 *  Copyright (C) 1998 Bernd Johannes Wuebben
 *  Copyright (C) 1994-1995 Benjamin Lin.
 *  All rights reserved.
 *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
 *  1. Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *  2. Redistributions in binary form must reproduce the above copyright
 *     notice in the documentation and/or other materials provided with
 *     the distribution.
 *
 *  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY
 *  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 *  PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
 *  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
 *  OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 *  BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 *  WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
 *  OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
 *  IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 *  == 1.0 == 1989.10.04 == John Birchfield (jb@koko.csustan.edu)
 *  == 1.1 == 1994.12.20 == Benjamin Lin (blin@socs.uts.edu.au)
 *     Cleaned up for Borland C/C++ 4.02
 *  == 1.2 == 1995.03.16 == Benjamin Lin (blin@socs.uts.edu.au)
 *     Modified to more conform to UNIX style.
 *  == 2.0 == 1995.03.19 == Benjamin Lin (blin@socs.uts.edu.au)
 *     Rewritten from scratch.
 *  == 2.1 == 1995.03.29 == Benjamin Lin (blin@socs.uts.edu.au)
 *     Conversion to SunOS charset implemented.
 *  == 2.2 == 1995.03.30 == Benjamin Lin (blin@socs.uts.edu.au)
 *     Fixed a bug in 2.1 where in new-file mode, if outfile already exists
 *     conversion can not be completed properly.
 *
 * Added Mac text file translation, i.e. \r to \n conversion
 * Bernd Johannes Wuebben, wuebben@kde.org
 * Wed Feb  4 19:12:58 EST 1998
 *
 * Added extra newline if ^M occurs
 * Christian Wurll, wurll@ira.uka.de
 * Thu Nov 19 1998
 *
 *  See ChangeLog.txt for complete version history.
 *
 */


/* #define DEBUG 1 */
#define __DOS2UNIX_C

#include "common.h"
#include "dos2unix.h"
# if (defined(_WIN32) && !defined(__CYGWIN__))
#include <windows.h>
#endif
#ifdef D2U_UNICODE
#if !defined(__MSDOS__) && !defined(_WIN32) && !defined(__OS2__)  /* Unix, Cygwin */
# include <langinfo.h>
#endif
#endif

void PrintLicense(void)
{
  D2U_ANSI_FPRINTF(stdout,_("\
Copyright (C) 2009-%d Erwin Waterlander\n\
Copyright (C) 1998      Christian Wurll (Version 3.1)\n\
Copyright (C) 1998      Bernd Johannes Wuebben (Version 3.0)\n\
Copyright (C) 1994-1995 Benjamin Lin\n\
All rights reserved.\n\n"),2016);
  PrintBSDLicense();
}

#ifdef D2U_UNICODE
wint_t StripDelimiterW(FILE* ipInF, FILE* ipOutF, CFlag *ipFlag, wint_t CurChar, unsigned int *converted, const char *progname)
{
  wint_t TempNextChar;
  /* CurChar is always CR (x0d) */
  /* In normal dos2unix mode put nothing (skip CR). */
  /* Don't modify Mac files when in dos2unix mode. */
  if ( (TempNextChar = d2u_getwc(ipInF, ipFlag->bomtype)) != WEOF) {
    if (d2u_ungetwc( TempNextChar, ipInF, ipFlag->bomtype) == WEOF) {  /* put back peek char */
        d2u_getc_error(ipFlag,progname);
        return WEOF;
    }
    if ( TempNextChar != 0x0a ) {
      if (d2u_putwc(CurChar, ipOutF, ipFlag, progname) == WEOF) {  /* Mac line, put CR */
          d2u_putwc_error(ipFlag,progname);
          return WEOF;
      }
    } else {
      (*converted)++;
      if (ipFlag->NewLine) {  /* add additional LF? */
        if (d2u_putwc(0x0a, ipOutF, ipFlag, progname) == WEOF) {
            d2u_putwc_error(ipFlag,progname);
            return WEOF;
        }
      }
    }
  } else {
    if (ferror(ipInF)) {
        d2u_getc_error(ipFlag,progname);
        return WEOF;
    }
    if ( CurChar == 0x0d ) {  /* EOF: last Mac line delimiter (CR)? */
        if (d2u_putwc(CurChar, ipOutF, ipFlag, progname) == WEOF) {
            d2u_putwc_error(ipFlag,progname);
            return WEOF;
        }
    }
  }
  return CurChar;
}
#endif

/* CUR        NEXT
   0xd(CR)    0xa(LF)  => put LF if option -l was used
   0xd(CR)  ! 0xa(LF)  => put CR
   0xd(CR)    EOF      => put CR
 */
int StripDelimiter(FILE* ipInF, FILE* ipOutF, CFlag *ipFlag, int CurChar, unsigned int *converted, const char *progname)
{
  int TempNextChar;
  /* CurChar is always CR (x0d) */
  /* In normal dos2unix mode put nothing (skip CR). */
  /* Don't modify Mac files when in dos2unix mode. */
  if ( (TempNextChar = fgetc(ipInF)) != EOF) {
    if (ungetc( TempNextChar, ipInF ) == EOF) { /* put back peek char */
        d2u_getc_error(ipFlag,progname);
        return EOF;
    }
    if ( TempNextChar != '\x0a' ) {
      if (fputc( CurChar, ipOutF ) == EOF) { /* Mac line, put CR */
          d2u_putc_error(ipFlag,progname);
          return EOF;
      }
    } else {
      (*converted)++;
      if (ipFlag->NewLine) {  /* add additional LF? */
        if (fputc('\x0a', ipOutF) == EOF) {
            d2u_putc_error(ipFlag,progname);
            return EOF;
        }
      }
    }
  } else {
    if (ferror(ipInF)) {
        d2u_getc_error(ipFlag,progname);
        return EOF;
    }
    if ( CurChar == '\x0d' ) {  /* EOF: last Mac line delimiter (CR)? */
        if (fputc( CurChar, ipOutF ) == EOF) {
            d2u_putc_error(ipFlag,progname);
            return EOF;
        }
    }
  }
  return CurChar;
}

/* converts stream ipInF to UNIX format text and write to stream ipOutF
 * RetVal: 0  if success
 *         -1  otherwise
 */
#ifdef D2U_UNICODE
int ConvertDosToUnixW(FILE* ipInF, FILE* ipOutF, CFlag *ipFlag, const char *progname)
{
    int RetVal = 0;
    wint_t TempChar;
    wint_t TempNextChar;
    unsigned int line_nr = 1;
    unsigned int converted = 0;

    ipFlag->status = 0;

    /* CR-LF -> LF */
    /* LF    -> LF, in case the input file is a Unix text file */
    /* CR    -> CR, in dos2unix mode (don't modify Mac file) */
    /* CR    -> LF, in Mac mode */
    /* \x0a = Newline/Line Feed (LF) */
    /* \x0d = Carriage Return (CR) */

    switch (ipFlag->FromToMode)
    {
      case FROMTO_DOS2UNIX: /* dos2unix */
        while ((TempChar = d2u_getwc(ipInF, ipFlag->bomtype)) != WEOF) {  /* get character */
          if ((ipFlag->Force == 0) &&
              (TempChar < 32) &&
              (TempChar != 0x0a) &&  /* Not an LF */
              (TempChar != 0x0d) &&  /* Not a CR */
              (TempChar != 0x09) &&  /* Not a TAB */
              (TempChar != 0x0c)) {  /* Not a form feed */
            RetVal = -1;
            ipFlag->status |= BINARY_FILE ;
            if (ipFlag->verbose) {
              if ((ipFlag->stdio_mode) && (!ipFlag->error)) ipFlag->error = 1;
              D2U_UTF8_FPRINTF(stderr, "%s: ", progname);
              D2U_UTF8_FPRINTF(stderr, _("Binary symbol 0x00%02X found at line %u\n"),TempChar, line_nr);
            }
            break;
          }
          if (TempChar != 0x0d) {
            if (TempChar == 0x0a) /* Count all DOS and Unix line breaks */
              ++line_nr;
            if (d2u_putwc(TempChar, ipOutF, ipFlag, progname) == WEOF) {
              RetVal = -1;
              d2u_putwc_error(ipFlag,progname);
              break;
            }
          } else {
            if (StripDelimiterW( ipInF, ipOutF, ipFlag, TempChar, &converted, progname) == WEOF) {
              RetVal = -1;
              break;
            }
          }
        }
        if ((TempChar == WEOF) && ferror(ipInF)) {
          RetVal = -1;
          d2u_getc_error(ipFlag,progname);
        }
        break;
      case FROMTO_MAC2UNIX: /* mac2unix */
        while ((TempChar = d2u_getwc(ipInF, ipFlag->bomtype)) != WEOF) {
          if ((ipFlag->Force == 0) &&
              (TempChar < 32) &&
              (TempChar != 0x0a) &&  /* Not an LF */
              (TempChar != 0x0d) &&  /* Not a CR */
              (TempChar != 0x09) &&  /* Not a TAB */
              (TempChar != 0x0c)) {  /* Not a form feed */
            RetVal = -1;
            ipFlag->status |= BINARY_FILE ;
            if (ipFlag->verbose) {
              if ((ipFlag->stdio_mode) && (!ipFlag->error)) ipFlag->error = 1;
              D2U_UTF8_FPRINTF(stderr, "%s: ", progname);
              D2U_UTF8_FPRINTF(stderr, _("Binary symbol 0x00%02X found at line %u\n"), TempChar, line_nr);
            }
            break;
          }
          if ((TempChar != 0x0d)) {
              if (TempChar == 0x0a) /* Count all DOS and Unix line breaks */
                ++line_nr;
              if(d2u_putwc(TempChar, ipOutF, ipFlag, progname) == WEOF) {
                RetVal = -1;
                d2u_putwc_error(ipFlag,progname);
                break;
              }
            }
          else{
            /* TempChar is a CR */
            if ( (TempNextChar = d2u_getwc(ipInF, ipFlag->bomtype)) != WEOF) {
              if (d2u_ungetwc( TempNextChar, ipInF, ipFlag->bomtype) == WEOF) {  /* put back peek char */
                d2u_getc_error(ipFlag,progname);
                RetVal = -1;
                break;
              }
              /* Don't touch this delimiter if it's a CR,LF pair. */
              if ( TempNextChar == 0x0a ) {
                if (d2u_putwc(0x0d, ipOutF, ipFlag, progname) == WEOF) { /* put CR, part of DOS CR-LF */
                  d2u_putwc_error(ipFlag,progname);
                  RetVal = -1;
                  break;
                }
                continue;
              }
            }
            if (d2u_putwc(0x0a, ipOutF, ipFlag, progname) == WEOF) { /* MAC line end (CR). Put LF */
                RetVal = -1;
                d2u_putwc_error(ipFlag,progname);
                break;
              }
            converted++;
            line_nr++; /* Count all Mac line breaks */
            if (ipFlag->NewLine) {  /* add additional LF? */
              if (d2u_putwc(0x0a, ipOutF, ipFlag, progname) == WEOF) {
                RetVal = -1;
                d2u_putwc_error(ipFlag,progname);
                break;
              }
            }
          }
        }
        if ((TempChar == WEOF) && ferror(ipInF)) {
          RetVal = -1;
          d2u_getc_error(ipFlag,progname);
        }
        break;
      default: /* unknown FromToMode */
      ;
#if DEBUG
      D2U_UTF8_FPRINTF(stderr, "%s: ", progname);
      D2U_UTF8_FPRINTF(stderr, _("program error, invalid conversion mode %d\n"),ipFlag->FromToMode);
      exit(1);
#endif
    }
    if (ipFlag->status & UNICODE_CONVERSION_ERROR)
        ipFlag->line_nr = line_nr;
    if ((RetVal == 0) && (ipFlag->verbose > 1)) {
      D2U_UTF8_FPRINTF(stderr, "%s: ", progname);
      D2U_UTF8_FPRINTF(stderr, _("Converted %u out of %u line breaks.\n"), converted, line_nr -1);
    }
    return RetVal;
}
#endif

/* converts stream ipInF to UNIX format text and write to stream ipOutF
 * RetVal: 0  if success
 *         -1  otherwise
 */
int ConvertDosToUnix(FILE* ipInF, FILE* ipOutF, CFlag *ipFlag, const char *progname)
{
    int RetVal = 0;
    int TempChar;
    int TempNextChar;
    int *ConvTable;
    unsigned int line_nr = 1;
    unsigned int converted = 0;

    ipFlag->status = 0;

    switch (ipFlag->ConvMode) {
      case CONVMODE_ASCII: /* ascii */
      case CONVMODE_UTF16LE: /* Assume UTF-16LE, bomtype = FILE_UTF8 or GB18030 */
      case CONVMODE_UTF16BE: /* Assume UTF-16BE, bomtype = FILE_UTF8 or GB18030 */
        ConvTable = D2UAsciiTable;
        break;
      case CONVMODE_7BIT: /* 7bit */
        ConvTable = D2U7BitTable;
        break;
      case CONVMODE_437: /* iso */
        ConvTable = D2UIso437Table;
        break;
      case CONVMODE_850: /* iso */
        ConvTable = D2UIso850Table;
        break;
      case CONVMODE_860: /* iso */
        ConvTable = D2UIso860Table;
        break;
      case CONVMODE_863: /* iso */
        ConvTable = D2UIso863Table;
        break;
      case CONVMODE_865: /* iso */
        ConvTable = D2UIso865Table;
        break;
      case CONVMODE_1252: /* iso */
        ConvTable = D2UIso1252Table;
        break;
      default: /* unknown convmode */
        ipFlag->status |= WRONG_CODEPAGE ;
        return(-1);
    }
    /* Turn off ISO and 7-bit conversion for Unicode text files */
    if (ipFlag->bomtype > 0)
      ConvTable = D2UAsciiTable;

    if ((ipFlag->ConvMode > CONVMODE_7BIT) && (ipFlag->verbose)) { /* not ascii or 7bit */
       D2U_UTF8_FPRINTF(stderr, "%s: ", progname);
       D2U_UTF8_FPRINTF(stderr, _("using code page %d.\n"), ipFlag->ConvMode);
    }

    /* CR-LF -> LF */
    /* LF    -> LF, in case the input file is a Unix text file */
    /* CR    -> CR, in dos2unix mode (don't modify Mac file) */
    /* CR    -> LF, in Mac mode */
    /* \x0a = Newline/Line Feed (LF) */
    /* \x0d = Carriage Return (CR) */

    switch (ipFlag->FromToMode) {
      case FROMTO_DOS2UNIX: /* dos2unix */
        while ((TempChar = fgetc(ipInF)) != EOF) {  /* get character */
          if ((ipFlag->Force == 0) &&
              (TempChar < 32) &&
              (TempChar != '\x0a') &&  /* Not an LF */
              (TempChar != '\x0d') &&  /* Not a CR */
              (TempChar != '\x09') &&  /* Not a TAB */
              (TempChar != '\x0c')) {  /* Not a form feed */
            RetVal = -1;
            ipFlag->status |= BINARY_FILE ;
            if (ipFlag->verbose) {
              if ((ipFlag->stdio_mode) && (!ipFlag->error)) ipFlag->error = 1;
              D2U_UTF8_FPRINTF(stderr, "%s: ", progname);
              D2U_UTF8_FPRINTF(stderr, _("Binary symbol 0x%02X found at line %u\n"),TempChar, line_nr);
            }
            break;
          }
          if (TempChar != '\x0d') {
            if (TempChar == '\x0a') /* Count all DOS and Unix line breaks */
              ++line_nr;
            if (fputc(ConvTable[TempChar], ipOutF) == EOF) {
              RetVal = -1;
              d2u_putc_error(ipFlag,progname);
              break;
            }
          } else {
            if (StripDelimiter( ipInF, ipOutF, ipFlag, TempChar, &converted, progname) == EOF) {
              RetVal = -1;
              break;
            }
          }
        }
        if ((TempChar == EOF) && ferror(ipInF)) {
          RetVal = -1;
          d2u_getc_error(ipFlag,progname);
        }
        break;
      case FROMTO_MAC2UNIX: /* mac2unix */
        while ((TempChar = fgetc(ipInF)) != EOF) {
          if ((ipFlag->Force == 0) &&
              (TempChar < 32) &&
              (TempChar != '\x0a') &&  /* Not an LF */
              (TempChar != '\x0d') &&  /* Not a CR */
              (TempChar != '\x09') &&  /* Not a TAB */
              (TempChar != '\x0c')) {  /* Not a form feed */
            RetVal = -1;
            ipFlag->status |= BINARY_FILE ;
            if (ipFlag->verbose) {
              if ((ipFlag->stdio_mode) && (!ipFlag->error)) ipFlag->error = 1;
              D2U_UTF8_FPRINTF(stderr, "%s: ", progname);
              D2U_UTF8_FPRINTF(stderr, _("Binary symbol 0x%02X found at line %u\n"),TempChar, line_nr);
            }
            break;
          }
          if ((TempChar != '\x0d')) {
              if (TempChar == '\x0a') /* Count all DOS and Unix line breaks */
                ++line_nr;
              if(fputc(ConvTable[TempChar], ipOutF) == EOF) {
                RetVal = -1;
                d2u_putc_error(ipFlag,progname);
                break;
              }
            }
          else{
            /* TempChar is a CR */
            if ( (TempNextChar = fgetc(ipInF)) != EOF) {
              if (ungetc( TempNextChar, ipInF ) == EOF) {  /* put back peek char */
                d2u_getc_error(ipFlag,progname);
                RetVal = -1;
                break;
              }
              /* Don't touch this delimiter if it's a CR,LF pair. */
              if ( TempNextChar == '\x0a' ) {
                if (fputc('\x0d', ipOutF) == EOF) { /* put CR, part of DOS CR-LF */
                  RetVal = -1;
                  d2u_putc_error(ipFlag,progname);
                  break;
                }
                continue;
              }
            }
            if (fputc('\x0a', ipOutF) == EOF) { /* MAC line end (CR). Put LF */
                RetVal = -1;
                d2u_putc_error(ipFlag,progname);
                break;
              }
            converted++;
            line_nr++; /* Count all Mac line breaks */
            if (ipFlag->NewLine) {  /* add additional LF? */
              if (fputc('\x0a', ipOutF) == EOF) {
                RetVal = -1;
                d2u_putc_error(ipFlag,progname);
                break;
              }
            }
          }
        }
        if ((TempChar == EOF) && ferror(ipInF)) {
          RetVal = -1;
          d2u_getc_error(ipFlag,progname);
        }
        break;
      default: /* unknown FromToMode */
      ;
#if DEBUG
      D2U_UTF8_FPRINTF(stderr, "%s: ", progname);
      D2U_UTF8_FPRINTF(stderr, _("program error, invalid conversion mode %d\n"),ipFlag->FromToMode);
      exit(1);
#endif
    }
    if ((RetVal == 0) && (ipFlag->verbose > 1)) {
      D2U_UTF8_FPRINTF(stderr, "%s: ", progname);
      D2U_UTF8_FPRINTF(stderr, _("Converted %u out of %u line breaks.\n"),converted, line_nr -1);
    }
    return RetVal;
}


int main (int argc, char *argv[])
{
  /* variable declarations */
  char progname[9];
  CFlag *pFlag;
  char *ptr;
  char localedir[1024];
# ifdef __MINGW64__
  int _dowildcard = -1; /* enable wildcard expansion for Win64 */
# endif
  int  argc_new;
  char **argv_new;
#ifdef D2U_UNIFILE
  wchar_t **wargv;
  char ***argv_glob;
# endif

  progname[8] = '\0';
  strcpy(progname,"dos2unix");

#ifdef ENABLE_NLS
   ptr = getenv("DOS2UNIX_LOCALEDIR");
   if (ptr == NULL)
      d2u_strncpy(localedir,LOCALEDIR,sizeof(localedir));
   else {
      if (strlen(ptr) < sizeof(localedir))
         d2u_strncpy(localedir,ptr,sizeof(localedir));
      else {
         D2U_UTF8_FPRINTF(stderr,"%s: ",progname);
         D2U_ANSI_FPRINTF(stderr, "%s", _("error: Value of environment variable DOS2UNIX_LOCALEDIR is too long.\n"));
         d2u_strncpy(localedir,LOCALEDIR,sizeof(localedir));
      }
   }
#endif

#if defined(ENABLE_NLS) || (defined(D2U_UNICODE) && !defined(__MSDOS__) && !defined(_WIN32) && !defined(__OS2__))
/* setlocale() is also needed for nl_langinfo() */
#if (defined(_WIN32) && !defined(__CYGWIN__))
/* When the locale is set to "" on Windows all East-Asian multi-byte ANSI encoded text is printed
   wrongly when you use standard printf(). Also UTF-8 code is printed wrongly. See also test/setlocale.c.
   When we set the locale to "C" gettext still translates the messages on Windows. On Unix this would disable
   gettext. */
   setlocale (LC_ALL, "C");
#else
   setlocale (LC_ALL, "");
#endif
#endif

#ifdef ENABLE_NLS
   bindtextdomain (PACKAGE, localedir);
   textdomain (PACKAGE);
#endif


  /* variable initialisations */
  pFlag = (CFlag*)malloc(sizeof(CFlag));
  if (pFlag == NULL) {
    D2U_UTF8_FPRINTF(stderr, "dos2unix:");
    D2U_ANSI_FPRINTF(stderr, " %s\n", strerror(errno));
    return errno;
  }
  pFlag->FromToMode = FROMTO_DOS2UNIX;  /* default dos2unix */
  pFlag->keep_bom = 0;

  if ( ((ptr=strrchr(argv[0],'/')) == NULL) && ((ptr=strrchr(argv[0],'\\')) == NULL) )
    ptr = argv[0];
  else
    ptr++;

  if ((strcmpi("mac2unix", ptr) == 0) || (strcmpi("mac2unix.exe", ptr) == 0)) {
    pFlag->FromToMode = FROMTO_MAC2UNIX;
    strcpy(progname,"mac2unix");
  }

#ifdef D2U_UNIFILE
  /* Get arguments in wide Unicode format in the Windows Command Prompt */

  /* This does not support wildcard expansion (globbing) */
  wargv = CommandLineToArgvW(GetCommandLineW(), &argc);

  argv_glob = (char ***)malloc(sizeof(char***));
  if (argv_glob == NULL) {
    D2U_UTF8_FPRINTF(stderr, "%s:", progname);
    D2U_ANSI_FPRINTF(stderr, " %s\n", strerror(errno));
    free(pFlag);
    return errno;
  }
  /* Glob the arguments and convert them to UTF-8 */
  argc_new = glob_warg(argc, wargv, argv_glob, pFlag, progname);
  argv_new = *argv_glob;
#else  
  argc_new = argc;
  argv_new = argv;
#endif

#ifdef D2U_UNICODE
  return parse_options(argc_new, argv_new, pFlag, localedir, progname, PrintLicense, ConvertDosToUnix, ConvertDosToUnixW);
#else
  return parse_options(argc_new, argv_new, pFlag, localedir, progname, PrintLicense, ConvertDosToUnix);
#endif
}