
/*
 *  tokenize.c
 *  $Id: tokenize.c,v 2.2 1999/07/07 19:30:52 bkorb Exp $
 *  This routine will break a string down into a series of tokens.
 *  Quoted strings are treated reasonably and backslashes contained
 *  within them will suppress their meaning (e.g. not terminate
 *  the string.)
 */

/*
 *  Tokenize is free software.
 *  You may redistribute it and/or modify it under the terms of the
 *  GNU General Public License, as published by the Free Software
 *  Foundation; either version 2, or (at your option) any later version.
 *
 *  Tokenize is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with Automated Options.  See the file "COPYING".  If not,
 *  write to:  The Free Software Foundation, Inc.,
 *             59 Temple Place - Suite 330,
 *             Boston,  MA  02111-1307, USA.
 *
 * As a special exception, Bruce Korb gives permission for additional
 * uses of the text contained in his release of Tokenize.
 *
 * The exception is that, if you link the Tokenize library with other
 * files to produce an executable, this does not by itself cause the
 * resulting executable to be covered by the GNU General Public License.
 * Your use of that executable is in no way restricted on account of
 * linking the Tokenize library code into it.
 *
 * This exception does not however invalidate any other reasons why
 * the executable file might be covered by the GNU General Public License.
 *
 * This exception applies only to the code released by Bruce Korb under
 * the name Tokenize.  If you copy code from other sources under the
 * General Public License into a copy of Tokenize, as the General Public
 * License permits, the exception does not apply to the code that you add
 * in this way.  To avoid misleading anyone as to the status of such
 * modified files, you must delete this exception notice from them.
 *
 * If you write modifications of your own for Tokenize, it is your choice
 * whether to permit this exception to apply to your modifications.
 * If you do not wish that, delete this exception notice.
 */

#include <compat/compat.h>

#include "autogen.h"

#define DEFAULT_SPLIT       " \t\n\f\r\v\b\a"
static char zSplit[ 257 ] = "\"'" DEFAULT_SPLIT;

/*
 *  The following routine scans over quoted text, shifting
 *  it in the process and eliminating the starting quote,
 *  ending quote and any embedded backslashes.  They may
 *  be used to embed the quote character in the quoted text.
 *  The quote character is whatever character the argument
 *  is pointing at when this procedure is called.
 */
    static char*
spanQuote( char* pzQte )
{
    char  q = *pzQte;          /*  Save the quote character type */
    char* p = pzQte++;         /*  Destination pointer           */

    while (*pzQte != q) {
        switch (*p++ = *pzQte++) {
        case NUL:
            return pzQte-1;      /* Return address of terminating NUL */

        case '\\':
            if (q != '\'') {
                unsigned int ct = doEscapeChar( pzQte, p-1 );
                /*
                 *  IF the advance is zero,
                 *  THEN we either have end of string (caught above),
                 *       or we have an escaped new-line,
                 *       which is to be ignored.
                 *  ELSE advance the quote scanning pointer by ct
                 */
                if (ct == 0) {
                    p--;     /* move destination back one character */
                    pzQte++; /* skip over new-line character        */
                } else
                    pzQte += ct;

            } else {
                switch (*pzQte) {
                case '\\':
                case '\'':
                case '#':
                    p[-1] = *pzQte++;
                }
            }
            break;

        default:
            ;
        }
    }

    *p = NUL;
    return pzQte+1; /* Return addr of char after the terminating quote */
}


/*
 *  Tokenize a string, like "sh".  Viz., quoted strings are
 *  taken as a single token.
 *
 *  NB:  Backslashes are *always* stripped out of quoted text,
 *       even in single quote strings.
 *  ALSO:  in unquoted text, backslashes are *never* stripped out.
 *
 *  EXAMPLE:
 *      ABC" def '"GHI"' jkl ' X" as\dt "\t\n"
 *  RESULT:
 *      token 1:  ABC def 'GHI' jkl ' X
 *      token 2:  as\dt
 *      token 3:  tn
 */
    int
tokenize( char* pzStr, int tokCt, tpapz papzTokens )
{
    /*
     *  No input text?  Reset the split characters to the default.
     */
    if( pzStr == (char*)NULL ) {
        strcpy( zSplit+2, DEFAULT_SPLIT );
        return 0;
    }

    /*
     *  A token count of zero is a secret code that means we are changing
     *  what the separator characters are.  We give them lots of room.
     *  It is only memory.  :-)
     */
    if (tokCt == 0) {
        if (papzTokens == (tpapz)NULL) {
            strncpy( zSplit+2, pzStr, sizeof( zSplit )-3 );
            zSplit[ STRSIZE( zSplit ) ] = NUL;
        }

        return 0;
    }

    {
        u_int  ctLeft = tokCt;

        do {
            /*
             *  Skip all the spanning characters (but not quotes).
             */
            char*  pzStart = pzStr + strspn( pzStr, zSplit+2 );
            char*  pzEnd;

            /*
             *  IF we reached the end of the input string, bail
             */
            if (*pzStart == NUL)
                break;

            /*
             *  found a token of some sort...
             */
            ctLeft--;
            *(papzTokens++) = pzStart;

            /*
             *  Find the first character following the token start that
             *  is a member of the separator set, or is a quote.
             */
            pzEnd = strpbrk( pzStart, zSplit );

            /*
             *  IF none, perforce, we are done
             */
            if (pzEnd == (char*)NULL)
                break;

            /*
             *  FOR as long as the separator character we find is a quote
             */
            while ((*pzEnd == '"') || (*pzEnd == '\'')) {
                /*
                 *  Terminate the quoted text and get a pointer to the
                 *  character following the terminating quote.
                 */
                char*  pzEndQuote = spanQuote( pzEnd );

                /*
                 *  Glue all the following text to the end
                 */
                pzEnd += strlen( pzEnd );
                strcpy( pzEnd, pzEndQuote );
                pzEnd = strpbrk( pzEnd, zSplit );

                /*
                 *  Look again for the token separator and bail if done
                 */
                if (pzEnd == (char*)NULL)
                    return tokCt - ctLeft;
            }

            /*
             *  NUL terminate the token
             */
            *(pzEnd++) = NUL;

            pzStr = pzEnd;
        }  while (ctLeft > 0);

        return tokCt - ctLeft;
    }
}
/* end of tokenize.c */
