/** \file
|
* \brief The ANTLR3 C filestream is used when the source character stream
|
* is a filesystem based input set and all the characters in the filestream
|
* can be loaded at once into memory and away the lexer goes.
|
*
|
* A number of initializers are provided in order that various character
|
* sets can be supported from input files. The ANTLR3 C runtime expects
|
* to deal with UTF32 characters only (the reasons for this are to
|
* do with the simplification of C code when using this form of Unicode
|
* encoding, though this is not a panacea. More information can be
|
* found on this by consulting:
|
* - http://www.unicode.org/versions/Unicode4.0.0/ch02.pdf#G11178
|
* Where a well grounded discussion of the encoding formats available
|
* may be found.
|
*
|
*/
|
|
// [The "BSD licence"]
|
// Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
|
// http://www.temporal-wave.com
|
// http://www.linkedin.com/in/jimidle
|
//
|
// All rights reserved.
|
//
|
// Redistribution and use in source and binary forms, with or without
|
// modification, are permitted provided that the following conditions
|
// are met:
|
// 1. Redistributions of source code must retain the above copyright
|
// notice, this list of conditions and the following disclaimer.
|
// 2. Redistributions in binary form must reproduce the above copyright
|
// notice, this list of conditions and the following disclaimer in the
|
// documentation and/or other materials provided with the distribution.
|
// 3. The name of the author may not be used to endorse or promote products
|
// derived from this software without specific prior written permission.
|
//
|
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
|
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
|
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
|
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
#include <antlr3.h>
|
|
static void setupInputStream (pANTLR3_INPUT_STREAM input);
|
static pANTLR3_INPUT_STREAM antlr3CreateFileStream (pANTLR3_UINT8 fileName);
|
static pANTLR3_INPUT_STREAM antlr3CreateStringStream (pANTLR3_UINT8 data);
|
|
ANTLR3_API pANTLR3_INPUT_STREAM
|
antlr3FileStreamNew(pANTLR3_UINT8 fileName, ANTLR3_UINT32 encoding)
|
{
|
pANTLR3_INPUT_STREAM input;
|
|
// First order of business is to read the file into some buffer space
|
// as just straight 8 bit bytes. Then we will work out the encoding and
|
// byte order and adjust the API functions that are installed for the
|
// default 8Bit stream accordingly.
|
//
|
input = antlr3CreateFileStream(fileName);
|
if (input == NULL)
|
{
|
return NULL;
|
}
|
|
// We have the data in memory now so we can deal with it according to
|
// the encoding scheme we were given by the user.
|
//
|
input->encoding = encoding;
|
|
// Now we need to work out the endian type and install any
|
// API functions that differ from 8Bit
|
//
|
setupInputStream(input);
|
|
// Now we can set up the file name
|
//
|
input->istream->streamName = input->strFactory->newStr8(input->strFactory, fileName);
|
input->fileName = input->istream->streamName;
|
|
return input;
|
}
|
|
|
ANTLR3_API pANTLR3_INPUT_STREAM
|
antlr3StringStreamNew(pANTLR3_UINT8 data, ANTLR3_UINT32 encoding, ANTLR3_UINT32 size, pANTLR3_UINT8 name)
|
{
|
pANTLR3_INPUT_STREAM input;
|
|
// First order of business is to set up the stream and install the data pointer.
|
// Then we will work out the encoding and byte order and adjust the API functions that are installed for the
|
// default 8Bit stream accordingly.
|
//
|
input = antlr3CreateStringStream(data);
|
if (input == NULL)
|
{
|
return NULL;
|
}
|
|
// Size (in bytes) of the given 'string'
|
//
|
input->sizeBuf = size;
|
|
// We have the data in memory now so we can deal with it according to
|
// the encoding scheme we were given by the user.
|
//
|
input->encoding = encoding;
|
|
// Now we need to work out the endian type and install any
|
// API functions that differ from 8Bit
|
//
|
setupInputStream(input);
|
|
// Now we can set up the file name
|
//
|
input->istream->streamName = input->strFactory->newStr8(input->strFactory, name);
|
input->fileName = input->istream->streamName;
|
|
return input;
|
}
|
|
|
/// Determine endianess of the input stream and install the
|
/// API required for the encoding in that format.
|
///
|
static void
|
setupInputStream(pANTLR3_INPUT_STREAM input)
|
{
|
ANTLR3_BOOLEAN isBigEndian;
|
|
// Used to determine the endianness of the machine we are currently
|
// running on.
|
//
|
ANTLR3_UINT16 bomTest = 0xFEFF;
|
|
// What endianess is the machine we are running on? If the incoming
|
// encoding endianess is the same as this machine's natural byte order
|
// then we can use more efficient API calls.
|
//
|
if (*((pANTLR3_UINT8)(&bomTest)) == 0xFE)
|
{
|
isBigEndian = ANTLR3_TRUE;
|
}
|
else
|
{
|
isBigEndian = ANTLR3_FALSE;
|
}
|
|
// What encoding did the user tell us {s}he thought it was? I am going
|
// to get sick of the questions on antlr-interest, I know I am.
|
//
|
switch (input->encoding)
|
{
|
case ANTLR3_ENC_UTF8:
|
|
// See if there is a BOM at the start of this UTF-8 sequence
|
// and just eat it if there is. Windows .TXT files have this for instance
|
// as it identifies UTF-8 even though it is of no consequence for byte order
|
// as UTF-8 does not have a byte order.
|
//
|
if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xEF
|
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xBB
|
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+2)) == 0xBF
|
)
|
{
|
// The UTF8 BOM is present so skip it
|
//
|
input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 3);
|
}
|
|
// Install the UTF8 input routines
|
//
|
antlr3UTF8SetupStream(input);
|
break;
|
|
case ANTLR3_ENC_UTF16:
|
|
// See if there is a BOM at the start of the input. If not then
|
// we assume that the byte order is the natural order of this
|
// machine (or it is really UCS2). If there is a BOM we determine if the encoding
|
// is the same as the natural order of this machine.
|
//
|
if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xFE
|
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xFF
|
)
|
{
|
// BOM Present, indicates Big Endian
|
//
|
input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 2);
|
|
antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_TRUE);
|
}
|
else if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xFF
|
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xFE
|
)
|
{
|
// BOM present, indicates Little Endian
|
//
|
input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 2);
|
|
antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_FALSE);
|
}
|
else
|
{
|
// No BOM present, assume local computer byte order
|
//
|
antlr3UTF16SetupStream(input, isBigEndian, isBigEndian);
|
}
|
break;
|
|
case ANTLR3_ENC_UTF32:
|
|
// See if there is a BOM at the start of the input. If not then
|
// we assume that the byte order is the natural order of this
|
// machine. If there is we determine if the encoding
|
// is the same as the natural order of this machine.
|
//
|
if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0x00
|
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0x00
|
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+2)) == 0xFE
|
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+3)) == 0xFF
|
)
|
{
|
// BOM Present, indicates Big Endian
|
//
|
input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 4);
|
|
antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_TRUE);
|
}
|
else if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xFF
|
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xFE
|
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0x00
|
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0x00
|
)
|
{
|
// BOM present, indicates Little Endian
|
//
|
input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 4);
|
|
antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_FALSE);
|
}
|
else
|
{
|
// No BOM present, assume local computer byte order
|
//
|
antlr3UTF32SetupStream(input, isBigEndian, isBigEndian);
|
}
|
break;
|
|
case ANTLR3_ENC_UTF16BE:
|
|
// Encoding is definately Big Endian with no BOM
|
//
|
antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_TRUE);
|
break;
|
|
case ANTLR3_ENC_UTF16LE:
|
|
// Encoding is definately Little Endian with no BOM
|
//
|
antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_FALSE);
|
break;
|
|
case ANTLR3_ENC_UTF32BE:
|
|
// Encoding is definately Big Endian with no BOM
|
//
|
antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_TRUE);
|
break;
|
|
case ANTLR3_ENC_UTF32LE:
|
|
// Encoding is definately Little Endian with no BOM
|
//
|
antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_FALSE);
|
break;
|
|
case ANTLR3_ENC_EBCDIC:
|
|
// EBCDIC is basically the same as ASCII but with an on the
|
// fly translation to ASCII
|
//
|
antlr3EBCDICSetupStream(input);
|
break;
|
|
case ANTLR3_ENC_8BIT:
|
default:
|
|
// Standard 8bit/ASCII
|
//
|
antlr38BitSetupStream(input);
|
break;
|
}
|
}
|
|
/** \brief Use the contents of an operating system file as the input
|
* for an input stream.
|
*
|
* \param fileName Name of operating system file to read.
|
* \return
|
* - Pointer to new input stream context upon success
|
* - One of the ANTLR3_ERR_ defines on error.
|
*/
|
static pANTLR3_INPUT_STREAM
|
antlr3CreateFileStream(pANTLR3_UINT8 fileName)
|
{
|
// Pointer to the input stream we are going to create
|
//
|
pANTLR3_INPUT_STREAM input;
|
ANTLR3_UINT32 status;
|
|
if (fileName == NULL)
|
{
|
return NULL;
|
}
|
|
// Allocate memory for the input stream structure
|
//
|
input = (pANTLR3_INPUT_STREAM)
|
ANTLR3_CALLOC(1, sizeof(ANTLR3_INPUT_STREAM));
|
|
if (input == NULL)
|
{
|
return NULL;
|
}
|
|
// Structure was allocated correctly, now we can read the file.
|
//
|
status = antlr3read8Bit(input, fileName);
|
|
// Call the common 8 bit input stream handler
|
// initialization.
|
//
|
antlr3GenericSetupStream(input);
|
|
// However if the file was not there or something then we
|
// need to close. Have to wait until here as we cannot call
|
// close until the API is installed of course.
|
//
|
if (status != ANTLR3_SUCCESS)
|
{
|
input->close(input);
|
return NULL;
|
}
|
|
return input;
|
}
|
|
ANTLR3_API ANTLR3_UINT32
|
antlr3read8Bit(pANTLR3_INPUT_STREAM input, pANTLR3_UINT8 fileName)
|
{
|
ANTLR3_FDSC infile;
|
ANTLR3_UINT32 fSize;
|
|
/* Open the OS file in read binary mode
|
*/
|
infile = antlr3Fopen(fileName, "rb");
|
|
/* Check that it was there
|
*/
|
if (infile == NULL)
|
{
|
return (ANTLR3_UINT32)ANTLR3_ERR_NOFILE;
|
}
|
|
/* It was there, so we can read the bytes now
|
*/
|
fSize = antlr3Fsize(fileName); /* Size of input file */
|
|
/* Allocate buffer for this input set
|
*/
|
input->data = ANTLR3_MALLOC((size_t)fSize);
|
input->sizeBuf = fSize;
|
|
if (input->data == NULL)
|
{
|
return (ANTLR3_UINT32)ANTLR3_ERR_NOMEM;
|
}
|
|
input->isAllocated = ANTLR3_TRUE;
|
|
/* Now we read the file. Characters are not converted to
|
* the internal ANTLR encoding until they are read from the buffer
|
*/
|
antlr3Fread(infile, fSize, input->data);
|
|
/* And close the file handle
|
*/
|
antlr3Fclose(infile);
|
|
return ANTLR3_SUCCESS;
|
}
|
|
/** \brief Open an operating system file and return the descriptor
|
* We just use the common open() and related functions here.
|
* Later we might find better ways on systems
|
* such as Windows and OpenVMS for instance. But the idea is to read the
|
* while file at once anyway, so it may be irrelevant.
|
*/
|
ANTLR3_API ANTLR3_FDSC
|
antlr3Fopen(pANTLR3_UINT8 filename, const char * mode)
|
{
|
return (ANTLR3_FDSC)fopen((const char *)filename, mode);
|
}
|
|
/** \brief Close an operating system file and free any handles
|
* etc.
|
*/
|
ANTLR3_API void
|
antlr3Fclose(ANTLR3_FDSC fd)
|
{
|
fclose(fd);
|
}
|
ANTLR3_API ANTLR3_UINT32
|
antlr3Fsize(pANTLR3_UINT8 fileName)
|
{
|
struct _stat statbuf;
|
|
_stat((const char *)fileName, &statbuf);
|
|
return (ANTLR3_UINT32)statbuf.st_size;
|
}
|
|
ANTLR3_API ANTLR3_UINT32
|
antlr3Fread(ANTLR3_FDSC fdsc, ANTLR3_UINT32 count, void * data)
|
{
|
return (ANTLR3_UINT32)fread(data, (size_t)count, 1, fdsc);
|
}
|
|
|
/** \brief Use the supplied 'string' as input to the stream
|
*
|
* \param data Pointer to the input data
|
* \return
|
* - Pointer to new input stream context upon success
|
* - NULL defines on error.
|
*/
|
static pANTLR3_INPUT_STREAM
|
antlr3CreateStringStream(pANTLR3_UINT8 data)
|
{
|
// Pointer to the input stream we are going to create
|
//
|
pANTLR3_INPUT_STREAM input;
|
|
if (data == NULL)
|
{
|
return NULL;
|
}
|
|
// Allocate memory for the input stream structure
|
//
|
input = (pANTLR3_INPUT_STREAM)
|
ANTLR3_CALLOC(1, sizeof(ANTLR3_INPUT_STREAM));
|
|
if (input == NULL)
|
{
|
return NULL;
|
}
|
|
// Structure was allocated correctly, now we can install the pointer
|
//
|
input->data = data;
|
input->isAllocated = ANTLR3_FALSE;
|
|
// Call the common 8 bit input stream handler
|
// initialization.
|
//
|
antlr3GenericSetupStream(input);
|
|
return input;
|
}
|