commit 3a588bb82a1b9248f166c0c3f814ce65008a59d0
parent f25090d64f44c1349a200acff7648ff1676a64ff
Author: Andres Navarro <canavarro82@gmail.com>
Date: Thu, 17 Feb 2011 17:08:38 -0300
(almost complete) Tokenizer added. Constructors for pair, string and boolean. Some more macros in kobject.h.
Diffstat:
12 files changed, 893 insertions(+), 73 deletions(-)
diff --git a/src/Makefile b/src/Makefile
@@ -7,7 +7,7 @@ MYCFLAGS=
MYLDFLAGS=
MYLIBS=
-CORE_O=
+CORE_O= kobject.o ktoken.o kpair.o kstring.o ksymbol.o
KRN_T= klisp
KRN_O= klisp.o
@@ -23,7 +23,7 @@ all: $(ALL_T)
o: $(ALL_O)
$(KRN_T): $(ALL_O)
- $(CC) -o $@ $(MYLDFLAGS) $(KRN_O) $(LIBS)
+ $(CC) -o $@ $(MYLDFLAGS) $(ALL_O) $(LIBS)
clean:
$(RM) $(ALL_T) $(ALL_O)
@@ -31,4 +31,10 @@ clean:
# list targets that do not create files (but not all makes understand .PHONY)
.PHONY: all default o clean
-klisp.o: klisp.c klisp.h kobject.h
+klisp.o: klisp.c klisp.h kobject.h ktoken.h
+kobject.o: kobject.c kobject.h
+ktoken.o: ktoken.c ktoken.h kobject.h kpair.h kstring.h ksymbol.h
+kpair.o: kpair.c kpair.h kobject.h
+kstring.o: kstring.c kstring.h kobject.h
+# XXX: kpair.h because of use of list as symbol table
+ksymbol.o: ksymbol.c ksymbol.h kobject.h kpair.h
diff --git a/src/klisp.c b/src/klisp.c
@@ -7,44 +7,28 @@
#include <stdio.h>
#include "kobject.h"
+#include "ktoken.h"
int main(int argc, char *argv[])
{
- printf("Tests\n");
-
- printf("\nVariables: \n");
- printf("nil: %d\n", ttisnil(knil));
- printf("ignore: %d\n", ttisignore(kignore));
- printf("inert: %d\n", ttisinert(kinert));
- printf("eof: %d\n", ttiseof(keof));
- printf("true: %d\n", ttisboolean(ktrue));
- printf("false: %d\n", ttisboolean(kfalse));
-
-
- printf("\nConstants: \n");
-
- printf("nil: %d\n", ttisnil(KNIL));
- printf("ignore: %d\n", ttisignore(KIGNORE));
- printf("inert: %d\n", ttisinert(KINERT));
- printf("eof: %d\n", ttiseof(KEOF));
- printf("true: %d\n", ttisboolean(KTRUE));
- printf("false: %d\n", ttisboolean(KFALSE));
-
- printf("int: %d\n",
- ttisfixint(((TValue) {.tv = {.t = K_TAG_FIXINT, .v = { .i = 3}}})));
- printf("double: %d\n", ttisdouble((TValue){.d = 1.0}));
-
- printf("\nSwitch: \n");
-
- printf("nil: %d\n", ttype(KNIL));
- printf("ignore: %d\n", ttype(KIGNORE));
- printf("inert: %d\n", ttype(KINERT));
- printf("eof: %d\n", ttype(KEOF));
- printf("true: %d\n", ttype(KTRUE));
- printf("false: %d\n", ttype(KFALSE));
- printf("int: %d\n",
- ttype(((TValue) {.tv = {.t = K_TAG_FIXINT, .v = { .i = 3}}})));
- printf("double: %d\n", ttype((TValue){.d = 1.0}));
+ /*
+ ** Simple tokenizer loop
+ */
+ printf("Tokenizer Type Test\n");
+
+ ktok_file = stdin;
+ ktok_init();
+
+ TValue tok = KNIL;
+
+ while(!ttiseof(tok)) {
+ tok = ktok_read_token();
+ if (ttisnil(tok)) {
+ /* there was an error */
+ break;
+ }
+ printf("\nToken Type: %s\n", ttname(tok));
+ }
return 0;
}
diff --git a/src/kobject.c b/src/kobject.c
@@ -0,0 +1,46 @@
+/*
+** kobject.h
+** Type definitions for Kernel Objects
+** See Copyright Notice in klisp.h
+*/
+
+#include "kobject.h"
+
+/*
+** The global const variables
+*/
+const TValue knil = KNIL_;
+const TValue kignore = KIGNORE_;
+const TValue kinert = KINERT_;
+const TValue keof = KEOF_;
+const TValue ktrue = KTRUE_;
+const TValue kfalse = KFALSE_;
+const TValue kepinf = KEPINF_;
+const TValue keminf = KEMINF_;
+
+/*
+** The name strings for all TValue types
+*/
+char *ktv_names[] = {
+ [K_TFIXINT] = "fixint",
+ [K_TBIGINT] = "bigint",
+ [K_TFIXRAT] = "fixrat",
+ [K_TBIGRAT] = "bigrat",
+ [K_TEINF] = "einf",
+ [K_TDOUBLE] = "double",
+ [K_TBDOUBLE] = "bdouble",
+ [K_TIINF] = "iinf",
+ [K_TRWNPN] = "rwnpn",
+ [K_TCOMPLEX] = "complex",
+
+ [K_TNIL] = "nil",
+ [K_TIGNORE] = "ignore",
+ [K_TINERT] = "inert",
+ [K_TEOF] = "eof",
+ [K_TBOOLEAN] = "boolean",
+ [K_TCHAR] = "char",
+
+ [K_TPAIR] = "pair",
+ [K_TSTRING] = "string",
+ [K_TSYMBOL] = "symbol"
+};
diff --git a/src/kobject.h b/src/kobject.h
@@ -6,7 +6,7 @@
/*
** SOURCE NOTE: While the tagging system comes from Mozilla TraceMonkey,
-** o code from TraceMonkey was used.
+** no code from TraceMonkey was used.
** The general structure, names and comments of this file follow the
** scheme of Lua.
*/
@@ -52,16 +52,14 @@ typedef struct __attribute__ ((__packed__)) GCheader {
/*
** Tags: Types & Flags
-*/
-
-/*
-** Tagged values in 64 bits (for 32 bit systems)
-** NaN boxing: Values are encoded as double precision NaNs
-** There is one canonical NaN that is used through the interpreter
-** and all remaining NaNs are used to encode the rest of the types
-** (other than double)
-** Canonical NaN: (0)(111 1111 1111) 1000 0000 0000 0000 0000 32(0)
-** Infinities: s(111 1111 1111) 0000 0000 0000 0000 0000 32(0)
+**
+** Nan Boxing: Tagged values in 64 bits (for 32 bit systems)
+** All Values except doubles are encoded as double precision NaNs
+** There is one canonical NaN(?maybe none?) that is used through the
+** interpreter and all remaining NaNs are used to encode the rest of
+** the types (other than double)
+** Canonical NaN(?): (0)(111 1111 1111) 1000 0000 0000 0000 0000 32(0)
+** Infinities(?): s(111 1111 1111) 0000 0000 0000 0000 0000 32(0)
** Tagged values: (0)(111 1111 1111) 1111 tttt tttt tttt tttt 32(v)
** So all tags start with 0x7fff which leaves us 16 bits for the
** tag proper.
@@ -83,6 +81,7 @@ typedef struct __attribute__ ((__packed__)) GCheader {
#define K_TAG_BASE_TYPE(t) ((t) & K_TAG_BASE_TYPE_MASK)
/*
+** RATIONALE:
** Number types are first and ordered to allow easy switch statements
** in arithmetic operators. The ones marked with (?) are still in
** consideration for separate type tags.
@@ -119,8 +118,15 @@ typedef struct __attribute__ ((__packed__)) GCheader {
#define K_MAKE_VTAG(t) (K_TAG_TAGGED | t)
-/* TODO: For now we will only use fixints */
+/*
+** TODO:
+**
+** - decide if inexact infinities and reals with no
+** primary values are included in K_TDOUBLE
+** - For now we will only use fixints and exact infinities
+*/
#define K_TAG_FIXINT K_MAKE_VTAG(K_TFIXINT)
+#define K_TAG_EINF K_MAKE_VTAG(K_TEINF)
#define K_TAG_NIL K_MAKE_VTAG(K_TNIL)
#define K_TAG_IGNORE K_MAKE_VTAG(K_TIGNORE)
@@ -137,17 +143,11 @@ typedef struct __attribute__ ((__packed__)) GCheader {
** Macros to test types
*/
-/*
-** This is intended for use in switch statements
-** TODO: decide if inexact infinities and reals with no
-** primary values are included in K_TDOUBLE
-*/
+/* NOTE: This is intended for use in switch statements */
#define ttype(o) ({ TValue o_ = o; \
ttisdouble(o_)? K_TDOUBLE : ttype_(o_); })
-/*
-** This is intended for internal use below. DON'T USE OUTSIDE THIS FILE
-*/
+/* This is intended for internal use below. DON'T USE OUTSIDE THIS FILE */
#define ttag(o) ((o).tv.t)
#define ttype_(o) (K_TAG_TYPE(ttag(o)))
#define tflag_(o) (K_TAG_FLAG(ttag(o)))
@@ -175,7 +175,7 @@ typedef struct __attribute__ ((__packed__)) GCheader {
typedef union {
bool b;
int32_t i;
- unsigned char ch;
+ char ch;
GCObject *gc;
void *p;
/* ... */
@@ -206,21 +206,30 @@ typedef struct __attribute__ ((__packed__)) {
TValue cdr;
} Pair;
+/* XXX: Symbol should probably contain a String instead of a char buf */
typedef struct __attribute__ ((__packed__)) {
CommonHeader;
- unsigned char b[]; // buffer
+ uint32_t size;
+ char b[];
} Symbol;
+/*
+** RATIONALE:
+**
+** Storing size allows embedded '\0's.
+** Note, however, that there are actually size + 1 bytes allocated
+** and that b[size] = '\0'. This is useful for printing strings
+**
+*/
typedef struct __attribute__ ((__packed__)) {
CommonHeader;
- uint32_t size; // to allow embedded '\0'
- unsigned char b[]; // buffer
+ uint32_t size;
+ char b[]; // buffer
} String;
/*
** Union of all Kernel heap-allocated values
*/
-
/* LUA NOTE: In Lua the corresponding union is in lstate.h */
union GCObject {
GCheader gch;
@@ -239,22 +248,66 @@ union GCObject {
#define KEOF_ {.tv = {.t = K_TAG_EOF, .v = { .i = 0 }}}
#define KTRUE_ {.tv = {.t = K_TAG_BOOLEAN, .v = { .b = true }}}
#define KFALSE_ {.tv = {.t = K_TAG_BOOLEAN, .v = { .b = false }}}
+#define KEPINF_ {.tv = {.t = K_TAG_EINF, .v = { .i = 1 }}}
+#define KEMINF_ {.tv = {.t = K_TAG_EINF, .v = { .i = -1 }}}
+/* RATIONALE: the ones above can be used in initializers */
#define KNIL ((TValue) KNIL_)
#define KINERT ((TValue) KINERT_)
#define KIGNORE ((TValue) KIGNORE_)
#define KEOF ((TValue) KEOF_)
#define KTRUE ((TValue) KTRUE_)
#define KFALSE ((TValue) KFALSE_)
-
-/*
-** The same constants as global const variables
-*/
-const TValue knil = KNIL_;
-const TValue kignore = KIGNORE_;
-const TValue kinert = KINERT_;
-const TValue keof = KEOF_;
-const TValue ktrue = KTRUE_;
-const TValue kfalse = KFALSE_;
+#define KEPINF ((TValue) KEPINF_)
+#define KEMINF ((TValue) KEMINF_)
+
+/* The same constants as global const variables */
+const TValue knil;
+const TValue kignore;
+const TValue kinert;
+const TValue keof;
+const TValue ktrue;
+const TValue kfalse;
+const TValue kepinf;
+const TValue keminf;
+
+/* Macros to create TValues of non-heap allocated types (for initializers) */
+#define ch2tv_(ch_) {.tv = {.t = K_TAG_CHAR, .v = { .ch = ch_ }}}
+#define i2tv_(i_) {.tv = {.t = K_TAG_FIXINT, .v = { .i = i_ }}}
+#define b2tv_(b_) {.tv = {.t = K_TAG_BOOLEAN, .v = { .b = b_ }}}
+
+/* Macros to create TValues of non-heap allocated types */
+#define ch2tv(ch_) ((TValue) ch2tv_(ch_))
+#define i2tv(i_) ((TValue) i2tv_(i_))
+#define b2tv(b_) ((TValue) b2tv_(b_))
+
+/* Macros to convert a GCObject * into a tagged value */
+/* TODO: add assertions */
+/* LUA NOTE: the corresponding defines are in lstate.h */
+#define gc2tv(t_, o_) ((TValue) {.tv = {.t = t_, \
+ .v = { .gc = obj2gco(o_)}}})
+#define gc2pair(o_) (gc2tv(K_TAG_PAIR, o_))
+#define gc2str(o_) (gc2tv(K_TAG_STRING, o_))
+#define gc2sym(o_) (gc2tv(K_TAG_SYMBOL, o_))
+
+/* Macro to convert a TValue into a specific heap allocated object */
+#define tv2pair(v_) ((Pair *) gcvalue(v_))
+#define tv2str(v_) ((String *) gcvalue(v_))
+#define tv2sym(v_) ((Symbol *) gcvalue(v_))
+
+/* Macro to convert any Kernel object into a GCObject */
+#define obj2gco(v_) ((GCObject *) (v_))
+
+/* Macros to access innertv values */
+/* TODO: add assertions */
+#define ivalue(o_) ((o_).tv.v.i)
+#define bvalue(o_) ((o_).tv.v.b)
+#define chvalue(o_) ((o_).tv.v.ch)
+#define gcvalue(o_) ((o_).tv.v.gc)
+
+/* Macro to obtain a string describing the type of a TValue */#
+#define ttname(tv_) (ktv_names[ttype(tv_)])
+
+extern char *ktv_names[];
#endif
diff --git a/src/kpair.c b/src/kpair.c
@@ -0,0 +1,27 @@
+/*
+** kpair.c
+** Kernel Pairs
+** See Copyright Notice in klisp.h
+*/
+
+/* XXX: for malloc */
+#include <stdlib.h>
+/* TODO: use a generalized alloc function */
+
+#include "kpair.h"
+#include "kobject.h"
+
+/* TODO: Out of memory errors */
+/* XXX: for now all pairs are mutable */
+TValue kcons(TValue car, TValue cdr)
+{
+ Pair *new_pair = malloc(sizeof(Pair));
+
+ new_pair->next = NULL;
+ new_pair->gct = 0;
+ new_pair->tt = K_TPAIR;
+ new_pair->car = car;
+ new_pair->cdr = cdr;
+
+ return gc2pair(new_pair);
+}
diff --git a/src/kpair.h b/src/kpair.h
@@ -0,0 +1,25 @@
+/*
+** kpair.h
+** Kernel Pairs
+** See Copyright Notice in klisp.h
+*/
+
+#ifndef kpair_h
+#define kpair_h
+
+#include "kobject.h"
+
+/* TODO: add type assertions */
+/* TODO: add more kc[ad]*r combinations */
+#define kcar(p_) (((Pair *)(p_.tv.v.gc))->car)
+#define kcdr(p_) (((Pair *)(p_.tv.v.gc))->cdr)
+
+#define kset_car(p_, v_) (kcar(p_) = v_)
+#define kset_cdr(p_, v_) (kcdr(p_) = v_)
+
+#define kdummy_cons (kcons(KNIL, KNIL))
+
+/* XXX: for now all pairs are mutable */
+TValue kcons(TValue, TValue);
+
+#endif
diff --git a/src/kstring.c b/src/kstring.c
@@ -0,0 +1,32 @@
+/*
+** kstring.c
+** Kernel Strings
+** See Copyright Notice in klisp.h
+*/
+
+/* XXX: for malloc */
+#include <stdlib.h>
+/* TODO: use a generalized alloc function */
+
+#include <string.h>
+
+#include "kstring.h"
+#include "kobject.h"
+
+/* TODO: Out of memory errors */
+/* XXX: for now all strings are mutable */
+TValue kstring_new(const char *buf, uint32_t size)
+{
+ String *new_str = malloc(sizeof(String) + size + 1);
+
+ new_str->next = NULL;
+ new_str->gct = 0;
+ new_str->tt = K_TSTRING;
+ new_str->size = size;
+ /* NOTE: there can be embedded '\0's in a string */
+ memcpy(new_str->b, buf, size);
+ /* NOTE: they still end with a '\0' for convenience in printing */
+ new_str->b[size] = '\0';
+
+ return gc2str(new_str);
+}
diff --git a/src/kstring.h b/src/kstring.h
@@ -0,0 +1,15 @@
+/*
+** kstring.h
+** Kernel Strings
+** See Copyright Notice in klisp.h
+*/
+
+#ifndef kstring_h
+#define kstring_h
+
+#include "kobject.h"
+
+/* XXX: for now all strings are mutable */
+TValue kstring_new(const char *, uint32_t);
+
+#endif
diff --git a/src/ksymbol.c b/src/ksymbol.c
@@ -0,0 +1,51 @@
+/*
+** ksymbol.c
+** Kernel Symbols
+** See Copyright Notice in klisp.h
+*/
+
+/* XXX: for malloc */
+#include <stdlib.h>
+/* TODO: use a generalized alloc function */
+
+#include <string.h>
+
+#include "ksymbol.h"
+#include "kobject.h"
+#include "kpair.h"
+
+/* TODO: replace the list with a hashtable */
+/* TODO: move to global state */
+TValue ksymbol_table = KNIL_;
+
+/* TODO: Out of memory errors */
+TValue ksymbol_new(const char *buf)
+{
+ /* First look for it in the symbol table */
+ TValue tbl = ksymbol_table;
+ while (!ttisnil(tbl)) {
+ TValue first = kcar(tbl);
+ /* NOTE: there are no embedded '\0's in symbols */
+ if (strcmp(buf, tv2sym(first)->b) == 0)
+ return first;
+ else
+ tbl = kcdr(tbl);
+ }
+
+ /* Didn't find it, alloc new and save in symbol table */
+ /* NOTE: there are no embedded '\0's in symbols */
+ int32_t size = strlen(buf);
+ Symbol *new_sym = malloc(sizeof(Symbol) + size + 1);
+
+ new_sym->next = NULL;
+ new_sym->gct = 0;
+ new_sym->tt = K_TSYMBOL;
+ new_sym->size = size;
+ memcpy(new_sym->b, buf, size);
+ new_sym->b[size] = '\0';
+
+ TValue new_symv = gc2sym(new_sym);
+ tbl = kcons(new_symv, ksymbol_table);
+ ksymbol_table = tbl;
+ return new_symv;
+}
diff --git a/src/ksymbol.h b/src/ksymbol.h
@@ -0,0 +1,19 @@
+/*
+** ksymbol.h
+** Kernel Symbols
+** See Copyright Notice in klisp.h
+*/
+
+#ifndef ksymbol_h
+#define ksymbol_h
+
+#include "kobject.h"
+
+/* TODO: replace the list with a hashtable */
+/* TODO: move to global state */
+TValue ksymbol_table;
+
+/* XXX: for now all symbols are interned */
+TValue ksymbol_new(const char *);
+
+#endif
diff --git a/src/ktoken.c b/src/ktoken.c
@@ -0,0 +1,541 @@
+/*
+** ktoken.c
+** Tokenizer for the Kernel Programming Language
+** See Copyright Notice in klisp.h
+*/
+
+
+/*
+** TODO:
+**
+** - Support other number types besides fixints and exact infinities
+** - Support for complete number syntax (exactness, radix, etc)
+** - Support for unicode (strings, char and symbols).
+** - Error handling
+** - Source code tracking
+**
+*/
+#include <stdio.h>
+/* XXX for malloc */
+#include <stdlib.h>
+/* TODO: use a generalized alloc function */
+
+#include <string.h>
+#include <ctype.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "ktoken.h"
+#include "kobject.h"
+#include "kpair.h"
+#include "kstring.h"
+#include "ksymbol.h"
+
+/*
+** Char sets for fast ASCII char classification
+*/
+
+/* Each bit correspond to a char in the 0-255 range */
+typedef uint32_t kcharset[8];
+
+/*
+** Char set function/macro interface
+*/
+void kcharset_empty(kcharset);
+void kcharset_fill(kcharset, char *);
+void kcharset_union(kcharset, kcharset);
+#define kcharset_contains(kch_, ch_) \
+ ({ unsigned char ch__ = (unsigned char) ch_; \
+ kch_[KCHS_OCTANT(ch__)] & KCHS_BIT(ch__); })
+
+
+/*
+** Char set contains macro interface
+*/
+#define KCHS_OCTANT(ch) (ch >> 5)
+#define KCHS_BIT(ch) (1 << (ch & 0x1f))
+
+void kcharset_empty(kcharset chs)
+{
+ for (int i = 0; i < 8; i++) {
+ chs[i] = 0;
+ }
+}
+
+void kcharset_fill(kcharset chs, char *chars_)
+{
+ unsigned char *chars = (unsigned char *) chars_;
+ unsigned char ch;
+
+ kcharset_empty(chs);
+
+ while ((ch = *chars++)) {
+ chs[KCHS_OCTANT(ch)] |= KCHS_BIT(ch);
+ }
+}
+
+void kcharset_union(kcharset chs, kcharset chs2)
+{
+ for (int i = 0; i < 8; i++) {
+ chs[i] |= chs2[i];
+ }
+}
+
+/*
+** Character sets for classification
+*/
+kcharset ktok_alphabetic, ktok_numeric, ktok_whitespace;
+kcharset ktok_delimiter, ktok_extended, ktok_subsequent;
+
+#define ktok_is_alphabetic(chi_) kcharset_contains(ktok_alphabetic, chi_)
+/* TODO: add is_digit, that takes the base as parameter */
+#define ktok_is_numeric(chi_) kcharset_contains(ktok_numeric, chi_)
+/* TODO: add hex digits */
+#define ktok_digit_value(ch_) (ch_ - '0')
+#define ktok_is_whitespace(chi_) kcharset_contains(ktok_whitespace, chi_)
+#define ktok_is_delimiter(chi_) (chi == EOF || \
+ kcharset_contains(ktok_delimiter, chi_))
+#define ktok_is_subsequent(chi_) kcharset_contains(ktok_subsequent, chi_)
+
+/*
+** Special Tokens
+*/
+
+/*
+** RATIONALE:
+**
+** Because a pair is not a token, they can be used to represent special tokens
+** instead of creating an otherwise useless special token type
+** lparen, rparen and dot are represented as a pair with the corresponding
+** char in the car and nil in the cdr.
+** srfi-38 tokens are represented with a boolean in the indicating if it's a
+** defining token and the number in the cdr
+**
+*/
+TValue ktok_lparen, ktok_rparen, ktok_dot;
+
+/* TODO: move this to the global state */
+char *ktok_buffer;
+/* WORKAROUND: for stdin line buffering & reading of EOF */
+bool ktok_seen_eof;
+uint32_t ktok_buffer_size;
+#define KTOK_BUFFER_INITIAL_SIZE 1024
+
+void ktok_init()
+{
+ /* WORKAROUND: for stdin line buffering & reading of EOF */
+ ktok_seen_eof = false;
+ /* string buffer */
+ /* XXX: for now use a fixed size */
+ ktok_buffer_size = KTOK_BUFFER_INITIAL_SIZE;
+ /* TODO: Out of memory errors */
+ ktok_buffer = malloc(KTOK_BUFFER_INITIAL_SIZE);
+
+ /* Character sets */
+ kcharset_fill(ktok_alphabetic, "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz");
+ kcharset_fill(ktok_numeric, "0123456789");
+ kcharset_fill(ktok_whitespace, " \t\v\r\n\f");
+
+ kcharset_fill(ktok_delimiter, "()\";");
+ kcharset_union(ktok_delimiter, ktok_whitespace);
+
+ kcharset_fill(ktok_extended, "!$%&*+-./:<=>?@^_~");
+
+ kcharset_empty(ktok_subsequent);
+ kcharset_union(ktok_subsequent, ktok_alphabetic);
+ kcharset_union(ktok_subsequent, ktok_numeric);
+ kcharset_union(ktok_subsequent, ktok_extended);
+
+ /* Special Tokens */
+ ktok_lparen = kcons(ch2tv('('), KNIL);
+ ktok_rparen = kcons(ch2tv(')'), KNIL);
+ ktok_dot = kcons(ch2tv('.'), KNIL);
+}
+
+/*
+** Underlying stream interface
+*/
+int ktok_getc();
+int ktok_peekc();
+
+int ktok_getc() {
+ /* TODO: add location tracking */
+ /* WORKAROUND: for stdin line buffering & reading of EOF */
+ if (ktok_seen_eof) {
+ return EOF;
+ } else {
+ int chi = getc(ktok_file);
+ ktok_seen_eof = (chi == EOF);
+ return chi;
+ }
+}
+
+int ktok_peekc() {
+ int chi = ktok_getc();
+ if (chi != EOF)
+ ungetc(chi, ktok_file);
+ return chi;
+}
+
+/*
+** Error management
+*/
+TValue ktok_error(char *str)
+{
+ /* TODO: Decide on error handling mechanism for reader (& tokenizer) */
+ printf("TOK ERROR: %s\n", str);
+ return KNIL;
+}
+
+
+/*
+** ktok_read_token() helpers
+*/
+void ktok_ignore_whitespace_and_comments();
+bool ktok_check_delimiter();
+TValue ktok_read_string();
+TValue ktok_read_special();
+TValue ktok_read_number(bool);
+TValue ktok_read_maybe_signed_numeric();
+TValue ktok_read_identifier();
+int ktok_read_until_delimiter();
+
+/*
+** Main tokenizer function
+*/
+TValue ktok_read_token ()
+{
+ ktok_ignore_whitespace_and_comments();
+ /*
+ ** NOTE: We jumped over all whitespace
+ ** so either the next token starts here or eof was reached,
+ ** in any case we save the location of the port
+ */
+
+ /* TODO: add location tracking */
+ /* SCHEME VERSION (kport-save-loc! kport) */
+
+ int chi = ktok_peekc();
+
+ switch(chi) {
+ case EOF:
+ ktok_getc();
+ return KEOF;
+ case '(':
+ ktok_getc();
+ return ktok_lparen;
+ case ')':
+ ktok_getc();
+ return ktok_rparen;
+ case '.':
+ ktok_getc();
+ if (ktok_check_delimiter())
+ return ktok_dot;
+ else
+ return ktok_error("no delimiter found after dot");
+ case '"':
+ return ktok_read_string();
+ case '#':
+ return ktok_read_special();
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ return ktok_read_number(true); /* positive number */
+ case '+': case '-':
+ return ktok_read_maybe_signed_numeric();
+ case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
+ case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
+ case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
+ case 'V': case 'W': case 'X': case 'Y': case 'Z':
+ case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
+ case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
+ case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
+ case 'v': case 'w': case 'x': case 'y': case 'z':
+ case '!': case '$': case '%': case '&': case '*': case '/': case ':':
+ case '<': case '=': case '>': case '?': case '@': case '^': case '_':
+ case '~':
+ /*
+ ** NOTE: the cases for '+', '-', '.' and numbers were already
+ ** considered so identifier-subsequent is used instead of
+ ** identifier-first-char (in the cases above)
+ */
+ return ktok_read_identifier();
+ default:
+ ktok_getc();
+ return ktok_error("unrecognized token starting char");
+ }
+}
+
+/*
+** Comments and Whitespace
+*/
+void ktok_ignore_comment()
+{
+ int chi;
+ do {
+ chi = ktok_getc();
+ } while (chi != EOF && chi != '\n');
+}
+
+void ktok_ignore_whitespace_and_comments()
+{
+ /* NOTE: if it's not a whitespace or comment do nothing (even on eof) */
+ bool end = false;
+ while(!end) {
+ int chi = ktok_peekc();
+
+ if (chi == EOF) {
+ end = true;
+ } else {
+ char ch = (char) chi;
+ if (ktok_is_whitespace(ch)) {
+ ktok_getc();
+ } else if (ch == ';') {
+ ktok_ignore_comment(); /* NOTE: this also reads again the ';' */
+ } else {
+ end = true;
+ }
+ }
+ }
+}
+
+/*
+** Delimiter checking
+*/
+bool ktok_check_delimiter()
+{
+ int chi = ktok_peekc();
+ return (ktok_is_delimiter(chi));
+}
+
+/*
+** Returns the number of bytes read
+*/
+int ktok_read_until_delimiter()
+{
+ int i = 0;
+
+ while (!ktok_check_delimiter()) {
+ /* NOTE: can't be eof, because eof is a delimiter */
+ char ch = (char) ktok_getc();
+ ktok_buffer[i++] = ch;
+
+ if (i + 1 == ktok_buffer_size) {
+ /* TODO: allow buffer to grow */
+ break;
+ }
+ }
+ ktok_buffer[i] = '\0';
+ return i;
+}
+
+/*
+** Numbers
+** XXX: for now, only fixints in base 10
+*/
+TValue ktok_read_number(bool is_pos)
+{
+ int32_t res = 0;
+
+ while(!ktok_check_delimiter()) {
+ /* NOTE: can't be eof because it's a delimiter */
+ char ch = (char) ktok_getc();
+ if (!ktok_is_numeric(ch))
+ return ktok_error("Not a digit found in number");
+ res = res * 10 + ktok_digit_value(ch);
+ }
+
+ if (!is_pos)
+ res = -res;
+ return i2tv(res);
+}
+
+TValue ktok_read_maybe_signed_numeric()
+{
+ /* NOTE: can't be eof, it's either '+' or '-' */
+ char ch = (char) ktok_getc();
+ if (ktok_check_delimiter()) {
+ ktok_buffer[0] = ch;
+ ktok_buffer[1] = '\0';
+ return ksymbol_new(ktok_buffer);
+ } else {
+ return ktok_read_number(ch == '+');
+ }
+}
+
+/*
+** Strings
+*/
+TValue ktok_read_string()
+{
+ /* discard opening quote */
+ ktok_getc();
+
+ bool done = false;
+ int i = 0;
+
+ while(!done) {
+ int chi = ktok_getc();
+ char ch = (char) chi;
+
+ if (chi == EOF)
+ return ktok_error("EOF found while reading a string");
+ if (ch == '"') {
+ ktok_buffer[i] = '\0';
+ done = true;
+ } else {
+ if (ch == '\\') {
+ chi = ktok_getc();
+
+ if (chi == EOF)
+ return ktok_error("EOF found while reading a string");
+
+ ch = (char) chi;
+
+ if (ch != '\\' && ch != '"') {
+ return ktok_error("Invalid char after '\\' "
+ "while reading a string");
+ }
+ }
+ ktok_buffer[i++] = ch;
+
+ if (i + 1 == ktok_buffer_size)
+ /* TODO: allow buffer to grow */
+ return ktok_error("Implementation restriction: "
+ "String too long");
+ }
+ }
+ return kstring_new(ktok_buffer, i);
+}
+
+/*
+** Special constants (starting with "#")
+** (Special number syntax, char constants, #ignore, #inert, srfi-38 tokens)
+*/
+TValue ktok_read_special()
+{
+ /* discard the '#' */
+ ktok_getc();
+
+ int chi = ktok_getc();
+ char ch = (char) chi;
+
+ if (chi == EOF)
+ return ktok_error("EOF found while reading a '#' constant");
+
+ switch(ch) {
+ case 'i':
+ /* ignore or inert */
+ /* XXX: could also be an inexact number */
+ ktok_read_until_delimiter();
+ /* NOTE: can use strcmp even in the presence of '\0's */
+ if (strcmp(ktok_buffer, "gnore") == 0)
+ return KIGNORE;
+ else if (strcmp(ktok_buffer, "nert") == 0)
+ return KINERT;
+ else
+ return ktok_error("unexpected char in # constant");
+ case 'e':
+ /* an exact infinity */
+ /* XXX: could also be an exact number */
+ if (ktok_read_until_delimiter()) {
+ /* NOTE: can use strcmp even in the presence of '\0's */
+ if (strcmp(ktok_buffer, "+infinity") == 0)
+ return KEPINF;
+ else if (strcmp(ktok_buffer, "-infinity") == 0)
+ return KEMINF;
+ else
+ return ktok_error("unexpected char in # constant");
+ } else
+ return ktok_error("unexpected error in # constant");
+ case 't':
+ case 'f':
+ /* boolean constant */
+ if (ktok_check_delimiter())
+ return b2tv(ch == 't');
+ else
+ return ktok_error("unexpected char in # constant");
+ case '\\':
+ /* char constant */
+ /*
+ ** RATIONALE: in the scheme spec (R5RS) it says that only alphabetic
+ ** char constants need a delimiter to disambiguate the cases with
+ ** character names. It would be more consistent if all characters
+ ** needed a delimiter (and is probably implied by the yet incomplete
+ ** Kernel report (R-1RK))
+ ** For now we follow the scheme report
+ */
+ chi = ktok_getc();
+ ch = (char) chi;
+
+ if (chi == EOF)
+ return ktok_error("EOF found while reading a char constant");
+
+ if (!ktok_is_alphabetic(ch) || ktok_check_delimiter())
+ return ch2tv(ch);
+
+ ktok_read_until_delimiter();
+ char *p = ktok_buffer;
+ while (*p) {
+ *p = tolower(*p);
+ p++;
+ }
+ ch = tolower(ch);
+ /* NOTE: can use strcmp even in the presence of '\0's */
+ if (ch == 's' && strcmp(ktok_buffer, "pace") == 0)
+ return ch2tv(' ');
+ else if (ch == 'n' && strcmp(ktok_buffer, "ewline") == 0)
+ return ch2tv('\n');
+ else
+ return ktok_error("Unrecognized character name");
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9': {
+ /* srfi-38 type token (can be either a def or ref) */
+ /* TODO: allow bigints */
+ int32_t res = 0;
+ while(ch != '#' && ch != '=') {
+ if (!ktok_is_numeric(ch))
+ return ktok_error("Invalid char found in srfi-38 token");
+
+ res = res * 10 + ktok_digit_value(ch);
+
+ chi = ktok_getc();
+ ch = (char) chi;
+
+ if (chi == EOF)
+ return ktok_error("EOF found while reading a srfi-38 token");
+ }
+ return kcons(b2tv(ch == '='), i2tv(res));
+ }
+ /* TODO: add real with no primary value and undefined */
+ default:
+ return ktok_error("unexpected char in # constant");
+ }
+}
+
+/*
+** Identifiers
+*/
+TValue ktok_read_identifier()
+{
+ int i = 0;
+
+ while (!ktok_check_delimiter()) {
+ /* NOTE: can't be eof, because eof is a delimiter */
+ char ch = (char) ktok_getc();
+
+ /* NOTE: is_subsequent of '\0' is false, so no embedded '\0' */
+ if (ktok_is_subsequent(ch))
+ ktok_buffer[i++] = ch;
+ else
+ return ktok_error("Invalid char in identifier");
+
+ /* TODO: allow buffer to grow */
+ if (i + 1 == ktok_buffer_size) {
+ break;
+ }
+ }
+ ktok_buffer[i] = '\0';
+ return ksymbol_new(ktok_buffer);
+}
+
diff --git a/src/ktoken.h b/src/ktoken.h
@@ -0,0 +1,21 @@
+/*
+** ktoken.h
+** Tokenizer for the Kernel Programming Language
+** See Copyright Notice in klisp.h
+*/
+
+#ifndef ktoken_h
+#define ktoken_h
+
+#include "kobject.h"
+
+/*
+** Tokenizer interface
+*/
+void ktok_init();
+TValue ktok_read_token();
+
+/* TODO: move this to the global state */
+FILE *ktok_file;
+
+#endif