klisp

an open source interpreter for the Kernel Programming Language.
git clone http://git.hanabi.in/repos/klisp.git
Log | Files | Refs | README

commit 3a588bb82a1b9248f166c0c3f814ce65008a59d0
parent f25090d64f44c1349a200acff7648ff1676a64ff
Author: Andres Navarro <canavarro82@gmail.com>
Date:   Thu, 17 Feb 2011 17:08:38 -0300

(almost complete) Tokenizer added. Constructors for pair, string and boolean. Some more macros in kobject.h.

Diffstat:
Msrc/Makefile | 12+++++++++---
Msrc/klisp.c | 54+++++++++++++++++++-----------------------------------
Asrc/kobject.c | 46++++++++++++++++++++++++++++++++++++++++++++++
Msrc/kobject.h | 123++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------
Asrc/kpair.c | 27+++++++++++++++++++++++++++
Asrc/kpair.h | 25+++++++++++++++++++++++++
Asrc/kstring.c | 32++++++++++++++++++++++++++++++++
Asrc/kstring.h | 15+++++++++++++++
Asrc/ksymbol.c | 51+++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/ksymbol.h | 19+++++++++++++++++++
Asrc/ktoken.c | 541+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/ktoken.h | 21+++++++++++++++++++++
12 files changed, 893 insertions(+), 73 deletions(-)

diff --git a/src/Makefile b/src/Makefile @@ -7,7 +7,7 @@ MYCFLAGS= MYLDFLAGS= MYLIBS= -CORE_O= +CORE_O= kobject.o ktoken.o kpair.o kstring.o ksymbol.o KRN_T= klisp KRN_O= klisp.o @@ -23,7 +23,7 @@ all: $(ALL_T) o: $(ALL_O) $(KRN_T): $(ALL_O) - $(CC) -o $@ $(MYLDFLAGS) $(KRN_O) $(LIBS) + $(CC) -o $@ $(MYLDFLAGS) $(ALL_O) $(LIBS) clean: $(RM) $(ALL_T) $(ALL_O) @@ -31,4 +31,10 @@ clean: # list targets that do not create files (but not all makes understand .PHONY) .PHONY: all default o clean -klisp.o: klisp.c klisp.h kobject.h +klisp.o: klisp.c klisp.h kobject.h ktoken.h +kobject.o: kobject.c kobject.h +ktoken.o: ktoken.c ktoken.h kobject.h kpair.h kstring.h ksymbol.h +kpair.o: kpair.c kpair.h kobject.h +kstring.o: kstring.c kstring.h kobject.h +# XXX: kpair.h because of use of list as symbol table +ksymbol.o: ksymbol.c ksymbol.h kobject.h kpair.h diff --git a/src/klisp.c b/src/klisp.c @@ -7,44 +7,28 @@ #include <stdio.h> #include "kobject.h" +#include "ktoken.h" int main(int argc, char *argv[]) { - printf("Tests\n"); - - printf("\nVariables: \n"); - printf("nil: %d\n", ttisnil(knil)); - printf("ignore: %d\n", ttisignore(kignore)); - printf("inert: %d\n", ttisinert(kinert)); - printf("eof: %d\n", ttiseof(keof)); - printf("true: %d\n", ttisboolean(ktrue)); - printf("false: %d\n", ttisboolean(kfalse)); - - - printf("\nConstants: \n"); - - printf("nil: %d\n", ttisnil(KNIL)); - printf("ignore: %d\n", ttisignore(KIGNORE)); - printf("inert: %d\n", ttisinert(KINERT)); - printf("eof: %d\n", ttiseof(KEOF)); - printf("true: %d\n", ttisboolean(KTRUE)); - printf("false: %d\n", ttisboolean(KFALSE)); - - printf("int: %d\n", - ttisfixint(((TValue) {.tv = {.t = K_TAG_FIXINT, .v = { .i = 3}}}))); - printf("double: %d\n", ttisdouble((TValue){.d = 1.0})); - - printf("\nSwitch: \n"); - - printf("nil: %d\n", ttype(KNIL)); - printf("ignore: %d\n", ttype(KIGNORE)); - printf("inert: %d\n", ttype(KINERT)); - printf("eof: %d\n", ttype(KEOF)); - printf("true: %d\n", ttype(KTRUE)); - printf("false: %d\n", ttype(KFALSE)); - printf("int: %d\n", - ttype(((TValue) {.tv = {.t = K_TAG_FIXINT, .v = { .i = 3}}}))); - printf("double: %d\n", ttype((TValue){.d = 1.0})); + /* + ** Simple tokenizer loop + */ + printf("Tokenizer Type Test\n"); + + ktok_file = stdin; + ktok_init(); + + TValue tok = KNIL; + + while(!ttiseof(tok)) { + tok = ktok_read_token(); + if (ttisnil(tok)) { + /* there was an error */ + break; + } + printf("\nToken Type: %s\n", ttname(tok)); + } return 0; } diff --git a/src/kobject.c b/src/kobject.c @@ -0,0 +1,46 @@ +/* +** kobject.h +** Type definitions for Kernel Objects +** See Copyright Notice in klisp.h +*/ + +#include "kobject.h" + +/* +** The global const variables +*/ +const TValue knil = KNIL_; +const TValue kignore = KIGNORE_; +const TValue kinert = KINERT_; +const TValue keof = KEOF_; +const TValue ktrue = KTRUE_; +const TValue kfalse = KFALSE_; +const TValue kepinf = KEPINF_; +const TValue keminf = KEMINF_; + +/* +** The name strings for all TValue types +*/ +char *ktv_names[] = { + [K_TFIXINT] = "fixint", + [K_TBIGINT] = "bigint", + [K_TFIXRAT] = "fixrat", + [K_TBIGRAT] = "bigrat", + [K_TEINF] = "einf", + [K_TDOUBLE] = "double", + [K_TBDOUBLE] = "bdouble", + [K_TIINF] = "iinf", + [K_TRWNPN] = "rwnpn", + [K_TCOMPLEX] = "complex", + + [K_TNIL] = "nil", + [K_TIGNORE] = "ignore", + [K_TINERT] = "inert", + [K_TEOF] = "eof", + [K_TBOOLEAN] = "boolean", + [K_TCHAR] = "char", + + [K_TPAIR] = "pair", + [K_TSTRING] = "string", + [K_TSYMBOL] = "symbol" +}; diff --git a/src/kobject.h b/src/kobject.h @@ -6,7 +6,7 @@ /* ** SOURCE NOTE: While the tagging system comes from Mozilla TraceMonkey, -** o code from TraceMonkey was used. +** no code from TraceMonkey was used. ** The general structure, names and comments of this file follow the ** scheme of Lua. */ @@ -52,16 +52,14 @@ typedef struct __attribute__ ((__packed__)) GCheader { /* ** Tags: Types & Flags -*/ - -/* -** Tagged values in 64 bits (for 32 bit systems) -** NaN boxing: Values are encoded as double precision NaNs -** There is one canonical NaN that is used through the interpreter -** and all remaining NaNs are used to encode the rest of the types -** (other than double) -** Canonical NaN: (0)(111 1111 1111) 1000 0000 0000 0000 0000 32(0) -** Infinities: s(111 1111 1111) 0000 0000 0000 0000 0000 32(0) +** +** Nan Boxing: Tagged values in 64 bits (for 32 bit systems) +** All Values except doubles are encoded as double precision NaNs +** There is one canonical NaN(?maybe none?) that is used through the +** interpreter and all remaining NaNs are used to encode the rest of +** the types (other than double) +** Canonical NaN(?): (0)(111 1111 1111) 1000 0000 0000 0000 0000 32(0) +** Infinities(?): s(111 1111 1111) 0000 0000 0000 0000 0000 32(0) ** Tagged values: (0)(111 1111 1111) 1111 tttt tttt tttt tttt 32(v) ** So all tags start with 0x7fff which leaves us 16 bits for the ** tag proper. @@ -83,6 +81,7 @@ typedef struct __attribute__ ((__packed__)) GCheader { #define K_TAG_BASE_TYPE(t) ((t) & K_TAG_BASE_TYPE_MASK) /* +** RATIONALE: ** Number types are first and ordered to allow easy switch statements ** in arithmetic operators. The ones marked with (?) are still in ** consideration for separate type tags. @@ -119,8 +118,15 @@ typedef struct __attribute__ ((__packed__)) GCheader { #define K_MAKE_VTAG(t) (K_TAG_TAGGED | t) -/* TODO: For now we will only use fixints */ +/* +** TODO: +** +** - decide if inexact infinities and reals with no +** primary values are included in K_TDOUBLE +** - For now we will only use fixints and exact infinities +*/ #define K_TAG_FIXINT K_MAKE_VTAG(K_TFIXINT) +#define K_TAG_EINF K_MAKE_VTAG(K_TEINF) #define K_TAG_NIL K_MAKE_VTAG(K_TNIL) #define K_TAG_IGNORE K_MAKE_VTAG(K_TIGNORE) @@ -137,17 +143,11 @@ typedef struct __attribute__ ((__packed__)) GCheader { ** Macros to test types */ -/* -** This is intended for use in switch statements -** TODO: decide if inexact infinities and reals with no -** primary values are included in K_TDOUBLE -*/ +/* NOTE: This is intended for use in switch statements */ #define ttype(o) ({ TValue o_ = o; \ ttisdouble(o_)? K_TDOUBLE : ttype_(o_); }) -/* -** This is intended for internal use below. DON'T USE OUTSIDE THIS FILE -*/ +/* This is intended for internal use below. DON'T USE OUTSIDE THIS FILE */ #define ttag(o) ((o).tv.t) #define ttype_(o) (K_TAG_TYPE(ttag(o))) #define tflag_(o) (K_TAG_FLAG(ttag(o))) @@ -175,7 +175,7 @@ typedef struct __attribute__ ((__packed__)) GCheader { typedef union { bool b; int32_t i; - unsigned char ch; + char ch; GCObject *gc; void *p; /* ... */ @@ -206,21 +206,30 @@ typedef struct __attribute__ ((__packed__)) { TValue cdr; } Pair; +/* XXX: Symbol should probably contain a String instead of a char buf */ typedef struct __attribute__ ((__packed__)) { CommonHeader; - unsigned char b[]; // buffer + uint32_t size; + char b[]; } Symbol; +/* +** RATIONALE: +** +** Storing size allows embedded '\0's. +** Note, however, that there are actually size + 1 bytes allocated +** and that b[size] = '\0'. This is useful for printing strings +** +*/ typedef struct __attribute__ ((__packed__)) { CommonHeader; - uint32_t size; // to allow embedded '\0' - unsigned char b[]; // buffer + uint32_t size; + char b[]; // buffer } String; /* ** Union of all Kernel heap-allocated values */ - /* LUA NOTE: In Lua the corresponding union is in lstate.h */ union GCObject { GCheader gch; @@ -239,22 +248,66 @@ union GCObject { #define KEOF_ {.tv = {.t = K_TAG_EOF, .v = { .i = 0 }}} #define KTRUE_ {.tv = {.t = K_TAG_BOOLEAN, .v = { .b = true }}} #define KFALSE_ {.tv = {.t = K_TAG_BOOLEAN, .v = { .b = false }}} +#define KEPINF_ {.tv = {.t = K_TAG_EINF, .v = { .i = 1 }}} +#define KEMINF_ {.tv = {.t = K_TAG_EINF, .v = { .i = -1 }}} +/* RATIONALE: the ones above can be used in initializers */ #define KNIL ((TValue) KNIL_) #define KINERT ((TValue) KINERT_) #define KIGNORE ((TValue) KIGNORE_) #define KEOF ((TValue) KEOF_) #define KTRUE ((TValue) KTRUE_) #define KFALSE ((TValue) KFALSE_) - -/* -** The same constants as global const variables -*/ -const TValue knil = KNIL_; -const TValue kignore = KIGNORE_; -const TValue kinert = KINERT_; -const TValue keof = KEOF_; -const TValue ktrue = KTRUE_; -const TValue kfalse = KFALSE_; +#define KEPINF ((TValue) KEPINF_) +#define KEMINF ((TValue) KEMINF_) + +/* The same constants as global const variables */ +const TValue knil; +const TValue kignore; +const TValue kinert; +const TValue keof; +const TValue ktrue; +const TValue kfalse; +const TValue kepinf; +const TValue keminf; + +/* Macros to create TValues of non-heap allocated types (for initializers) */ +#define ch2tv_(ch_) {.tv = {.t = K_TAG_CHAR, .v = { .ch = ch_ }}} +#define i2tv_(i_) {.tv = {.t = K_TAG_FIXINT, .v = { .i = i_ }}} +#define b2tv_(b_) {.tv = {.t = K_TAG_BOOLEAN, .v = { .b = b_ }}} + +/* Macros to create TValues of non-heap allocated types */ +#define ch2tv(ch_) ((TValue) ch2tv_(ch_)) +#define i2tv(i_) ((TValue) i2tv_(i_)) +#define b2tv(b_) ((TValue) b2tv_(b_)) + +/* Macros to convert a GCObject * into a tagged value */ +/* TODO: add assertions */ +/* LUA NOTE: the corresponding defines are in lstate.h */ +#define gc2tv(t_, o_) ((TValue) {.tv = {.t = t_, \ + .v = { .gc = obj2gco(o_)}}}) +#define gc2pair(o_) (gc2tv(K_TAG_PAIR, o_)) +#define gc2str(o_) (gc2tv(K_TAG_STRING, o_)) +#define gc2sym(o_) (gc2tv(K_TAG_SYMBOL, o_)) + +/* Macro to convert a TValue into a specific heap allocated object */ +#define tv2pair(v_) ((Pair *) gcvalue(v_)) +#define tv2str(v_) ((String *) gcvalue(v_)) +#define tv2sym(v_) ((Symbol *) gcvalue(v_)) + +/* Macro to convert any Kernel object into a GCObject */ +#define obj2gco(v_) ((GCObject *) (v_)) + +/* Macros to access innertv values */ +/* TODO: add assertions */ +#define ivalue(o_) ((o_).tv.v.i) +#define bvalue(o_) ((o_).tv.v.b) +#define chvalue(o_) ((o_).tv.v.ch) +#define gcvalue(o_) ((o_).tv.v.gc) + +/* Macro to obtain a string describing the type of a TValue */# +#define ttname(tv_) (ktv_names[ttype(tv_)]) + +extern char *ktv_names[]; #endif diff --git a/src/kpair.c b/src/kpair.c @@ -0,0 +1,27 @@ +/* +** kpair.c +** Kernel Pairs +** See Copyright Notice in klisp.h +*/ + +/* XXX: for malloc */ +#include <stdlib.h> +/* TODO: use a generalized alloc function */ + +#include "kpair.h" +#include "kobject.h" + +/* TODO: Out of memory errors */ +/* XXX: for now all pairs are mutable */ +TValue kcons(TValue car, TValue cdr) +{ + Pair *new_pair = malloc(sizeof(Pair)); + + new_pair->next = NULL; + new_pair->gct = 0; + new_pair->tt = K_TPAIR; + new_pair->car = car; + new_pair->cdr = cdr; + + return gc2pair(new_pair); +} diff --git a/src/kpair.h b/src/kpair.h @@ -0,0 +1,25 @@ +/* +** kpair.h +** Kernel Pairs +** See Copyright Notice in klisp.h +*/ + +#ifndef kpair_h +#define kpair_h + +#include "kobject.h" + +/* TODO: add type assertions */ +/* TODO: add more kc[ad]*r combinations */ +#define kcar(p_) (((Pair *)(p_.tv.v.gc))->car) +#define kcdr(p_) (((Pair *)(p_.tv.v.gc))->cdr) + +#define kset_car(p_, v_) (kcar(p_) = v_) +#define kset_cdr(p_, v_) (kcdr(p_) = v_) + +#define kdummy_cons (kcons(KNIL, KNIL)) + +/* XXX: for now all pairs are mutable */ +TValue kcons(TValue, TValue); + +#endif diff --git a/src/kstring.c b/src/kstring.c @@ -0,0 +1,32 @@ +/* +** kstring.c +** Kernel Strings +** See Copyright Notice in klisp.h +*/ + +/* XXX: for malloc */ +#include <stdlib.h> +/* TODO: use a generalized alloc function */ + +#include <string.h> + +#include "kstring.h" +#include "kobject.h" + +/* TODO: Out of memory errors */ +/* XXX: for now all strings are mutable */ +TValue kstring_new(const char *buf, uint32_t size) +{ + String *new_str = malloc(sizeof(String) + size + 1); + + new_str->next = NULL; + new_str->gct = 0; + new_str->tt = K_TSTRING; + new_str->size = size; + /* NOTE: there can be embedded '\0's in a string */ + memcpy(new_str->b, buf, size); + /* NOTE: they still end with a '\0' for convenience in printing */ + new_str->b[size] = '\0'; + + return gc2str(new_str); +} diff --git a/src/kstring.h b/src/kstring.h @@ -0,0 +1,15 @@ +/* +** kstring.h +** Kernel Strings +** See Copyright Notice in klisp.h +*/ + +#ifndef kstring_h +#define kstring_h + +#include "kobject.h" + +/* XXX: for now all strings are mutable */ +TValue kstring_new(const char *, uint32_t); + +#endif diff --git a/src/ksymbol.c b/src/ksymbol.c @@ -0,0 +1,51 @@ +/* +** ksymbol.c +** Kernel Symbols +** See Copyright Notice in klisp.h +*/ + +/* XXX: for malloc */ +#include <stdlib.h> +/* TODO: use a generalized alloc function */ + +#include <string.h> + +#include "ksymbol.h" +#include "kobject.h" +#include "kpair.h" + +/* TODO: replace the list with a hashtable */ +/* TODO: move to global state */ +TValue ksymbol_table = KNIL_; + +/* TODO: Out of memory errors */ +TValue ksymbol_new(const char *buf) +{ + /* First look for it in the symbol table */ + TValue tbl = ksymbol_table; + while (!ttisnil(tbl)) { + TValue first = kcar(tbl); + /* NOTE: there are no embedded '\0's in symbols */ + if (strcmp(buf, tv2sym(first)->b) == 0) + return first; + else + tbl = kcdr(tbl); + } + + /* Didn't find it, alloc new and save in symbol table */ + /* NOTE: there are no embedded '\0's in symbols */ + int32_t size = strlen(buf); + Symbol *new_sym = malloc(sizeof(Symbol) + size + 1); + + new_sym->next = NULL; + new_sym->gct = 0; + new_sym->tt = K_TSYMBOL; + new_sym->size = size; + memcpy(new_sym->b, buf, size); + new_sym->b[size] = '\0'; + + TValue new_symv = gc2sym(new_sym); + tbl = kcons(new_symv, ksymbol_table); + ksymbol_table = tbl; + return new_symv; +} diff --git a/src/ksymbol.h b/src/ksymbol.h @@ -0,0 +1,19 @@ +/* +** ksymbol.h +** Kernel Symbols +** See Copyright Notice in klisp.h +*/ + +#ifndef ksymbol_h +#define ksymbol_h + +#include "kobject.h" + +/* TODO: replace the list with a hashtable */ +/* TODO: move to global state */ +TValue ksymbol_table; + +/* XXX: for now all symbols are interned */ +TValue ksymbol_new(const char *); + +#endif diff --git a/src/ktoken.c b/src/ktoken.c @@ -0,0 +1,541 @@ +/* +** ktoken.c +** Tokenizer for the Kernel Programming Language +** See Copyright Notice in klisp.h +*/ + + +/* +** TODO: +** +** - Support other number types besides fixints and exact infinities +** - Support for complete number syntax (exactness, radix, etc) +** - Support for unicode (strings, char and symbols). +** - Error handling +** - Source code tracking +** +*/ +#include <stdio.h> +/* XXX for malloc */ +#include <stdlib.h> +/* TODO: use a generalized alloc function */ + +#include <string.h> +#include <ctype.h> +#include <stdint.h> +#include <stdbool.h> + +#include "ktoken.h" +#include "kobject.h" +#include "kpair.h" +#include "kstring.h" +#include "ksymbol.h" + +/* +** Char sets for fast ASCII char classification +*/ + +/* Each bit correspond to a char in the 0-255 range */ +typedef uint32_t kcharset[8]; + +/* +** Char set function/macro interface +*/ +void kcharset_empty(kcharset); +void kcharset_fill(kcharset, char *); +void kcharset_union(kcharset, kcharset); +#define kcharset_contains(kch_, ch_) \ + ({ unsigned char ch__ = (unsigned char) ch_; \ + kch_[KCHS_OCTANT(ch__)] & KCHS_BIT(ch__); }) + + +/* +** Char set contains macro interface +*/ +#define KCHS_OCTANT(ch) (ch >> 5) +#define KCHS_BIT(ch) (1 << (ch & 0x1f)) + +void kcharset_empty(kcharset chs) +{ + for (int i = 0; i < 8; i++) { + chs[i] = 0; + } +} + +void kcharset_fill(kcharset chs, char *chars_) +{ + unsigned char *chars = (unsigned char *) chars_; + unsigned char ch; + + kcharset_empty(chs); + + while ((ch = *chars++)) { + chs[KCHS_OCTANT(ch)] |= KCHS_BIT(ch); + } +} + +void kcharset_union(kcharset chs, kcharset chs2) +{ + for (int i = 0; i < 8; i++) { + chs[i] |= chs2[i]; + } +} + +/* +** Character sets for classification +*/ +kcharset ktok_alphabetic, ktok_numeric, ktok_whitespace; +kcharset ktok_delimiter, ktok_extended, ktok_subsequent; + +#define ktok_is_alphabetic(chi_) kcharset_contains(ktok_alphabetic, chi_) +/* TODO: add is_digit, that takes the base as parameter */ +#define ktok_is_numeric(chi_) kcharset_contains(ktok_numeric, chi_) +/* TODO: add hex digits */ +#define ktok_digit_value(ch_) (ch_ - '0') +#define ktok_is_whitespace(chi_) kcharset_contains(ktok_whitespace, chi_) +#define ktok_is_delimiter(chi_) (chi == EOF || \ + kcharset_contains(ktok_delimiter, chi_)) +#define ktok_is_subsequent(chi_) kcharset_contains(ktok_subsequent, chi_) + +/* +** Special Tokens +*/ + +/* +** RATIONALE: +** +** Because a pair is not a token, they can be used to represent special tokens +** instead of creating an otherwise useless special token type +** lparen, rparen and dot are represented as a pair with the corresponding +** char in the car and nil in the cdr. +** srfi-38 tokens are represented with a boolean in the indicating if it's a +** defining token and the number in the cdr +** +*/ +TValue ktok_lparen, ktok_rparen, ktok_dot; + +/* TODO: move this to the global state */ +char *ktok_buffer; +/* WORKAROUND: for stdin line buffering & reading of EOF */ +bool ktok_seen_eof; +uint32_t ktok_buffer_size; +#define KTOK_BUFFER_INITIAL_SIZE 1024 + +void ktok_init() +{ + /* WORKAROUND: for stdin line buffering & reading of EOF */ + ktok_seen_eof = false; + /* string buffer */ + /* XXX: for now use a fixed size */ + ktok_buffer_size = KTOK_BUFFER_INITIAL_SIZE; + /* TODO: Out of memory errors */ + ktok_buffer = malloc(KTOK_BUFFER_INITIAL_SIZE); + + /* Character sets */ + kcharset_fill(ktok_alphabetic, "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz"); + kcharset_fill(ktok_numeric, "0123456789"); + kcharset_fill(ktok_whitespace, " \t\v\r\n\f"); + + kcharset_fill(ktok_delimiter, "()\";"); + kcharset_union(ktok_delimiter, ktok_whitespace); + + kcharset_fill(ktok_extended, "!$%&*+-./:<=>?@^_~"); + + kcharset_empty(ktok_subsequent); + kcharset_union(ktok_subsequent, ktok_alphabetic); + kcharset_union(ktok_subsequent, ktok_numeric); + kcharset_union(ktok_subsequent, ktok_extended); + + /* Special Tokens */ + ktok_lparen = kcons(ch2tv('('), KNIL); + ktok_rparen = kcons(ch2tv(')'), KNIL); + ktok_dot = kcons(ch2tv('.'), KNIL); +} + +/* +** Underlying stream interface +*/ +int ktok_getc(); +int ktok_peekc(); + +int ktok_getc() { + /* TODO: add location tracking */ + /* WORKAROUND: for stdin line buffering & reading of EOF */ + if (ktok_seen_eof) { + return EOF; + } else { + int chi = getc(ktok_file); + ktok_seen_eof = (chi == EOF); + return chi; + } +} + +int ktok_peekc() { + int chi = ktok_getc(); + if (chi != EOF) + ungetc(chi, ktok_file); + return chi; +} + +/* +** Error management +*/ +TValue ktok_error(char *str) +{ + /* TODO: Decide on error handling mechanism for reader (& tokenizer) */ + printf("TOK ERROR: %s\n", str); + return KNIL; +} + + +/* +** ktok_read_token() helpers +*/ +void ktok_ignore_whitespace_and_comments(); +bool ktok_check_delimiter(); +TValue ktok_read_string(); +TValue ktok_read_special(); +TValue ktok_read_number(bool); +TValue ktok_read_maybe_signed_numeric(); +TValue ktok_read_identifier(); +int ktok_read_until_delimiter(); + +/* +** Main tokenizer function +*/ +TValue ktok_read_token () +{ + ktok_ignore_whitespace_and_comments(); + /* + ** NOTE: We jumped over all whitespace + ** so either the next token starts here or eof was reached, + ** in any case we save the location of the port + */ + + /* TODO: add location tracking */ + /* SCHEME VERSION (kport-save-loc! kport) */ + + int chi = ktok_peekc(); + + switch(chi) { + case EOF: + ktok_getc(); + return KEOF; + case '(': + ktok_getc(); + return ktok_lparen; + case ')': + ktok_getc(); + return ktok_rparen; + case '.': + ktok_getc(); + if (ktok_check_delimiter()) + return ktok_dot; + else + return ktok_error("no delimiter found after dot"); + case '"': + return ktok_read_string(); + case '#': + return ktok_read_special(); + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + return ktok_read_number(true); /* positive number */ + case '+': case '-': + return ktok_read_maybe_signed_numeric(); + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': + case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': + case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': + case 'V': case 'W': case 'X': case 'Y': case 'Z': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': + case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': + case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': + case 'v': case 'w': case 'x': case 'y': case 'z': + case '!': case '$': case '%': case '&': case '*': case '/': case ':': + case '<': case '=': case '>': case '?': case '@': case '^': case '_': + case '~': + /* + ** NOTE: the cases for '+', '-', '.' and numbers were already + ** considered so identifier-subsequent is used instead of + ** identifier-first-char (in the cases above) + */ + return ktok_read_identifier(); + default: + ktok_getc(); + return ktok_error("unrecognized token starting char"); + } +} + +/* +** Comments and Whitespace +*/ +void ktok_ignore_comment() +{ + int chi; + do { + chi = ktok_getc(); + } while (chi != EOF && chi != '\n'); +} + +void ktok_ignore_whitespace_and_comments() +{ + /* NOTE: if it's not a whitespace or comment do nothing (even on eof) */ + bool end = false; + while(!end) { + int chi = ktok_peekc(); + + if (chi == EOF) { + end = true; + } else { + char ch = (char) chi; + if (ktok_is_whitespace(ch)) { + ktok_getc(); + } else if (ch == ';') { + ktok_ignore_comment(); /* NOTE: this also reads again the ';' */ + } else { + end = true; + } + } + } +} + +/* +** Delimiter checking +*/ +bool ktok_check_delimiter() +{ + int chi = ktok_peekc(); + return (ktok_is_delimiter(chi)); +} + +/* +** Returns the number of bytes read +*/ +int ktok_read_until_delimiter() +{ + int i = 0; + + while (!ktok_check_delimiter()) { + /* NOTE: can't be eof, because eof is a delimiter */ + char ch = (char) ktok_getc(); + ktok_buffer[i++] = ch; + + if (i + 1 == ktok_buffer_size) { + /* TODO: allow buffer to grow */ + break; + } + } + ktok_buffer[i] = '\0'; + return i; +} + +/* +** Numbers +** XXX: for now, only fixints in base 10 +*/ +TValue ktok_read_number(bool is_pos) +{ + int32_t res = 0; + + while(!ktok_check_delimiter()) { + /* NOTE: can't be eof because it's a delimiter */ + char ch = (char) ktok_getc(); + if (!ktok_is_numeric(ch)) + return ktok_error("Not a digit found in number"); + res = res * 10 + ktok_digit_value(ch); + } + + if (!is_pos) + res = -res; + return i2tv(res); +} + +TValue ktok_read_maybe_signed_numeric() +{ + /* NOTE: can't be eof, it's either '+' or '-' */ + char ch = (char) ktok_getc(); + if (ktok_check_delimiter()) { + ktok_buffer[0] = ch; + ktok_buffer[1] = '\0'; + return ksymbol_new(ktok_buffer); + } else { + return ktok_read_number(ch == '+'); + } +} + +/* +** Strings +*/ +TValue ktok_read_string() +{ + /* discard opening quote */ + ktok_getc(); + + bool done = false; + int i = 0; + + while(!done) { + int chi = ktok_getc(); + char ch = (char) chi; + + if (chi == EOF) + return ktok_error("EOF found while reading a string"); + if (ch == '"') { + ktok_buffer[i] = '\0'; + done = true; + } else { + if (ch == '\\') { + chi = ktok_getc(); + + if (chi == EOF) + return ktok_error("EOF found while reading a string"); + + ch = (char) chi; + + if (ch != '\\' && ch != '"') { + return ktok_error("Invalid char after '\\' " + "while reading a string"); + } + } + ktok_buffer[i++] = ch; + + if (i + 1 == ktok_buffer_size) + /* TODO: allow buffer to grow */ + return ktok_error("Implementation restriction: " + "String too long"); + } + } + return kstring_new(ktok_buffer, i); +} + +/* +** Special constants (starting with "#") +** (Special number syntax, char constants, #ignore, #inert, srfi-38 tokens) +*/ +TValue ktok_read_special() +{ + /* discard the '#' */ + ktok_getc(); + + int chi = ktok_getc(); + char ch = (char) chi; + + if (chi == EOF) + return ktok_error("EOF found while reading a '#' constant"); + + switch(ch) { + case 'i': + /* ignore or inert */ + /* XXX: could also be an inexact number */ + ktok_read_until_delimiter(); + /* NOTE: can use strcmp even in the presence of '\0's */ + if (strcmp(ktok_buffer, "gnore") == 0) + return KIGNORE; + else if (strcmp(ktok_buffer, "nert") == 0) + return KINERT; + else + return ktok_error("unexpected char in # constant"); + case 'e': + /* an exact infinity */ + /* XXX: could also be an exact number */ + if (ktok_read_until_delimiter()) { + /* NOTE: can use strcmp even in the presence of '\0's */ + if (strcmp(ktok_buffer, "+infinity") == 0) + return KEPINF; + else if (strcmp(ktok_buffer, "-infinity") == 0) + return KEMINF; + else + return ktok_error("unexpected char in # constant"); + } else + return ktok_error("unexpected error in # constant"); + case 't': + case 'f': + /* boolean constant */ + if (ktok_check_delimiter()) + return b2tv(ch == 't'); + else + return ktok_error("unexpected char in # constant"); + case '\\': + /* char constant */ + /* + ** RATIONALE: in the scheme spec (R5RS) it says that only alphabetic + ** char constants need a delimiter to disambiguate the cases with + ** character names. It would be more consistent if all characters + ** needed a delimiter (and is probably implied by the yet incomplete + ** Kernel report (R-1RK)) + ** For now we follow the scheme report + */ + chi = ktok_getc(); + ch = (char) chi; + + if (chi == EOF) + return ktok_error("EOF found while reading a char constant"); + + if (!ktok_is_alphabetic(ch) || ktok_check_delimiter()) + return ch2tv(ch); + + ktok_read_until_delimiter(); + char *p = ktok_buffer; + while (*p) { + *p = tolower(*p); + p++; + } + ch = tolower(ch); + /* NOTE: can use strcmp even in the presence of '\0's */ + if (ch == 's' && strcmp(ktok_buffer, "pace") == 0) + return ch2tv(' '); + else if (ch == 'n' && strcmp(ktok_buffer, "ewline") == 0) + return ch2tv('\n'); + else + return ktok_error("Unrecognized character name"); + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': { + /* srfi-38 type token (can be either a def or ref) */ + /* TODO: allow bigints */ + int32_t res = 0; + while(ch != '#' && ch != '=') { + if (!ktok_is_numeric(ch)) + return ktok_error("Invalid char found in srfi-38 token"); + + res = res * 10 + ktok_digit_value(ch); + + chi = ktok_getc(); + ch = (char) chi; + + if (chi == EOF) + return ktok_error("EOF found while reading a srfi-38 token"); + } + return kcons(b2tv(ch == '='), i2tv(res)); + } + /* TODO: add real with no primary value and undefined */ + default: + return ktok_error("unexpected char in # constant"); + } +} + +/* +** Identifiers +*/ +TValue ktok_read_identifier() +{ + int i = 0; + + while (!ktok_check_delimiter()) { + /* NOTE: can't be eof, because eof is a delimiter */ + char ch = (char) ktok_getc(); + + /* NOTE: is_subsequent of '\0' is false, so no embedded '\0' */ + if (ktok_is_subsequent(ch)) + ktok_buffer[i++] = ch; + else + return ktok_error("Invalid char in identifier"); + + /* TODO: allow buffer to grow */ + if (i + 1 == ktok_buffer_size) { + break; + } + } + ktok_buffer[i] = '\0'; + return ksymbol_new(ktok_buffer); +} + diff --git a/src/ktoken.h b/src/ktoken.h @@ -0,0 +1,21 @@ +/* +** ktoken.h +** Tokenizer for the Kernel Programming Language +** See Copyright Notice in klisp.h +*/ + +#ifndef ktoken_h +#define ktoken_h + +#include "kobject.h" + +/* +** Tokenizer interface +*/ +void ktok_init(); +TValue ktok_read_token(); + +/* TODO: move this to the global state */ +FILE *ktok_file; + +#endif