Skip to content

Commit

Permalink
Jtv/lex (#14)
Browse files Browse the repository at this point in the history
Ported the lexer from Nim; added the start of structure for calling the compile pipeline. Moved over the test files.
  • Loading branch information
viega authored Apr 16, 2024
1 parent c16f75c commit 0b7e648
Show file tree
Hide file tree
Showing 57 changed files with 2,442 additions and 244 deletions.
4 changes: 4 additions & 0 deletions include/con4m.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,7 @@ typedef void *object_t;

// Yes we use cryptographic hashes internally for type IDing.
#include "crypto/sha.h"

// The front end.
#include "con4m/frontend/compile.h"
#include "con4m/frontend/lex.h" // Lexical Tokens
56 changes: 53 additions & 3 deletions include/con4m/codepoint.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,62 @@ c4m_codepoint_is_space(c4m_codepoint_t cp)
}

switch (c4m_codepoint_category(cp)) {
case CP_CATEGORY_ZS:
case UTF8PROC_CATEGORY_ZS:
return true;
case CP_CATEGORY_ZL:
case CP_CATEGORY_ZP:
case UTF8PROC_CATEGORY_ZL:
case UTF8PROC_CATEGORY_ZP:
return true;
default:
return false;
}
}

static inline bool
c4m_codepoint_is_c4m_id_start(c4m_codepoint_t cp)
{
switch (utf8proc_category(cp)) {
case UTF8PROC_CATEGORY_LU:
case UTF8PROC_CATEGORY_LL:
case UTF8PROC_CATEGORY_LT:
case UTF8PROC_CATEGORY_LM:
case UTF8PROC_CATEGORY_LO:
case UTF8PROC_CATEGORY_NL:
return true;
default:
switch (cp) {
case '_':
case '?':
case '$':
return true;
default:
return false;
}
}
}

static inline bool
c4m_codepoint_is_c4m_id_continue(c4m_codepoint_t cp)
{
switch (utf8proc_category(cp)) {
case UTF8PROC_CATEGORY_LU:
case UTF8PROC_CATEGORY_LL:
case UTF8PROC_CATEGORY_LT:
case UTF8PROC_CATEGORY_LM:
case UTF8PROC_CATEGORY_LO:
case UTF8PROC_CATEGORY_NL:
case UTF8PROC_CATEGORY_ND:
case UTF8PROC_CATEGORY_MN:
case UTF8PROC_CATEGORY_MC:
case UTF8PROC_CATEGORY_PC:
return true;
default:
switch (cp) {
case '_':
case '?':
case '$':
return true;
default:
return false;
}
}
}
2 changes: 2 additions & 0 deletions include/con4m/conststr.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ extern c4m_utf8_t *c4m_get_lbrace_const();
extern c4m_utf8_t *c4m_get_rbrace_const();
extern c4m_utf8_t *c4m_get_colon_const();
extern c4m_utf8_t *c4m_get_colon_no_space_const();
extern c4m_utf8_t *c4m_get_slash_const();
extern c4m_utf8_t *c4m_get_period_const();
1 change: 1 addition & 0 deletions include/con4m/datatypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "con4m/datatypes/tuples.h"
#include "con4m/datatypes/callbacks.h"
#include "con4m/datatypes/streams.h"
#include "con4m/datatypes/frontend.h"

typedef c4m_str_t *(*c4m_repr_fn)(c4m_obj_t, to_str_use_t);
typedef void (*c4m_marshal_fn)(c4m_obj_t,
Expand Down
148 changes: 148 additions & 0 deletions include/con4m/datatypes/frontend.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#pragma once
#include "con4m.h"

typedef enum {
c4m_tt_space,
c4m_tt_semi,
c4m_tt_newline,
c4m_tt_line_comment,
c4m_tt_lock_attr,
c4m_tt_plus,
c4m_tt_minus,
c4m_tt_mul,
c4m_tt_long_comment,
c4m_tt_div,
c4m_tt_mod,
c4m_tt_lte,
c4m_tt_lt,
c4m_tt_gte,
c4m_tt_gt,
c4m_tt_neq,
c4m_tt_not,
c4m_tt_colon,
c4m_tt_assign,
c4m_tt_cmp,
c4m_tt_comma,
c4m_tt_period,
c4m_tt_lbrace,
c4m_tt_rbrace,
c4m_tt_lbracket,
c4m_tt_rbracket,
c4m_tt_lparen,
c4m_tt_rparen,
c4m_tt_and,
c4m_tt_or,
c4m_tt_int_lit,
c4m_tt_hex_lit,
c4m_tt_float_lit,
c4m_tt_string_lit,
c4m_tt_char_lit,
c4m_tt_true,
c4m_tt_false,
c4m_tt_nil,
c4m_tt_if,
c4m_tt_elif,
c4m_tt_else,
c4m_tt_for,
c4m_tt_from,
c4m_tt_to,
c4m_tt_break,
c4m_tt_continue,
c4m_tt_return,
c4m_tt_enum,
c4m_tt_identifier,
c4m_tt_func,
c4m_tt_var,
c4m_tt_global,
c4m_tt_const,
c4m_tt_unquoted_lit,
c4m_tt_backtick,
c4m_tt_arrow,
c4m_tt_object,
c4m_tt_while,
c4m_tt_in,
c4m_tt_bit_and,
c4m_tt_bit_or,
c4m_tt_bit_xor,
c4m_tt_shl,
c4m_tt_shr,
c4m_tt_typeof,
c4m_tt_switch,
c4m_tt_case,
c4m_tt_plus_eq,
c4m_tt_minus_eq,
c4m_tt_mul_eq,
c4m_tt_div_eq,
c4m_tt_mod_eq,
c4m_tt_bit_and_eq,
c4m_tt_bit_or_eq,
c4m_tt_bit_xor_eq,
c4m_tt_shl_eq,
c4m_tt_shr_eq,
c4m_tt_sof,
c4m_tt_eof,
c4m_tt_lex_error
} c4m_token_kind_t;

typedef enum {
c4m_err_open_file,
c4m_err_lex_stray_cr,
c4m_err_lex_eof_in_comment,
c4m_err_lex_invalid_char,
c4m_err_lex_eof_in_str_lit,
c4m_err_lex_nl_in_str_lit,
c4m_err_lex_eof_in_char_lit,
c4m_err_lex_nl_in_char_lit,
c4m_err_lex_extra_in_char_lit,
c4m_err_lex_esc_in_esc,
c4m_err_lex_invalid_float_lit,
c4m_err_lex_float_oflow,
c4m_err_lex_float_uflow,
c4m_err_lex_int_oflow,
c4m_err_last,
} c4m_compile_error_t;

typedef struct {
c4m_codepoint_t *start_ptr;
c4m_codepoint_t *end_ptr;
c4m_utf32_t *literal_modifier;
void *literal_value; // Once parsed.
c4m_token_kind_t kind;
int token_id;
int line_no;
int line_offset;
uint8_t adjustment; // For keeping track of quoting.
} c4m_token_t;

typedef struct {
c4m_compile_error_t code;
// These will probably turn into a tagged union or transparent
// pointer with a phase indicator, so we can design the aux data
// appropriate per-phase.
c4m_token_t *current_token;
c4m_str_t *exception_message;
} c4m_compile_error;

typedef struct {
// The module_id is calculated by combining the package name and the
// module name, then hashing it with SHA256. We use Unix style paths
// but this is not necessarily derived from the URI path.
//
// Note that packages (and our combining of it and the module) use
// dotted syntax like with most PLs. When we combine for the hash,
// we add a dot in there.
//
// c4m_new_compile_ctx will add __default__ as the package if none
// is provided. The URI fields are optional (via API you can just
// pass raw source as long as you give at least a module name).

__int128_t module_id;
c4m_str_t *scheme; // http, https or file; if NULL, then file.
c4m_str_t *authority; // http/s only.
c4m_str_t *path; // Path component in the URI.
c4m_str_t *package; // Package name.
c4m_str_t *module; // Module name.
c4m_utf32_t *raw; // raw contents read when we do the lex pass.
c4m_xlist_t *tokens; // an xlist of x4m_token_t objects;
c4m_xlist_t *errors; // an xlist of c4m_compile_errors
} c4m_file_compile_ctx;
14 changes: 7 additions & 7 deletions include/con4m/datatypes/grids.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,13 +173,13 @@ typedef struct {
} c4m_renderable_t;

struct c4m_grid_t {
c4m_renderable_t *self;
c4m_renderable_t **cells; // A 2d array of renderable_objects, by ref
uint16_t num_cols;
uint16_t num_rows;
uint16_t spare_rows;
c4m_render_style_t **col_props;
c4m_render_style_t **row_props;
c4m_renderable_t *self;
c4m_renderable_t **cells; // A 2d array of renderable_objects, by ref
uint16_t num_cols;
uint16_t num_rows;
uint16_t spare_rows;
c4m_dict_t *col_props; // dict of int:c4m_render_style_t **
c4m_dict_t *row_props;

// Per-render info, which includes any adding added to perform
// alignment of the grid within the dimensions we're given.
Expand Down
9 changes: 9 additions & 0 deletions include/con4m/frontend/compile.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#pragma once
#include "con4m.h"

c4m_file_compile_ctx *_c4m_new_compile_ctx(c4m_str_t *module_name, ...);
bool c4m_validate_module_info(c4m_file_compile_ctx *);
c4m_stream_t *c4m_load_code(c4m_file_compile_ctx *);

#define c4m_new_compile_ctx(m, ...) \
_c4m_new_compile_ctx(m, KFUNC(__VA_ARGS__))
5 changes: 5 additions & 0 deletions include/con4m/frontend/lex.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#pragma once
#include "con4m.h"

bool c4m_lex(c4m_file_compile_ctx *, c4m_stream_t *);
c4m_grid_t *c4m_format_tokens(c4m_file_compile_ctx *);
53 changes: 23 additions & 30 deletions include/con4m/grid.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,20 @@ c4m_get_td_tag(c4m_grid_t *g)
}
void c4m_grid_set_all_contents(c4m_grid_t *, flexarray_t *);

extern c4m_grid_t *c4m_grid_flow(uint64_t items, ...);
c4m_utf32_t *c4m_grid_to_str(c4m_grid_t *, to_str_use_t);
extern c4m_grid_t *_c4m_ordered_list(flexarray_t *, ...);
extern c4m_grid_t *_c4m_unordered_list(flexarray_t *, ...);
extern c4m_grid_t *_c4m_grid_tree(c4m_tree_node_t *, ...);
c4m_xlist_t *_c4m_grid_render(c4m_grid_t *, ...);
extern c4m_grid_t *c4m_grid_flow(uint64_t items, ...);
extern c4m_utf32_t *c4m_grid_to_str(c4m_grid_t *, to_str_use_t);
extern c4m_grid_t *_c4m_ordered_list(flexarray_t *, ...);
extern c4m_grid_t *_c4m_unordered_list(flexarray_t *, ...);
extern c4m_grid_t *_c4m_grid_tree(c4m_tree_node_t *, ...);
extern c4m_xlist_t *_c4m_grid_render(c4m_grid_t *, ...);
extern void c4m_set_column_props(c4m_grid_t *,
int,
c4m_render_style_t *);
extern void c4m_row_column_props(c4m_grid_t *,
int,
c4m_render_style_t *);
extern void c4m_set_column_style(c4m_grid_t *, int, char *);
extern void c4m_set_row_style(c4m_grid_t *, int, char *);

#define c4m_grid_render(g, ...) _c4m_grid_render(g, KFUNC(__VA_ARGS__))
#define c4m_ordered_list(l, ...) _c4m_ordered_list(l, KFUNC(__VA_ARGS__))
Expand All @@ -52,30 +60,6 @@ c4m_to_str_renderable(c4m_str_t *s, char *tag)
c4m_kw("obj", c4m_ka(s), "tag", c4m_ka(tag)));
}

static inline void
c4m_set_column_style(c4m_grid_t *grid, int col, char *tag)
{
grid->col_props[col] = c4m_lookup_cell_style(tag);
}

static inline void
c4m_set_row_style(c4m_grid_t *grid, int row, char *tag)
{
grid->row_props[row] = c4m_lookup_cell_style(tag);
}

static inline void
c4m_set_column_props(c4m_grid_t *grid, int col, c4m_render_style_t *s)
{
grid->col_props[col] = s;
}

static inline void
c4m_set_row_props(c4m_grid_t *grid, int row, c4m_render_style_t *s)
{
grid->row_props[row] = s;
}

static inline c4m_style_t
c4m_grid_blend_color(c4m_style_t style1, c4m_style_t style2)
{
Expand Down Expand Up @@ -183,3 +167,12 @@ c4m_grid_stripe_rows(c4m_grid_t *grid)
{
grid->stripe = 1;
}

#ifdef C4M_USE_INTERNAL_API

static inline c4m_xlist_t *
c4m_new_table_row()
{
return c4m_new(c4m_tspec_xlist(c4m_tspec_utf32()));
}
#endif
21 changes: 11 additions & 10 deletions include/con4m/stream.h
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
#pragma once
#include "con4m.h"

c4m_obj_t c4m_stream_raw_read(c4m_stream_t *, int64_t, char *);
size_t c4m_stream_raw_write(c4m_stream_t *, int64_t, char *);
void _c4m_stream_write_object(c4m_stream_t *, c4m_obj_t, bool);
bool c4m_stream_at_eof(c4m_stream_t *);
int64_t c4m_stream_get_location(c4m_stream_t *);
void c4m_stream_set_location(c4m_stream_t *, int64_t);
void c4m_stream_close(c4m_stream_t *);
void c4m_stream_flush(c4m_stream_t *);
void _c4m_print(c4m_obj_t, ...);
extern c4m_obj_t *c4m_stream_raw_read(c4m_stream_t *, int64_t, char *);
extern size_t c4m_stream_raw_write(c4m_stream_t *, int64_t, char *);
extern void _c4m_stream_write_object(c4m_stream_t *, c4m_obj_t, bool);
extern bool c4m_stream_at_eof(c4m_stream_t *);
extern int64_t c4m_stream_get_location(c4m_stream_t *);
extern void c4m_stream_set_location(c4m_stream_t *, int64_t);
extern void c4m_stream_close(c4m_stream_t *);
extern void c4m_stream_flush(c4m_stream_t *);
extern void _c4m_print(c4m_obj_t, ...);
extern c4m_obj_t *c4m_stream_read_all(c4m_stream_t *);

#define c4m_stream_write_object(s, o, ...) \
_c4m_stream_write_object(s, o, IF(ISEMPTY(__VA_ARGS__))(false) __VA_ARGS__)
Expand Down Expand Up @@ -108,7 +109,7 @@ buffer_iostream(c4m_buf_t *buf)
}

static inline c4m_stream_t *
file_instream(c4m_str_t *filename, c4m_builtin_t output_type)
c4m_file_instream(c4m_str_t *filename, c4m_builtin_t output_type)
{
return c4m_new(c4m_tspec_stream(),
c4m_kw("filename",
Expand Down
Loading

0 comments on commit 0b7e648

Please sign in to comment.