diff --git a/include/con4m.h b/include/con4m.h index e917cf5c..3b51ffdc 100644 --- a/include/con4m.h +++ b/include/con4m.h @@ -72,3 +72,7 @@ typedef void *object_t; // Yes we use cryptographic hashes internally for type IDing. #include "crypto/sha.h" + +// The front end. +#include "con4m/frontend/compile.h" +#include "con4m/frontend/lex.h" // Lexical Tokens diff --git a/include/con4m/codepoint.h b/include/con4m/codepoint.h index ddaaf4e7..00bd7aaf 100644 --- a/include/con4m/codepoint.h +++ b/include/con4m/codepoint.h @@ -17,12 +17,62 @@ c4m_codepoint_is_space(c4m_codepoint_t cp) } switch (c4m_codepoint_category(cp)) { - case CP_CATEGORY_ZS: + case UTF8PROC_CATEGORY_ZS: return true; - case CP_CATEGORY_ZL: - case CP_CATEGORY_ZP: + case UTF8PROC_CATEGORY_ZL: + case UTF8PROC_CATEGORY_ZP: return true; default: return false; } } + +static inline bool +c4m_codepoint_is_c4m_id_start(c4m_codepoint_t cp) +{ + switch (utf8proc_category(cp)) { + case UTF8PROC_CATEGORY_LU: + case UTF8PROC_CATEGORY_LL: + case UTF8PROC_CATEGORY_LT: + case UTF8PROC_CATEGORY_LM: + case UTF8PROC_CATEGORY_LO: + case UTF8PROC_CATEGORY_NL: + return true; + default: + switch (cp) { + case '_': + case '?': + case '$': + return true; + default: + return false; + } + } +} + +static inline bool +c4m_codepoint_is_c4m_id_continue(c4m_codepoint_t cp) +{ + switch (utf8proc_category(cp)) { + case UTF8PROC_CATEGORY_LU: + case UTF8PROC_CATEGORY_LL: + case UTF8PROC_CATEGORY_LT: + case UTF8PROC_CATEGORY_LM: + case UTF8PROC_CATEGORY_LO: + case UTF8PROC_CATEGORY_NL: + case UTF8PROC_CATEGORY_ND: + case UTF8PROC_CATEGORY_MN: + case UTF8PROC_CATEGORY_MC: + case UTF8PROC_CATEGORY_PC: + return true; + default: + switch (cp) { + case '_': + case '?': + case '$': + return true; + default: + return false; + } + } +} diff --git a/include/con4m/conststr.h b/include/con4m/conststr.h index ab22ba7b..34719927 100644 --- a/include/con4m/conststr.h +++ b/include/con4m/conststr.h @@ -15,3 +15,5 @@ extern c4m_utf8_t *c4m_get_lbrace_const(); extern c4m_utf8_t *c4m_get_rbrace_const(); extern c4m_utf8_t *c4m_get_colon_const(); extern c4m_utf8_t *c4m_get_colon_no_space_const(); +extern c4m_utf8_t *c4m_get_slash_const(); +extern c4m_utf8_t *c4m_get_period_const(); diff --git a/include/con4m/datatypes.h b/include/con4m/datatypes.h index f777d02e..04aec34f 100644 --- a/include/con4m/datatypes.h +++ b/include/con4m/datatypes.h @@ -20,6 +20,7 @@ #include "con4m/datatypes/tuples.h" #include "con4m/datatypes/callbacks.h" #include "con4m/datatypes/streams.h" +#include "con4m/datatypes/frontend.h" typedef c4m_str_t *(*c4m_repr_fn)(c4m_obj_t, to_str_use_t); typedef void (*c4m_marshal_fn)(c4m_obj_t, diff --git a/include/con4m/datatypes/frontend.h b/include/con4m/datatypes/frontend.h new file mode 100644 index 00000000..27b5e052 --- /dev/null +++ b/include/con4m/datatypes/frontend.h @@ -0,0 +1,148 @@ +#pragma once +#include "con4m.h" + +typedef enum { + c4m_tt_space, + c4m_tt_semi, + c4m_tt_newline, + c4m_tt_line_comment, + c4m_tt_lock_attr, + c4m_tt_plus, + c4m_tt_minus, + c4m_tt_mul, + c4m_tt_long_comment, + c4m_tt_div, + c4m_tt_mod, + c4m_tt_lte, + c4m_tt_lt, + c4m_tt_gte, + c4m_tt_gt, + c4m_tt_neq, + c4m_tt_not, + c4m_tt_colon, + c4m_tt_assign, + c4m_tt_cmp, + c4m_tt_comma, + c4m_tt_period, + c4m_tt_lbrace, + c4m_tt_rbrace, + c4m_tt_lbracket, + c4m_tt_rbracket, + c4m_tt_lparen, + c4m_tt_rparen, + c4m_tt_and, + c4m_tt_or, + c4m_tt_int_lit, + c4m_tt_hex_lit, + c4m_tt_float_lit, + c4m_tt_string_lit, + c4m_tt_char_lit, + c4m_tt_true, + c4m_tt_false, + c4m_tt_nil, + c4m_tt_if, + c4m_tt_elif, + c4m_tt_else, + c4m_tt_for, + c4m_tt_from, + c4m_tt_to, + c4m_tt_break, + c4m_tt_continue, + c4m_tt_return, + c4m_tt_enum, + c4m_tt_identifier, + c4m_tt_func, + c4m_tt_var, + c4m_tt_global, + c4m_tt_const, + c4m_tt_unquoted_lit, + c4m_tt_backtick, + c4m_tt_arrow, + c4m_tt_object, + c4m_tt_while, + c4m_tt_in, + c4m_tt_bit_and, + c4m_tt_bit_or, + c4m_tt_bit_xor, + c4m_tt_shl, + c4m_tt_shr, + c4m_tt_typeof, + c4m_tt_switch, + c4m_tt_case, + c4m_tt_plus_eq, + c4m_tt_minus_eq, + c4m_tt_mul_eq, + c4m_tt_div_eq, + c4m_tt_mod_eq, + c4m_tt_bit_and_eq, + c4m_tt_bit_or_eq, + c4m_tt_bit_xor_eq, + c4m_tt_shl_eq, + c4m_tt_shr_eq, + c4m_tt_sof, + c4m_tt_eof, + c4m_tt_lex_error +} c4m_token_kind_t; + +typedef enum { + c4m_err_open_file, + c4m_err_lex_stray_cr, + c4m_err_lex_eof_in_comment, + c4m_err_lex_invalid_char, + c4m_err_lex_eof_in_str_lit, + c4m_err_lex_nl_in_str_lit, + c4m_err_lex_eof_in_char_lit, + c4m_err_lex_nl_in_char_lit, + c4m_err_lex_extra_in_char_lit, + c4m_err_lex_esc_in_esc, + c4m_err_lex_invalid_float_lit, + c4m_err_lex_float_oflow, + c4m_err_lex_float_uflow, + c4m_err_lex_int_oflow, + c4m_err_last, +} c4m_compile_error_t; + +typedef struct { + c4m_codepoint_t *start_ptr; + c4m_codepoint_t *end_ptr; + c4m_utf32_t *literal_modifier; + void *literal_value; // Once parsed. + c4m_token_kind_t kind; + int token_id; + int line_no; + int line_offset; + uint8_t adjustment; // For keeping track of quoting. +} c4m_token_t; + +typedef struct { + c4m_compile_error_t code; + // These will probably turn into a tagged union or transparent + // pointer with a phase indicator, so we can design the aux data + // appropriate per-phase. + c4m_token_t *current_token; + c4m_str_t *exception_message; +} c4m_compile_error; + +typedef struct { + // The module_id is calculated by combining the package name and the + // module name, then hashing it with SHA256. We use Unix style paths + // but this is not necessarily derived from the URI path. + // + // Note that packages (and our combining of it and the module) use + // dotted syntax like with most PLs. When we combine for the hash, + // we add a dot in there. + // + // c4m_new_compile_ctx will add __default__ as the package if none + // is provided. The URI fields are optional (via API you can just + // pass raw source as long as you give at least a module name). + + __int128_t module_id; + c4m_str_t *scheme; // http, https or file; if NULL, then file. + c4m_str_t *authority; // http/s only. + c4m_str_t *path; // Path component in the URI. + c4m_str_t *package; // Package name. + c4m_str_t *module; // Module name. + c4m_utf32_t *raw; // raw contents read when we do the lex pass. + c4m_xlist_t *tokens; // an xlist of x4m_token_t objects; + c4m_xlist_t *errors; // an xlist of c4m_compile_errors +} c4m_file_compile_ctx; diff --git a/include/con4m/datatypes/grids.h b/include/con4m/datatypes/grids.h index 903cc679..5e383f8a 100644 --- a/include/con4m/datatypes/grids.h +++ b/include/con4m/datatypes/grids.h @@ -173,13 +173,13 @@ typedef struct { } c4m_renderable_t; struct c4m_grid_t { - c4m_renderable_t *self; - c4m_renderable_t **cells; // A 2d array of renderable_objects, by ref - uint16_t num_cols; - uint16_t num_rows; - uint16_t spare_rows; - c4m_render_style_t **col_props; - c4m_render_style_t **row_props; + c4m_renderable_t *self; + c4m_renderable_t **cells; // A 2d array of renderable_objects, by ref + uint16_t num_cols; + uint16_t num_rows; + uint16_t spare_rows; + c4m_dict_t *col_props; // dict of int:c4m_render_style_t ** + c4m_dict_t *row_props; // Per-render info, which includes any adding added to perform // alignment of the grid within the dimensions we're given. diff --git a/include/con4m/frontend/compile.h b/include/con4m/frontend/compile.h new file mode 100644 index 00000000..4ced7725 --- /dev/null +++ b/include/con4m/frontend/compile.h @@ -0,0 +1,9 @@ +#pragma once +#include "con4m.h" + +c4m_file_compile_ctx *_c4m_new_compile_ctx(c4m_str_t *module_name, ...); +bool c4m_validate_module_info(c4m_file_compile_ctx *); +c4m_stream_t *c4m_load_code(c4m_file_compile_ctx *); + +#define c4m_new_compile_ctx(m, ...) \ + _c4m_new_compile_ctx(m, KFUNC(__VA_ARGS__)) diff --git a/include/con4m/frontend/lex.h b/include/con4m/frontend/lex.h new file mode 100644 index 00000000..2eae9928 --- /dev/null +++ b/include/con4m/frontend/lex.h @@ -0,0 +1,5 @@ +#pragma once +#include "con4m.h" + +bool c4m_lex(c4m_file_compile_ctx *, c4m_stream_t *); +c4m_grid_t *c4m_format_tokens(c4m_file_compile_ctx *); diff --git a/include/con4m/grid.h b/include/con4m/grid.h index 51f97a54..3ba83507 100644 --- a/include/con4m/grid.h +++ b/include/con4m/grid.h @@ -26,12 +26,20 @@ c4m_get_td_tag(c4m_grid_t *g) } void c4m_grid_set_all_contents(c4m_grid_t *, flexarray_t *); -extern c4m_grid_t *c4m_grid_flow(uint64_t items, ...); -c4m_utf32_t *c4m_grid_to_str(c4m_grid_t *, to_str_use_t); -extern c4m_grid_t *_c4m_ordered_list(flexarray_t *, ...); -extern c4m_grid_t *_c4m_unordered_list(flexarray_t *, ...); -extern c4m_grid_t *_c4m_grid_tree(c4m_tree_node_t *, ...); -c4m_xlist_t *_c4m_grid_render(c4m_grid_t *, ...); +extern c4m_grid_t *c4m_grid_flow(uint64_t items, ...); +extern c4m_utf32_t *c4m_grid_to_str(c4m_grid_t *, to_str_use_t); +extern c4m_grid_t *_c4m_ordered_list(flexarray_t *, ...); +extern c4m_grid_t *_c4m_unordered_list(flexarray_t *, ...); +extern c4m_grid_t *_c4m_grid_tree(c4m_tree_node_t *, ...); +extern c4m_xlist_t *_c4m_grid_render(c4m_grid_t *, ...); +extern void c4m_set_column_props(c4m_grid_t *, + int, + c4m_render_style_t *); +extern void c4m_row_column_props(c4m_grid_t *, + int, + c4m_render_style_t *); +extern void c4m_set_column_style(c4m_grid_t *, int, char *); +extern void c4m_set_row_style(c4m_grid_t *, int, char *); #define c4m_grid_render(g, ...) _c4m_grid_render(g, KFUNC(__VA_ARGS__)) #define c4m_ordered_list(l, ...) _c4m_ordered_list(l, KFUNC(__VA_ARGS__)) @@ -52,30 +60,6 @@ c4m_to_str_renderable(c4m_str_t *s, char *tag) c4m_kw("obj", c4m_ka(s), "tag", c4m_ka(tag))); } -static inline void -c4m_set_column_style(c4m_grid_t *grid, int col, char *tag) -{ - grid->col_props[col] = c4m_lookup_cell_style(tag); -} - -static inline void -c4m_set_row_style(c4m_grid_t *grid, int row, char *tag) -{ - grid->row_props[row] = c4m_lookup_cell_style(tag); -} - -static inline void -c4m_set_column_props(c4m_grid_t *grid, int col, c4m_render_style_t *s) -{ - grid->col_props[col] = s; -} - -static inline void -c4m_set_row_props(c4m_grid_t *grid, int row, c4m_render_style_t *s) -{ - grid->row_props[row] = s; -} - static inline c4m_style_t c4m_grid_blend_color(c4m_style_t style1, c4m_style_t style2) { @@ -183,3 +167,12 @@ c4m_grid_stripe_rows(c4m_grid_t *grid) { grid->stripe = 1; } + +#ifdef C4M_USE_INTERNAL_API + +static inline c4m_xlist_t * +c4m_new_table_row() +{ + return c4m_new(c4m_tspec_xlist(c4m_tspec_utf32())); +} +#endif diff --git a/include/con4m/stream.h b/include/con4m/stream.h index dbb35b98..c64c0245 100644 --- a/include/con4m/stream.h +++ b/include/con4m/stream.h @@ -1,15 +1,16 @@ #pragma once #include "con4m.h" -c4m_obj_t c4m_stream_raw_read(c4m_stream_t *, int64_t, char *); -size_t c4m_stream_raw_write(c4m_stream_t *, int64_t, char *); -void _c4m_stream_write_object(c4m_stream_t *, c4m_obj_t, bool); -bool c4m_stream_at_eof(c4m_stream_t *); -int64_t c4m_stream_get_location(c4m_stream_t *); -void c4m_stream_set_location(c4m_stream_t *, int64_t); -void c4m_stream_close(c4m_stream_t *); -void c4m_stream_flush(c4m_stream_t *); -void _c4m_print(c4m_obj_t, ...); +extern c4m_obj_t *c4m_stream_raw_read(c4m_stream_t *, int64_t, char *); +extern size_t c4m_stream_raw_write(c4m_stream_t *, int64_t, char *); +extern void _c4m_stream_write_object(c4m_stream_t *, c4m_obj_t, bool); +extern bool c4m_stream_at_eof(c4m_stream_t *); +extern int64_t c4m_stream_get_location(c4m_stream_t *); +extern void c4m_stream_set_location(c4m_stream_t *, int64_t); +extern void c4m_stream_close(c4m_stream_t *); +extern void c4m_stream_flush(c4m_stream_t *); +extern void _c4m_print(c4m_obj_t, ...); +extern c4m_obj_t *c4m_stream_read_all(c4m_stream_t *); #define c4m_stream_write_object(s, o, ...) \ _c4m_stream_write_object(s, o, IF(ISEMPTY(__VA_ARGS__))(false) __VA_ARGS__) @@ -108,7 +109,7 @@ buffer_iostream(c4m_buf_t *buf) } static inline c4m_stream_t * -file_instream(c4m_str_t *filename, c4m_builtin_t output_type) +c4m_file_instream(c4m_str_t *filename, c4m_builtin_t output_type) { return c4m_new(c4m_tspec_stream(), c4m_kw("filename", diff --git a/include/vendor/utf8proc.h b/include/vendor/utf8proc.h index af5f353d..f25cd51c 100644 --- a/include/vendor/utf8proc.h +++ b/include/vendor/utf8proc.h @@ -3,36 +3,36 @@ #include "con4m/base.h" typedef enum { - CP_CATEGORY_CN = 0, /**< Other, not assigned */ - CP_CATEGORY_LU = 1, /**< Letter, uppercase */ - CP_CATEGORY_LL = 2, /**< Letter, lowercase */ - CP_CATEGORY_LT = 3, /**< Letter, titlecase */ - CP_CATEGORY_LM = 4, /**< Letter, modifier */ - CP_CATEGORY_LO = 5, /**< Letter, other */ - CP_CATEGORY_MN = 6, /**< Mark, nonspacing */ - CP_CATEGORY_MC = 7, /**< Mark, spacing combining */ - CP_CATEGORY_ME = 8, /**< Mark, enclosing */ - CP_CATEGORY_ND = 9, /**< Number, decimal digit */ - CP_CATEGORY_NL = 10, /**< Number, letter */ - CP_CATEGORY_NO = 11, /**< Number, other */ - CP_CATEGORY_PC = 12, /**< Punctuation, connector */ - CP_CATEGORY_PD = 13, /**< Punctuation, dash */ - CP_CATEGORY_PS = 14, /**< Punctuation, open */ - CP_CATEGORY_PE = 15, /**< Punctuation, close */ - CP_CATEGORY_PI = 16, /**< Punctuation, initial quote */ - CP_CATEGORY_PF = 17, /**< Punctuation, final quote */ - CP_CATEGORY_PO = 18, /**< Punctuation, other */ - CP_CATEGORY_SM = 19, /**< Symbol, math */ - CP_CATEGORY_SC = 20, /**< Symbol, currency */ - CP_CATEGORY_SK = 21, /**< Symbol, modifier */ - CP_CATEGORY_SO = 22, /**< Symbol, other */ - CP_CATEGORY_ZS = 23, /**< Separator, space */ - CP_CATEGORY_ZL = 24, /**< Separator, line */ - CP_CATEGORY_ZP = 25, /**< Separator, paragraph */ - CP_CATEGORY_CC = 26, /**< Other, control */ - CP_CATEGORY_CF = 27, /**< Other, format */ - CP_CATEGORY_CS = 28, /**< Other, surrogate */ - CP_CATEGORY_CO = 29, /**< Other, private use */ + UTF8PROC_CATEGORY_CN = 0, /**< Other, not assigned */ + UTF8PROC_CATEGORY_LU = 1, /**< Letter, uppercase */ + UTF8PROC_CATEGORY_LL = 2, /**< Letter, lowercase */ + UTF8PROC_CATEGORY_LT = 3, /**< Letter, titlecase */ + UTF8PROC_CATEGORY_LM = 4, /**< Letter, modifier */ + UTF8PROC_CATEGORY_LO = 5, /**< Letter, other */ + UTF8PROC_CATEGORY_MN = 6, /**< Mark, nonspacing */ + UTF8PROC_CATEGORY_MC = 7, /**< Mark, spacing combining */ + UTF8PROC_CATEGORY_ME = 8, /**< Mark, enclosing */ + UTF8PROC_CATEGORY_ND = 9, /**< Number, decimal digit */ + UTF8PROC_CATEGORY_NL = 10, /**< Number, letter */ + UTF8PROC_CATEGORY_NO = 11, /**< Number, other */ + UTF8PROC_CATEGORY_PC = 12, /**< Punctuation, connector */ + UTF8PROC_CATEGORY_PD = 13, /**< Punctuation, dash */ + UTF8PROC_CATEGORY_PS = 14, /**< Punctuation, open */ + UTF8PROC_CATEGORY_PE = 15, /**< Punctuation, close */ + UTF8PROC_CATEGORY_PI = 16, /**< Punctuation, initial quote */ + UTF8PROC_CATEGORY_PF = 17, /**< Punctuation, final quote */ + UTF8PROC_CATEGORY_PO = 18, /**< Punctuation, other */ + UTF8PROC_CATEGORY_SM = 19, /**< Symbol, math */ + UTF8PROC_CATEGORY_SC = 20, /**< Symbol, currency */ + UTF8PROC_CATEGORY_SK = 21, /**< Symbol, modifier */ + UTF8PROC_CATEGORY_SO = 22, /**< Symbol, other */ + UTF8PROC_CATEGORY_ZS = 23, /**< Separator, space */ + UTF8PROC_CATEGORY_ZL = 24, /**< Separator, line */ + UTF8PROC_CATEGORY_ZP = 25, /**< Separator, paragraph */ + UTF8PROC_CATEGORY_CC = 26, /**< Other, control */ + UTF8PROC_CATEGORY_CF = 27, /**< Other, format */ + UTF8PROC_CATEGORY_CS = 28, /**< Other, surrogate */ + UTF8PROC_CATEGORY_CO = 29, /**< Other, private use */ } cp_category_t; typedef enum { @@ -42,99 +42,97 @@ typedef enum { } lbreak_kind_t; typedef enum { - /** The given UTF-8 input is NULL terminated. */ - UTF8PROC_NULLTERM = (1<<0), - - /** Unicode Versioning Stability has to be respected. */ - UTF8PROC_STABLE = (1<<1), - - /** Compatibility decomposition (i.e. formatting information is lost). */ - UTF8PROC_COMPAT = (1<<2), - - /** Return a result with composed characters. */ - UTF8PROC_COMPOSE = (1<<3), - - /** Return a result with decomposed characters. */ - UTF8PROC_DECOMPOSE = (1<<4), - - /** Strip "default ignorable characters" such as SOFT-HYPHEN or - * ZERO-WIDTH-SPACE. */ - UTF8PROC_IGNORE = (1<<5), - - /** Return an error, if the input contains unassigned codepoints. */ - UTF8PROC_REJECTNA = (1<<6), - - /** - * Indicating that NLF-sequences (LF, CRLF, CR, NEL) are representing a - * line break, and should be converted to the codepoint for line - * separation (LS). - */ - UTF8PROC_NLF2LS = (1<<7), - - /** - * Indicating that NLF-sequences are representing a paragraph break, and - * should be converted to the codepoint for paragraph separation - * (PS). - */ - UTF8PROC_NLF2PS = (1<<8), - - /** Indicating that the meaning of NLF-sequences is unknown. */ - UTF8PROC_NLF2LF = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS), - - /** Strips and/or convers control characters. - * - * NLF-sequences are transformed into space, except if one of the - * NLF2LS/PS/LF options is given. HorizontalTab (HT) and FormFeed (FF) - * are treated as a NLF-sequence in this case. All other control - * characters are simply removed. - */ - UTF8PROC_STRIPCC = (1<<9), - - /** - * Performs unicode case folding, to be able to do a case-insensitive - * string comparison. - */ - UTF8PROC_CASEFOLD = (1<<10), - - /** - * Inserts 0xFF bytes at the beginning of each sequence which is - * representing a single grapheme cluster (see UAX#29). - */ - UTF8PROC_CHARBOUND = (1<<11), - - /** Lumps certain characters together. - * - * E.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-". See lump.md for details. - * - * If NLF2LF is set, this includes a transformation of paragraph and - * line separators to ASCII line-feed (LF). - */ - UTF8PROC_LUMP = (1<<12), - - /** Strips all character markings. - * - * This includes non-spacing, spacing and enclosing (i.e. accents). - * @note This option works only with @ref UTF8PROC_COMPOSE or - * @ref UTF8PROC_DECOMPOSE - */ - UTF8PROC_STRIPMARK = (1<<13), - - /** - * Strip unassigned codepoints. - */ - UTF8PROC_STRIPNA = (1<<14), + /** The given UTF-8 input is NULL terminated. */ + UTF8PROC_NULLTERM = (1 << 0), + + /** Unicode Versioning Stability has to be respected. */ + UTF8PROC_STABLE = (1 << 1), + + /** Compatibility decomposition (i.e. formatting information is lost). */ + UTF8PROC_COMPAT = (1 << 2), + + /** Return a result with composed characters. */ + UTF8PROC_COMPOSE = (1 << 3), + + /** Return a result with decomposed characters. */ + UTF8PROC_DECOMPOSE = (1 << 4), + + /** Strip "default ignorable characters" such as SOFT-HYPHEN or + * ZERO-WIDTH-SPACE. */ + UTF8PROC_IGNORE = (1 << 5), + + /** Return an error, if the input contains unassigned codepoints. */ + UTF8PROC_REJECTNA = (1 << 6), + + /** + * Indicating that NLF-sequences (LF, CRLF, CR, NEL) are representing a + * line break, and should be converted to the codepoint for line + * separation (LS). + */ + UTF8PROC_NLF2LS = (1 << 7), + + /** + * Indicating that NLF-sequences are representing a paragraph break, and + * should be converted to the codepoint for paragraph separation + * (PS). + */ + UTF8PROC_NLF2PS = (1 << 8), + + /** Indicating that the meaning of NLF-sequences is unknown. */ + UTF8PROC_NLF2LF = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS), + + /** Strips and/or convers control characters. + * + * NLF-sequences are transformed into space, except if one of the + * NLF2LS/PS/LF options is given. HorizontalTab (HT) and FormFeed (FF) + * are treated as a NLF-sequence in this case. All other control + * characters are simply removed. + */ + UTF8PROC_STRIPCC = (1 << 9), + + /** + * Performs unicode case folding, to be able to do a case-insensitive + * string comparison. + */ + UTF8PROC_CASEFOLD = (1 << 10), + + /** + * Inserts 0xFF bytes at the beginning of each sequence which is + * representing a single grapheme cluster (see UAX#29). + */ + UTF8PROC_CHARBOUND = (1 << 11), + + /** Lumps certain characters together. + * + * E.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-". See lump.md for details. + * + * If NLF2LF is set, this includes a transformation of paragraph and + * line separators to ASCII line-feed (LF). + */ + UTF8PROC_LUMP = (1 << 12), + + /** Strips all character markings. + * + * This includes non-spacing, spacing and enclosing (i.e. accents). + * @note This option works only with @ref UTF8PROC_COMPOSE or + * @ref UTF8PROC_DECOMPOSE + */ + UTF8PROC_STRIPMARK = (1 << 13), + + /** + * Strip unassigned codepoints. + */ + UTF8PROC_STRIPNA = (1 << 14), } utf8proc_option_t; // From libutf8proc -extern int utf8proc_iterate(const uint8_t *str, ssize_t len, int32_t *cp); -extern bool utf8proc_codepoint_valid(int32_t cp); -extern int utf8proc_encode_char(int32_t cp, uint8_t *dst); +extern int utf8proc_iterate(const uint8_t *str, ssize_t len, int32_t *cp); +extern bool utf8proc_codepoint_valid(int32_t cp); +extern int utf8proc_encode_char(int32_t cp, uint8_t *dst); extern cp_category_t utf8proc_category(int32_t cp); -extern int utf8proc_charwidth(int32_t cp); -extern int32_t utf8proc_tolower(int32_t cp); -extern int32_t utf8proc_toupper(int32_t cp); -extern int utf8proc_charwidth(int32_t cp); -extern bool utf8proc_grapheme_break_stateful(int32_t cp1, int32_t cp2, - int32_t *state); -extern int32_t utf8proc_map(const uint8_t *str, int32_t len, uint8_t **out, - utf8proc_option_t options); +extern int utf8proc_charwidth(int32_t cp); +extern int32_t utf8proc_tolower(int32_t cp); +extern int32_t utf8proc_toupper(int32_t cp); +extern int utf8proc_charwidth(int32_t cp); +extern bool utf8proc_grapheme_break_stateful(int32_t cp1, int32_t cp2, int32_t *state); +extern int32_t utf8proc_map(const uint8_t *str, int32_t len, uint8_t **out, utf8proc_option_t options); diff --git a/meson.build b/meson.build index 2241d260..930b002d 100644 --- a/meson.build +++ b/meson.build @@ -86,6 +86,8 @@ c4m_src = ['src/con4m/style.c', 'src/con4m/literals.c', 'src/con4m/init.c', 'src/con4m/crypto/sha.c', + 'src/con4m/frontend/compile.c', + 'src/con4m/frontend/lex.c', ] hat_primary = ['src/hatrack/support/hatrack_common.c', diff --git a/src/con4m/ansi.c b/src/con4m/ansi.c index 1e4410ac..65adf06d 100644 --- a/src/con4m/ansi.c +++ b/src/con4m/ansi.c @@ -11,11 +11,11 @@ ignore_for_printing(c4m_codepoint_t cp) // control rendering as intended. switch (cp) { - case CP_CATEGORY_CN: - case CP_CATEGORY_CC: - case CP_CATEGORY_CF: - case CP_CATEGORY_CS: - case CP_CATEGORY_CO: + case UTF8PROC_CATEGORY_CN: + case UTF8PROC_CATEGORY_CC: + case UTF8PROC_CATEGORY_CF: + case UTF8PROC_CATEGORY_CS: + case UTF8PROC_CATEGORY_CO: if (cp == '\n') { return false; } diff --git a/src/con4m/breaks.c b/src/con4m/breaks.c index f82d3e33..fd496de0 100644 --- a/src/con4m/breaks.c +++ b/src/con4m/breaks.c @@ -91,8 +91,8 @@ internal_is_line_break(int32_t cp) } switch (utf8proc_category(cp)) { - case CP_CATEGORY_ZL: - case CP_CATEGORY_ZP: + case UTF8PROC_CATEGORY_ZL: + case UTF8PROC_CATEGORY_ZP: return true; default: return false; diff --git a/src/con4m/conststr.c b/src/con4m/conststr.c index 12d0f44a..2f5c532a 100644 --- a/src/con4m/conststr.c +++ b/src/con4m/conststr.c @@ -14,7 +14,9 @@ enum { RBRACE_IX = 10, COLON_IX = 11, COLON_NSP = 12, - PUNC_MAX = 13 + SLASH_IX = 13, + PERIOD_IX = 14, + PUNC_MAX = 15 }; static c4m_str_t *type_punct[PUNC_MAX] = { @@ -41,6 +43,7 @@ init_punctuation() type_punct[COLON_IX] = c4m_new(c4m_tspec_utf8(), c4m_kw("cstring", c4m_ka(" : "))); type_punct[COLON_NSP] = c4m_utf8_repeat(':', 1); + type_punct[SLASH_IX] = c4m_utf8_repeat('/', 1); } c4m_gc_register_root(&type_punct[0], PUNC_MAX); } @@ -135,3 +138,17 @@ c4m_get_colon_no_space_const() init_punctuation(); return type_punct[COLON_NSP]; } + +c4m_utf8_t * +c4m_get_slash_const() +{ + init_punctuation(); + return type_punct[SLASH_IX]; +} + +c4m_utf8_t * +c4m_get_period_const() +{ + init_punctuation(); + return type_punct[PERIOD_IX]; +} diff --git a/src/con4m/frontend/compile.c b/src/con4m/frontend/compile.c new file mode 100644 index 00000000..f1420122 --- /dev/null +++ b/src/con4m/frontend/compile.c @@ -0,0 +1,159 @@ +#include "con4m.h" + +static c4m_str_t * +module_name_from_path(c4m_str_t *path) +{ + c4m_xlist_t *parts = c4m_str_xsplit(path, c4m_get_slash_const()); + int l = c4m_xlist_len(parts); + c4m_str_t *candidate = c4m_xlist_get(parts, l - 1, NULL); + + l = c4m_str_find(candidate, c4m_get_period_const()); + + if (l == -1) { + return candidate; + } + + return c4m_str_slice(candidate, 0, l); +} + +c4m_file_compile_ctx * +_c4m_new_compile_ctx(c4m_str_t *module_name, ...) +{ + c4m_file_compile_ctx *result; + c4m_str_t *scheme = NULL; + c4m_str_t *authority = NULL; + c4m_str_t *path = NULL; + c4m_str_t *package = NULL; + + c4m_karg_only_init(module_name); + c4m_kw_ptr("uri_scheme", scheme); + c4m_kw_ptr("uri_authority", authority); + c4m_kw_ptr("uri_path", path); + c4m_kw_ptr("package", package); + + if (package == NULL) { + package = c4m_new(c4m_tspec_utf8(), + c4m_kw("cstring", c4m_ka("__default__"))); + } + + if (module_name == NULL && path != NULL) { + module_name = module_name_from_path(path); + } + + result = c4m_gc_alloc(c4m_file_compile_ctx); + result->errors = c4m_new(c4m_tspec_xlist(c4m_tspec_ref())); + result->scheme = scheme; + result->authority = authority; + result->path = path; + result->package = package; + result->module = module_name; + + if (!c4m_validate_module_info(result)) { + C4M_CRAISE( + "Invalid module spec; the packages and the module name " + "must all be valid identifiers; package parts must be " + "separated by dots."); + } + + return result; +} + +bool +c4m_validate_module_info(c4m_file_compile_ctx *ctx) +{ + c4m_codepoint_t cp; + + if (ctx->package == NULL || ctx->module == NULL) { + return false; + } + + int plen = c4m_str_codepoint_len(ctx->package); + int mlen = c4m_str_codepoint_len(ctx->module); + bool dot_ok = true; // We start at char 1. + + if (plen == 0 || mlen == 0) { + return false; + } + + cp = c4m_index(ctx->package, 0); + if (!c4m_codepoint_is_c4m_id_start(cp)) { + return false; + } + + cp = c4m_index(ctx->module, 0); + if (!c4m_codepoint_is_c4m_id_start(cp)) { + return false; + } + + for (int i = 1; i < plen; i++) { + cp = c4m_index(ctx->package, i); + + if (c4m_codepoint_is_c4m_id_continue(cp)) { + dot_ok = true; + continue; + } + + if (cp != '.' || !dot_ok) { + return false; + } + + // dot_ok being true is really only keeping track of whether + // the previous character was a dot; however, the final + // character of the package name cannot be a dot. + if (i + 1 == plen) { + return false; + } + + dot_ok = false; + } + + for (int i = 1; i < mlen; i++) { + cp = c4m_index(ctx->module, i); + + if (!c4m_codepoint_is_c4m_id_continue(cp)) { + return false; + } + } + + return true; +} + +// If this fails due to the source not being found or some other IO +// error, it will return NULL and add an error to the file compile +// ctx. +// +// However, if you call it wrong, at the API level, it raises an +// exception. +// +// Currently, this is only handling files on the local file system; need +// to add an API for easier http/https access. +c4m_stream_t * +c4m_load_code(c4m_file_compile_ctx *ctx) +{ + c4m_stream_t *result; + + if (ctx->scheme != NULL) { + C4M_CRAISE("Non-file URI schemes are currently unimplemented."); + } + + if (!ctx->path) { + C4M_CRAISE("Do not call with a null path."); + } + + C4M_TRY + { + result = c4m_file_instream(ctx->path, C4M_T_UTF8); + } + C4M_EXCEPT + { + c4m_compile_error *err = c4m_gc_alloc(c4m_compile_error); + err->code = c4m_err_open_file; + err->exception_message = c4m_exception_get_message(C4M_X_CUR()); + + c4m_xlist_append(ctx->errors, err); + result = NULL; + } + C4M_TRY_END; + + return result; +} diff --git a/src/con4m/frontend/lex.c b/src/con4m/frontend/lex.c new file mode 100644 index 00000000..ea30058d --- /dev/null +++ b/src/con4m/frontend/lex.c @@ -0,0 +1,1178 @@ +#define C4M_USE_INTERNAL_API +#include + +typedef struct { + char *tt_name; + bool show_contents; +} internal_tt_info_t; + +static internal_tt_info_t tt_info[] = { + {"space", false}, + {";", false}, + {"newline", false}, + {"comment", true}, + {"~", false}, + {"+", false}, + {"-", false}, + {"*", false}, + {"comment", true}, + {"/", false}, + {"%", false}, + {"<=", false}, + {"<", false}, + {">=", false}, + {">", false}, + {"!=", false}, + {"!", false}, + {":", false}, + {"=", false}, + {"==", false}, + {",", false}, + {".", false}, + {"{", false}, + {"}", false}, + {"[", false}, + {"]", false}, + {"(", false}, + {")", false}, + {"and", false}, + {"or", false}, + {"int", true}, + {"hex", true}, + {"float", true}, + {"string", true}, + {"char", true}, + {"true", false}, + {"false", false}, + {"nil", false}, + {"if", false}, + {"elif", false}, + {"else", false}, + {"for", false}, + {"from", false}, + {"to", false}, + {"break", false}, + {"continue", false}, + {"return", false}, + {"enum", false}, + {"identifier", true}, + {"func", false}, + {"var", false}, + {"global", false}, + {"const", false}, + {":= literal", true}, + {"`", false}, + {"->", false}, + {"object", false}, + {"while", false}, + {"in", false}, + {"&", false}, + {"|", false}, + {"^", false}, + {"<<", false}, + {">>", false}, + {"typeof", false}, + {"switch", false}, + {"case", false}, + {"+=", false}, + {"-=", false}, + {"*=", false}, + {"/=", false}, + {"%=", false}, + {"&=", false}, + {"|=", false}, + {"^=", false}, + {"<<=", false}, + {">>=", false}, + {"start", false}, + {"eof", false}, + {"error", false}, +}; + +typedef struct { + c4m_file_compile_ctx *ctx; + c4m_codepoint_t *start; + c4m_codepoint_t *end; + c4m_codepoint_t *pos; + c4m_codepoint_t *line_start; + c4m_token_t *last_token; + size_t token_id; + size_t line_no; + size_t cur_tok_line_no; + size_t cur_tok_offset; +} lex_state_t; + +// These helpers definitely require us to keep names consistent internally. +// +// They just remove clutter in calling stuff and emphasize the variability: +// - TOK adds a token to the output stream of the given kind; +// - LITERAL_TOK is the same, except the system looks to see if there is +// - a lit modifier at the end; if there is, it copies it into the token. +// - LEX_ERROR adds an error to the broader context object, and longjumps. +#define TOK(kind) output_token(state, kind) +#define LITERAL_TOK(kind) \ + output_token(state, kind); \ + handle_lit_mod(state) +#define LEX_ERROR(code) \ + fill_lex_error(state, code); \ + printf("Raising exception: " #code); \ + C4M_CRAISE("Exception:" #code "\n") + +static const __uint128_t max_intval = (__uint128_t)0xffffffffffffffffULL; + +static inline c4m_codepoint_t +next(lex_state_t *state) +{ + if (state->pos >= state->end) { + return 0; + } + return *state->pos++; +} + +static inline void +unput(lex_state_t *state) +{ + if (state->pos && state->pos < state->end) { + --state->pos; + } +} + +static inline void +advance(lex_state_t *state) +{ + state->pos++; +} + +static inline c4m_codepoint_t +peek(lex_state_t *state) +{ + if (state->pos + 1 >= state->end) { + return 0; + } + return *(state->pos); +} + +static inline void +at_new_line(lex_state_t *state) +{ + state->line_no++; + state->line_start = state->pos; +} + +static inline void +output_token(lex_state_t *state, c4m_token_kind_t kind) +{ + c4m_token_t *tok = c4m_gc_alloc(c4m_token_t); + tok->kind = kind; + tok->start_ptr = state->start; + tok->end_ptr = state->pos; + tok->token_id = ++state->token_id; + tok->line_no = state->cur_tok_line_no; + tok->line_offset = state->cur_tok_offset; + state->last_token = tok; + + c4m_xlist_append(state->ctx->tokens, tok); +} + +static inline void +skip_optional_newline(lex_state_t *state) +{ + c4m_codepoint_t *start = state->pos; + + while (true) { + switch (peek(state)) { + case ' ': + case '\t': + advance(state); + continue; + case '\n': + advance(state); + at_new_line(state); + // We only allow one newline after tokens. So don't keep + // running the same loop; we're done when this one finds + // a non-space character. + while (true) { + switch (peek(state)) { + case ' ': + case '\t': + advance(state); + continue; + default: + goto possible_ws_token; + } + } + // Explicitly fall through here out of the nested switch + // since we're done. + default: +possible_ws_token: + if (state->pos != start) { + TOK(c4m_tt_space); + } + return; + } + } +} + +static inline void +handle_lit_mod(lex_state_t *state) +{ + if (peek(state) != '\'') { + return; + } + advance(state); + + c4m_codepoint_t *lm_start = state->pos; + + while (c4m_codepoint_is_c4m_id_continue(peek(state))) { + advance(state); + } + + size_t n = (size_t)(state->pos - lm_start); + c4m_token_t *tok = state->last_token; + tok->literal_modifier = c4m_new(c4m_tspec_utf32(), + c4m_kw("length", c4m_ka(n))); + state->start = state->pos; +} + +static inline void +fill_lex_error(lex_state_t *state, c4m_compile_error_t code) + +{ + c4m_token_t *tok = c4m_gc_alloc(c4m_token_t); + tok->kind = c4m_tt_lex_error; + tok->start_ptr = state->start; + tok->end_ptr = state->pos; + tok->line_no = state->line_no; + tok->line_offset = state->start - state->line_start; + + c4m_compile_error *err = c4m_gc_alloc(c4m_compile_error); + err->code = code; + err->current_token = tok; + + c4m_xlist_append(state->ctx->errors, err); +} + +static inline void +scan_unquoted_literal(lex_state_t *state) +{ + // For now, this just scans to the end of the line, and returns a + // token of type c4m_tt_unquoted_lit. When it comes time to + // re-implement the litmod stuff and we add literal parsers for + // all the builtins, this can generate the proper token up-front. + while (true) { + switch (next(state)) { + case '\n': + at_new_line(state); + // fallthrough. + case 0: + LITERAL_TOK(c4m_tt_unquoted_lit); + return; + } + } +} + +static void +scan_int_or_float_literal(lex_state_t *state) +{ + // This one probably does make more sense to fully parse here. + // There is an issue: + // + // We're using u32 as our internal repr for what we're parsing. + // But the easiest way to deal w/ floats is to call strtod(), + // which expects UTF8 (well, ASCII really). We don't want to + // reconvert (or keep around) the whole remainder of the file, so + // we just scan forward looking at absolutely every character than + // can possibly be in a valid float (including E/e, but not NaN / + // infinity; those will have to be handled as keywords). We + // convert that bit back to UTF-8. + // + // If did we see a starting character that indicates a float, we + // know it might be a float, so we keep a record of where the + // first such character is; then we call strtod(); if strtod() + // tells us it found a valid parse where the ending point is + // farther than the first float indicator, then we're + // done; we just need to set the proper token end point. + // + // Otherwise, we re-parse as an int, and we can just do that + // manually into a __uint128_t (getting the float parse precisely + // right is not something I relish, even though it can be done + // faster than w/ strtod). + // + // One final note: we already passed the first character before we + // got here. But state->start does point to the beginning, so we + // use that when we need to reconstruct the string. + + c4m_codepoint_t *start = state->start; + int ix = 1; // First index we need to check. + int float_ix = 0; // 0 means not a float. + + while (true) { + switch (start[ix]) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + ix++; + continue; + case '.': + if (float_ix) { + // Already had a dot or something like that. + break; + } + float_ix = ix++; + continue; + case 'e': + case 'E': + if (!float_ix) { + float_ix = ix; + } + ix++; + continue; + case '+': + case '-': + ix++; + continue; + default: + break; + } + break; + } + + c4m_utf32_t *u32 = c4m_new(c4m_tspec_utf32(), + c4m_kw("length", + c4m_ka(ix), + "codepoints", + c4m_ka(start))); + c4m_utf8_t *u8 = c4m_to_utf8(u32); + + if (float_ix) { + char *endp = NULL; + double value = strtod((char *)u8->data, &endp); + + if (endp == (char *)u8->data || !endp) { + // I don't think this one should ever happen here. + LEX_ERROR(c4m_err_lex_invalid_float_lit); + } + + if (errno == ERANGE) { + if (value == HUGE_VAL) { + LEX_ERROR(c4m_err_lex_float_oflow); + } + LEX_ERROR(c4m_err_lex_float_uflow); + } + + int float_strlen = (int)(endp - u8->data); + if (float_strlen > float_ix) { + state->pos = state->start + float_strlen; + LITERAL_TOK(c4m_tt_float_lit); + state->last_token->literal_value = (void *)*(uint64_t *)&value; + return; + } + } + + // Either we saw no evidence of a float or the float parse + // didn't get to any of that evidence, so voila, it's an int token. + + __int128_t val = 0; + int i = 0; + size_t slen = c4m_str_byte_len(u8); + char *p = (char *)u8->data; + + for (; i < (int64_t)slen; i++) { + char c = *p++; + + switch (c) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + val *= 10; + val += c - '0'; + if (val > (uint64_t)max_intval) { + LEX_ERROR(c4m_err_lex_int_oflow); + } + continue; + default: + goto finished_int; + } + } +finished_int: { + uint64_t n = (uint64_t)val; + state->pos = state->start + i; + LITERAL_TOK(c4m_tt_int_lit); + state->last_token->literal_value = (void *)n; + return; +} +} + +static inline void +scan_hex_literal(lex_state_t *state) +{ + while (true) { + switch (peek(state)) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + advance(state); + continue; + default: + LITERAL_TOK(c4m_tt_hex_lit); + return; + } + } +} + +// This only gets called if we already passed a leading 0. So we +// inspect the first char; if it's an 'x' or 'X', we go the hex +// route. Otherwise, we go the int route, which promotes to float +// depending on what it sees. + +static inline void +scan_int_float_or_hex_literal(lex_state_t *state) +{ + switch (peek(state)) { + case 'x': + case 'X': + scan_hex_literal(state); + return; + default: + scan_int_or_float_literal(state); + return; + } +} + +static inline void +scan_tristring(lex_state_t *state) +{ + // Here, we already got 3 quotes. We now need to: + // 1. Keep track of line numbers when we see newlines. + // 2. Skip any backtick'd things. + // 3. Count consecutive quotes. + // 4. Error when we get to EOF. + + int quote_count = 0; + + while (true) { + switch (next(state)) { + case 0: + LEX_ERROR(c4m_err_lex_eof_in_str_lit); + case '\n': + at_new_line(state); + break; + case '\\': + advance(state); + break; + case '"': + if (++quote_count == 3) { + LITERAL_TOK(c4m_tt_string_lit); + state->last_token->adjustment = 3; + return; + } + continue; // breaking would reset quote count. + default: + break; + } + quote_count = 0; + } +} + +static void +scan_string_literal(lex_state_t *state) +{ + // This function only finds the end of the string and keeps track + // of line numbers; it does not otherwise attempt to handle any + // parsing of the string itself. + // + // That could either be done after we've seen if there's a lit mod, + // or wait until the parser or ir generator need the data; + // + // My choice is to do it as late as possible, because we could + // then allow people to register litmods and then use them in the + // same source file (or a dependent source file) if done properly. + + // Here, we know we already passed a single quote. We must first + // determine if we're looking at a tristring. + if (peek(state) == '"') { + advance(state); + if (peek(state) != '"') { + // empty string. + goto finish_single_quote; + } + advance(state); + scan_tristring(state); + return; + } + + while (true) { + c4m_codepoint_t c = next(state); + + switch (c) { + case 0: + LEX_ERROR(c4m_err_lex_eof_in_str_lit); + case '\n': + case '\r': + LEX_ERROR(c4m_err_lex_nl_in_str_lit); + case '\\': + // Skip absolutely anything that comes next, + // including a newline. + advance(state); + continue; + case '"': +finish_single_quote: + LITERAL_TOK(c4m_tt_string_lit); + state->last_token->adjustment = 1; + return; + default: + continue; + } + } +} + +// Char literals can be: +// 1. a single character +// 2. \x, \X, \u, \U .. They're all the same. We scan till ' or some +// error condition (which includes another \). +// We don't check the value at this point; default char type will +// error if it's outside the range of valid unicode. We don't even +// check for it being valid hex; we just scan it. +// w/ \u and \U I'll probably accept an optional + after the U since +// officially that's what the unicode consortium does. +// 3. \ followed by any single character. +// -1. If we get a newline or null, it's an error. +// Also, if we get anything after it other than a ', it's an error. +// +// Note specifically that we do NOT turn this into a real char literal +// here. We wait till needed, so we can apply literal modifiers. +static void +scan_char_literal(lex_state_t *state) +{ + switch (next(state)) { + case 0: + LEX_ERROR(c4m_err_lex_eof_in_char_lit); + case '\r': + case '\n': + LEX_ERROR(c4m_err_lex_nl_in_char_lit); + case '\'': + return; + case '\\': + switch (next(state)) { + case 'x': + case 'X': + case 'u': + case 'U': + while (true) { + switch (next(state)) { + case 0: + LEX_ERROR(c4m_err_lex_eof_in_char_lit); + case '\r': + case '\n': + LEX_ERROR(c4m_err_lex_nl_in_char_lit); + case '\\': + LEX_ERROR(c4m_err_lex_esc_in_esc); + case '\'': + goto finish_up; + } + } + default: + break; + } + default: + break; + } + if (next(state) != '\'') { + LEX_ERROR(c4m_err_lex_extra_in_char_lit); + } + +finish_up: + LITERAL_TOK(c4m_tt_char_lit); + state->last_token->adjustment = 1; + return; +} + +static c4m_dict_t *keywords = NULL; + +static inline void +add_keyword(char *keyword, c4m_token_kind_t kind) +{ + c4m_utf8_t *s = c4m_new(c4m_tspec_utf8(), + c4m_kw("cstring", c4m_ka(keyword))); + hatrack_dict_add(keywords, s, (void *)(int64_t)kind); +} + +static inline void +init_keywords() +{ + if (keywords != NULL) { + return; + } + + keywords = c4m_new(c4m_tspec_dict(c4m_tspec_utf32(), c4m_tspec_i64())); + + add_keyword("True", c4m_tt_true); + add_keyword("true", c4m_tt_true); + add_keyword("False", c4m_tt_false); + add_keyword("false", c4m_tt_false); + add_keyword("nil", c4m_tt_nil); + add_keyword("in", c4m_tt_in); + add_keyword("var", c4m_tt_var); + add_keyword("global", c4m_tt_global); + add_keyword("const", c4m_tt_const); + add_keyword("is", c4m_tt_cmp); + add_keyword("and", c4m_tt_and); + add_keyword("or", c4m_tt_or); + add_keyword("not", c4m_tt_not); + add_keyword("if", c4m_tt_if); + add_keyword("elif", c4m_tt_elif); + add_keyword("else", c4m_tt_else); + add_keyword("case", c4m_tt_case); + add_keyword("for", c4m_tt_for); + add_keyword("while", c4m_tt_while); + add_keyword("from", c4m_tt_from); + add_keyword("to", c4m_tt_to); + add_keyword("break", c4m_tt_break); + add_keyword("continue", c4m_tt_continue); + add_keyword("return", c4m_tt_return); + add_keyword("enum", c4m_tt_enum); + add_keyword("func", c4m_tt_func); + add_keyword("object", c4m_tt_object); + add_keyword("typeof", c4m_tt_typeof); + add_keyword("switch", c4m_tt_switch); + add_keyword("infinity", c4m_tt_float_lit); + add_keyword("NaN", c4m_tt_float_lit); + + c4m_gc_register_root(&keywords, 1); +} + +static void +scan_id_or_keyword(lex_state_t *state) +{ + init_keywords(); + + // The pointer should be over an id_start + while (true) { + c4m_codepoint_t c = next(state); + if (!c4m_codepoint_is_c4m_id_continue(c)) { + unput(state); + break; + } + } + + bool found = false; + int64_t length = (int64_t)(state->pos - state->start); + + if (length == 0) { + return; + } + + c4m_utf32_t *as_u32 = c4m_new( + c4m_tspec_utf32(), + c4m_kw("codepoints", + c4m_ka(state->start), + "length", + c4m_ka(length))); + + c4m_token_kind_t r = (c4m_token_kind_t)(int64_t)hatrack_dict_get( + keywords, + c4m_to_utf8(as_u32), + &found); + + if (!found) { + TOK(c4m_tt_identifier); + return; + } + + switch (r) { + case c4m_tt_true: + case c4m_tt_false: + LITERAL_TOK(r); + return; + case c4m_tt_float_lit: { + c4m_utf32_t *u32 = c4m_new( + c4m_tspec_utf32(), + c4m_kw("length", + c4m_ka((int64_t)(state->pos - state->start)), + "codepoints", + c4m_ka(state->start))); + + c4m_utf8_t *u8 = c4m_to_utf8(u32); + double value = strtod((char *)u8->data, NULL); + + LITERAL_TOK(r); + state->last_token->literal_value = *(void **)&value; + return; + } + default: + TOK(r); + return; + } +} + +static void +lex(lex_state_t *state) +{ + while (true) { + c4m_codepoint_t c; + c4m_codepoint_t tmp; + + // When we need to escape from nested loops after + // recognizing a token, it's sometimes easier to short + // circuit here w/ a goto than to break out of all those + // loops just to 'continue'. +lex_next_token: + state->start = state->pos; + state->cur_tok_line_no = state->line_no; + state->cur_tok_offset = state->start - state->line_start; + c = next(state); + + switch (c) { + case 0: + TOK(c4m_tt_eof); + return; + case ' ': + case '\t': + while (true) { + switch (peek(state)) { + case ' ': + case '\t': + advance(state); + continue; + default: + goto lex_next_token; + } + } + TOK(c4m_tt_space); + continue; + case '\r': + tmp = next(state); + if (tmp != '\n') { + LEX_ERROR(c4m_err_lex_stray_cr); + } + // Fallthrough if no exception got raised. + case '\n': + TOK(c4m_tt_newline); + at_new_line(state); + continue; + case '#': + // Line comments go to EOF or new line, and we include the + // newline in the token. + // Double-slash comments work in con4m too; if we see that, + // the lexer jumps back up here once it advances past the + // second slash. +line_comment: + while (true) { + switch (next(state)) { + case '\n': + at_new_line(state); + TOK(c4m_tt_line_comment); + goto lex_next_token; + case 0: // EOF + return; + default: + continue; + } + } + case '~': + TOK(c4m_tt_lock_attr); + continue; + case '`': + TOK(c4m_tt_backtick); + continue; + case '+': + if (peek(state) == '=') { + advance(state); + TOK(c4m_tt_plus_eq); + } + else { + TOK(c4m_tt_plus); + } + skip_optional_newline(state); + continue; + case '-': + switch (peek(state)) { + case '=': + advance(state); + TOK(c4m_tt_minus_eq); + break; + case '>': + advance(state); + TOK(c4m_tt_arrow); + default: + TOK(c4m_tt_minus); + break; + } + skip_optional_newline(state); + continue; + case '*': + if (peek(state) == '=') { + advance(state); + TOK(c4m_tt_mul_eq); + } + else { + TOK(c4m_tt_mul); + } + skip_optional_newline(state); + continue; + case '/': + switch (peek(state)) { + case '=': + advance(state); + TOK(c4m_tt_div_eq); + skip_optional_newline(state); + break; + case '/': + advance(state); + goto line_comment; + case '*': + advance(state); + while (true) { + switch (next(state)) { + case '\n': + at_new_line(state); + continue; + case '*': + if (peek(state) == '/') { + advance(state); + TOK(c4m_tt_long_comment); + goto lex_next_token; + } + continue; + case 0: + LEX_ERROR(c4m_err_lex_eof_in_comment); + default: + continue; + } + } + default: + TOK(c4m_tt_div); + skip_optional_newline(state); + break; + } + continue; + case '%': + if (peek(state) == '=') { + advance(state); + TOK(c4m_tt_mod_eq); + } + else { + TOK(c4m_tt_mod); + } + skip_optional_newline(state); + continue; + case '<': + switch (peek(state)) { + case '=': + advance(state); + TOK(c4m_tt_lte); + break; + case '<': + advance(state); + if (peek(state) == '=') { + advance(state); + TOK(c4m_tt_shl_eq); + } + else { + TOK(c4m_tt_shl); + } + break; + default: + TOK(c4m_tt_lt); + break; + } + skip_optional_newline(state); + continue; + case '>': + switch (peek(state)) { + case '=': + advance(state); + TOK(c4m_tt_gte); + break; + case '>': + advance(state); + if (peek(state) == '=') { + advance(state); + TOK(c4m_tt_shr_eq); + } + else { + TOK(c4m_tt_shr); + } + break; + default: + TOK(c4m_tt_gt); + break; + } + skip_optional_newline(state); + continue; + case '!': + if (peek(state) == '=') { + advance(state); + TOK(c4m_tt_neq); + } + else { + TOK(c4m_tt_not); + } + skip_optional_newline(state); + continue; + case ';': + TOK(c4m_tt_semi); + continue; + case ':': + if (peek(state) == '=') { + advance(state); + TOK(c4m_tt_assign); + state->start = state->pos; + state->cur_tok_line_no = state->line_no; + state->cur_tok_offset = state->start - state->line_start; + scan_unquoted_literal(state); + } + else { + TOK(c4m_tt_colon); + } + continue; + case '=': + if (peek(state) == '=') { + advance(state); + TOK(c4m_tt_cmp); + } + else { + TOK(c4m_tt_assign); + } + skip_optional_newline(state); + continue; + case ',': + TOK(c4m_tt_comma); + skip_optional_newline(state); + continue; + case '.': + TOK(c4m_tt_period); + skip_optional_newline(state); + continue; + case '{': + TOK(c4m_tt_lbrace); + skip_optional_newline(state); + continue; + case '}': + LITERAL_TOK(c4m_tt_rbrace); + continue; + case '[': + TOK(c4m_tt_lbracket); + skip_optional_newline(state); + continue; + case ']': + LITERAL_TOK(c4m_tt_rbracket); + continue; + case '(': + TOK(c4m_tt_lparen); + skip_optional_newline(state); + continue; + case ')': + LITERAL_TOK(c4m_tt_rparen); + continue; + case '&': + if (peek(state) == '=') { + advance(state); + TOK(c4m_tt_bit_and_eq); + } + else { + TOK(c4m_tt_bit_and); + } + skip_optional_newline(state); + continue; + case '|': + if (peek(state) == '=') { + advance(state); + TOK(c4m_tt_bit_or_eq); + } + else { + TOK(c4m_tt_bit_or); + } + skip_optional_newline(state); + continue; + case '^': + if (peek(state) == '=') { + advance(state); + TOK(c4m_tt_bit_xor_eq); + } + else { + TOK(c4m_tt_bit_xor); + } + skip_optional_newline(state); + continue; + case '0': + scan_int_float_or_hex_literal(state); + continue; + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + scan_int_or_float_literal(state); + continue; + case '\'': + scan_char_literal(state); + continue; + case '"': + scan_string_literal(state); + continue; + default: + if (!c4m_codepoint_is_c4m_id_start(c)) { + LEX_ERROR(c4m_err_lex_invalid_char); + } + scan_id_or_keyword(state); + continue; + } + } +} + +bool +c4m_lex(c4m_file_compile_ctx *ctx, c4m_stream_t *stream) +{ + int outkind; + outkind = stream->flags & (C4M_F_STREAM_UTF8_OUT | C4M_F_STREAM_UTF32_OUT); + + c4m_obj_t *raw = c4m_stream_read_all(stream); + c4m_utf32_t *utf32; + lex_state_t lex_info = { + .token_id = 0, + .line_no = 1, + .line_start = 0, + .ctx = ctx, + }; + + if (raw == NULL) { + return false; + } + + switch (outkind) { + case C4M_F_STREAM_UTF32_OUT: + utf32 = (c4m_str_t *)raw; + + if (c4m_str_codepoint_len(utf32) == 0) { + return false; + } + break; + case C4M_F_STREAM_UTF8_OUT: + if (c4m_str_codepoint_len((c4m_utf8_t *)raw) == 0) { + return false; + } + + utf32 = c4m_to_utf32((c4m_utf8_t *)raw); + break; + default: + // A buffer object, which we assume is utf8. + if (c4m_buffer_len((c4m_buf_t *)raw) == 0) { + return false; + } + utf32 = c4m_to_utf32(c4m_buf_to_utf8_string((c4m_buf_t *)raw)); + break; + } + + int len = c4m_str_codepoint_len(utf32); + ctx->raw = utf32; + ctx->tokens = c4m_new(c4m_tspec_xlist(c4m_tspec_ref())); + lex_info.start = (c4m_codepoint_t *)utf32->data; + lex_info.pos = (c4m_codepoint_t *)utf32->data; + lex_info.line_start = (c4m_codepoint_t *)utf32->data; + lex_info.end = &((c4m_codepoint_t *)(utf32->data))[len]; + + bool error = false; + + C4M_TRY + { + lex(&lex_info); + } + C4M_EXCEPT + { + error = true; + } + C4M_TRY_END; + + return !error; +} + +// Start out with any focus on color or other highlighting; just get +// them into a default table for now aimed at debugging, and we'll add +// a facility for styling later. +c4m_grid_t * +c4m_format_tokens(c4m_file_compile_ctx *ctx) +{ + c4m_grid_t *grid = c4m_new(c4m_tspec_grid(), + c4m_kw("start_cols", + c4m_ka(5), + "header_rows", + c4m_ka(1), + "stripe", + c4m_ka(true))); + + c4m_xlist_t *row = c4m_new_table_row(); + int64_t len = c4m_xlist_len(ctx->tokens); + + c4m_xlist_append(row, c4m_rich_lit("Seq #")); + c4m_xlist_append(row, c4m_rich_lit("Type")); + c4m_xlist_append(row, c4m_rich_lit("Line #")); + c4m_xlist_append(row, c4m_rich_lit("Column #")); + c4m_xlist_append(row, c4m_rich_lit("Value")); + c4m_grid_add_row(grid, row); + + for (int64_t i = 0; i < len; i++) { + c4m_token_t *tok = c4m_xlist_get(ctx->tokens, i, NULL); + int info_ix = (int)tok->kind; + + row = c4m_new_table_row(); + c4m_xlist_append(row, c4m_str_from_int(i + 1)); + c4m_xlist_append(row, c4m_rich_lit(tt_info[info_ix].tt_name)); + c4m_xlist_append(row, c4m_str_from_int(tok->line_no)); + c4m_xlist_append(row, c4m_str_from_int(tok->line_offset)); + + if (tt_info[info_ix].show_contents) { + c4m_xlist_append( + row, + c4m_new(c4m_tspec_utf32(), + c4m_kw("length", + c4m_ka((int64_t)(tok->end_ptr - tok->start_ptr)), + "codepoints", + c4m_ka(tok->start_ptr)))); + } + else { + c4m_xlist_append(row, c4m_rich_lit(" ")); + } + + c4m_grid_add_row(grid, row); + } + + return grid; +} diff --git a/src/con4m/grid.c b/src/con4m/grid.c index 19dc9569..91a814e3 100644 --- a/src/con4m/grid.c +++ b/src/con4m/grid.c @@ -188,13 +188,6 @@ c4m_expand_columns(c4m_grid_t *grid, uint64_t num) } } - c4m_render_style_t **col_props = c4m_gc_array_alloc(c4m_render_style_t *, - new_cols); - - for (int i = 0; i < grid->num_cols; i++) { - col_props[i] = grid->col_props[i]; - } - // This needs a lock. grid->cells = cells; grid->num_cols = new_cols; @@ -217,13 +210,6 @@ c4m_grid_expand_rows(c4m_grid_t *grid, uint64_t num) cells[i] = grid->cells[i]; } - c4m_render_style_t **row_props = c4m_gc_array_alloc(c4m_render_style_t *, - new_rows); - - for (int i = 0; i < grid->num_rows; i++) { - row_props[i] = grid->row_props[i]; - } - grid->cells = cells; grid->num_rows = new_rows; } @@ -285,7 +271,8 @@ c4m_grid_add_row(c4m_grid_t *grid, c4m_obj_t container) for (int i = 0; i < grid->num_cols; i++) { c4m_obj_t x = c4m_xlist_get((c4m_xlist_t *)container, i, NULL); if (x == NULL) { - x = (c4m_obj_t)c4m_new(c4m_tspec_utf8(), c4m_kw("cstring", c4m_ka(" "))); + x = (c4m_obj_t)c4m_new(c4m_tspec_utf8(), + c4m_kw("cstring", c4m_ka(" "))); } c4m_grid_set_cell_contents(grid, grid->row_cursor, i, x); } @@ -376,17 +363,8 @@ grid_init(c4m_grid_t *grid, va_list args) c4m_ka(grid))); grid->self = self; - grid->col_props = c4m_gc_array_alloc(c4m_render_style_t *, grid->num_cols); - grid->row_props = c4m_gc_array_alloc(c4m_render_style_t *, - grid->num_rows + spare_rows); - - for (int i = 0; i < min(header_rows, start_rows); i++) { - c4m_set_row_style(grid, i, "th"); - } - - for (int i = 0; i < min(header_cols, start_cols); i++) { - c4m_set_column_style(grid, i, "th"); - } + grid->col_props = NULL; + grid->row_props = NULL; grid->header_rows = header_rows; grid->header_cols = header_cols; @@ -395,32 +373,42 @@ grid_init(c4m_grid_t *grid, va_list args) static inline c4m_render_style_t * get_row_props(c4m_grid_t *grid, int row) { - if (!grid->row_props[row]) { - if (grid->stripe) { - if (row % 2) { - return c4m_lookup_cell_style("tr.even"); - } - else { - return c4m_lookup_cell_style("tr.odd"); - } + c4m_render_style_t *result; + + if (grid->row_props != NULL) { + result = hatrack_dict_get(grid->row_props, (void *)(int64_t)row, NULL); + if (result != NULL) { + return result; } + } - return c4m_lookup_cell_style("tr"); + if (grid->stripe) { + if (row % 2) { + return c4m_lookup_cell_style("tr.even"); + } + else { + return c4m_lookup_cell_style("tr.odd"); + } } else { - return grid->row_props[row]; + return c4m_lookup_cell_style("tr"); } } static inline c4m_render_style_t * get_col_props(c4m_grid_t *grid, int col) { - if (!grid->col_props[col]) { - return c4m_lookup_cell_style("td"); - } - else { - return grid->col_props[col]; + c4m_render_style_t *result; + + if (grid->col_props != NULL) { + result = hatrack_dict_get(grid->col_props, (void *)(int64_t)col, NULL); + + if (result != NULL) { + return result; + } } + + return c4m_lookup_cell_style("td"); } // Contents currently must be a list[list[c4m_obj_t]]. Supply @@ -1726,7 +1714,7 @@ _c4m_ordered_list(flexarray_t *items, ...) width += bp->left_pad + bp->right_pad; bp->dims.units = width; - res->col_props[0] = bp; + c4m_set_column_props(res, 0, bp); c4m_set_column_style(res, 1, item_style); for (int i = 0; i < n; i++) { @@ -1774,7 +1762,7 @@ _c4m_unordered_list(flexarray_t *items, ...) c4m_render_style_t *bp = c4m_lookup_cell_style(bullet_style); bp->dims.units += bp->left_pad + bp->right_pad; - res->col_props[0] = bp; + c4m_set_column_props(res, 0, bp); c4m_set_column_style(res, 1, item_style); for (int i = 0; i < n; i++) { @@ -1885,13 +1873,8 @@ c4m_grid_marshal(c4m_grid_t *grid, c4m_marshal_cstring(grid->td_tag_name, s); c4m_marshal_cstring(grid->th_tag_name, s); - for (int i = 0; i < grid->num_cols; i++) { - c4m_sub_marshal(grid->col_props[i], s, memos, mid); - } - - for (int i = 0; i < grid->num_rows; i++) { - c4m_sub_marshal(grid->row_props[i], s, memos, mid); - } + c4m_sub_marshal(grid->col_props, s, memos, mid); + c4m_sub_marshal(grid->row_props, s, memos, mid); for (int i = 0; i < num_cells; i++) { c4m_sub_marshal((c4m_renderable_t *)grid->cells[i], s, memos, mid); @@ -1915,23 +1898,14 @@ c4m_grid_unmarshal(c4m_grid_t *grid, c4m_stream_t *s, c4m_dict_t *memos) grid->stripe = c4m_unmarshal_i8(s); grid->td_tag_name = c4m_unmarshal_cstring(s); grid->th_tag_name = c4m_unmarshal_cstring(s); + grid->col_props = c4m_sub_unmarshal(s, memos); + grid->row_props = c4m_sub_unmarshal(s, memos); size_t num_cells = (grid->num_rows + grid->spare_rows) * grid->num_cols; grid->cells = c4m_gc_array_alloc(c4m_renderable_t *, num_cells); - grid->col_props = c4m_gc_array_alloc(c4m_render_style_t *, grid->num_cols); - grid->row_props = c4m_gc_array_alloc(c4m_render_style_t *, - grid->num_rows + grid->spare_rows); num_cells = grid->num_rows * grid->num_cols; - for (int i = 0; i < grid->num_cols; i++) { - grid->col_props[i] = c4m_sub_unmarshal(s, memos); - } - - for (int i = 0; i < grid->num_rows; i++) { - grid->row_props[i] = c4m_sub_unmarshal(s, memos); - } - for (size_t i = 0; i < num_cells; i++) { grid->cells[i] = c4m_sub_unmarshal(s, memos); } @@ -2093,6 +2067,52 @@ build_tree_output(c4m_tree_node_t *node, tree_fmt_t *info) info->padstr = prev_pad; } +void +c4m_set_column_props(c4m_grid_t *grid, int col, c4m_render_style_t *s) +{ + if (grid->col_props == NULL) { + grid->col_props = c4m_new(c4m_tspec_dict(c4m_tspec_int(), + c4m_tspec_ref())); + } + + hatrack_dict_put(grid->col_props, (void *)(int64_t)col, s); +} + +void +c4m_set_row_props(c4m_grid_t *grid, int row, c4m_render_style_t *s) +{ + if (grid->row_props == NULL) { + grid->row_props = c4m_new(c4m_tspec_dict(c4m_tspec_int(), + c4m_tspec_ref())); + } + + hatrack_dict_put(grid->row_props, (void *)(int64_t)row, s); +} + +void +c4m_set_column_style(c4m_grid_t *grid, int col, char *tag) +{ + c4m_render_style_t *style = c4m_lookup_cell_style(tag); + + if (!style) { + C4M_CRAISE("Style not found."); + } + + c4m_set_column_props(grid, col, style); +} + +void +c4m_set_row_style(c4m_grid_t *grid, int row, char *tag) +{ + c4m_render_style_t *style = c4m_lookup_cell_style(tag); + + if (!style) { + C4M_CRAISE("Style not found."); + } + + c4m_set_row_props(grid, row, style); +} + // This currently expects a tree[utf8] or tree[utf32]. Eventually // maybe would make it handle anything via it's repr. However, it // should also be restructured to be a single renderable item itself, diff --git a/src/con4m/numbers.c b/src/con4m/numbers.c index 1f990dd3..96ef8b65 100644 --- a/src/con4m/numbers.c +++ b/src/con4m/numbers.c @@ -450,13 +450,16 @@ u64_parse(char *s, } c4m_obj_t -f64_parse(char *s, c4m_lit_syntax_t st, char *litmod, c4m_lit_error_code_t *code) +f64_parse(char *s, + c4m_lit_syntax_t st, + char *litmod, + c4m_lit_error_code_t *code) { char *end; double *lit = c4m_new(c4m_tspec_f64()); double d = strtod(s, &end); - if (end == s || *end) { + if (end == s || !*end) { *code = LE_InvalidChar; return NULL; } diff --git a/src/con4m/object.c b/src/con4m/object.c index 39a8b285..58dc3657 100644 --- a/src/con4m/object.c +++ b/src/con4m/object.c @@ -348,6 +348,16 @@ const c4m_dt_info_t c4m_base_type_info[C4M_NUM_BUILTIN_DTS] = { .dt_kind = C4M_DT_KIND_func, }, { + // The idea from the library level behind refs is that they + // will always be pointers, but perhaps not even to one of our + // heaps. + // + // We need to take this into account if we need to dereference + // something here. Currently, this is only used for holding + // non-objects internally. + // + // Once we add proper references to the language, we might split + // out such internal references, IDK. .name = "ref", .alloc_len = sizeof(void *), .ptr_info = GC_SCAN_ALL, @@ -356,6 +366,10 @@ const c4m_dt_info_t c4m_base_type_info[C4M_NUM_BUILTIN_DTS] = { .hash_fn = HATRACK_DICT_KEY_TYPE_OBJ_PTR, }, { + // This is meant for runtime sum types. It's lightly used + // internally, and we may want to do something more + // sophisticated when deciding how to support this in the + // language proper. .name = "mixed", .typeid = C4M_T_GENERIC, .alloc_len = sizeof(c4m_mixed_t), diff --git a/src/con4m/streams.c b/src/con4m/streams.c index 8c55bbfc..c1221a70 100644 --- a/src/con4m/streams.c +++ b/src/con4m/streams.c @@ -334,7 +334,7 @@ c4m_stream_bytes_to_output(int64_t flags, char *buf, int64_t len) // marshal, so we don't have to go through an object to read out // things like ints that we plan on returning. -c4m_obj_t +c4m_obj_t * c4m_stream_raw_read(c4m_stream_t *stream, int64_t len, char *buf) { // If a buffer is provided, return the length and write into @@ -385,6 +385,54 @@ c4m_stream_raw_read(c4m_stream_t *stream, int64_t len, char *buf) } } +c4m_obj_t * +c4m_stream_read_all(c4m_stream_t *stream) +{ + c4m_xlist_t *l; + int outkind; + + outkind = stream->flags & (C4M_F_STREAM_UTF8_OUT | C4M_F_STREAM_UTF32_OUT); + + switch (outkind) { + case C4M_F_STREAM_UTF8_OUT: + l = c4m_new(c4m_tspec_xlist(c4m_tspec_utf8())); + break; + case C4M_F_STREAM_UTF32_OUT: + l = c4m_new(c4m_tspec_xlist(c4m_tspec_utf32())); + break; + default: + // Buffers. + l = c4m_new(c4m_tspec_xlist(c4m_tspec_buffer())); + break; + } + while (true) { + c4m_obj_t *one = c4m_stream_raw_read(stream, PIPE_BUF, NULL); + if (outkind) { + if (c4m_str_codepoint_len((c4m_str_t *)one) == 0) { + break; + } + } + else { + if (c4m_buffer_len((c4m_buf_t *)one) == 0) { + break; + } + } + c4m_xlist_append(l, one); + } + if (outkind) { + c4m_str_t *s = c4m_str_join(l, c4m_empty_string()); + + if (outkind == C4M_F_STREAM_UTF8_OUT) { + return (c4m_obj_t *)c4m_to_utf8(s); + } + else { + return (c4m_obj_t *)c4m_to_utf32(s); + } + } + else { + return (c4m_obj_t *)c4m_buffer_join(l, NULL); + } +} size_t c4m_stream_raw_write(c4m_stream_t *stream, int64_t len, char *buf) { diff --git a/src/con4m/string.c b/src/con4m/string.c index 769efca1..c1d070cd 100644 --- a/src/con4m/string.c +++ b/src/con4m/string.c @@ -420,6 +420,8 @@ c4m_to_utf8(const c4m_utf32_t *inp) outloc += l; } + res->byte_len = (int32_t)(outloc - (uint8_t *)res->data); + c4m_copy_style_info(inp, res); return res; diff --git a/src/tests/test.c b/src/tests/test.c index c4f0c413..80e1b77c 100644 --- a/src/tests/test.c +++ b/src/tests/test.c @@ -577,6 +577,22 @@ c4m_rich_lit_test() c4m_print(test, test, c4m_kw("no_color", c4m_ka(true), "sep", c4m_ka('&'))); } +void +test_lex() +{ + c4m_str_t *fname = c4m_rich_lit("../tests/modparam.c4m"); + c4m_str_t *mname = c4m_rich_lit("test1"); + c4m_file_compile_ctx *ctx; + + ctx = c4m_new_compile_ctx( + mname, + c4m_kw("uri_path", c4m_ka(fname))); + + c4m_stream_t *stream = c4m_load_code(ctx); + c4m_lex(ctx, stream); + c4m_print(c4m_format_tokens(ctx)); +} + int main(int argc, char **argv, char **envp) { @@ -614,6 +630,8 @@ main(int argc, char **argv, char **envp) c4m_rich_lit_test(); c4m_print(c4m_box_u32((int32_t)-1)); c4m_print(c4m_box_i32((int32_t)-1)); + + test_lex(); C4M_STATIC_ASCII_STR(local_test, "Goodbye!"); // c4m_ansi_render(local_test, sout); c4m_print((c4m_obj_t *)local_test); diff --git a/tests/abort.c4m b/tests/abort.c4m new file mode 100644 index 00000000..e6a99cfe --- /dev/null +++ b/tests/abort.c4m @@ -0,0 +1,4 @@ +x = 1 +x += 1 +x += 1 +abort() \ No newline at end of file diff --git a/tests/assert.c4m b/tests/assert.c4m new file mode 100644 index 00000000..bc9cb2ce --- /dev/null +++ b/tests/assert.c4m @@ -0,0 +1,3 @@ +assert 100 > 88 +assert 100 < 88 +assert true != false diff --git a/tests/assignops.c4m b/tests/assignops.c4m new file mode 100644 index 00000000..8c46abb2 --- /dev/null +++ b/tests/assignops.c4m @@ -0,0 +1,12 @@ +x = 2 +x += 2 +x -= 2 +assert x == 2 + +for i from 0 to 10 { + x += 2 + assert $len == 10 +} + +assert x == 22 + diff --git a/tests/attrs.c4m b/tests/attrs.c4m new file mode 100644 index 00000000..95c2b226 --- /dev/null +++ b/tests/attrs.c4m @@ -0,0 +1,8 @@ +foo.bar = "hello" +assert foo.bar == "hello" +foo.bar = "goodbye" +assert foo.bar == "goodbye" +boo.hoo = [0,1,2,3] +assert boo.hoo[2] == 2 +boo.hoo[2] = 4 +assert boo.hoo[2] == 4 diff --git a/tests/breaks.c4m b/tests/breaks.c4m new file mode 100644 index 00000000..0b7d1c17 --- /dev/null +++ b/tests/breaks.c4m @@ -0,0 +1,8 @@ +x = 12 +x = x + +/* Some +Long +Comment */ +y = x + +print(y) \ No newline at end of file diff --git a/tests/builtins.c4m b/tests/builtins.c4m new file mode 100644 index 00000000..476ad34f --- /dev/null +++ b/tests/builtins.c4m @@ -0,0 +1,6 @@ +x = "Hello, world!"'h1 + +print(x) +print(repr(x)) +print(osname()) +print(arch()) \ No newline at end of file diff --git a/tests/cb.c4m b/tests/cb.c4m new file mode 100644 index 00000000..4083409b --- /dev/null +++ b/tests/cb.c4m @@ -0,0 +1,10 @@ +func somethingcool(x: int) { + return true +} + +x = func somethingcool(int) -> bool +y = func print(`x) -> void + +assert repr(x) == "func somethingcool(int) -> bool" +assert repr(y) == "func con4m_print(`x) -> void" +assert x(2) == true diff --git a/tests/crash.c4m b/tests/crash.c4m new file mode 100644 index 00000000..a9949e12 --- /dev/null +++ b/tests/crash.c4m @@ -0,0 +1,5 @@ +func crasher(x) { + return crasher(x + 1) +} + +crasher(1) \ No newline at end of file diff --git a/tests/dict.c4m b/tests/dict.c4m new file mode 100644 index 00000000..ecf5d357 --- /dev/null +++ b/tests/dict.c4m @@ -0,0 +1,9 @@ +x = { "foo" : "Bar" } +x["bar"] = "Foo" +print(x["bar"]) +print(x["foo"]) +print(x["boz"]) + + + + diff --git a/tests/docstrings.c4m b/tests/docstrings.c4m new file mode 100644 index 00000000..dc19062e --- /dev/null +++ b/tests/docstrings.c4m @@ -0,0 +1,35 @@ +""" +This is my module. +Here is one of its two doc strings. +""" +"Here's it's other doc." + +extern callecho(a: ptr) -> cvoid { + "This has some docs too." + + local: print(x: string) -> void + pure: false +} + +extern echoanint(a: cint) -> cvoid { + local: print(x: int) -> void + pure: false +} + +func fib(x) { + "Also doc'd." + "And double doc'd." + + switch x { + case 0: + return 0 + case 1: + x = x - x + 1 + return x + else: + return fib(x - 2) + fib(x - 1) + } +} + +x = fib(10) +assert x == 55 diff --git a/tests/extern2.c4m b/tests/extern2.c4m new file mode 100644 index 00000000..2b5972d3 --- /dev/null +++ b/tests/extern2.c4m @@ -0,0 +1,9 @@ +extern callecho(a: ptr) -> ptr { + local: print(string) -> void + pure: false +} + + extern splitwrap(a: cstring, cstring) -> ptr { + local: split(x: string, y: string) -> list[string] + pure: true +} diff --git a/tests/extern_syntax.c4m b/tests/extern_syntax.c4m new file mode 100644 index 00000000..6194caf3 --- /dev/null +++ b/tests/extern_syntax.c4m @@ -0,0 +1,32 @@ +extern callecho(a: ptr) -> ptr { + local: print(x: string) -> void + pure: false +} + +extern echoanint(a: cint) -> cvoid { + local: print(x: int) -> void + pure: false +} + +extern exit(status: cint) -> cvoid { + local: exit(x: int) -> void + pure: false +} + +extern abort() -> cvoid { + local: abort() -> void + pure: false +} + + extern splitwrap(a: cstring, cstring) -> ptr { + local: split(x: string, y: string) -> list[string] + pure: true +} + +extern callecho2(a: ptr, b: ptr) -> ptr { + local: print(x: string) -> void + pure: false + holds: a + allocs: b, return +} + diff --git a/tests/fib.c4m b/tests/fib.c4m new file mode 100644 index 00000000..868b7064 --- /dev/null +++ b/tests/fib.c4m @@ -0,0 +1,9 @@ +func n(m) { + if m <= 1 { + return 1 + } + + return n(m - 1) + n(m - 2) +} + +assert n(18) == 4181 \ No newline at end of file diff --git a/tests/labels.c4m b/tests/labels.c4m new file mode 100644 index 00000000..b4903217 --- /dev/null +++ b/tests/labels.c4m @@ -0,0 +1,19 @@ +outer: + while true { + inner: + while true { + break outer + } + } + +outer: +while true { + inner: + while true { + break outer + } +} + +outer: while true { + break +} \ No newline at end of file diff --git a/tests/list.c4m b/tests/list.c4m new file mode 100644 index 00000000..b5639e69 --- /dev/null +++ b/tests/list.c4m @@ -0,0 +1,35 @@ +# Tests basic functionality of lists built into the language: +# 1. copying on assignment +# 2. container iteration +# 3. indexing +# 4. assignment to an index +# 5. slicing +# 6. assigning slices + +x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] +y = x + +for item in y { + assert item == ($i + 1) + x[$i] = 10 +} + +for item in x { + assert item == 10 + assert y[$i] == $i + 1 +} + +sum = 0 + +for item in x[1:-1] { + sum += item +} + +assert sum == 80 +x[1:-1] = [5] +sum = 0 + +for item in x { + sum += item +} +assert sum == 25 \ No newline at end of file diff --git a/tests/lock.c4m b/tests/lock.c4m new file mode 100644 index 00000000..bcefcb1e --- /dev/null +++ b/tests/lock.c4m @@ -0,0 +1,3 @@ +~foo.bar = "hello" +foo.bar = "goodbye" + diff --git a/tests/lock2.c4m b/tests/lock2.c4m new file mode 100644 index 00000000..ed41f436 --- /dev/null +++ b/tests/lock2.c4m @@ -0,0 +1,6 @@ +foo.bar = "hello" +assert foo.bar == "hello" +~foo.bar +foo.bar = "goodbye" +assert foo.bar == "goodbye" + diff --git a/tests/modparam.c4m b/tests/modparam.c4m new file mode 100644 index 00000000..b618ea18 --- /dev/null +++ b/tests/modparam.c4m @@ -0,0 +1,30 @@ +func value_provider() { + return 101 +} + +func example_checker(x) { + result = "" + + if (x % 2) != 0 { + result = "Parameter value must be even." + } +} + +parameter var example1 { + "This should be some documentation." + "Also this." + default: 100 + validator: func example_checker(int) -> string +} + +parameter var example2 { + "This should be some documentation." + "Also this." + initialize: func value_provider() + validator: func example_checker(int) -> string +} + +# Neither of these should happen. +print(example1) +print(example2) +assert false \ No newline at end of file diff --git a/tests/olitmod.c4m b/tests/olitmod.c4m new file mode 100644 index 00000000..d3eed5c6 --- /dev/null +++ b/tests/olitmod.c4m @@ -0,0 +1,6 @@ +shouldwork = "2 gb"'sz + +var shouldnt: duration + +shouldnt = "2 gb"'sz + diff --git a/tests/olits.c4m b/tests/olits.c4m new file mode 100644 index 00000000..95673864 --- /dev/null +++ b/tests/olits.c4m @@ -0,0 +1,2 @@ +way1 = "2 gb"'sz +qwoi := 2 gb diff --git a/tests/params.c4m b/tests/params.c4m new file mode 100644 index 00000000..e6c27b2d --- /dev/null +++ b/tests/params.c4m @@ -0,0 +1,39 @@ +# This will obviously infer `int`, but I eventually want to warn on +# this situation, which is why I've written it this way. +# +# Also, currently, we don't accept the type removal here; we *could* +# infer list | dict | tuple, but that complicates the code generation +# a bit. +# +# But I'm not sure that's particularly necessary. Once there is enough +# API, being able to leverage info about the calls made will generally +# be able to help us make correct inferences. + +func l_test(n: list[`t]) { + n[2] = 100 +} + +func d_test(d: dict[`t, `v]) { + d[1] = 4 +} + +func t_test(t: tuple[`x, `y]) { + t[0] = 1 +} + + +# Testing to make sure that containers have reference semantics. + +x = [1, 2, 3, 4, 5, 6, 7] + +l_test(x) + +assert x[2] == 100 + +z = (2, 3) +t_test(z) +assert z[0] == 1 + +y = { 1: 2, 3 : 4 } +d_test(y) +assert y[1] == 4 \ No newline at end of file diff --git a/tests/resume1.c4m b/tests/resume1.c4m new file mode 100644 index 00000000..83f56839 --- /dev/null +++ b/tests/resume1.c4m @@ -0,0 +1,17 @@ +confspec { + singleton example { + field intval { + type: int + default: 0 + + } + } + + root { + allow: example + } +} + +print(example.intval) +example.intval += 1 +print(example.intval) \ No newline at end of file diff --git a/tests/rich.c4m b/tests/rich.c4m new file mode 100644 index 00000000..f44371fa --- /dev/null +++ b/tests/rich.c4m @@ -0,0 +1,29 @@ +func crappylen(x: list[`t]) { + for i in x { + return $len + } +} + +r = "Hello, world!"'h2 + +r.print() + +n = [["Name", "Country", "City"], + ["John"'em, "USA", "NYC"], + ["Brandon"'em, "USA", "NYC"], + ["Liming"'em, "USA", "NYC"], + ["Miroslav"'em, "USA", "Lawn Guy Land"], + ["Rich"'em, "USA", "Providence, RI"], + ["Matt"'em, "USA", "Orange"], + ["Mark"'em, "UK", "Brighton"], + ["Hugo"'em, "UK", "Brighton"], + ["James"'em, "UK", "Brighton"], + ["Max"'em, "UK", "Brighton"], + ["Theo"'em, "Greece", "Athens"], + ["Thomas"'em, "Greece", "Athens"], + ["James II"'em, "Germany", "Berlin"]] + +assert crappylen(n) == len(n) + +print("Hello, world!"'h1 + "Here's " + "emphasis"'em + " for my table: ") +print(n.table()) diff --git a/tests/richmarshal.c4m b/tests/richmarshal.c4m new file mode 100644 index 00000000..3f218dd4 --- /dev/null +++ b/tests/richmarshal.c4m @@ -0,0 +1,3 @@ +r = "Hello, world!"'h2 + "Yay!"'h4 + +r.print() \ No newline at end of file diff --git a/tests/sections.c4m b/tests/sections.c4m new file mode 100644 index 00000000..9bba7620 --- /dev/null +++ b/tests/sections.c4m @@ -0,0 +1,24 @@ +## all of these should work +hello {} +hello world {} +hello "world" {} +hello { + world : 2 +} + +hello { + world { + + } +} + +# These no longer work, as it is too easy to do this on accident. +# But the error probably should be improved. + +hello world +hello "world" + +# This should not work; should give a use-before-def. +# But for now this is disabled. +foo + diff --git a/tests/sigoverlap.c4m b/tests/sigoverlap.c4m new file mode 100644 index 00000000..557abae1 --- /dev/null +++ b/tests/sigoverlap.c4m @@ -0,0 +1,24 @@ +func ex2(x: `t) { + typeof x { + case int, i32: + print("hi") + case dict[`t, int]: + print("Int value") + case dict[string, string]: + print("Word.") + } +} + +func ex2(lmno: `t) { + typeof lmno { + case int, i32: + print("hi") + case dict[`t, int]: + print("Int value") + case dict[string, string]: + print("Word.") + else { + print("foo") + } + } +} diff --git a/tests/spec1.c4m b/tests/spec1.c4m new file mode 100644 index 00000000..a962295b --- /dev/null +++ b/tests/spec1.c4m @@ -0,0 +1,44 @@ +func somethingcool(x: int) { + return true +} + +confspec { + + singleton test { + user_def_ok: true + #validator: func somethingcool(int) -> bool + + field audit_id { + type: int + default: 176 + range: 0, 100 + #validator: func somethingcool(int) -> bool + } + + } + + named test2 { + field audit_location { + type: string + default: "test 1" + #validator: func somethingcool(int) -> bool + } + } + + root { + allow: test + require: test2 + + field log_level { + type: string + require: true + default: "info" + choices: ["trace", "info", "warn", "error", "fatal"] + } + } +} + +assert test.audit_id == 176 +assert log_level == "info" +log_level = "warn" +assert log_level == "warn" \ No newline at end of file diff --git a/tests/spec2.c4m b/tests/spec2.c4m new file mode 100644 index 00000000..1fb80ea5 --- /dev/null +++ b/tests/spec2.c4m @@ -0,0 +1,38 @@ +confspec { + + singleton test { + user_def_ok: true + + field max { + type: int + default: 0xffffffffffffffff + lock: true + } + + field audit_id { + type: string + default: "foo" + range: "please", "fail" + } + } + + named test2 { + field audit_location { + type: string + default: "test 1" + } + } + + root { + allow: test + allow: test2 + + field log_level { + type: string + require: true + default: "info" + choices: ["trace", "info", "warn", "error", "fatal"] + } + } +} + diff --git a/tests/str.c4m b/tests/str.c4m new file mode 100644 index 00000000..772ab74a --- /dev/null +++ b/tests/str.c4m @@ -0,0 +1,7 @@ +s = ["this", "is", "a", "test"] + +l = join(s, " ") +assert join(s, " ") == "this is a test" +assert upper(l) == "THIS IS A TEST" +assert split(join(s, " "), " ") == s +assert pad("foo", 10) == "foo " \ No newline at end of file diff --git a/tests/tup.c4m b/tests/tup.c4m new file mode 100644 index 00000000..4ff95886 --- /dev/null +++ b/tests/tup.c4m @@ -0,0 +1,25 @@ +x = (1, "foo", 3) +#assert x[0] == 1 +#assert x[1] == "foo" +#assert x[2] == 3 + +y = x +y[1] = "blah" + +assert x[0] == 1 +assert x[1] == "foo" +assert x[2] == 3 + +x[0] = 4 + +assert y[0] == 1 +assert y[1] == "blah" +assert y[2] == 3 + +x[1] = y[1] + +(a, b, c) = x + +assert a == 4 +assert b == "blah" +assert c == 3 \ No newline at end of file diff --git a/tests/use1.c4m b/tests/use1.c4m new file mode 100644 index 00000000..c24c93de --- /dev/null +++ b/tests/use1.c4m @@ -0,0 +1,3 @@ +use use2 from "." + +assert(fact(4) == 24) diff --git a/tests/use2.c4m b/tests/use2.c4m new file mode 100644 index 00000000..7d87c631 --- /dev/null +++ b/tests/use2.c4m @@ -0,0 +1,7 @@ +func fact(x) { + if x < 2 { + return 1; + } + + return x * fact(x - 1); +} \ No newline at end of file diff --git a/tests/usemissing.c4m b/tests/usemissing.c4m new file mode 100644 index 00000000..73cbc6bc --- /dev/null +++ b/tests/usemissing.c4m @@ -0,0 +1 @@ +use missing \ No newline at end of file diff --git a/tests/valueof.c4m b/tests/valueof.c4m new file mode 100644 index 00000000..1904b934 --- /dev/null +++ b/tests/valueof.c4m @@ -0,0 +1,14 @@ +func fib(x) { + switch x { + case 0: + return 0 + case 1: + return 1 + else: + return fib(x - 2) + fib(x - 1) + } +} + +x = fib(10) +assert x == 55 +assert repr(x) == "55"