From ac357d9ef23a07551dc6f5b06bf23f9b165c2b35 Mon Sep 17 00:00:00 2001 From: Tom Wilkie Date: Tue, 10 May 2011 13:49:14 +0100 Subject: [PATCH] libcastle from acunutils.hg:d724e196c846 --- Makefile | 28 ++ castle.h | 420 +++++++++++++++++++ castle_convenience.c | 725 ++++++++++++++++++++++++++++++++ castle_front.c | 641 ++++++++++++++++++++++++++++ castle_ioctl.c | 169 ++++++++ castle_print.c | 139 ++++++ castle_private.h | 52 +++ castle_public.h | 588 ++++++++++++++++++++++++++ castle_utils.c | 262 ++++++++++++ list.h | 979 +++++++++++++++++++++++++++++++++++++++++++ ring.h | 343 +++++++++++++++ versions | 78 ++++ 12 files changed, 4424 insertions(+) create mode 100644 Makefile create mode 100644 castle.h create mode 100644 castle_convenience.c create mode 100644 castle_front.c create mode 100644 castle_ioctl.c create mode 100644 castle_print.c create mode 100644 castle_private.h create mode 100644 castle_public.h create mode 100644 castle_utils.c create mode 100644 list.h create mode 100644 ring.h create mode 100644 versions diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4aeec99 --- /dev/null +++ b/Makefile @@ -0,0 +1,28 @@ +CFLAGS=-fPIC -DPIC -std=gnu99 -ggdb -Wmissing-prototypes -Wmissing-declarations -Wstrict-prototypes -Wall -Wextra -Wshadow -Werror -O2 +LIB_DESTDIR=$(DESTDIR)/usr/lib64 +INC_DESTDIR=$(DESTDIR)/usr/include/castle + +SONAME=libcastle.so.1 + +all: $(SONAME) + +%.o: %.c *.h + gcc -pthread -c -o $@ $< $(CFLAGS) + +$(SONAME): castle_front.o castle_ioctl.o castle_convenience.o castle_print.o castle_utils.o + gcc -pthread -shared -Wl,-Bsymbolic -Wl,-soname,$(SONAME) -Wl,--warn-common -Wl,--fatal-warnings -Wl,--version-script=versions -o $@ $^ $(CFLAGS) + +install: $(SONAME) + mkdir -p $(LIB_DESTDIR) + install $(SONAME) $(LIB_DESTDIR) + ln -sf $(SONAME) $(LIB_DESTDIR)/libcastle.so + if [ -z "$(DONT_RUN_LDCONFIG)" ]; then \ + ldconfig; \ + fi + + mkdir -p $(INC_DESTDIR) + install castle_public.h $(INC_DESTDIR) + install castle.h $(INC_DESTDIR) + +clean: + rm -rf *.o *.so* diff --git a/castle.h b/castle.h new file mode 100644 index 0000000..7c09dd5 --- /dev/null +++ b/castle.h @@ -0,0 +1,420 @@ +#include +#include + +#include "castle_public.h" + +#ifndef __CASTLE_FRONT_H__ +#define __CASTLE_FRONT_H__ + +#define CASTLE_NODE "/dev/castle-fs/control" + +#ifdef __GNUC_STDC_INLINE__ +#define ONLY_INLINE extern __inline__ __attribute__((__gnu_inline__)) +#else +#define ONLY_INLINE extern __inline__ +#endif + +//#define TRACE +#ifdef TRACE +extern volatile unsigned long ops_counter; +extern volatile unsigned long selects_counter; +extern volatile unsigned long ioctls_counter; +#endif + +/* deref the char buffer from a castle_buffer* */ +#define CASTLE_BUF_GET(x) ((x)?(*(char**)(x)):(char*)NULL) + +struct castle_front_connection; + +struct s_castle_buffer +{ + /* buf must be first member for CASTLE_BUF_GET macro */ + char* buf; + size_t buflen; +}; + +/* Type names are all over the place - create a consistent set of typedefs for the public API */ +typedef struct castle_front_connection castle_connection; +typedef struct s_castle_buffer castle_buffer; +typedef c_vl_okey_t castle_key; +typedef c_vl_key_t castle_key_part; +typedef castle_request_t castle_request; +typedef castle_response_t castle_response; +typedef castle_interface_token_t castle_token; +typedef collection_id_t castle_collection; +typedef slave_uuid_t castle_slave_uuid; +typedef version_t castle_version; +typedef c_env_var_t castle_env_var_id; + +typedef void (*castle_callback) (castle_connection *connection, + castle_response *response, void *userdata); + +struct castle_blocking_call +{ + int completed; + int err; + uint64_t length; + castle_token token; +}; + +int castle_shared_buffer_create (castle_connection *conn, + char **buffer, + unsigned long size) __attribute__((warn_unused_result)); +int castle_shared_buffer_destroy (castle_connection *conn, + char *buffer, + unsigned long size); +int castle_shared_buffer_allocate (castle_connection *conn, + castle_buffer **buffer_out, unsigned long size) __attribute__((warn_unused_result)); +int castle_shared_buffer_release (castle_connection *conn, castle_buffer* buffer); +int castle_connect (castle_connection **conn) __attribute__((warn_unused_result)); +void castle_disconnect (castle_connection *conn); +void castle_free (castle_connection *conn); +void castle_request_send (castle_connection *conn, + castle_request *req, + castle_callback *callbacks, + void **userdatas, + int reqs_count); +int castle_request_do_blocking (castle_connection *conn, + castle_request *req, + struct castle_blocking_call *blocking_call); +int castle_request_do_blocking_multi(castle_connection *conn, + castle_request *req, + struct castle_blocking_call *blocking_call, + int count); + +int castle_print_key(FILE *f, castle_key *key); +int castle_print_request(FILE *f, castle_request *req, int print_values); +int castle_print_response(FILE *f, castle_response *resp, int print_values); + +extern void castle_replace_prepare(castle_request *req, castle_collection collection, castle_key *key, uint32_t key_len, char *value, uint32_t value_len) __attribute__((always_inline)); +ONLY_INLINE void castle_replace_prepare(castle_request *req, castle_collection collection, castle_key *key, uint32_t key_len, char *value, uint32_t value_len) { + req->tag = CASTLE_RING_REPLACE; + req->replace.collection_id = collection; + req->replace.key_ptr = key; + req->replace.key_len = key_len; + req->replace.value_ptr = value; + req->replace.value_len = value_len; +} + +extern void castle_remove_prepare(castle_request *req, castle_collection collection, castle_key *key, uint32_t key_len) __attribute__((always_inline)); +ONLY_INLINE void castle_remove_prepare(castle_request *req, castle_collection collection, castle_key *key, uint32_t key_len) { + req->tag = CASTLE_RING_REMOVE; + req->replace.collection_id = collection; + req->replace.key_ptr = key; + req->replace.key_len = key_len; +} + +extern void castle_get_prepare(castle_request *req, castle_collection collection, castle_key *key, uint32_t key_len, char *buffer, uint32_t buffer_len) __attribute__((always_inline)); +ONLY_INLINE void castle_get_prepare(castle_request *req, castle_collection collection, castle_key *key, uint32_t key_len, char *buffer, uint32_t buffer_len) { + req->tag = CASTLE_RING_GET; + req->get.collection_id = collection; + req->get.key_ptr = key; + req->get.key_len = key_len; + req->get.value_ptr = buffer; + req->get.value_len = buffer_len; +} + +extern void castle_iter_start_prepare(castle_request *req, castle_collection collection, castle_key *start_key, uint32_t start_key_len, castle_key *end_key, uint32_t end_key_len, uint64_t flags) __attribute__((always_inline)); +ONLY_INLINE void castle_iter_start_prepare(castle_request *req, castle_collection collection, castle_key *start_key, uint32_t start_key_len, castle_key *end_key, uint32_t end_key_len, uint64_t flags) { + req->tag = CASTLE_RING_ITER_START; + req->iter_start.collection_id = collection; + req->iter_start.start_key_ptr = start_key; + req->iter_start.start_key_len = start_key_len; + req->iter_start.end_key_ptr = end_key; + req->iter_start.end_key_len = end_key_len; + req->iter_start.flags = flags; +} + +extern void castle_iter_next_prepare(castle_request *req, castle_token token, char *buffer, uint32_t buffer_len) __attribute__((always_inline)); +ONLY_INLINE void castle_iter_next_prepare(castle_request *req, castle_token token, char *buffer, uint32_t buffer_len) { + req->tag = CASTLE_RING_ITER_NEXT; + req->iter_next.token = token; + req->iter_next.buffer_ptr = buffer; + req->iter_next.buffer_len = buffer_len; +} + +extern void castle_iter_finish_prepare(castle_request *req, castle_token token) __attribute__((always_inline)); +ONLY_INLINE void castle_iter_finish_prepare(castle_request *req, castle_token token) { + req->tag = CASTLE_RING_ITER_FINISH; + req->iter_finish.token = token; +} + +extern void castle_big_put_prepare(castle_request *req, castle_collection collection, castle_key *key, uint32_t key_len, uint64_t value_len) __attribute__((always_inline)); +ONLY_INLINE void castle_big_put_prepare(castle_request *req, castle_collection collection, castle_key *key, uint32_t key_len, uint64_t value_len) { + req->tag = CASTLE_RING_BIG_PUT; + req->big_put.collection_id = collection; + req->big_put.key_ptr = key; + req->big_put.key_len = key_len; + req->big_put.value_len = value_len; +} + +extern void castle_put_chunk_prepare(castle_request *req, castle_token token, char *buffer, uint32_t buffer_len) __attribute__((always_inline)); +ONLY_INLINE void castle_put_chunk_prepare(castle_request *req, castle_token token, char *buffer, uint32_t buffer_len) { + req->tag = CASTLE_RING_PUT_CHUNK; + req->put_chunk.token = token; + req->put_chunk.buffer_ptr = buffer; + req->put_chunk.buffer_len = buffer_len; +} + +extern void castle_big_get_prepare(castle_request *req, castle_collection collection, castle_key *key, uint32_t key_len) __attribute__((always_inline)); +ONLY_INLINE void castle_big_get_prepare(castle_request *req, castle_collection collection, castle_key *key, uint32_t key_len) { + req->tag = CASTLE_RING_BIG_GET; + req->big_get.collection_id = collection; + req->big_get.key_ptr = key; + req->big_get.key_len = key_len; +} + +extern void castle_get_chunk_prepare(castle_request *req, castle_token token, char *buffer, uint32_t buffer_len) __attribute__((always_inline)); +ONLY_INLINE void castle_get_chunk_prepare(castle_request *req, castle_token token, char *buffer, uint32_t buffer_len) { + req->tag = CASTLE_RING_GET_CHUNK; + req->get_chunk.token = token; + req->get_chunk.buffer_ptr = buffer; + req->get_chunk.buffer_len = buffer_len; +} + +/* Assembles a castle_key at the given location, where buf_len is the + * number of bytes allocated, and dims/key_lens/keys are the + * parameters of the key. Returns zero on success, or the number of + * bytes needed to build the key if it won't fit in buf_len bytes. + * + * When invoked with key == NULL, buf_len == 0, or keys == NULL, only + * returns the necessary size + * + * When invoked with key_lens == NULL, uses strlen() to compute the lengths + */ +uint32_t castle_build_key(castle_key *key, size_t buf_len, int dims, const int *key_lens, const uint8_t * const*keys); +/* Variation on the theme, always returns the number of bytes needed. Success is when that value is <= buf_len. Mostly just here because java-castle wants it */ +uint32_t castle_build_key_len(castle_key *key, size_t buf_len, int dims, const int *key_lens, const uint8_t * const*keys); +/* Returns the number of bytes needed for a key with these parameters */ +uint32_t castle_key_bytes_needed(int dims, const int *key_lens, const uint8_t * const*keys) __attribute__((pure)); + +/* Convenience functions - some of these incur copies */ + +/* Call as castle_alloca_key("foo") or castle_alloca_key("foo", "bar"). Likely to compile down to nothing. Only really useful when calling the convenience functions */ +#define castle_alloca_key(...) ({ \ + const char *ks[] = { __VA_ARGS__ }; \ + uint32_t nr_dims = sizeof(ks) / sizeof(const char *); \ + castle_key *okey = alloca(sizeof(*okey) + sizeof(okey->dims[0]) * nr_dims); \ + int i; \ + okey->nr_dims = nr_dims; \ + for (i = 0; i < nr_dims; i++) { \ + size_t len = strlen(ks[i]); \ + castle_key_part *key = alloca(sizeof(*key) + len); \ + memcpy(key->key, ks[i], len); \ + key->length = len; \ + okey->dims[i] = key; \ + } \ + okey; }) + +castle_key *castle_malloc_key(int dims, const int *key_lens, const uint8_t * const*keys) __attribute__((malloc)); + +extern uint32_t castle_key_dims(const castle_key *key) __attribute__((always_inline)); +ONLY_INLINE uint32_t castle_key_dims(const castle_key *key) { + return key->nr_dims; +} + +extern uint32_t castle_key_elem_len(const castle_key *key, int elem) __attribute__((always_inline)); +ONLY_INLINE uint32_t castle_key_elem_len(const castle_key *key, int elem) { + return key->dims[elem]->length; +} + +extern const uint8_t *castle_key_elem_data(const castle_key *key, int elem) __attribute__((always_inline)); +ONLY_INLINE const uint8_t *castle_key_elem_data(const castle_key *key, int elem) { + return key->dims[elem]->key; +} + +int castle_get (castle_connection *conn, + castle_collection collection, + castle_key *key, + char **value_out, uint32_t *value_len_out) __attribute__((warn_unused_result)); +int castle_replace (castle_connection *conn, + castle_collection collection, + castle_key *key, + char *val, uint32_t val_len); +int castle_remove (castle_connection *conn, + castle_collection collection, + castle_key *key); +int castle_iter_start (castle_connection *conn, + castle_collection collection, + castle_key *start_key, + castle_key *end_key, + castle_token *token_out) __attribute__((warn_unused_result)); +int castle_iter_next (castle_connection *conn, + castle_token token, + struct castle_key_value_list **kvs, + uint32_t buf_size) __attribute__((warn_unused_result)); +int castle_iter_finish (castle_connection *conn, + castle_token token); +int castle_getslice (castle_connection *conn, + castle_collection collection, + castle_key *start_key, + castle_key *end_key, + struct castle_key_value_list **kvs_out, + uint32_t limit) __attribute__((warn_unused_result)); +void castle_kvs_free (struct castle_key_value_list *kvs_out); +int castle_big_put (castle_connection *conn, + castle_collection collection, + castle_key *key, + uint64_t val_length, + castle_token *token_out); +int castle_put_chunk (castle_connection *conn, + castle_token token, + char *value, uint32_t value_len); +int castle_big_get (castle_connection *conn, + castle_collection collection, + castle_key *key, + castle_token *token_out, uint64_t *value_len_out) __attribute__((warn_unused_result)); +int castle_get_chunk (castle_connection *conn, + castle_token token, + char **value_out, uint32_t *value_len_out) __attribute__((warn_unused_result)); + +/* Control functions - ioctls */ + +#define C_TYPE_uint32 uint32_t +#define C_TYPE_uint64 uint64_t +#define C_TYPE_slave_uuid castle_slave_uuid +#define C_TYPE_version castle_version +#define C_TYPE_size size_t +#define C_TYPE_string const char * +#define C_TYPE_collection_id castle_collection +#define C_TYPE_env_var castle_env_var_id +#define C_TYPE_int int +#define C_TYPE_int32 int32_t + +#define CASTLE_IOCTL_0IN_0OUT(_id, _name) \ + int castle_##_id (castle_connection *conn); + +#define CASTLE_IOCTL_1IN_0OUT(_id, _name, _arg_1_t, _arg_1) \ + int castle_##_id (castle_connection *conn, C_TYPE_##_arg_1_t _arg_1); + +#define CASTLE_IOCTL_1IN_1OUT(_id, _name, _arg_1_t, _arg_1, _ret_1_t, _ret) \ + int castle_##_id (castle_connection *conn, C_TYPE_##_arg_1_t _arg_1, \ + C_TYPE_##_ret_1_t * _ret); + +#define CASTLE_IOCTL_2IN_0OUT(_id, _name, _arg_1_t, _arg_1, _arg_2_t, _arg_2) \ + int castle_##_id (castle_connection *conn, \ + C_TYPE_##_arg_1_t _arg_1, C_TYPE_##_arg_2_t _arg_2); \ + +#define CASTLE_IOCTL_3IN_1OUT(_id, _name, \ + _arg_1_t, _arg_1, _arg_2_t, _arg_2, _arg_3_t, _arg_3, \ + _ret_1_t, _ret) \ + int castle_##_id (castle_connection *conn, \ + C_TYPE_##_arg_1_t _arg_1, C_TYPE_##_arg_2_t _arg_2, C_TYPE_##_arg_3_t _arg_3, \ + C_TYPE_##_ret_1_t * _ret); + + +#define CASTLE_IOCTLS \ + CASTLE_IOCTL_1IN_1OUT( \ + claim, \ + CASTLE_CTRL_CLAIM, \ + uint32, dev, slave_uuid, id) \ + CASTLE_IOCTL_1IN_0OUT( \ + release, \ + CASTLE_CTRL_RELEASE, \ + slave_uuid, id) \ + CASTLE_IOCTL_1IN_1OUT( \ + attach, \ + CASTLE_CTRL_ATTACH, \ + version, version, uint32, dev) \ + CASTLE_IOCTL_1IN_0OUT( \ + detach, \ + CASTLE_CTRL_DETACH, \ + uint32, dev) \ + CASTLE_IOCTL_1IN_1OUT( \ + snapshot, \ + CASTLE_CTRL_SNAPSHOT, \ + uint32, dev, version, version) \ + CASTLE_IOCTL_3IN_1OUT( \ + collection_attach, \ + CASTLE_CTRL_COLLECTION_ATTACH, \ + version, version, string, name, size, name_length, collection_id, collection) \ + CASTLE_IOCTL_1IN_0OUT( \ + collection_detach, \ + CASTLE_CTRL_COLLECTION_DETACH, \ + collection_id, collection) \ + CASTLE_IOCTL_1IN_1OUT( \ + collection_snapshot, \ + CASTLE_CTRL_COLLECTION_SNAPSHOT, \ + collection_id, collection, version, version) \ + CASTLE_IOCTL_1IN_1OUT( \ + create, \ + CASTLE_CTRL_CREATE, \ + uint64, size, version, id) \ + CASTLE_IOCTL_1IN_1OUT( \ + clone, \ + CASTLE_CTRL_CLONE, \ + version, version, version, clone) \ + CASTLE_IOCTL_2IN_0OUT( \ + destroy, \ + CASTLE_CTRL_DESTROY, \ + version, version, int32, flag) \ + CASTLE_IOCTL_0IN_0OUT( \ + init, \ + CASTLE_CTRL_INIT) \ + CASTLE_IOCTL_2IN_0OUT( \ + fault, \ + CASTLE_CTRL_FAULT, \ + uint32, fault_id, uint32, fault_arg) \ + CASTLE_IOCTL_2IN_0OUT( \ + slave_evacuate, \ + CASTLE_CTRL_SLAVE_EVACUATE, \ + slave_uuid, id, uint32, force) \ + CASTLE_IOCTL_1IN_0OUT( \ + slave_scan, \ + CASTLE_CTRL_SLAVE_SCAN, \ + uint32, id) \ + CASTLE_IOCTL_1IN_0OUT( \ + thread_priority, \ + CASTLE_CTRL_THREAD_PRIORITY, \ + uint32, nice_value) + +#define PRIVATE_CASTLE_IOCTLS \ + CASTLE_IOCTL_3IN_1OUT( \ + environment_set, \ + CASTLE_CTRL_ENVIRONMENT_SET, \ + env_var, var_id, string, var_str, size, var_len, int, ret) \ + CASTLE_IOCTL_2IN_0OUT( \ + trace_setup, \ + CASTLE_CTRL_TRACE_SETUP, \ + string, dir_str, size, dir_len) \ + CASTLE_IOCTL_0IN_0OUT( \ + trace_start, \ + CASTLE_CTRL_TRACE_START) \ + CASTLE_IOCTL_0IN_0OUT( \ + trace_stop, \ + CASTLE_CTRL_TRACE_STOP) \ + CASTLE_IOCTL_0IN_0OUT( \ + trace_teardown, \ + CASTLE_CTRL_TRACE_TEARDOWN) \ + +CASTLE_IOCTLS +PRIVATE_CASTLE_IOCTLS + +#undef CASTLE_IOCTL_0IN_0OUT +#undef CASTLE_IOCTL_1IN_0OUT +#undef CASTLE_IOCTL_1IN_1OUT +#undef CASTLE_IOCTL_2IN_0OUT +#undef CASTLE_IOCTL_3IN_1OUT + +uint32_t castle_device_to_devno(const char *filename); +const char *castle_devno_to_device(uint32_t devno); + +/* Convenience methods which don't use the hated device number */ +int castle_claim_dev(castle_connection *conn, const char *filename, castle_slave_uuid *id_out); +int castle_attach_dev(castle_connection *conn, castle_version version, const char **filename_out) __attribute__((warn_unused_result)); +int castle_detach_dev(castle_connection *conn, const char *filename); +int castle_snapshot_dev(castle_connection *conn, const char *filename, castle_version *version_out); + +uint32_t castle_max_buffer_size(void); + +/* Shared buffer pool */ +typedef struct s_castle_shared_pool castle_shared_pool; +int castle_shared_pool_create(castle_connection* conn, size_t nsizes, size_t* sizes, size_t* quantities, castle_shared_pool** pool_out); +int castle_shared_pool_destroy(castle_shared_pool* pool); +int castle_shared_pool_lease(castle_shared_pool* pool, castle_buffer** buffer_out, unsigned long size); +int castle_shared_pool_release(castle_shared_pool* pool, castle_buffer* buffer, unsigned long size); + +/* Collection utils */ +int castle_collection_find(const char* name, castle_collection* collection_out); + +#endif /* __CASTLE_FRONT_H__ */ diff --git a/castle_convenience.c b/castle_convenience.c new file mode 100644 index 0000000..22b12ef --- /dev/null +++ b/castle_convenience.c @@ -0,0 +1,725 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "castle.h" + +uint32_t +castle_build_key_len(castle_key *key, size_t buf_len, int dims, const int *key_lens, const uint8_t * const*keys) { + int *lens = (int *)key_lens; + + if (!key_lens && dims) { + if (!keys) + abort(); + lens = alloca(dims * sizeof(lens[0])); + for (int i = 0; i < dims; i++) + lens[i] = strlen((const char *)keys); + } + + uint32_t needed = sizeof(castle_key) + sizeof(key->dims[0]) * dims + sizeof(*key->dims[0]) * dims; + for (int i = 0; i < dims; i++) + needed += lens[i]; + + if (!key || buf_len == 0 || !keys || buf_len < needed) + return needed; + + key->nr_dims = dims; + char *ptr = (char *)key + sizeof(*key) + sizeof(key->dims[0]) * dims; + for (int i = 0; i < dims; i++) { + key->dims[i] = (c_vl_key_t *)ptr; + key->dims[i]->length = lens[i]; + memcpy(key->dims[i]->key, keys[i], lens[i]); + ptr += sizeof(*key->dims[i]) + lens[i]; + } + + assert(ptr - (char *)key == (int64_t)needed); + + return needed; +} + +uint32_t +castle_build_key(castle_key *key, size_t buf_len, int dims, const int *key_lens, const uint8_t * const*keys) { + uint32_t needed = castle_build_key_len(key, buf_len, dims, key_lens, keys); + if (needed <= buf_len) + return 0; + else + return needed; +} + +uint32_t +castle_key_bytes_needed(int dims, const int *key_lens, const uint8_t * const*keys) { + return castle_build_key(NULL, 0, dims, key_lens, keys); +} + +castle_key * +castle_malloc_key(int dims, const int *key_lens, const uint8_t * const*keys) { + uint32_t len = castle_key_bytes_needed(dims, key_lens, keys); + castle_key *key = malloc(len); + if (!key) + return NULL; + if (0 != castle_build_key(key, len, dims, key_lens, keys)) + abort(); + return key; +} + +static int make_key_buffer(castle_connection *conn, castle_key *key, uint32_t extra_space, char **key_buf_out, uint32_t *key_len_out) { + int dims = key->nr_dims; + int lens[dims]; + uint8_t *keys[dims]; + char *key_buf; + uint32_t key_len; + int err; + + for (int i = 0; i < dims; i++) { + lens[i] = key->dims[i]->length; + keys[i] = key->dims[i]->key; + } + + key_len = castle_key_bytes_needed(dims, lens, NULL); + + err = castle_shared_buffer_create(conn, &key_buf, key_len + extra_space); + if (err) + return err; + + { + int r = castle_build_key((castle_key *)key_buf, key_len, dims, lens, (const uint8_t *const *)keys); + if (r != 0) + /* impossible */ + abort(); + } + + *key_buf_out = key_buf; + *key_len_out = key_len; + return 0; +} + +static int make_2key_buffer(castle_connection *conn, castle_key *key1, castle_key *key2, char **key_buf_out, uint32_t *key1_len_out, uint32_t *key2_len_out) { + int dims1 = key1->nr_dims; + int dims2 = key2->nr_dims; + int lens1[dims1]; + int lens2[dims2]; + uint8_t *keys1[dims1]; + uint8_t *keys2[dims2]; + char *key_buf; + uint32_t key1_len; + uint32_t key2_len; + int err; + + for (int i = 0; i < dims1; i++) { + lens1[i] = key1->dims[i]->length; + keys1[i] = key1->dims[i]->key; + } + + for (int i = 0; i < dims2; i++) { + lens2[i] = key2->dims[i]->length; + keys2[i] = key2->dims[i]->key; + } + + key1_len = castle_key_bytes_needed(dims1, lens1, NULL); + key2_len = castle_key_bytes_needed(dims2, lens2, NULL); + + err = castle_shared_buffer_create(conn, &key_buf, key1_len + key2_len); + if (err) + return err; + + { + int r = castle_build_key((castle_key *)key_buf, key1_len, dims1, lens1, (const uint8_t *const *)keys1); + if (r != 0) + /* impossible */ + abort(); + } + + { + int r = castle_build_key((castle_key *)(key_buf + key1_len), key2_len, dims2, lens2, (const uint8_t *const *)keys2); + if (r != 0) + /* impossible */ + abort(); + } + + *key_buf_out = key_buf; + *key1_len_out = key1_len; + *key2_len_out = key2_len; + return 0; +} + +/* These two functions are for copying keys out of shared buffers; keys supplied by the user are not contiguous */ + +/* + * Assumes key is contiguous in memory + */ +static int copy_key(c_vl_okey_t *key, void *buf, uint32_t key_len) +{ + c_vl_okey_t *new_key = buf; + unsigned int i; + + memcpy(buf, key, key_len); + + if ((new_key->nr_dims * sizeof(c_vl_key_t *)) > (key_len - sizeof(c_vl_okey_t))) + { + return -EINVAL; + } + + for (i=0; i < new_key->nr_dims; i++) + new_key->dims[i] = (void *) (((unsigned long) new_key->dims[i]) - + ((unsigned long) key) + ((unsigned long) buf)); + + return 0; +} + +#define max(_a, _b) ((_a) > (_b) ? (_a) : (_b)) + +/* + * Assumes key is contiguous in memory + */ +static uint32_t get_key_len(c_vl_okey_t *key) +{ + + uint32_t i; + unsigned long end = 0; + + for (i=0; i < key->nr_dims; i++) + end = max(end, ((unsigned long) key->dims[i]) + sizeof(c_vl_key_t) + key->dims[i]->length); + + return end - (unsigned long) key; +} + +int castle_get(castle_connection *conn, + collection_id_t collection, + castle_key *key, + char **value_out, uint32_t *value_len_out) +{ + struct castle_blocking_call call; + castle_request_t req; + char *key_buf, *val_buf; + int err = 0; + uint32_t key_len; + uint32_t val_len = PAGE_SIZE; + char *value; + + err = make_key_buffer(conn, key, 0, &key_buf, &key_len); + if (err) goto err0; + + err = castle_shared_buffer_create(conn, &val_buf, val_len); + if (err) goto err1; + + castle_get_prepare(&req, collection, (castle_key *) key_buf, + key_len, val_buf, val_len); + + err = castle_request_do_blocking(conn, &req, &call); + if (err) goto err2; + + if (call.length > val_len) + { + castle_interface_token_t token; + uint64_t val_len_64; + uint32_t remaining, buf_len; + char *buf; + + err = castle_big_get(conn, collection, key, &token, &val_len_64); + if (err) goto err2; + + /* We can't assign val_len_64 to value_len_out unless val_len_64 fits */ + if (val_len_64 > UINT32_MAX) { + err = -EFBIG; + goto err1; + } + + value = malloc(val_len_64); + if (!value) + { + err = -ENOMEM; + goto err2; + } + + remaining = val_len_64; + + while (remaining > 0) + { + err = castle_get_chunk(conn, token, &buf, &buf_len); + if (err) + { + free(value); + goto err2; + } + + memcpy(value + (val_len_64 - remaining), buf, buf_len); + free(buf); + remaining -= buf_len; + } + + *value_len_out = val_len_64; + *value_out = value; + } + else + { + assert(call.length <= UINT32_MAX); + value = malloc(call.length); + if (!value) + { + err = -ENOMEM; + goto err2; + } + + memcpy(value, val_buf, call.length); + + *value_len_out = call.length; + *value_out = value; + } + +err2: castle_shared_buffer_destroy(conn, val_buf, val_len); +err1: castle_shared_buffer_destroy(conn, key_buf, key_len); +err0: return err; +} + +int castle_replace(castle_connection *conn, + collection_id_t collection, + castle_key *key, + char *val, uint32_t val_len) +{ + struct castle_blocking_call call; + castle_request_t req; + char *buf; + uint32_t key_len; + int err = 0; + + err = make_key_buffer(conn, key, val_len, &buf, &key_len); + if (err) goto err0; + + memcpy(buf + key_len, val, val_len); + + castle_replace_prepare(&req, collection, (castle_key *) buf, + key_len, buf + key_len, val_len); + + err = castle_request_do_blocking(conn, &req, &call); + if (err) goto err1; + +err1: castle_shared_buffer_destroy(conn, buf, key_len + val_len); +err0: return err; +} + +int castle_remove(castle_connection *conn, + collection_id_t collection, + castle_key *key) +{ + struct castle_blocking_call call; + castle_request_t req; + char *key_buf; + uint32_t key_len; + int err = 0; + + err = make_key_buffer(conn, key, 0, &key_buf, &key_len); + if (err) goto err0; + + castle_remove_prepare(&req, collection, + (castle_key *) key_buf, key_len); + + err = castle_request_do_blocking(conn, &req, &call); + if (err) goto err1; + +err1: castle_shared_buffer_destroy(conn, key_buf, key_len); +err0: return err; +} + +int castle_iter_start(castle_connection *conn, + collection_id_t collection, + castle_key *start_key, + castle_key *end_key, + castle_interface_token_t *token_out) +{ + struct castle_blocking_call call; + castle_request_t req; + char *key_buf; + uint32_t start_key_len; + uint32_t end_key_len; + int err = 0; + + *token_out = 0; + + err = make_2key_buffer(conn, start_key, end_key, &key_buf, &start_key_len, &end_key_len); + if (err) goto err0; + + castle_iter_start_prepare(&req, collection, + (castle_key *) key_buf, start_key_len, + (castle_key *) ((unsigned long)key_buf + (unsigned long)start_key_len), end_key_len, + CASTLE_RING_ITER_FLAG_NONE); + + err = castle_request_do_blocking(conn, &req, &call); + if (err) goto err1; + + *token_out = call.token; + +err1: castle_shared_buffer_destroy(conn, key_buf, start_key_len + end_key_len); +err0: return err; +} + +void castle_kvs_free(struct castle_key_value_list *kvs) +{ + while (kvs) + { + struct castle_key_value_list *next = kvs->next; + + if (kvs->key) free(kvs->key); + if (kvs->val->val) free(kvs->val->val); + if (kvs->val) free(kvs->val); + free(kvs); + + kvs = next; + } +} + +int castle_iter_next(castle_connection *conn, + castle_interface_token_t token, + struct castle_key_value_list **kvs, + uint32_t buf_size) +{ + struct castle_blocking_call call; + castle_request_t req; + struct castle_key_value_list *head = NULL, *tail = NULL, *copy = NULL, *curr = NULL; + char *buf; + int err = 0; + + *kvs = NULL; + + err = castle_shared_buffer_create(conn, &buf, buf_size); + if (err) goto err0; + + castle_iter_next_prepare(&req, token, buf, buf_size); + + err = castle_request_do_blocking(conn, &req, &call); + if (err) goto err1; + + curr = (struct castle_key_value_list *)buf; + + // NULL first key means no entries + if (curr->key == NULL) + { + head = NULL; + } + else + { + while (curr != NULL) + { + unsigned long key_len = get_key_len(curr->key); + + copy = calloc(1, sizeof(*copy)); + if (!copy) + { + err = -ENOMEM; + goto err2; + } + + copy->key = malloc(key_len); + if (!copy->key) + { + err = -ENOMEM; + goto err2; + } + err = copy_key(curr->key, copy->key, key_len); + if (err) goto err2; + + copy->val = malloc(sizeof(*(copy->val))); + if (!copy->val) + { + err = -ENOMEM; + goto err2; + } + memcpy(copy->val, curr->val, sizeof(*(copy->val))); + + if (curr->val->type & CVT_TYPE_INLINE) + { + copy->val->val = malloc(copy->val->length); + if (!copy->val->val) + { + err = -ENOMEM; + goto err2; + } + memcpy(copy->val->val, curr->val->val, copy->val->length); + } + else + { + char *val; + uint32_t val_len; + err = castle_get(conn, curr->val->collection_id, curr->key, &val, &val_len); + if (err) + goto err2; + copy->val->length = val_len; + copy->val->val = (uint8_t *)val; + + /* pretend it is inline since the value is now 'inline' */ + copy->val->type = CVT_TYPE_INLINE; + } + + if (!head) + head = copy; + else + tail->next = copy; + + tail = copy; + curr = curr->next; + } + } + + *kvs = head; + + castle_shared_buffer_destroy(conn, buf, buf_size); + + return 0; + +err2: castle_kvs_free(head); +err1: castle_shared_buffer_destroy(conn, buf, buf_size); +err0: return err; +} + +int castle_iter_finish(castle_connection *conn, + castle_token token) +{ + struct castle_blocking_call call; + castle_request_t req; + int err = 0; + + castle_iter_finish_prepare(&req, token); + + err = castle_request_do_blocking(conn, &req, &call); + + return err; +} + +// 'limit' means the maximum number of values to retrieve. 0 means unlimited. +int castle_getslice(castle_connection *conn, + collection_id_t collection, + castle_key *start_key, + castle_key *end_key, + struct castle_key_value_list **kvs_out, + uint32_t limit) +{ + castle_token token; + int ret; + uint32_t count = 0; + struct castle_key_value_list *head = NULL, *tail = NULL, *curr = NULL; + + ret = castle_iter_start(conn, collection, start_key, + end_key, &token); + if (ret) goto err0; + + while (!ret && (limit == 0 || count < limit)) + { + ret = castle_iter_next(conn, token, &curr, PAGE_SIZE); + if (ret) goto err1; + + if (!curr) + break; + + if (!head) + head = curr; + else + tail->next = curr; + + while (curr) + { + count++; + if (count == limit) + { + if (curr->next) + castle_kvs_free(curr->next); + curr->next = NULL; + break; + } + tail = curr; // tail will be one behind curr + curr = curr->next; + } + } + + ret = castle_iter_finish(conn, token); + if (ret) + goto err1; + + *kvs_out = head; + + return ret; + +err1: + if (head) + castle_kvs_free(head); +err0: + return ret; +} + +int castle_big_put (castle_connection *conn, + collection_id_t collection, + castle_key *key, + uint64_t val_length, + castle_interface_token_t *token_out) +{ + struct castle_blocking_call call; + castle_request_t req; + char *key_buf; + uint32_t key_len; + int err = 0; + + *token_out = 0; + + err = make_key_buffer(conn, key, 0, &key_buf, &key_len); + if (err) goto err0; + + castle_big_put_prepare(&req, collection, + (castle_key *) key_buf, key_len, val_length); + + err = castle_request_do_blocking(conn, &req, &call); + if (err) goto err1; + + *token_out = call.token; + +err1: castle_shared_buffer_destroy(conn, key_buf, key_len); +err0: return err; +} + +int castle_put_chunk (castle_connection *conn, + castle_interface_token_t token, + char *value, uint32_t value_len) +{ + struct castle_blocking_call call; + castle_request_t req; + char *buf; + int err = 0; + + err = castle_shared_buffer_create(conn, &buf, value_len); + if (err) goto err0; + + memcpy(buf, value, value_len); + + castle_put_chunk_prepare(&req, token, buf, value_len); + + err = castle_request_do_blocking(conn, &req, &call); + if (err) goto err1; + + err1: castle_shared_buffer_destroy(conn, buf, value_len); + err0: return err; +} + +int castle_big_get (castle_connection *conn, + collection_id_t collection, + castle_key *key, + castle_interface_token_t *token_out, uint64_t *value_len_out) +{ + struct castle_blocking_call call; + castle_request_t req; + char *key_buf; + uint32_t key_len; + int err = 0; + + *token_out = 0; + + err = make_key_buffer(conn, key, 0, &key_buf, &key_len); + if (err) goto err0; + + castle_big_get_prepare(&req, collection, + (castle_key *) key_buf, key_len); + + err = castle_request_do_blocking(conn, &req, &call); + if (err) goto err1; + + *token_out = call.token; + *value_len_out = call.length; + +err1: castle_shared_buffer_destroy(conn, key_buf, key_len); +err0: return err; +} + +#define VALUE_LEN (1024 * 1024) +int castle_get_chunk (castle_connection *conn, + castle_interface_token_t token, + char **value_out, uint32_t *value_len_out) +{ + struct castle_blocking_call call; + castle_request_t req; + char *buf; + char *value; + int err = 0; + + *value_out = NULL; + + err = castle_shared_buffer_create(conn, &buf, VALUE_LEN); + if (err) goto err0; + + castle_get_chunk_prepare(&req, token, buf, VALUE_LEN); + + err = castle_request_do_blocking(conn, &req, &call); + if (err) goto err1; + + value = malloc(VALUE_LEN); + memcpy(value, buf, VALUE_LEN); + + *value_out = value; + *value_len_out = call.length; + + err1: castle_shared_buffer_destroy(conn, buf, VALUE_LEN); + err0: return err; +} + +uint32_t +castle_device_to_devno(const char *filename) { + struct stat st; + if (0 != stat(filename, &st)) + return 0; + + return st.st_rdev; +} + +static char **devnames = NULL; +static int devname_count = 0; + +static void +alloc_devnames_to(int minor) { + if (devname_count > minor) + return; + + int old_devname_count = devname_count; + devname_count = minor + 1; + devnames = realloc(devnames, devname_count * sizeof(devnames[0])); + for (int i = old_devname_count; i < devname_count; i++) { + if (-1 == asprintf(&devnames[i], "/dev/castle-fs/castle-fs-%d", i)) + abort(); + } +} + +const char * +castle_devno_to_device(uint32_t devno) { + int minor = minor(devno); + /* This is a bit wrong, but it'll do for now. castle-fs gets some + arbitrary major assigned, and then names its devices based on the + minor. We can find the path from that */ + + alloc_devnames_to(minor); + return devnames[minor]; +} + +int +castle_claim_dev(castle_connection *conn, const char *filename, castle_slave_uuid *id_out) { + return castle_claim(conn, castle_device_to_devno(filename), id_out); +} + +int +castle_attach_dev(castle_connection *conn, castle_version version, const char **filename_out) { + uint32_t devno; + int ret = castle_attach(conn, version, &devno); + if (ret == 0) + *filename_out = castle_devno_to_device(devno); + return ret; +} + +int +castle_detach_dev(castle_connection *conn, const char *filename) { + return castle_detach(conn, castle_device_to_devno(filename)); +} + +int +castle_snapshot_dev(castle_connection *conn, const char *filename, castle_version *version_out) { + return castle_snapshot(conn, castle_device_to_devno(filename), version_out); +} diff --git a/castle_front.c b/castle_front.c new file mode 100644 index 0000000..c72ff8c --- /dev/null +++ b/castle_front.c @@ -0,0 +1,641 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "castle_public.h" +#include "castle.h" + +#include "castle_private.h" + +//#define DEBUG +#ifndef DEBUG +#define debug(_f, ...) ((void)0) +#else +#define debug(_f, _a...) (printf(_f, ##_a)) +#endif + +#define atomic_inc(x) ({int z __attribute__((unused)); z = __sync_fetch_and_add(x, 1); }) +#define atomic_dec(x) ({int z __attribute__((unused)); z = __sync_fetch_and_sub(x, 1); }) + +static pthread_mutex_t blocking_call_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t blocking_call_cond = PTHREAD_COND_INITIALIZER; + +static void *castle_response_thread(void *data) +{ + castle_connection *conn = data; + castle_response_t *resp; + RING_IDX i, rp; + fd_set readfds; + int ret, more_to_do; + int max_fd = conn->select_pipe[0] > conn->fd ? conn->select_pipe[0] : conn->fd; + + while (!conn->response_thread_exit) + { + debug("pre-select %d\n", conn->fd); + + /* select destroys readfds, so must create it every time */ + FD_ZERO(&readfds); + FD_SET(conn->fd, &readfds); + FD_SET(conn->select_pipe[0], &readfds); + + ret = select(max_fd + 1, &readfds, (fd_set *)NULL, (fd_set *)NULL, NULL); + if (ret <= 0) + { + debug("select returned %d\n", ret); + continue; + } + + if (conn->response_thread_exit) + break; + +#ifdef TRACE + selects_counter++; +#endif + + debug("post-select\n"); + + do { + /* rsp_prod is written from the kernel, but in a strictly + ordered way and it fits inside a cache line. Reading it + at any point is safe */ + rp = conn->front_ring.sring->rsp_prod; + + /* This memory barrier is copied from Xen, which runs on + powerpc, which has weak memory ordering. We only run on + amd64, which has strong memory ordering: in particular, + reads are never reordered with respect to other reads. We + suspect that this is nothing more than a waste of a few + hundred cycles. Revisit this if we have performance + issues here - perhaps it can be safely removed */ + xen_rmb(); + + /* rsp_cons is safe for concurrency; only read or written from this thread */ + for (i = conn->front_ring.rsp_cons; i != rp; i++) { + resp = RING_GET_RESPONSE(&conn->front_ring, i); + + if (__builtin_expect(conn->debug_log != NULL, 0)) { + flockfile(conn->debug_log); + castle_print_response(conn->debug_log, resp, conn->debug_values); + fprintf(conn->debug_log, "\n"); + fflush(conn->debug_log); + funlockfile(conn->debug_log); + } + + if(conn->callbacks[resp->call_id].callback) + conn->callbacks[resp->call_id].callback(conn, resp, conn->callbacks[resp->call_id].data); + + if (conn->callbacks[resp->call_id].token) { + unsigned int x = conn->callbacks[resp->call_id].token % CASTLE_STATEFUL_OPS; + assert(x < CASTLE_STATEFUL_OPS); + assert(conn->outstanding_stateful_requests[x] > 0); + int new = __sync_sub_and_fetch(&conn->outstanding_stateful_requests[x], 1); + if (new == 0) + atomic_inc(&conn->front_ring.reserved); + } + + pthread_mutex_lock(&conn->free_mutex); + list_add(&conn->callbacks[resp->call_id].list, &conn->free_callbacks); + pthread_mutex_unlock(&conn->free_mutex); + + debug("Got response %d\n", resp->call_id); + +#ifdef TRACE + ops_counter++; +#endif + } + + conn->front_ring.rsp_cons = i; + assert(conn->front_ring.reserved <= RING_FREE_REQUESTS(&conn->front_ring)); + + RING_FINAL_CHECK_FOR_RESPONSES(&conn->front_ring, more_to_do); + + pthread_mutex_lock(&conn->ring_mutex); + pthread_cond_broadcast(&conn->ring_cond); + pthread_mutex_unlock(&conn->ring_mutex); + + } while (more_to_do); + } + + debug("castle_response_thread exiting...\n"); + + pthread_mutex_lock(&conn->ring_mutex); + conn->response_thread_running = 0; + pthread_cond_broadcast(&conn->ring_cond); + pthread_mutex_unlock(&conn->ring_mutex); + + return NULL; +} + +int castle_shared_buffer_create(castle_connection *conn, + char **buffer_out, unsigned long size) +{ + void *buffer; + buffer = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, conn->fd, 0); + if (buffer == MAP_FAILED) + { + debug("Failed to map page %d\n", errno); + return -errno; + } + + // TODO keep track of buffers to free up + + *buffer_out = buffer; + + return 0; +} + +int castle_shared_buffer_destroy(castle_connection *conn __attribute__((unused)), + char *buffer, unsigned long size) +{ + int ret = munmap(buffer, size); + + if (ret == -1) + return -errno; + + return 0; +} + +int castle_shared_buffer_allocate(castle_connection *conn, + castle_buffer **buffer_out, unsigned long size) +{ + castle_buffer* buffer = calloc(1, sizeof(*buffer)); + if(!buffer) + return -ENOMEM; + + int rc = 0; + if((rc = castle_shared_buffer_create(conn, &buffer->buf, size))) + { + debug("Failed to create shared buffer: %d\n", rc); + free(buffer); + return rc; + } + + buffer->buflen = size; + *buffer_out = buffer; + return 0; +} + +int castle_shared_buffer_release(castle_connection *conn, castle_buffer* buffer) +{ + int rc = castle_shared_buffer_destroy(conn, buffer->buf, buffer->buflen); + if(!rc) + free(buffer); + return rc; +} + +static int set_non_blocking(int fd) +{ + int flags; + + if (-1 == (flags = fcntl(fd, F_GETFL, 0))) + flags = 0; + + return fcntl(fd, F_SETFL, flags | O_NONBLOCK); +} + +int castle_connect(castle_connection **conn_out) +{ + int err; + castle_connection *conn = calloc(1, sizeof(*conn)); + + *conn_out = NULL; + + if (!conn) + { + debug("Failed to malloc\n"); + err = -ENOMEM; + goto err0; + } + + conn->fd = open(CASTLE_NODE, O_RDWR); + if (conn->fd == -1) + { + debug("Failed to open %s, errno=%d (\"%s\")\n", + CASTLE_NODE, errno, strerror(errno)); + err = -errno; + goto err1; + } + debug("Got fd %d\n", conn->fd); + + { + int version = castle_protocol_version(conn); + if (version != CASTLE_PROTOCOL_VERSION) { + debug("Protocol version mismatch (kernel %d, libcastle %d)\n", version, CASTLE_PROTOCOL_VERSION); + err = -ENOPROTOOPT; + goto err2; + } + } + + conn->shared_ring = mmap(NULL, CASTLE_RING_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, conn->fd, 0); + if (conn->shared_ring == MAP_FAILED) + { + debug("Failed to map page errno=%d (\"%s\")\n", + errno, strerror(errno)); + err = -errno; + goto err2; + } + debug("Got shared ring at address %p\n", conn->shared_ring); + + FRONT_RING_INIT(&conn->front_ring, conn->shared_ring, CASTLE_RING_SIZE, CASTLE_STATEFUL_OPS); + + conn->callbacks = malloc(sizeof(struct castle_front_callback) * RING_SIZE(&conn->front_ring)); + if (!conn->callbacks) + { + debug("Failed to malloc callbacks!"); + err = -ENOMEM; + goto err3; + } + + INIT_LIST_HEAD(&conn->free_callbacks); + for (unsigned int i=0; ifront_ring); i++) + list_add(&conn->callbacks[i].list, &conn->free_callbacks); + + err = pthread_mutex_init(&conn->free_mutex, NULL); + if (err) + { + debug("Failed to create mutex, err=%d\n", err); + err = -err; + goto err4; + } + + err = pthread_mutex_init(&conn->ring_mutex, NULL); + if (err) + { + debug("Failed to create mutex, err=%d\n", err); + err = -err; + goto err5; + } + debug("Initialised mutex\n"); + + err = pthread_cond_init(&conn->ring_cond, NULL); + if (err) + { + debug("Failed to create condition, err=%d\n", err); + err = -err; + goto err6; + } + debug("Initialise condition\n"); + + if (pipe(conn->select_pipe) == -1) + { + debug("Failed to create pipe to unblock select, errno=%d (\"%s\")", + errno, strerror(errno)); + err = -errno; + goto err7; + } + + if (set_non_blocking(conn->select_pipe[0]) == -1) + { + debug("Failed to set non-block on fd %d, errno=%d (\"%s\")", + conn->select_pipe[0], errno, strerror(errno)); + err = -errno; + goto err8; + } + + if (set_non_blocking(conn->select_pipe[1]) == -1) + { + debug("Failed to set non-block on fd %d, errno=%d (\"%s\")", + conn->select_pipe[1], errno, strerror(errno)); + err = -errno; + goto err8; + } + + { + const char *debug_env = getenv("CASTLE_DEBUG"); + if (debug_env) { + const char *debug_file = getenv("CASTLE_DEBUG_FILE"); + const char *debug_fd = getenv("CASTLE_DEBUG_FD"); + if (debug_file) + conn->debug_log = fopen(debug_file, "a"); + else if (debug_fd) + conn->debug_log = fdopen(atoi(debug_fd), "a"); + else { + int err_fd = dup(2); + if (-1 != err_fd) + conn->debug_log = fdopen(err_fd, "a"); + } + if (getenv("CASTLE_DEBUG_VALUES")) + conn->debug_values = 1; + else + conn->debug_values = 0; + } + else + conn->debug_log = NULL; + } + + conn->response_thread_running = 1; + conn->response_thread_exit = 0; + err = pthread_create(&conn->response_thread, NULL, castle_response_thread, conn); + if (err) + { + debug("Failed to create response thread, err=%d\n", err); + err = -err; + goto err9; + } + debug("Response thread started\n"); + + *conn_out = conn; + + return 0; + +err9: fclose(conn->debug_log); +err8: close(conn->select_pipe[0]); close(conn->select_pipe[1]); +err7: pthread_cond_destroy(&conn->ring_cond); +err6: pthread_mutex_destroy(&conn->ring_mutex); +err5: pthread_mutex_destroy(&conn->free_mutex); +err4: free(conn->callbacks); +err3: munmap(conn->shared_ring, CASTLE_RING_SIZE); +err2: close(conn->fd); +err1: free(conn); +err0: return err; +} + +void castle_disconnect(castle_connection *conn) +{ + ssize_t write_ret; + + if (!conn) + return; + + if (conn->fd == -1) + return; + + /* It doesn't matter that this flag is not protected by the lock + * as long as the response thread eventually notices, and by + * writing to the pipe the select will now never block, so it + * should wake it up and notice eventually */ + conn->response_thread_exit = 1; + write_ret = write(conn->select_pipe[1], "\0", 1); + if (write_ret < 0) + printf("write failed in castle_front_disconnect, error %d.\n", errno); + + /* Wait for the response thread to go away */ + pthread_mutex_lock(&conn->ring_mutex); + while(conn->response_thread_running) + pthread_cond_wait(&conn->ring_cond, &conn->ring_mutex); + pthread_mutex_unlock(&conn->ring_mutex); + + // TODO: free buffers / wait for them to be free'd? + + pthread_mutex_lock(&conn->ring_mutex); + munmap(conn->shared_ring, CASTLE_RING_SIZE); + close(conn->fd); + conn->fd = -1; + pthread_mutex_unlock(&conn->ring_mutex); + + pthread_mutex_lock(&blocking_call_mutex); + pthread_cond_broadcast(&blocking_call_cond); + pthread_mutex_unlock(&blocking_call_mutex); + + close(conn->select_pipe[0]); close(conn->select_pipe[1]); +} + +void castle_free(castle_connection *conn) +{ + if (!conn) + return; + + if (conn->fd >= 0) + castle_disconnect(conn); + + pthread_cond_destroy(&conn->ring_cond); + pthread_mutex_destroy(&conn->ring_mutex); + pthread_mutex_destroy(&conn->free_mutex); + free(conn->callbacks); + free(conn); +} + +static castle_interface_token_t +get_request_token(castle_request_t *req) { + switch (req->tag) { + case CASTLE_RING_ITER_NEXT: + return req->iter_next.token; + case CASTLE_RING_ITER_FINISH: + return req->iter_finish.token; + case CASTLE_RING_PUT_CHUNK: + return req->put_chunk.token; + case CASTLE_RING_GET_CHUNK: + return req->get_chunk.token; + default: + return 0; + } +} + +static bool +ring_full_for(castle_connection *conn, castle_request_t *req) { + castle_interface_token_t token = get_request_token(req); + + if (token) { + unsigned int x = token % CASTLE_STATEFUL_OPS; + assert(x < CASTLE_STATEFUL_OPS); + if (conn->outstanding_stateful_requests[x] == 0) + return false; + } + + int space = RING_FREE_REQUESTS(&conn->front_ring); + int reserved = conn->front_ring.reserved; + /* space < reserved when we've bumped the reserve count for a new + reponse but haven't updated the ring yet */ + return space <= reserved; +} + +void castle_request_send(castle_connection *conn, + castle_request_t *req, castle_callback *callbacks, + void **datas, int reqs_count) +{ + // TODO check return codes? + int notify, i=0, call_id; + struct castle_front_callback *callback; + + /* This mutex is currently being abused for two distinct purposes, + creating false scheduling hazards: it is both the condition + variable mutex for ring_cond, which is used for signalling + between the response thread and this function, and it is also + used to protect the value req_prod_pvt from simultaneous + executions of this function. + + TODO: break it apart into two mutexes + + TODO+1: change req_prod_pvt to be a lock-free atomic + compare-and-set mechanism instead of using mutexes, so that + multiple threads can write to the ring without context switches + */ + pthread_mutex_lock(&conn->ring_mutex); + + while (i < reqs_count) + { + if (conn->fd < 0) + break; + + /* RING_FULL is based on nr_ents (safe), rsp_cons (written only + by the response thread and always within a cache line, hence + safe), and req_prod_pvt (currently a concurrency hazard due + to lack of atomic compare-and-set logic) */ + + while (ring_full_for(conn, &req[i])) + pthread_cond_wait(&conn->ring_cond, &conn->ring_mutex); + + /* Another RING_FULL hazard on req_prod_pvt */ + while (i < reqs_count && !ring_full_for(conn, &req[i])) + { + pthread_mutex_lock(&conn->free_mutex); + assert(!list_empty(&conn->free_callbacks)); + callback = list_entry(conn->free_callbacks.next, struct castle_front_callback, list); + list_del(&callback->list); + pthread_mutex_unlock(&conn->free_mutex); + + call_id = callback - conn->callbacks; + req[i].call_id = call_id; + + callback->callback = callbacks ? callbacks[i] : NULL; + callback->data = datas ? datas[i] : NULL; + callback->token = get_request_token(&req[i]); + + if (__builtin_expect(conn->debug_log != NULL, 0)) { + flockfile(conn->debug_log); + castle_print_request(conn->debug_log, &req[i], conn->debug_values); + fprintf(conn->debug_log, "\n"); + fflush(conn->debug_log); + funlockfile(conn->debug_log); + } + + if (callback->token) { + unsigned int x = callback->token % CASTLE_STATEFUL_OPS; + assert(x < CASTLE_STATEFUL_OPS); + int old = __sync_fetch_and_add(&conn->outstanding_stateful_requests[x], 1); + if (old == 0) { + assert(conn->front_ring.reserved > 0); + atomic_dec(&conn->front_ring.reserved); + } + } + + /* More req_prod_pvt hazards */ + castle_request_t *ring_req = RING_GET_REQUEST(&conn->front_ring, conn->front_ring.req_prod_pvt); + debug("Putting request %d at position %d\n", call_id, conn->front_ring.req_prod_pvt); + conn->front_ring.req_prod_pvt++; + + memcpy(ring_req, req + i, sizeof(*ring_req)); + + i++; + } + + /* This uses req_prod (safe due to strict ordering guarantees) and req_prod_pvt (hazard) */ + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&conn->front_ring, notify); + + debug("notify=%d\n", notify); + + if (notify) + { +#ifdef TRACE + ioctls_counter++; +#endif + ioctl(conn->fd, CASTLE_IOCTL_POKE_RING); + } + } + + pthread_mutex_unlock(&conn->ring_mutex); +} + +static void castle_blocking_callback(castle_connection *conn __attribute__((unused)), + castle_response_t *resp, void *data) +{ + struct castle_blocking_call *call = data; + + call->err = resp->err; + call->token = resp->token; + call->length = resp->length; + + pthread_mutex_lock(&blocking_call_mutex); + call->completed = 1; + pthread_cond_broadcast(&blocking_call_cond); + pthread_mutex_unlock(&blocking_call_mutex); +} + +int castle_request_do_blocking(castle_connection *conn, + castle_request_t *req, + struct castle_blocking_call *blocking_call) +{ + /* + * Warning variables these will be on stack but used elsewhere, only safe as + * this function sleeps until they are finished with (see castle_blocking_callback) + */ + void *blocking_calls = blocking_call; + castle_callback callback = &castle_blocking_callback; + + blocking_call->completed = 0; + + castle_request_send(conn, req, &callback, &blocking_calls, 1); + + pthread_mutex_lock(&blocking_call_mutex); + while (conn->fd >= 0 && !blocking_call->completed) + pthread_cond_wait(&blocking_call_cond, &blocking_call_mutex); + pthread_mutex_unlock(&blocking_call_mutex); + + if (conn->fd < 0 && !blocking_call->completed) { + blocking_call->completed = 1; + blocking_call->err = EUNATCH; + } + + return blocking_call->err; +} + +int castle_request_do_blocking_multi(castle_connection *conn, + castle_request_t *req, + struct castle_blocking_call *blocking_call, + int count) +{ + int i; + void **blocking_calls; + castle_callback *callbacks; + + blocking_calls = malloc(sizeof(struct castle_blocking_call *) * count); + callbacks = malloc(sizeof(callbacks[0]) * count); + + for (i = 0; i < count; i++) + { + blocking_call[i].completed = 0; + blocking_calls[i] = &blocking_call[i]; + callbacks[i] = castle_blocking_callback; + } + + castle_request_send(conn, req, callbacks, blocking_calls, count); + + pthread_mutex_lock(&blocking_call_mutex); + for (i = 0; i < count; i++) + { + while (conn->fd >= 0 && !blocking_call[i].completed) + pthread_cond_wait(&blocking_call_cond, &blocking_call_mutex); + } + pthread_mutex_unlock(&blocking_call_mutex); + + free(blocking_calls); + free(callbacks); + + for (i = 0; i < count; i++) + if (conn->fd < 0 && !blocking_call[i].completed) { + blocking_call[i].completed = 1; + blocking_call[i].err = EUNATCH; + } + + for (i = 0; i < count; i++) + if (blocking_call[i].err) + return blocking_call[i].err; + + return 0; +} + +uint32_t +castle_max_buffer_size(void) { + return 1048576; +} diff --git a/castle_ioctl.c b/castle_ioctl.c new file mode 100644 index 0000000..6daf2f1 --- /dev/null +++ b/castle_ioctl.c @@ -0,0 +1,169 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "castle.h" +#include "castle_public.h" + +#include "castle_private.h" + +int castle_protocol_version(struct castle_front_connection *conn) { + struct castle_control_ioctl ctl; + int ret; + ctl.cmd = CASTLE_CTRL_PROTOCOL_VERSION; + + ret = ioctl(conn->fd, CASTLE_CTRL_PROTOCOL_VERSION_IOCTL, &ctl); + if (__builtin_expect(conn->debug_log != NULL, 0)) { + fprintf(conn->debug_log, "protocol_version() = %d, %d\n", ret, ctl.protocol_version.version); + fflush(conn->debug_log); + } + if (ret) + return -1; + + return ctl.protocol_version.version; +} + +#define C_PRINTF_uint32 "%u" +#define C_PRINTF_uint64 "%lu" +#define C_PRINTF_slave_uuid "%u" +#define C_PRINTF_version "%u" +#define C_PRINTF_size "%zu" +#define C_PRINTF_string "%s" +#define C_PRINTF_collection_id "%u" +#define C_PRINTF_env_var "%u" +#define C_PRINTF_int "%d" +#define C_PRINTF_int32 "%d" + +#define CASTLE_IOCTL_0IN_0OUT(_id, _name) \ +int castle_##_id (struct castle_front_connection *conn) \ +{ \ + struct castle_control_ioctl ctl; \ + int ret; \ + ctl.cmd = _name; \ + \ + ret = ioctl(conn->fd, _name##_IOCTL, &ctl); \ + if (__builtin_expect(conn->debug_log != NULL, 0)) { \ + fprintf(conn->debug_log, #_id "() = %d\n", ret); \ + fflush(conn->debug_log); \ + } \ + if (ret) \ + return errno; \ + \ + return ctl._id.ret; \ +} + +#define CASTLE_IOCTL_1IN_0OUT(_id, _name, _arg_1_t, _arg_1) \ +int castle_##_id (struct castle_front_connection *conn, \ + C_TYPE_##_arg_1_t _arg_1) \ +{ \ + struct castle_control_ioctl ctl; \ + int ret; \ + ctl.cmd = _name; \ + ctl._id._arg_1 = _arg_1; \ + \ + ret = ioctl(conn->fd, _name##_IOCTL, &ctl); \ + if (__builtin_expect(conn->debug_log != NULL, 0)) { \ + fprintf(conn->debug_log, \ + #_id "(" #_arg_1 " = " C_PRINTF_##_arg_1_t ") = %d\n", \ + _arg_1, ret); \ + fflush(conn->debug_log); \ + } \ + if (ret) \ + return errno; \ + \ + return ctl._id.ret; \ +} + +#define CASTLE_IOCTL_1IN_1OUT(_id, _name, _arg_1_t, _arg_1, _ret_1_t, _ret) \ +int castle_##_id (struct castle_front_connection *conn, \ + C_TYPE_##_arg_1_t _arg_1, \ + C_TYPE_##_ret_1_t * _ret##_out) \ +{ \ + struct castle_control_ioctl ctl; \ + int ret; \ + ctl.cmd = _name; \ + ctl._id._arg_1 = _arg_1; \ + \ + ret = ioctl(conn->fd, _name##_IOCTL, &ctl); \ + if (__builtin_expect(conn->debug_log != NULL, 0)) { \ + fprintf(conn->debug_log, #_id "(" #_arg_1 " = " C_PRINTF_##_arg_1_t \ + ", " #_ret " = " C_PRINTF_##_ret_1_t ") = %d\n", _arg_1, ctl._id.ret, ret); \ + fflush(conn->debug_log); \ + } \ + if (ret) \ + return errno; \ + \ + * _ret##_out = ctl._id._ret; \ + \ + return ctl._id.ret; \ +} \ + +#define CASTLE_IOCTL_2IN_0OUT(_id, _name, _arg_1_t, _arg_1, _arg_2_t, _arg_2) \ +int castle_##_id (struct castle_front_connection *conn, \ + C_TYPE_##_arg_1_t _arg_1, C_TYPE_##_arg_2_t _arg_2) \ +{ \ + struct castle_control_ioctl ctl; \ + int ret; \ + ctl.cmd = _name; \ + ctl._id._arg_1 = _arg_1; \ + ctl._id._arg_2 = _arg_2; \ + \ + ret = ioctl(conn->fd, _name##_IOCTL, &ctl); \ + if (__builtin_expect(conn->debug_log != NULL, 0)) { \ + fprintf(conn->debug_log, \ + #_id "(" #_arg_1 " = " C_PRINTF_##_arg_1_t \ + ", " #_arg_2 " = " C_PRINTF_##_arg_2_t ") = %d\n", \ + _arg_1, _arg_2, ret); \ + fflush(conn->debug_log); \ + } \ + if (ret) \ + return errno; \ + \ + return ctl._id.ret; \ +} + +#define CASTLE_IOCTL_3IN_1OUT(_id, _name, _arg_1_t, _arg_1, _arg_2_t, _arg_2, \ + _arg_3_t, _arg_3, _ret_1_t, _ret) \ +int castle_##_id (struct castle_front_connection *conn, \ + C_TYPE_##_arg_1_t _arg_1, \ + C_TYPE_##_arg_2_t _arg_2, \ + C_TYPE_##_arg_3_t _arg_3, \ + C_TYPE_##_ret_1_t * _ret##_out) \ +{ \ + struct castle_control_ioctl ctl; \ + int ret; \ + \ + ctl.cmd = _name; \ + ctl._id._arg_1 = _arg_1; \ + ctl._id._arg_2 = _arg_2; \ + ctl._id._arg_3 = _arg_3; \ + \ + ret = ioctl(conn->fd, _name##_IOCTL, &ctl); \ + if (__builtin_expect(conn->debug_log != NULL, 0)) { \ + fprintf(conn->debug_log, #_id "(" #_arg_1 " = " C_PRINTF_##_arg_1_t \ + ", " #_arg_2 " = " C_PRINTF_##_arg_2_t \ + ", " #_arg_3 " = " C_PRINTF_##_arg_3_t \ + ", " #_ret " = " C_PRINTF_##_ret_1_t ") = %d\n", \ + _arg_1, _arg_2, _arg_3, ctl._id.ret, ret); \ + fflush(conn->debug_log); \ + } \ + if (ret) \ + return errno; \ + \ + * _ret##_out = ctl._id._ret; \ + \ + return ctl._id.ret; \ +} + +CASTLE_IOCTLS +PRIVATE_CASTLE_IOCTLS diff --git a/castle_print.c b/castle_print.c new file mode 100644 index 0000000..60da06c --- /dev/null +++ b/castle_print.c @@ -0,0 +1,139 @@ +#include +#include +#include + +#include "castle.h" + +/* Because it's awkward to keep track of the proper result when making + multiple stdio calls, these two handle it - assuming that 'len' + accumulates the length for return, -1 is returned on error, and... */ + +/* ...you're calling a function which returns length on success */ +#define call_stdio_len(exp) ({ int r = (exp); if (r < 0) return -1; len += r; r; }) +/* ...you're calling a function which returns the character written on success */ +#define call_stdio_char(exp) ({ int r = (exp); if (r < 0) return -1; len += 1; r; }) + +static int +print_escaped(FILE *f, const char *str, unsigned int str_len) { + int len = 0; + for (unsigned int i = 0; i < str_len; i++) { + char c = str[i]; + if (isprint(c) && c != ',' && c != '(' && c != ')') { + call_stdio_char(fputc(c, f)); + } + else { + call_stdio_len(fprintf(f, "\\x%02hhx", (uint8_t)c)); + } + } + return len; +} + +int +castle_print_key(FILE *f, castle_key *key) { + int len = 0; + call_stdio_len(fprintf(f, "(")); + for (unsigned int i = 0; i < castle_key_dims(key); i++) { + if (i > 0) + call_stdio_char(fputc(',', f)); + + const uint8_t *elem = castle_key_elem_data(key, i); + uint32_t elem_len = castle_key_elem_len(key, i); + if (elem_len == 0) + call_stdio_len(fprintf(f, "(invalid zero-length element)")); + else + call_stdio_len(print_escaped(f, (const char *)elem, elem_len)); + } + call_stdio_char(fputc(')', f)); + return len; +} + +static const char *command_names[] = { + [CASTLE_RING_REPLACE] = "replace", + [CASTLE_RING_BIG_PUT] = "big_put", + [CASTLE_RING_PUT_CHUNK] = "put_chunk", + [CASTLE_RING_GET] = "get", + [CASTLE_RING_BIG_GET] = "big_get", + [CASTLE_RING_GET_CHUNK] = "get_chunk", + [CASTLE_RING_ITER_START] = "iter_start", + [CASTLE_RING_ITER_NEXT] = "iter_next", + [CASTLE_RING_ITER_FINISH] = "iter_finish", + [CASTLE_RING_REMOVE] = "remove", +}; + +int +castle_print_request(FILE *f, castle_request *req, int print_values) { + int len = 0; + call_stdio_len(fprintf(f, "%s(call_id=%u, ", command_names[req->tag], req->call_id)); + switch (req->tag) { + case CASTLE_RING_REPLACE: + { + char key_buf[req->replace.key_len]; + memcpy(key_buf, req->replace.key_ptr, req->replace.key_len); + call_stdio_len(fprintf(f, "collection=%u, key=", req->replace.collection_id)); + call_stdio_len(castle_print_key(f, req->replace.key_ptr)); + if (print_values) { + call_stdio_len(fprintf(f, ", value=")); + call_stdio_len(print_escaped(f, req->replace.value_ptr, req->replace.value_len)); + } + break; + } + case CASTLE_RING_BIG_PUT: + call_stdio_len(fprintf(f, "collection=%u, key=", req->big_put.collection_id)); + call_stdio_len(castle_print_key(f, req->big_put.key_ptr)); + call_stdio_len(fprintf(f, ", len=%llu", (long long unsigned)req->big_put.value_len)); + break; + case CASTLE_RING_PUT_CHUNK: + call_stdio_len(fprintf(f, "token=%u", req->put_chunk.token)); + if (print_values) { + call_stdio_len(fprintf(f, ", data=")); + call_stdio_len(print_escaped(f, req->put_chunk.buffer_ptr, req->put_chunk.buffer_len)); + } + break; + case CASTLE_RING_GET: + call_stdio_len(fprintf(f, "collection=%u, key=", req->get.collection_id)); + call_stdio_len(castle_print_key(f, req->get.key_ptr)); + call_stdio_len(fprintf(f, ", buffer=%p, buffer_len=%u", req->get.value_ptr, req->get.value_len)); + break; + case CASTLE_RING_BIG_GET: + call_stdio_len(fprintf(f, "collection=%u, key=", req->big_get.collection_id)); + call_stdio_len(castle_print_key(f, req->big_get.key_ptr)); + break; + case CASTLE_RING_GET_CHUNK: + call_stdio_len(fprintf(f, "token=%u, buffer=%p, buffer_len=%u", req->get_chunk.token, req->get_chunk.buffer_ptr, req->get_chunk.buffer_len)); + break; + case CASTLE_RING_ITER_START: + call_stdio_len(fprintf(f, "collection=%u, start_key=", req->iter_start.collection_id)); + call_stdio_len(castle_print_key(f, req->iter_start.start_key_ptr)); + call_stdio_len(fprintf(f, ", end_key=")); + call_stdio_len(castle_print_key(f, req->iter_start.end_key_ptr)); + call_stdio_len(fprintf(f, ", flags=")); + if (req->iter_start.flags & ~CASTLE_RING_ITER_FLAG_NO_VALUES) + call_stdio_len(fprintf(f, "error(%llx)", (long long unsigned)req->iter_start.flags)); + else if (req->iter_start.flags & CASTLE_RING_ITER_FLAG_NO_VALUES) + call_stdio_len(fprintf(f, "no_values")); + else + call_stdio_len(fprintf(f, "none")); + break; + case CASTLE_RING_ITER_NEXT: + call_stdio_len(fprintf(f, "token=%u, buffer=%p, buffer_len=%u", req->iter_next.token, req->iter_next.buffer_ptr, req->iter_next.buffer_len)); + break; + case CASTLE_RING_ITER_FINISH: + call_stdio_len(fprintf(f, "token=%u", req->iter_next.token)); + break; + case CASTLE_RING_REMOVE: + call_stdio_len(fprintf(f, "collection=%u, key=", req->remove.collection_id)); + call_stdio_len(castle_print_key(f, req->remove.key_ptr)); + break; + default: + call_stdio_len(fprintf(f, "unknown(%x)", req->tag)); + break; + } + call_stdio_char(fputc(')', f)); + return len; +} + +/* TODO: implement print_values */ +int +castle_print_response(FILE *f, castle_response *resp, int print_values __attribute__((unused))) { + return fprintf(f, "response(call_id=%u, err=%u, length=%llu, token=%u)", resp->call_id, resp->err, (long long unsigned)resp->length, resp->token); +} diff --git a/castle_private.h b/castle_private.h new file mode 100644 index 0000000..99a914d --- /dev/null +++ b/castle_private.h @@ -0,0 +1,52 @@ +#ifndef __CASTLE_PRIVATE_H__ +#define __CASTLE_PRIVATE_H__ + +#include +#include + +#include "ring.h" +#include "list.h" + +#include "castle_public.h" + +DEFINE_RING_TYPES(castle, castle_request_t, castle_response_t); + +int castle_protocol_version(struct castle_front_connection *conn); + +struct castle_front_callback +{ + struct list_head list; + castle_callback callback; + void *data; + castle_interface_token_t token; +}; + +struct castle_front_connection +{ + int fd; /* tests rely on this being the first field */ + castle_sring_t *shared_ring; + castle_front_ring_t front_ring; + int next_call_id; + /* pointer to array of callback pointers, corresponding to requests on ring */ + + pthread_mutex_t free_mutex; + struct castle_front_callback *callbacks; + struct list_head free_callbacks; + + int outstanding_stateful_requests[CASTLE_STATEFUL_OPS]; + + pthread_t response_thread; + int response_thread_exit; + int response_thread_running; + + pthread_mutex_t ring_mutex; + pthread_cond_t ring_cond; + + /* pipe fds to wake up select in the response thread */ + int select_pipe[2]; + + FILE * debug_log; + int debug_values; +} PACKED; + +#endif /* __CASTLE_PRIVATE_H__ */ diff --git a/castle_public.h b/castle_public.h new file mode 100644 index 0000000..3f63241 --- /dev/null +++ b/castle_public.h @@ -0,0 +1,588 @@ +#ifndef __CASTLE_PUBLIC_H__ +#define __CASTLE_PUBLIC_H__ + +#include +#include +#ifndef __KERNEL__ +#include +#endif + +#define CASTLE_PROTOCOL_VERSION 7 + +#define PACKED __attribute__((packed)) + +#ifndef __KERNEL__ +#define PAGE_SIZE 4096 +#define PAGE_SHIFT 12 +/* These must be the same as castle.h in fs.hg */ +enum { + CVT_TYPE_INLINE = 0x10, + CVT_TYPE_ONDISK = 0x20, + CVT_TYPE_INVALID = 0x30, +}; +#endif + +typedef enum { + NO_FAULT, /* 0 */ + MERGE_FAULT, /* 1 */ + EXTENT_FAULT, /* 2 */ + FREESPACE_FAULT, /* 3 */ + REPLACE_FAULT, /* 4 */ + GET_FAULT, /* 5 */ + BIG_PUT_FAULT, /* 6 */ + BIG_GET_FAULT, /* 7 */ + CHECKPOINT_FAULT, /* 8 */ + CLAIM_FAULT, /* 9 */ + FS_INIT_FAULT, /*10 */ + FS_RESTORE_FAULT, /*11 */ + FINI_FAULT, /*12 */ + SLAVE_OOS_ERR, /*13 */ + REBUILD_FAULT1, /*14 Fault between extent remaps*/ + REBUILD_FAULT2, /*15 Fault in mid extent remap*/ +} c_fault_t; + +typedef enum { + BUILD_ID = 0, + LAST_ENV_VAR_ID, +} c_env_var_t; + +/** + * Trace providers. + */ +typedef enum { + TRACE_CACHE, /**< Cache events */ + TRACE_DA, /**< DA events */ + TRACE_DA_MERGE, /**< Merge events */ + TRACE_DA_MERGE_UNIT, /**< Merge unit events */ +} c_trc_prov_t; + +/** + * Event types. + */ +typedef enum { + TRACE_VALUE, /**< Value being reported */ + TRACE_MARK, /**< Event has occurred */ + TRACE_START, /**< Event has started */ + TRACE_END, /**< Event has ended */ +} c_trc_type_t; + +/** + * Cache trace variables. + */ +typedef enum { + TRACE_CACHE_CHECKPOINT_ID, /**< Checkpoint running. */ + TRACE_CACHE_DIRTY_PGS_ID, /**< Number of c2ps on the dirtylist. */ + TRACE_CACHE_CLEAN_PGS_ID, /**< Number of c2ps on the cleanlist. */ + TRACE_CACHE_FREE_PGS_ID, /**< Number of c2ps on the freelist. */ + TRACE_CACHE_RESERVE_PGS_ID, /**< Number of c2ps on the reserve freelist. */ + TRACE_CACHE_CLEAN_BLKS_ID, /**< Number of c2bs on the cleanlist. */ + TRACE_CACHE_FREE_BLKS_ID, /**< Number of c2bs on the freelist. */ + TRACE_CACHE_RESERVE_BLKS_ID, /**< Number of c2bs on the reserve freelist. */ + TRACE_CACHE_SOFTPIN_BLKS_ID, /**< Number of softpin c2bs in the cache. */ + TRACE_CACHE_BLOCK_VICTIMS_ID, /**< Number of c2bs evicted from the cache. */ + TRACE_CACHE_SOFTPIN_VICTIMS_ID, /**< Number of softpinned c2bs evicted from the cache. */ + TRACE_CACHE_READS_ID, /**< Number of reads this tick. */ + TRACE_CACHE_WRITES_ID, /**< Number of writes this tick. */ +} c_trc_cache_var_t; + +/** + * DA trace variables. + */ +typedef enum { + TRACE_DA_INSERTS_DISABLED_ID, /**< Whether inserts are enabled or not. */ + TRACE_DA_MERGE_ID, /**< Merge */ + TRACE_DA_MERGE_MODLIST_ITER_INIT_ID, /**< Modlist iter init */ + TRACE_DA_MERGE_UNIT_ID, /**< Merge unit */ + TRACE_DA_MERGE_UNIT_C2B_SYNC_WAIT_BT_NS_ID, + TRACE_DA_MERGE_UNIT_C2B_SYNC_WAIT_DATA_NS_ID, + TRACE_DA_MERGE_UNIT_GET_C2B_NS_ID, + TRACE_DA_MERGE_UNIT_MOBJ_COPY_NS_ID, +} c_trc_da_var_t; + +#define MERGE_START_FLAG (1U<<0) +#define MERGE_END_FLAG (1U<<1) + +/* Bump the magic version byte (LSB) when c_trc_evt_t changes. */ +#define CASTLE_TRACE_MAGIC 0xCAE5E10D +typedef struct castle_trace_event { + uint32_t magic; + struct timeval timestamp; + int cpu; /**< CPU ID that allocated structure. */ + c_trc_prov_t provider; /**< Event provider */ + c_trc_type_t type; /**< Event type */ + int var; /**< Event variable */ + uint64_t v1; + uint64_t v2; + uint64_t v3; + uint64_t v4; + uint64_t v5; +} c_trc_evt_t; + +typedef uint32_t transfer_id_t; +typedef uint32_t slave_uuid_t; +typedef uint32_t collection_id_t; +typedef uint32_t version_t; /**< Version ID type, unique across all Doubling Arrays. */ +#define INVAL_VERSION ((version_t)-1) +#define VERSION_INVAL(_v) ((_v) == INVAL_VERSION) + +/* And our IOCTL code is: */ +#define CASTLE_CTRL_IOCTL_TYPE (0xCA) + +/* Subtypes for CASTLE_CTRL_ used for IOCTLs */ +#define CASTLE_CTRL_CLAIM 1 +#define CASTLE_CTRL_RELEASE 2 +#define CASTLE_CTRL_ATTACH 3 +#define CASTLE_CTRL_DETACH 4 +#define CASTLE_CTRL_CREATE 5 +#define CASTLE_CTRL_CLONE 6 +#define CASTLE_CTRL_SNAPSHOT 7 +#define CASTLE_CTRL_INIT 8 +#define CASTLE_CTRL_TRANSFER_CREATE 11 +#define CASTLE_CTRL_TRANSFER_DESTROY 12 +#define CASTLE_CTRL_COLLECTION_ATTACH 13 +#define CASTLE_CTRL_COLLECTION_DETACH 14 +#define CASTLE_CTRL_COLLECTION_SNAPSHOT 15 +#define CASTLE_CTRL_RESERVE_FOR_TRANSFER 16 +#define CASTLE_CTRL_VALID_STATS 17 +#define CASTLE_CTRL_INVALID_STATS 18 +#define CASTLE_CTRL_SET_TARGET 19 +#define CASTLE_CTRL_DESTROY 20 +#define CASTLE_CTRL_PROTOCOL_VERSION 21 +#define CASTLE_CTRL_FAULT 22 +#define CASTLE_CTRL_ENVIRONMENT_SET 23 +#define CASTLE_CTRL_TRACE_SETUP 24 +#define CASTLE_CTRL_TRACE_START 25 +#define CASTLE_CTRL_TRACE_STOP 26 +#define CASTLE_CTRL_TRACE_TEARDOWN 27 +#define CASTLE_CTRL_SLAVE_EVACUATE 28 +#define CASTLE_CTRL_THREAD_PRIORITY 29 +#define CASTLE_CTRL_SLAVE_SCAN 30 + +typedef struct castle_control_cmd_claim { + uint32_t dev; /* IN */ + int ret; /* OUT */ + slave_uuid_t id; /* OUT */ +} cctrl_cmd_claim_t; + +typedef struct castle_control_cmd_release { + slave_uuid_t id; /* IN */ + int ret; /* OUT */ +} cctrl_cmd_release_t; + +typedef struct castle_control_cmd_attach { + version_t version; /* IN */ + int ret; /* OUT */ + uint32_t dev; /* OUT */ +} cctrl_cmd_attach_t; + +typedef struct castle_control_cmd_detach { + uint32_t dev; /* IN */ + int ret; /* OUT */ +} cctrl_cmd_detach_t; + +typedef struct castle_control_cmd_snapshot { + uint32_t dev; /* IN */ + int ret; /* OUT */ + version_t version; /* OUT */ +} cctrl_cmd_snapshot_t; + +typedef struct castle_control_cmd_collection_attach { + version_t version; /* IN */ + const char *name; /* IN */ + size_t name_length; /* IN */ + int ret; /* OUT */ + collection_id_t collection; /* OUT */ +} cctrl_cmd_collection_attach_t; + +typedef struct castle_control_cmd_collection_detach { + collection_id_t collection; /* IN */ + int ret; /* OUT */ +} cctrl_cmd_collection_detach_t; + +typedef struct castle_control_cmd_collection_snapshot { + collection_id_t collection; /* IN */ + int ret; /* OUT */ + version_t version; /* OUT */ +} cctrl_cmd_collection_snapshot_t; + +typedef struct castle_control_cmd_create { + uint64_t size; /* IN */ + int ret; /* OUT */ + version_t id; /* OUT */ +} cctrl_cmd_create_t; + +enum { + CASTLE_DESTROY_TREE = 0, + CASTLE_DESTROY_VERSION = 1, +}; +typedef struct castle_control_cmd_destroy { + version_t version; /* IN */ + int flag; /* IN */ + int ret; /* OUT */ +} cctrl_cmd_destroy_t; + +typedef struct castle_control_cmd_clone { + version_t version; /* IN */ + int ret; /* OUT */ + version_t clone; /* OUT */ +} cctrl_cmd_clone_t; + +typedef struct castle_control_cmd_init { + int ret; /* OUT */ +} cctrl_cmd_init_t; + +typedef struct castle_control_cmd_transfer_create { + version_t version; /* IN */ + uint32_t direction; /* IN */ + int ret; /* OUT */ + transfer_id_t id; /* OUT */ +} cctrl_cmd_transfer_create_t; + +typedef struct castle_control_cmd_transfer_destroy { + transfer_id_t id; /* IN */ + int ret; /* OUT */ +} cctrl_cmd_transfer_destroy_t; + +typedef struct castle_control_cmd_protocol_version { + int ret; /* OUT */ + uint32_t version; /* OUT */ +} cctrl_cmd_protocol_version_t; + +typedef struct castle_control_cmd_environment_set { + c_env_var_t var_id; /* IN */ + const char *var_str; /* IN */ + size_t var_len; /* IN */ + int ret; /* OUT */ +} cctrl_cmd_environment_set_t; + +typedef struct castle_control_cmd_fault { + c_fault_t fault_id; /* IN */ + uint32_t fault_arg; /* IN */ + int ret; /* OUT */ +} cctrl_cmd_fault_t; + +typedef struct castle_control_cmd_trace_setup { + const char *dir_str; /* IN */ + size_t dir_len; /* IN */ + int ret; /* OUT */ +} cctrl_cmd_trace_setup_t; + +typedef struct castle_control_cmd_trace_start { + int ret; /* OUT */ +} cctrl_cmd_trace_start_t; + +typedef struct castle_control_cmd_trace_stop { + int ret; /* OUT */ +} cctrl_cmd_trace_stop_t; + +typedef struct castle_control_cmd_trace_teardown { + int ret; /* OUT */ +} cctrl_cmd_trace_teardown_t; + +typedef struct castle_control_slave_evacuate { + slave_uuid_t id; /* IN */ + uint32_t force; /* IN */ + int ret; /* OUT */ +} PACKED cctrl_cmd_slave_evacuate_t; + +typedef struct castle_control_slave_scan { + slave_uuid_t id; /* IN */ + int ret; /* OUT */ +} PACKED cctrl_cmd_slave_scan_t; + +typedef struct castle_control_cmd_thread_priority { + int nice_value; /* IN */ + int ret; /* OUT */ +} cctrl_cmd_thread_priority_t; + +typedef struct castle_control_ioctl { + uint16_t cmd; + union { + cctrl_cmd_claim_t claim; + cctrl_cmd_release_t release; + cctrl_cmd_init_t init; + + cctrl_cmd_attach_t attach; + cctrl_cmd_detach_t detach; + cctrl_cmd_snapshot_t snapshot; + + cctrl_cmd_collection_attach_t collection_attach; + cctrl_cmd_collection_detach_t collection_detach; + cctrl_cmd_collection_snapshot_t collection_snapshot; + + cctrl_cmd_create_t create; + cctrl_cmd_destroy_t destroy; + cctrl_cmd_clone_t clone; + + cctrl_cmd_transfer_create_t transfer_create; + cctrl_cmd_transfer_destroy_t transfer_destroy; + + cctrl_cmd_protocol_version_t protocol_version; + cctrl_cmd_environment_set_t environment_set; + + cctrl_cmd_fault_t fault; + + cctrl_cmd_trace_setup_t trace_setup; + cctrl_cmd_trace_start_t trace_start; + cctrl_cmd_trace_stop_t trace_stop; + cctrl_cmd_trace_teardown_t trace_teardown; + + cctrl_cmd_slave_evacuate_t slave_evacuate; + cctrl_cmd_slave_scan_t slave_scan; + + cctrl_cmd_thread_priority_t thread_priority; + }; +} cctrl_ioctl_t; + +/* IOCTL definitions. */ +enum { + CASTLE_CTRL_CLAIM_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_CLAIM, cctrl_ioctl_t), + CASTLE_CTRL_RELEASE_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_RELEASE, cctrl_ioctl_t), + CASTLE_CTRL_ATTACH_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_ATTACH, cctrl_ioctl_t), + CASTLE_CTRL_DETACH_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_DETACH, cctrl_ioctl_t), + CASTLE_CTRL_CREATE_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_CREATE, cctrl_ioctl_t), + CASTLE_CTRL_CLONE_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_CLONE, cctrl_ioctl_t), + CASTLE_CTRL_SNAPSHOT_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_SNAPSHOT, cctrl_ioctl_t), + CASTLE_CTRL_INIT_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_INIT, cctrl_ioctl_t), + CASTLE_CTRL_TRANSFER_CREATE_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_TRANSFER_CREATE, cctrl_ioctl_t), + CASTLE_CTRL_TRANSFER_DESTROY_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_TRANSFER_DESTROY, cctrl_ioctl_t), + CASTLE_CTRL_COLLECTION_ATTACH_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_COLLECTION_ATTACH, cctrl_ioctl_t), + CASTLE_CTRL_COLLECTION_DETACH_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_COLLECTION_DETACH, cctrl_ioctl_t), + CASTLE_CTRL_COLLECTION_SNAPSHOT_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_COLLECTION_SNAPSHOT, cctrl_ioctl_t), + CASTLE_CTRL_RESERVE_FOR_TRANSFER_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_RESERVE_FOR_TRANSFER, cctrl_ioctl_t), + CASTLE_CTRL_VALID_STATS_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_VALID_STATS, cctrl_ioctl_t), + CASTLE_CTRL_INVALID_STATS_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_INVALID_STATS, cctrl_ioctl_t), + CASTLE_CTRL_SET_TARGET_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_SET_TARGET, cctrl_ioctl_t), + CASTLE_CTRL_DESTROY_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_DESTROY, cctrl_ioctl_t), + CASTLE_CTRL_PROTOCOL_VERSION_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_PROTOCOL_VERSION, cctrl_ioctl_t), + CASTLE_CTRL_ENVIRONMENT_SET_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_ENVIRONMENT_SET, cctrl_ioctl_t), + CASTLE_CTRL_FAULT_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_FAULT, cctrl_ioctl_t), + CASTLE_CTRL_TRACE_SETUP_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_TRACE_SETUP, cctrl_ioctl_t), + CASTLE_CTRL_TRACE_START_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_TRACE_START, cctrl_ioctl_t), + CASTLE_CTRL_TRACE_STOP_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_TRACE_STOP, cctrl_ioctl_t), + CASTLE_CTRL_TRACE_TEARDOWN_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_TRACE_TEARDOWN, cctrl_ioctl_t), + CASTLE_CTRL_SLAVE_EVACUATE_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_SLAVE_EVACUATE, cctrl_ioctl_t), + CASTLE_CTRL_THREAD_PRIORITY_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_THREAD_PRIORITY, cctrl_ioctl_t), + CASTLE_CTRL_SLAVE_SCAN_IOCTL = + _IOWR(CASTLE_CTRL_IOCTL_TYPE, CASTLE_CTRL_SLAVE_SCAN, cctrl_ioctl_t), +}; + +/* + * Variable length key, for example used by the btree + */ + +typedef struct castle_var_length_key { + uint32_t length; + uint8_t key[]; +} PACKED c_vl_key_t; + +typedef struct castle_var_length_object_key { + uint32_t nr_dims; + c_vl_key_t *dims[]; +} PACKED c_vl_okey_t; + +#define CASTLE_RING_PAGES (16) /**< 64 requests/page. */ +#define CASTLE_RING_SIZE (CASTLE_RING_PAGES << PAGE_SHIFT) /**< Must be ^2 or things break. */ + +#define CASTLE_STATEFUL_OPS 512 + +#define CASTLE_IOCTL_POKE_RING 2 +#define CASTLE_IOCTL_WAIT 3 + +#define CASTLE_RING_REPLACE 1 +#define CASTLE_RING_BIG_PUT 2 +#define CASTLE_RING_PUT_CHUNK 3 +#define CASTLE_RING_GET 4 +#define CASTLE_RING_BIG_GET 5 +#define CASTLE_RING_GET_CHUNK 6 +#define CASTLE_RING_ITER_START 7 +#define CASTLE_RING_ITER_NEXT 8 +#define CASTLE_RING_ITER_FINISH 9 +#define CASTLE_RING_ITER_SKIP 10 +#define CASTLE_RING_REMOVE 11 + +typedef uint32_t castle_interface_token_t; + +typedef struct castle_request_replace { + collection_id_t collection_id; + c_vl_okey_t *key_ptr; + uint32_t key_len; + void *value_ptr; + uint32_t value_len; +} castle_request_replace_t; + +typedef struct castle_request_remove { + collection_id_t collection_id; + c_vl_okey_t *key_ptr; + uint32_t key_len; +} castle_request_remove_t; + +typedef struct castle_request_get { + collection_id_t collection_id; + c_vl_okey_t *key_ptr; + uint32_t key_len; + void *value_ptr; /* where to put the result */ + uint32_t value_len; +} castle_request_get_t; + +typedef struct castle_request_iter_start { + collection_id_t collection_id; + c_vl_okey_t *start_key_ptr; + uint32_t start_key_len; + c_vl_okey_t *end_key_ptr; + uint32_t end_key_len; + uint64_t flags; +} castle_request_iter_start_t; + +#define CASTLE_RING_ITER_FLAG_NONE 0x0 +#define CASTLE_RING_ITER_FLAG_NO_VALUES 0x1 + +typedef struct castle_request_iter_next { + castle_interface_token_t token; + void *buffer_ptr; + uint32_t buffer_len; +} castle_request_iter_next_t; + +typedef struct castle_request_iter_finish { + castle_interface_token_t token; +} castle_request_iter_finish_t; + +typedef struct castle_request_big_get { + collection_id_t collection_id; + c_vl_okey_t *key_ptr; + uint32_t key_len; +} castle_request_big_get_t; + +typedef struct castle_request_get_chunk { + castle_interface_token_t token; + void *buffer_ptr; + uint32_t buffer_len; +} castle_request_get_chunk_t; + +typedef struct castle_request_big_put { + collection_id_t collection_id; + c_vl_okey_t *key_ptr; + uint32_t key_len; + uint64_t value_len; +} castle_request_big_put_t; + +typedef struct castle_request_put_chunk { + castle_interface_token_t token; + void *buffer_ptr; + uint32_t buffer_len; +} castle_request_put_chunk_t; + +typedef struct castle_request { + uint32_t call_id; + uint32_t tag; + union { + castle_request_replace_t replace; + castle_request_remove_t remove; + castle_request_get_t get; + + castle_request_big_get_t big_get; + castle_request_get_chunk_t get_chunk; + castle_request_big_put_t big_put; + castle_request_put_chunk_t put_chunk; + + castle_request_iter_start_t iter_start; + castle_request_iter_next_t iter_next; + castle_request_iter_finish_t iter_finish; + }; +} castle_request_t; + +typedef struct castle_response { + uint32_t call_id; + uint32_t err; + uint64_t length; + castle_interface_token_t token; +} castle_response_t; + +struct castle_iter_val { + uint64_t length; + uint8_t type; + union { + uint8_t *val; + collection_id_t collection_id; + }; +}; + +struct castle_key_value_list { + struct castle_key_value_list *next; + c_vl_okey_t *key; + struct castle_iter_val *val; +}; + + +#define CASTLE_SLAVE_MAGIC1 (0x02061985) +#define CASTLE_SLAVE_MAGIC2 (0x16071983) +#define CASTLE_SLAVE_MAGIC3 (0x16061981) +#define CASTLE_SLAVE_VERSION (13) + +#define CASTLE_SLAVE_TARGET (0x00000001) +#define CASTLE_SLAVE_SPINNING (0x00000002) +#define CASTLE_SLAVE_NEWDEV (0x00000004) +#define CASTLE_SLAVE_SSD (0x00000008) + +struct castle_slave_superblock_public { + /* align: 8 */ + /* offset: 0 */ uint32_t magic1; + /* 4 */ uint32_t magic2; + /* 8 */ uint32_t magic3; + /* 12 */ uint32_t version; /* Super chunk format version */ + /* 16 */ uint32_t uuid; + /* 20 */ uint32_t used; + /* 24 */ uint64_t size; /* In 4K blocks. */ + /* 32 */ uint32_t flags; + /* 36 */ uint32_t checksum; + /* 40 */ uint8_t _unused[88]; + /* 128 */ +} PACKED; + +#define CASTLE_FS_MAGIC1 (0x19731121) +#define CASTLE_FS_MAGIC2 (0x19880624) +#define CASTLE_FS_MAGIC3 (0x19821120) +#define CASTLE_FS_VERSION (1) + +struct castle_fs_superblock_public { + /* align: 4 */ + /* offset: 0 */ uint32_t magic1; + /* 4 */ uint32_t magic2; + /* 8 */ uint32_t magic3; + /* 12 */ uint32_t uuid; + /* 16 */ uint32_t version; /* Super chunk format version */ + /* 20 */ uint32_t salt; + /* 24 */ uint32_t peper; + /* 28 */ uint32_t checksum; + /* 32 */ uint8_t _unused[96]; + /* 128 */ +} PACKED; + +#endif /* __CASTLE_PUBLIC_H__ */ diff --git a/castle_utils.c b/castle_utils.c new file mode 100644 index 0000000..e120194 --- /dev/null +++ b/castle_utils.c @@ -0,0 +1,262 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "castle.h" +#include "castle_private.h" + +/* get next castle_buffer* in free-list */ +#define BUF_NEXT(x) (*(castle_buffer**)((x)->buf)) +/* min size of pooled buffer */ +#define MIN_SIZE sizeof(castle_buffer*) + +#define max(a, b) ((a)>(b)?(a):(b)) + +typedef struct pool_node +{ + size_t size; + castle_buffer* head; +} pool_node; + +struct s_castle_shared_pool +{ + pool_node* free; + size_t nsizes; + + pthread_mutex_t* lock; + pthread_cond_t* sig; + castle_connection* conn; +}; + +static int node_cmp(const void* a, const void* b) +{ + pool_node* l = (pool_node*)a, *r = (pool_node*)b; + return l->size < r->size ? -1 : l->size > r->size ? 1 : 0; +} + +static pool_node* find_size_locked(castle_shared_pool* pool, size_t size, bool nonempty) +{ + size_t first = 0; + int last = pool->nsizes - 1; + if(pool->free[last].size < size) + return NULL; + + /* binary search for the least upper bound which contains the requested size */ + while(last >= (signed)first) + { + size_t test = first + (last-first)/2; + if(pool->free[test].size > size) + last = test - 1; + else if(pool->free[test].size < size) + first = test + 1; + else + { + first = test; + break; + } + } + /* increase size until we find a non-empty free-list */ + if(nonempty) + while(first < pool->nsizes && !pool->free[first].head) ++first; + + if(first < pool->nsizes) + return &pool->free[first]; + + /* all sufficiently large buffers are in use */ + return NULL; +} + +int castle_shared_pool_lease(castle_shared_pool* pool, castle_buffer** buffer, unsigned long size) +{ + if(!pool || !buffer || *buffer || size > pool->free[pool->nsizes-1].size) + return -EINVAL; + + pool_node* node = NULL; + + pthread_mutex_lock(pool->lock); + while(!(node = find_size_locked(pool, size, true))) + pthread_cond_wait(pool->sig, pool->lock); + + castle_buffer* head = node->head; + node->head = BUF_NEXT(head); + BUF_NEXT(head) = NULL; + + pthread_mutex_unlock(pool->lock); + *buffer = head; + return 0; +} + +int castle_shared_pool_release(castle_shared_pool* pool, castle_buffer* buffer, __attribute__((unused)) unsigned long size) +{ + if(!pool || !buffer) + return -EINVAL; + + pthread_mutex_lock(pool->lock); + + pool_node* node = find_size_locked(pool, buffer->buflen, false); + BUF_NEXT(buffer) = node->head; + node->head = buffer; + + pthread_cond_signal(pool->sig); + pthread_mutex_unlock(pool->lock); + return 0; +} + +int castle_shared_pool_create(castle_connection* conn, size_t nsizes, size_t* sizes, size_t* quantities, castle_shared_pool** pool_out) +{ + if(!conn || !nsizes || !sizes || !quantities || !pool_out || *pool_out) + return -EINVAL; + + castle_shared_pool* pool = (castle_shared_pool*)calloc(1, sizeof(*pool)); + pool->lock = (pthread_mutex_t*)calloc(1, sizeof(*pool->lock)); + pool->sig = (pthread_cond_t*)calloc(1, sizeof(*pool->sig)); + + pthread_mutex_init(pool->lock, NULL); + pthread_cond_init(pool->sig, NULL); + pool->conn = conn; + + pool->free = (pool_node*)calloc(nsizes, sizeof(*pool->free)); + pool->nsizes = nsizes; + + for(size_t i = 0; i < nsizes; ++i) + { + size_t size = max(sizes[i], MIN_SIZE); + pool->free[i].size = size; + + for(size_t n = 0; n < quantities[i]; ++n) + { + castle_buffer* node = NULL; + int ret = castle_shared_buffer_allocate(conn, &node, size); + if (ret) + { + castle_shared_pool_destroy(pool); + return -ENOMEM; + } + BUF_NEXT(node) = pool->free[i].head; + pool->free[i].head = node; + } + } + + qsort(pool->free, nsizes, sizeof(pool_node), node_cmp); + + *pool_out = pool; + return 0; +} + +int castle_shared_pool_destroy(castle_shared_pool* pool) +{ + if(!pool) + return 0; + + for(size_t i = 0; i < pool->nsizes; ++i) + { + while(pool->free[i].head) + { + castle_buffer* head = pool->free[i].head; + pool->free[i].head = BUF_NEXT(head); + castle_shared_buffer_release(pool->conn, head); + } + } + + free(pool->free); + + pthread_cond_destroy(pool->sig); + free(pool->sig); + + pthread_mutex_destroy(pool->lock); + free(pool->lock); + + free(pool); + return 0; +} + +static int dir_exists(const char* path) +{ + struct stat attr; + if(stat(path, &attr)) + return 0; + return S_ISDIR(attr.st_mode); +} + +static long filesize(FILE* file) +{ + if(!file) + return -EINVAL; + long cur = ftell(file); + fseek(file, 0, SEEK_END); + long size = ftell(file); + fseek(file, cur, SEEK_SET); + return size; +} + +static int castle_collection_name_get(const char* coll_path, char** name) +{ + if(!name || *name) + return -EINVAL; + int ret = 0; + FILE* file = fopen(coll_path, "r"); + if(!file) + return -errno; + long size = filesize(file); + *name = (char*)calloc(1, size + 1); + if(!*name) + { + ret = -ENOMEM; + goto out1; + } + char* p = *name; + while(!feof(file) && p < *name + size) + p += fread(p, *name + size - p, 1, file); + + p = *name; + while(*p != '\n' && *p++); + *p = '\0'; + +out1: fclose(file); + return ret; +} + +static const char* collections_path = "/sys/fs/castle-fs/collections"; + +int castle_collection_find(const char* name, castle_collection* coll) +{ + if(!name || !*name || !coll) + return -EINVAL; + int ret = 0; + DIR* dir = opendir(collections_path); + if(!dir) + { + ret = -errno; + goto out1; + } + struct dirent* entry; + char* cur_name = NULL; + while((entry = readdir(dir))) + { + char path[PATH_MAX] = {0}; + snprintf(path, PATH_MAX, "%s/%s", collections_path, entry->d_name); + if(dir_exists(path)) + { + snprintf(path, PATH_MAX, "%s/%s/name", collections_path, entry->d_name); + castle_collection_name_get(path, &cur_name); + if(cur_name && 0==strcmp(cur_name, name)) + { + *coll = strtol(entry->d_name, NULL, 16); + goto out2; + } + free(cur_name); cur_name = NULL; + } + } + + ret = -ENOENT; + +out2: free(cur_name); +out1: closedir(dir); + return ret; +} diff --git a/list.h b/list.h new file mode 100644 index 0000000..bd7b6f1 --- /dev/null +++ b/list.h @@ -0,0 +1,979 @@ +#ifndef _LINUX_LIST_H +#define _LINUX_LIST_H + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +struct list_head { + struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +static inline void INIT_LIST_HEAD(struct list_head *list) +{ + list->next = list; + list->prev = list; +} + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +#ifndef CONFIG_DEBUG_LIST +static inline void __list_add(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new; + new->next = next; + xen_wmb(); + new->prev = prev; + prev->next = new; +} +#else +extern void __list_add(struct list_head *new, + struct list_head *prev, + struct list_head *next); +#endif + +/** + * list_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +#ifndef CONFIG_DEBUG_LIST +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} +#else +extern void list_add(struct list_head *new, struct list_head *head); +#endif + + +/** + * list_add_tail - add a new entry + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static inline void list_add_tail(struct list_head *new, struct list_head *head) +{ + __list_add(new, head->prev, head); +} + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_add_rcu(struct list_head * new, + struct list_head * prev, struct list_head * next) +{ + new->next = next; + new->prev = prev; + next->prev = new; + prev->next = new; +} + +/** + * list_add_rcu - add a new entry to rcu-protected list + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + * + * The caller must take whatever precautions are necessary + * (such as holding appropriate locks) to avoid racing + * with another list-mutation primitive, such as list_add_rcu() + * or list_del_rcu(), running on this same list. + * However, it is perfectly legal to run concurrently with + * the _rcu list-traversal primitives, such as + * list_for_each_entry_rcu(). + */ +static inline void list_add_rcu(struct list_head *new, struct list_head *head) +{ + __list_add_rcu(new, head, head->next); +} + +/** + * list_add_tail_rcu - add a new entry to rcu-protected list + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + * + * The caller must take whatever precautions are necessary + * (such as holding appropriate locks) to avoid racing + * with another list-mutation primitive, such as list_add_tail_rcu() + * or list_del_rcu(), running on this same list. + * However, it is perfectly legal to run concurrently with + * the _rcu list-traversal primitives, such as + * list_for_each_entry_rcu(). + */ +static inline void list_add_tail_rcu(struct list_head *new, + struct list_head *head) +{ + __list_add_rcu(new, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head * prev, struct list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry is + * in an undefined state. + */ +#ifndef CONFIG_DEBUG_LIST +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = NULL; + entry->prev = NULL; +} +#else +extern void list_del(struct list_head *entry); +#endif + +/** + * list_del_rcu - deletes entry from list without re-initialization + * @entry: the element to delete from the list. + * + * Note: list_empty on entry does not return true after this, + * the entry is in an undefined state. It is useful for RCU based + * lockfree traversal. + * + * In particular, it means that we can not poison the forward + * pointers that may still be used for walking the list. + * + * The caller must take whatever precautions are necessary + * (such as holding appropriate locks) to avoid racing + * with another list-mutation primitive, such as list_del_rcu() + * or list_add_rcu(), running on this same list. + * However, it is perfectly legal to run concurrently with + * the _rcu list-traversal primitives, such as + * list_for_each_entry_rcu(). + * + * Note that the caller is not permitted to immediately free + * the newly deleted entry. Instead, either synchronize_rcu() + * or call_rcu() must be used to defer freeing until an RCU + * grace period has elapsed. + */ +static inline void list_del_rcu(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->prev = NULL; +} + +/** + * list_replace - replace old entry by new one + * @old : the element to be replaced + * @new : the new element to insert + * Note: if 'old' was empty, it will be overwritten. + */ +static inline void list_replace(struct list_head *old, + struct list_head *new) +{ + new->next = old->next; + new->next->prev = new; + new->prev = old->prev; + new->prev->next = new; +} + +static inline void list_replace_init(struct list_head *old, + struct list_head *new) +{ + list_replace(old, new); + INIT_LIST_HEAD(old); +} + +/* + * list_replace_rcu - replace old entry by new one + * @old : the element to be replaced + * @new : the new element to insert + * + * The old entry will be replaced with the new entry atomically. + * Note: 'old' should not be empty. + */ +static inline void list_replace_rcu(struct list_head *old, + struct list_head *new) +{ + new->next = old->next; + new->prev = old->prev; + new->next->prev = new; + new->prev->next = new; + old->prev = NULL; +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static inline void list_del_init(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + +/** + * list_move - delete from one list and add as another's head + * @list: the entry to move + * @head: the head that will precede our entry + */ +static inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add(list, head); +} + +/** + * list_move_tail - delete from one list and add as another's tail + * @list: the entry to move + * @head: the head that will follow our entry + */ +static inline void list_move_tail(struct list_head *list, + struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add_tail(list, head); +} + +/** + * list_is_last - tests whether @list is the last entry in list @head + * @list: the entry to test + * @head: the head of the list + */ +static inline int list_is_last(const struct list_head *list, + const struct list_head *head) +{ + return list->next == head; +} + +/** + * list_empty - tests whether a list is empty + * @head: the list to test. + */ +static inline int list_empty(const struct list_head *head) +{ + return head->next == head; +} + +/** + * list_empty_careful - tests whether a list is empty and not being modified + * @head: the list to test + * + * Description: + * tests whether a list is empty _and_ checks that no other CPU might be + * in the process of modifying either member (next or prev) + * + * NOTE: using list_empty_careful() without synchronization + * can only be safe if the only activity that can happen + * to the list entry is list_del_init(). Eg. it cannot be used + * if another CPU could re-list_add() it. + */ +static inline int list_empty_careful(const struct list_head *head) +{ + struct list_head *next = head->next; + return (next == head) && (next == head->prev); +} + +/** + * list_is_singular - tests whether a list has just one entry. + * @head: the list to test. + */ +static inline int list_is_singular(const struct list_head *head) +{ + return !list_empty(head) && (head->next == head->prev); +} + +static inline void __list_splice(struct list_head *list, + struct list_head *head) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + struct list_head *at = head->next; + + first->prev = head; + head->next = first; + + last->next = at; + at->prev = last; +} + +/** + * list_splice - join two lists + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static inline void list_splice(struct list_head *list, struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head); +} + +/** + * list_splice_init - join two lists and reinitialise the emptied list. + * @list: the new list to add. + * @head: the place to add it in the first list. + * + * The list at @list is reinitialised + */ +static inline void list_splice_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head); + INIT_LIST_HEAD(list); + } +} + +/** + * list_splice_init_rcu - splice an RCU-protected list into an existing list. + * @list: the RCU-protected list to splice + * @head: the place in the list to splice the first list into + * @sync: function to sync: synchronize_rcu(), synchronize_sched(), ... + * + * @head can be RCU-read traversed concurrently with this function. + * + * Note that this function blocks. + * + * Important note: the caller must take whatever action is necessary to + * prevent any other updates to @head. In principle, it is possible + * to modify the list as soon as sync() begins execution. + * If this sort of thing becomes necessary, an alternative version + * based on call_rcu() could be created. But only if -really- + * needed -- there is no shortage of RCU API members. + */ +static inline void list_splice_init_rcu(struct list_head *list, + struct list_head *head, + void (*sync_func)(void)) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + struct list_head *at = head->next; + + if (list_empty(head)) + return; + + /* "first" and "last" tracking list, so initialize it. */ + + INIT_LIST_HEAD(list); + + /* + * At this point, the list body still points to the source list. + * Wait for any readers to finish using the list before splicing + * the list body into the new list. Any new readers will see + * an empty list. + */ + + sync_func(); + + /* + * Readers are finished with the source list, so perform splice. + * The order is important if the new list is global and accessible + * to concurrent RCU readers. Note that RCU readers are not + * permitted to traverse the prev pointers without excluding + * this function. + */ + + last->next = at; + + head->next = first; + first->prev = head; + at->prev = last; +} + +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +/** + * Casts a member of a structure out to the containing structure + * @param ptr the pointer to the member. + * @param type the type of the container struct this is embedded in. + * @param member the name of the member within the struct. + * + */ +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +/** + * list_entry - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) + +/** + * list_first_entry - get the first element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + * + * Note, that list is expected to be not empty. + */ +#define list_first_entry(ptr, type, member) \ + list_entry((ptr)->next, type, member) + +/** + * list_first_entry - get the first element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + * + * Note, that list is expected to be not empty. + */ +#define list_first_entry(ptr, type, member) \ + list_entry((ptr)->next, type, member) + +/** + * list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. + */ +#define list_for_each(pos, head) \ + for (pos = (head)->next; pos != (head); \ + pos = pos->next) + +/** + * __list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. + * + * This variant differs from list_for_each() in that it's the + * simplest possible list iteration code, no prefetching is done. + * Use this for code that knows the list to be very short (empty + * or 1 entry) most of the time. + */ +#define __list_for_each(pos, head) \ + for (pos = (head)->next; pos != (head); pos = pos->next) + +/** + * list_for_each_prev - iterate over a list backwards + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. + */ +#define list_for_each_prev(pos, head) \ + for (pos = (head)->prev; pos != (head); \ + pos = pos->prev) + +/** + * list_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct list_head to use as a loop cursor. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +/** + * list_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +/** + * list_for_each_entry_reverse - iterate backwards over list of given type. + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_reverse(pos, head, member) \ + for (pos = list_entry((head)->prev, typeof(*pos), member); \ + pos = list_entry(pos->member.prev, typeof(*pos), member)) + +/** + * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue + * @pos: the type * to use as a start point + * @head: the head of the list + * @member: the name of the list_struct within the struct. + * + * Prepares a pos entry for use as a start point in list_for_each_entry_continue. + */ +#define list_prepare_entry(pos, head, member) \ + ((pos) ? : list_entry(head, typeof(*pos), member)) + +/** + * list_for_each_entry_continue - continue iteration over list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * Continue to iterate over list of given type, continuing after + * the current position. + */ +#define list_for_each_entry_continue(pos, head, member) \ + for (pos = list_entry(pos->member.next, typeof(*pos), member); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +/** + * list_for_each_entry_continue_reverse - iterate backwards from the given point + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * Start to iterate over list of given type backwards, continuing after + * the current position. + */ +#define list_for_each_entry_continue_reverse(pos, head, member) \ + for (pos = list_entry(pos->member.prev, typeof(*pos), member); \ + pos = list_entry(pos->member.prev, typeof(*pos), member)) + +/** + * list_for_each_entry_from - iterate over list of given type from the current point + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * Iterate over list of given type, continuing from current position. + */ +#define list_for_each_entry_from(pos, head, member) \ + for (; &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +/** + * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +/** + * list_for_each_entry_safe_continue + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * Iterate over list of given type, continuing after current point, + * safe against removal of list entry. + */ +#define list_for_each_entry_safe_continue(pos, n, head, member) \ + for (pos = list_entry(pos->member.next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +/** + * list_for_each_entry_safe_from + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * Iterate over list of given type from current point, safe against + * removal of list entry. + */ +#define list_for_each_entry_safe_from(pos, n, head, member) \ + for (n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +/** + * list_for_each_entry_safe_reverse + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * Iterate backwards over list of given type, safe against removal + * of list entry. + */ +#define list_for_each_entry_safe_reverse(pos, n, head, member) \ + for (pos = list_entry((head)->prev, typeof(*pos), member), \ + n = list_entry(pos->member.prev, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.prev, typeof(*n), member)) + +/** + * list_for_each_rcu - iterate over an rcu-protected list + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as list_add_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + */ +#define list_for_each_rcu(pos, head) \ + for (pos = (head)->next; \ + pos = pos->next) + +#define __list_for_each_rcu(pos, head) \ + for (pos = (head)->next; \ + rcu_dereference(pos) != (head); \ + pos = pos->next) + +/** + * list_for_each_safe_rcu + * @pos: the &struct list_head to use as a loop cursor. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + * + * Iterate over an rcu-protected list, safe against removal of list entry. + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as list_add_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + */ +#define list_for_each_safe_rcu(pos, n, head) \ + for (pos = (head)->next; \ + n = rcu_dereference(pos)->next, pos != (head); \ + pos = n) + +/** + * list_for_each_entry_rcu - iterate over rcu list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as list_add_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + */ +#define list_for_each_entry_rcu(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + + +/** + * list_for_each_continue_rcu + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. + * + * Iterate over an rcu-protected list, continuing after current point. + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as list_add_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + */ +#define list_for_each_continue_rcu(pos, head) \ + for ((pos) = (pos)->next; \ + (pos) != (head); \ + (pos) = (pos)->next) + +/* + * Double linked lists with a single pointer list head. + * Mostly useful for hash tables where the two pointer list head is + * too wasteful. + * You lose the ability to access the tail in O(1). + */ + +struct hlist_head { + struct hlist_node *first; +}; + +struct hlist_node { + struct hlist_node *next, **pprev; +}; + +#define HLIST_HEAD_INIT { .first = NULL } +#define HLIST_HEAD(name) struct hlist_head name = { .first = NULL } +#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) +static inline void INIT_HLIST_NODE(struct hlist_node *h) +{ + h->next = NULL; + h->pprev = NULL; +} + +static inline int hlist_unhashed(const struct hlist_node *h) +{ + return !h->pprev; +} + +static inline int hlist_empty(const struct hlist_head *h) +{ + return !h->first; +} + +static inline void __hlist_del(struct hlist_node *n) +{ + struct hlist_node *next = n->next; + struct hlist_node **pprev = n->pprev; + *pprev = next; + if (next) + next->pprev = pprev; +} + +static inline void hlist_del(struct hlist_node *n) +{ + __hlist_del(n); + n->next = NULL; + n->pprev = NULL; +} + +/** + * hlist_del_rcu - deletes entry from hash list without re-initialization + * @n: the element to delete from the hash list. + * + * Note: list_unhashed() on entry does not return true after this, + * the entry is in an undefined state. It is useful for RCU based + * lockfree traversal. + * + * In particular, it means that we can not poison the forward + * pointers that may still be used for walking the hash list. + * + * The caller must take whatever precautions are necessary + * (such as holding appropriate locks) to avoid racing + * with another list-mutation primitive, such as hlist_add_head_rcu() + * or hlist_del_rcu(), running on this same list. + * However, it is perfectly legal to run concurrently with + * the _rcu list-traversal primitives, such as + * hlist_for_each_entry(). + */ +static inline void hlist_del_rcu(struct hlist_node *n) +{ + __hlist_del(n); + n->pprev = NULL; +} + +static inline void hlist_del_init(struct hlist_node *n) +{ + if (!hlist_unhashed(n)) { + __hlist_del(n); + INIT_HLIST_NODE(n); + } +} + +static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) +{ + struct hlist_node *first = h->first; + n->next = first; + if (first) + first->pprev = &n->next; + h->first = n; + n->pprev = &h->first; +} + + +/** + * hlist_add_head_rcu + * @n: the element to add to the hash list. + * @h: the list to add to. + * + * Description: + * Adds the specified element to the specified hlist, + * while permitting racing traversals. + * + * The caller must take whatever precautions are necessary + * (such as holding appropriate locks) to avoid racing + * with another list-mutation primitive, such as hlist_add_head_rcu() + * or hlist_del_rcu(), running on this same list. + * However, it is perfectly legal to run concurrently with + * the _rcu list-traversal primitives, such as + * hlist_for_each_entry_rcu(), used to prevent memory-consistency + * problems on Alpha CPUs. Regardless of the type of CPU, the + * list-traversal primitive must be guarded by rcu_read_lock(). + */ +static inline void hlist_add_head_rcu(struct hlist_node *n, + struct hlist_head *h) +{ + struct hlist_node *first = h->first; + n->next = first; + n->pprev = &h->first; + + if (first) + first->pprev = &n->next; + h->first = n; +} + +/* next must be != NULL */ +static inline void hlist_add_before(struct hlist_node *n, + struct hlist_node *next) +{ + n->pprev = next->pprev; + n->next = next; + next->pprev = &n->next; + *(n->pprev) = n; +} + +static inline void hlist_add_after(struct hlist_node *n, + struct hlist_node *next) +{ + next->next = n->next; + n->next = next; + next->pprev = &n->next; + + if(next->next) + next->next->pprev = &next->next; +} + +/** + * hlist_add_before_rcu + * @n: the new element to add to the hash list. + * @next: the existing element to add the new element before. + * + * Description: + * Adds the specified element to the specified hlist + * before the specified node while permitting racing traversals. + * + * The caller must take whatever precautions are necessary + * (such as holding appropriate locks) to avoid racing + * with another list-mutation primitive, such as hlist_add_head_rcu() + * or hlist_del_rcu(), running on this same list. + * However, it is perfectly legal to run concurrently with + * the _rcu list-traversal primitives, such as + * hlist_for_each_entry_rcu(), used to prevent memory-consistency + * problems on Alpha CPUs. + */ +static inline void hlist_add_before_rcu(struct hlist_node *n, + struct hlist_node *next) +{ + n->pprev = next->pprev; + n->next = next; + + next->pprev = &n->next; + *(n->pprev) = n; +} + +/** + * hlist_add_after_rcu + * @prev: the existing element to add the new element after. + * @n: the new element to add to the hash list. + * + * Description: + * Adds the specified element to the specified hlist + * after the specified node while permitting racing traversals. + * + * The caller must take whatever precautions are necessary + * (such as holding appropriate locks) to avoid racing + * with another list-mutation primitive, such as hlist_add_head_rcu() + * or hlist_del_rcu(), running on this same list. + * However, it is perfectly legal to run concurrently with + * the _rcu list-traversal primitives, such as + * hlist_for_each_entry_rcu(), used to prevent memory-consistency + * problems on Alpha CPUs. + */ +static inline void hlist_add_after_rcu(struct hlist_node *prev, + struct hlist_node *n) +{ + n->next = prev->next; + n->pprev = &prev->next; + + prev->next = n; + if (n->next) + n->next->pprev = &n->next; +} + +#define hlist_entry(ptr, type, member) container_of(ptr,type,member) + +#define hlist_for_each(pos, head) \ + for (pos = (head)->first; pos; \ + pos = pos->next) + +#define hlist_for_each_safe(pos, n, head) \ + for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ + pos = n) + +/** + * hlist_for_each_entry - iterate over list of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_node to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry(tpos, pos, head, member) \ + for (pos = (head)->first; \ + pos && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +/** + * hlist_for_each_entry_continue - iterate over a hlist continuing after current point + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_node to use as a loop cursor. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_continue(tpos, pos, member) \ + for (pos = (pos)->next; \ + pos && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +/** + * hlist_for_each_entry_from - iterate over a hlist continuing from current point + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_node to use as a loop cursor. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_from(tpos, pos, member) \ + for (; pos && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +/** + * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_node to use as a loop cursor. + * @n: another &struct hlist_node to use as temporary storage + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_safe(tpos, pos, n, head, member) \ + for (pos = (head)->first; \ + pos && ({ n = pos->next; 1; }) && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = n) + +/** + * hlist_for_each_entry_rcu - iterate over rcu list of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_node to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + * + * This list-traversal primitive may safely run concurrently with + * the _rcu list-mutation primitives such as hlist_add_head_rcu() + * as long as the traversal is guarded by rcu_read_lock(). + */ +#define hlist_for_each_entry_rcu(tpos, pos, head, member) \ + for (pos = (head)->first; \ + rcu_dereference(pos) && \ + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +#endif diff --git a/ring.h b/ring.h new file mode 100644 index 0000000..a668f8a --- /dev/null +++ b/ring.h @@ -0,0 +1,343 @@ +/****************************************************************************** + * ring.h + * + * Shared producer-consumer ring macros. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Tim Deegan and Andrew Warfield November 2004. + */ + +#ifndef __XEN_PUBLIC_IO_RING_H__ +#define __XEN_PUBLIC_IO_RING_H__ + +//#include "../xen-compat.h" + +//#if __XEN_INTERFACE_VERSION__ < 0x00030208 +#ifdef KERNEL +#define xen_mb() mb() +#define xen_rmb() rmb() +#define xen_wmb() wmb() +#else +#if defined(__i386__) +#define xen_mb() asm volatile ( "lock; addl $0,0(%%esp)" : : : "memory" ) +#define xen_rmb() asm volatile ( "lock; addl $0,0(%%esp)" : : : "memory" ) +#define xen_wmb() asm volatile ( "" : : : "memory") +#elif defined(__x86_64__) +#define xen_mb() asm volatile ( "mfence" : : : "memory") +#define xen_rmb() asm volatile ( "lfence" : : : "memory") +#define xen_wmb() asm volatile ( "" : : : "memory") +#elif defined(__ia64__) +#define xen_mb() asm volatile ("mf" ::: "memory") +#define xen_rmb() asm volatile ("mf" ::: "memory") +#define xen_wmb() asm volatile ("mf" ::: "memory") +#else +#error "Define barriers" +#endif +#endif +//#endif + +typedef unsigned int RING_IDX; + +/* Round a 32-bit unsigned constant down to the nearest power of two. */ +#define __RD2(_x) (((_x) & 0x00000002) ? 0x2 : ((_x) & 0x1)) +#define __RD4(_x) (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2 : __RD2(_x)) +#define __RD8(_x) (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4 : __RD4(_x)) +#define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8 : __RD8(_x)) +#define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x)) + +/* + * Calculate size of a shared ring, given the total available space for the + * ring and indexes (_sz), and the name tag of the request/response structure. + * A ring contains as many entries as will fit, rounded down to the nearest + * power of two (so we can mask with (size-1) to loop around). + */ +#define __CONST_RING_SIZE(_s, _sz) \ + (__RD32(((_sz) - offsetof(struct _s##_sring, ring)) / \ + sizeof(((struct _s##_sring *)0)->ring[0]))) +/* + * The same for passing in an actual pointer instead of a name tag. + */ +#define __RING_SIZE(_s, _sz) \ + (__RD32(((_sz) - (long)(_s)->ring + (long)(_s)) / sizeof((_s)->ring[0]))) + +/* + * Macros to make the correct C datatypes for a new kind of ring. + * + * To make a new ring datatype, you need to have two message structures, + * let's say request_t, and response_t already defined. + * + * In a header where you want the ring datatype declared, you then do: + * + * DEFINE_RING_TYPES(mytag, request_t, response_t); + * + * These expand out to give you a set of types, as you can see below. + * The most important of these are: + * + * mytag_sring_t - The shared ring. + * mytag_front_ring_t - The 'front' half of the ring. + * mytag_back_ring_t - The 'back' half of the ring. + * + * To initialize a ring in your code you need to know the location and size + * of the shared memory area (PAGE_SIZE, for instance). To initialise + * the front half: + * + * mytag_front_ring_t front_ring; + * SHARED_RING_INIT((mytag_sring_t *)shared_page); + * FRONT_RING_INIT(&front_ring, (mytag_sring_t *)shared_page, PAGE_SIZE); + * + * Initializing the back follows similarly (note that only the front + * initializes the shared ring): + * + * mytag_back_ring_t back_ring; + * BACK_RING_INIT(&back_ring, (mytag_sring_t *)shared_page, PAGE_SIZE); + */ + +#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t) \ + \ +/* Shared ring entry */ \ +union __name##_sring_entry { \ + __req_t req; \ + __rsp_t rsp; \ +}; \ + \ +/* Shared ring page */ \ +struct __name##_sring { \ + RING_IDX req_prod, req_event; \ + RING_IDX rsp_prod, rsp_event; \ + union { \ + struct { \ + uint8_t smartpoll_active; \ + } netif; \ + struct { \ + uint8_t msg; \ + } tapif_user; \ + uint8_t pvt_pad[4]; \ + } private; \ + uint8_t __pad[44]; \ + union __name##_sring_entry ring[1]; /* variable-length */ \ +}; \ + \ +/* "Front" end's private variables */ \ +struct __name##_front_ring { \ + RING_IDX req_prod_pvt; \ + RING_IDX rsp_cons; \ + unsigned int nr_ents; \ + unsigned int reserved; \ + struct __name##_sring *sring; \ +}; \ + \ +/* "Back" end's private variables */ \ +struct __name##_back_ring { \ + RING_IDX rsp_prod_pvt; \ + RING_IDX req_cons; \ + unsigned int nr_ents; \ + struct __name##_sring *sring; \ +}; \ + \ +/* Syntactic sugar */ \ +typedef struct __name##_sring __name##_sring_t; \ +typedef struct __name##_front_ring __name##_front_ring_t; \ +typedef struct __name##_back_ring __name##_back_ring_t + +/* + * Macros for manipulating rings. + * + * FRONT_RING_whatever works on the "front end" of a ring: here + * requests are pushed on to the ring and responses taken off it. + * + * BACK_RING_whatever works on the "back end" of a ring: here + * requests are taken off the ring and responses put on. + * + * N.B. these macros do NO INTERLOCKS OR FLOW CONTROL. + * This is OK in 1-for-1 request-response situations where the + * requestor (front end) never has more than RING_SIZE()-1 + * outstanding requests. + */ + +/* Initialising empty rings */ +#define SHARED_RING_INIT(_s) do { \ + (_s)->req_prod = (_s)->rsp_prod = 0; \ + (_s)->req_event = (_s)->rsp_event = 1; \ + (void)memset((_s)->private.pvt_pad, 0, sizeof((_s)->private.pvt_pad)); \ + (void)memset((_s)->__pad, 0, sizeof((_s)->__pad)); \ +} while(0) + +#define FRONT_RING_INIT(_r, _s, __size, __reserved) do { \ + (_r)->req_prod_pvt = 0; \ + (_r)->rsp_cons = 0; \ + (_r)->nr_ents = __RING_SIZE(_s, __size); \ + (_r)->reserved = __reserved; \ + (_r)->sring = (_s); \ +} while (0) + +#define BACK_RING_INIT(_r, _s, __size) do { \ + (_r)->rsp_prod_pvt = 0; \ + (_r)->req_cons = 0; \ + (_r)->nr_ents = __RING_SIZE(_s, __size); \ + (_r)->sring = (_s); \ +} while (0) + +/* Initialize to existing shared indexes -- for recovery */ +#define FRONT_RING_ATTACH(_r, _s, __size) do { \ + (_r)->sring = (_s); \ + (_r)->req_prod_pvt = (_s)->req_prod; \ + (_r)->rsp_cons = (_s)->rsp_prod; \ + (_r)->nr_ents = __RING_SIZE(_s, __size); \ +} while (0) + +#define BACK_RING_ATTACH(_r, _s, __size) do { \ + (_r)->sring = (_s); \ + (_r)->rsp_prod_pvt = (_s)->rsp_prod; \ + (_r)->req_cons = (_s)->req_prod; \ + (_r)->nr_ents = __RING_SIZE(_s, __size); \ +} while (0) + +/* How big is this ring? */ +#define RING_SIZE(_r) \ + ((_r)->nr_ents) + +/* Number of free requests (for use on front side only). */ +#define RING_FREE_REQUESTS(_r) \ + (RING_SIZE(_r) - ((_r)->req_prod_pvt - (_r)->rsp_cons)) + +/* Test if there is an empty slot available on the front ring. + * (This is only meaningful from the front. ) + */ +#define RING_FULL(_r) \ + (RING_FREE_REQUESTS(_r) == 0) + +/* Test if there are outstanding messages to be processed on a ring. */ +#define RING_HAS_UNCONSUMED_RESPONSES(_r) \ + ((_r)->sring->rsp_prod - (_r)->rsp_cons) + +#ifdef __GNUC__ +#define RING_HAS_UNCONSUMED_REQUESTS(_r) ({ \ + unsigned int req = (_r)->sring->req_prod - (_r)->req_cons; \ + unsigned int rsp = RING_SIZE(_r) - \ + ((_r)->req_cons - (_r)->rsp_prod_pvt); \ + req < rsp ? req : rsp; \ +}) +#else +/* Same as above, but without the nice GCC ({ ... }) syntax. */ +#define RING_HAS_UNCONSUMED_REQUESTS(_r) \ + ((((_r)->sring->req_prod - (_r)->req_cons) < \ + (RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt))) ? \ + ((_r)->sring->req_prod - (_r)->req_cons) : \ + (RING_SIZE(_r) - ((_r)->req_cons - (_r)->rsp_prod_pvt))) +#endif + +/* Direct access to individual ring elements, by index. */ +#define RING_GET_REQUEST(_r, _idx) \ + (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].req)) + +#define RING_GET_RESPONSE(_r, _idx) \ + (&((_r)->sring->ring[((_idx) & (RING_SIZE(_r) - 1))].rsp)) + +/* Loop termination condition: Would the specified index overflow the ring? */ +#define RING_REQUEST_CONS_OVERFLOW(_r, _cons) \ + (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r)) + +#define RING_PUSH_REQUESTS(_r) do { \ + xen_wmb(); /* back sees requests /before/ updated producer index */ \ + (_r)->sring->req_prod = (_r)->req_prod_pvt; \ +} while (0) + +#define RING_PUSH_RESPONSES(_r) do { \ + xen_wmb(); /* front sees resps /before/ updated producer index */ \ + (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt; \ +} while (0) + +/* + * Notification hold-off (req_event and rsp_event): + * + * When queueing requests or responses on a shared ring, it may not always be + * necessary to notify the remote end. For example, if requests are in flight + * in a backend, the front may be able to queue further requests without + * notifying the back (if the back checks for new requests when it queues + * responses). + * + * When enqueuing requests or responses: + * + * Use RING_PUSH_{REQUESTS,RESPONSES}_AND_CHECK_NOTIFY(). The second argument + * is a boolean return value. True indicates that the receiver requires an + * asynchronous notification. + * + * After dequeuing requests or responses (before sleeping the connection): + * + * Use RING_FINAL_CHECK_FOR_REQUESTS() or RING_FINAL_CHECK_FOR_RESPONSES(). + * The second argument is a boolean return value. True indicates that there + * are pending messages on the ring (i.e., the connection should not be put + * to sleep). + * + * These macros will set the req_event/rsp_event field to trigger a + * notification on the very next message that is enqueued. If you want to + * create batches of work (i.e., only receive a notification after several + * messages have been enqueued) then you will need to create a customised + * version of the FINAL_CHECK macro in your own code, which sets the event + * field appropriately. + */ + +#define RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(_r, _notify) do { \ + RING_IDX __old = (_r)->sring->req_prod; \ + RING_IDX __new = (_r)->req_prod_pvt; \ + xen_wmb(); /* back sees requests /before/ updated producer index */ \ + (_r)->sring->req_prod = __new; \ + xen_mb(); /* back sees new requests /before/ we check req_event */ \ + (_notify) = ((RING_IDX)(__new - (_r)->sring->req_event) < \ + (RING_IDX)(__new - __old)); \ +} while (0) + +#define RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(_r, _notify) do { \ + RING_IDX __old = (_r)->sring->rsp_prod; \ + RING_IDX __new = (_r)->rsp_prod_pvt; \ + xen_wmb(); /* front sees resps /before/ updated producer index */ \ + (_r)->sring->rsp_prod = __new; \ + xen_mb(); /* front sees new resps /before/ we check rsp_event */ \ + (_notify) = ((RING_IDX)(__new - (_r)->sring->rsp_event) < \ + (RING_IDX)(__new - __old)); \ +} while (0) + +#define RING_FINAL_CHECK_FOR_REQUESTS(_r, _work_to_do) do { \ + (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \ + if (_work_to_do) break; \ + (_r)->sring->req_event = (_r)->req_cons + 1; \ + xen_mb(); \ + (_work_to_do) = RING_HAS_UNCONSUMED_REQUESTS(_r); \ +} while (0) + +#define RING_FINAL_CHECK_FOR_RESPONSES(_r, _work_to_do) do { \ + (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \ + if (_work_to_do) break; \ + (_r)->sring->rsp_event = (_r)->rsp_cons + 1; \ + xen_mb(); \ + (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \ +} while (0) + +#endif /* __XEN_PUBLIC_IO_RING_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff --git a/versions b/versions new file mode 100644 index 0000000..7f19116 --- /dev/null +++ b/versions @@ -0,0 +1,78 @@ +CASTLE_1 { + global: + /* Data path */ + castle_connect; + castle_disconnect; + castle_free; + castle_request_do_blocking; + castle_request_do_blocking_multi; + castle_request_send; + castle_shared_buffer_create; + castle_shared_buffer_destroy; + castle_shared_buffer_allocate; + castle_shared_buffer_release; + castle_get; + castle_replace; + castle_remove; + castle_iter_start; + castle_iter_next; + castle_iter_finish; + castle_kvs_free; + castle_getslice; + castle_big_put; + castle_put_chunk; + castle_big_get; + castle_get_chunk; + + castle_build_key; + castle_build_key_len; + castle_key_bytes_needed; + castle_malloc_key; + + castle_print_key; + castle_print_request; + castle_print_response; + + /* Control path */ + castle_attach; + castle_attach_dev; + castle_claim; + castle_claim_dev; + castle_clone; + castle_collection_attach; + castle_collection_detach; + castle_collection_snapshot; + castle_environment_set; + castle_trace_setup; + castle_trace_start; + castle_trace_stop; + castle_trace_teardown; + castle_fault; + castle_slave_evacuate; + castle_slave_scan; + castle_thread_priority; + castle_create; + castle_detach; + castle_detach_dev; + castle_init; + castle_release; + castle_snapshot; + castle_snapshot_dev; + castle_destroy; + + castle_device_to_devno; + castle_devno_to_device; + + castle_max_buffer_size; + + /* Shared buffer pool */ + castle_shared_pool_create; + castle_shared_pool_destroy; + castle_shared_pool_lease; + castle_shared_pool_release; + + /* Collection utils */ + castle_collection_find; + + local: *; +};