diff --git a/mdbx/mdbx.c b/mdbx/mdbx.c index 01303e0..778e05b 100644 --- a/mdbx/mdbx.c +++ b/mdbx/mdbx.c @@ -1,38 +1,25 @@ -/* - * Copyright 2015-2024 Leonid Yuriev - * and other libmdbx authors: please see AUTHORS file. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . */ +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 + +#define xMDBX_ALLOY 1 /* alloyed build */ + +#define MDBX_BUILD_SOURCERY 8085906fc8a5c6fa4bb6dedd8b442229a8049b16efd634c70a16e27bdb70bde4_v0_13_0_61_g45b204f5 + + +#define LIBMDBX_INTERNALS +#define MDBX_DEPRECATED -#define xMDBX_ALLOY 1 -#define MDBX_BUILD_SOURCERY 91ff5b5423830ee44fca4b70dcb298f233338a17a3185c44df67ec16d3034af3_v0_13_0_38_gf1975363 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif -#define LIBMDBX_INTERNALS -#ifdef xMDBX_TOOLS -#define MDBX_DEPRECATED -#endif /* xMDBX_TOOLS */ -#ifdef xMDBX_ALLOY -/* Amalgamated build */ -#define MDBX_INTERNAL_FUNC static -#define MDBX_INTERNAL_VAR_PROTO static -#define MDBX_INTERNAL_VAR_INSTA static -#else -/* Non-amalgamated build */ -#define MDBX_INTERNAL_FUNC -#define MDBX_INTERNAL_VAR_PROTO extern -#define MDBX_INTERNAL_VAR_INSTA -#endif /* xMDBX_ALLOY */ + +/* Undefine the NDEBUG if debugging is enforced by MDBX_DEBUG */ +#if (defined(MDBX_DEBUG) && MDBX_DEBUG > 0) || \ + (defined(MDBX_FORCE_ASSERTIONS) && MDBX_FORCE_ASSERTIONS) +#undef NDEBUG +#endif /*----------------------------------------------------------------------------*/ @@ -53,11 +40,60 @@ #if !defined(_FILE_OFFSET_BITS) && !defined(__ANDROID_API__) && \ !defined(ANDROID) #define _FILE_OFFSET_BITS 64 -#endif +#endif /* _FILE_OFFSET_BITS */ -#ifdef __APPLE__ +#if defined(__APPLE__) && !defined(_DARWIN_C_SOURCE) #define _DARWIN_C_SOURCE -#endif +#endif /* _DARWIN_C_SOURCE */ + +#if (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) && \ + !defined(__USE_MINGW_ANSI_STDIO) +#define __USE_MINGW_ANSI_STDIO 1 +#endif /* MinGW */ + +#if defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS) + +#ifndef _WIN32_WINNT +#define _WIN32_WINNT 0x0601 /* Windows 7 */ +#endif /* _WIN32_WINNT */ + +#if !defined(_CRT_SECURE_NO_WARNINGS) +#define _CRT_SECURE_NO_WARNINGS +#endif /* _CRT_SECURE_NO_WARNINGS */ +#if !defined(UNICODE) +#define UNICODE +#endif /* UNICODE */ + +#if !defined(_NO_CRT_STDIO_INLINE) && MDBX_BUILD_SHARED_LIBRARY && \ + !defined(xMDBX_TOOLS) && MDBX_WITHOUT_MSVC_CRT +#define _NO_CRT_STDIO_INLINE +#endif /* _NO_CRT_STDIO_INLINE */ + +#elif !defined(_POSIX_C_SOURCE) +#define _POSIX_C_SOURCE 200809L +#endif /* Windows */ + +#ifdef __cplusplus + +#ifndef NOMINMAX +#define NOMINMAX +#endif /* NOMINMAX */ + +/* Workaround for modern libstdc++ with CLANG < 4.x */ +#if defined(__SIZEOF_INT128__) && !defined(__GLIBCXX_TYPE_INT_N_0) && \ + defined(__clang__) && __clang_major__ < 4 +#define __GLIBCXX_BITSIZE_INT_N_0 128 +#define __GLIBCXX_TYPE_INT_N_0 __int128 +#endif /* Workaround for modern libstdc++ with CLANG < 4.x */ + +#ifdef _MSC_VER +/* Workaround for MSVC' header `extern "C"` vs `std::` redefinition bug */ +#if defined(__SANITIZE_ADDRESS__) && !defined(_DISABLE_VECTOR_ANNOTATION) +#define _DISABLE_VECTOR_ANNOTATION +#endif /* _DISABLE_VECTOR_ANNOTATION */ +#endif /* _MSC_VER */ + +#endif /* __cplusplus */ #ifdef _MSC_VER #if _MSC_FULL_VER < 190024234 @@ -82,9 +118,6 @@ #error \ "At least \"Microsoft C/C++ Compiler\" version 19.00.24234 (Visual Studio 2015 Update 3) is required." #endif -#ifndef _CRT_SECURE_NO_WARNINGS -#define _CRT_SECURE_NO_WARNINGS -#endif /* _CRT_SECURE_NO_WARNINGS */ #if _MSC_VER > 1800 #pragma warning(disable : 4464) /* relative include path contains '..' */ #endif @@ -93,8 +126,8 @@ #endif #if _MSC_VER > 1914 #pragma warning( \ - disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ - producing 'defined' has undefined behavior */ + disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ + producing 'defined' has undefined behavior */ #endif #if _MSC_VER < 1920 /* avoid "error C2219: syntax error: type qualifier must be after '*'" */ @@ -127,37 +160,13 @@ #pragma warning(disable : 4204) /* nonstandard extension used: non-constant \ aggregate initializer */ #pragma warning( \ - disable : 4505) /* unreferenced local function has been removed */ -#endif /* _MSC_VER (warnings) */ + disable : 4505) /* unreferenced local function has been removed */ +#endif /* _MSC_VER (warnings) */ #if defined(__GNUC__) && __GNUC__ < 9 #pragma GCC diagnostic ignored "-Wattributes" #endif /* GCC < 9 */ -#if (defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)) && \ - !defined(__USE_MINGW_ANSI_STDIO) -#define __USE_MINGW_ANSI_STDIO 1 -#endif /* MinGW */ - -#if (defined(_WIN32) || defined(_WIN64)) && !defined(UNICODE) -#define UNICODE -#endif /* UNICODE */ - -#include "mdbx.h" -/* - * Copyright 2015-2024 Leonid Yuriev - * and other libmdbx authors: please see AUTHORS file. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - /*----------------------------------------------------------------------------*/ /* Microsoft compiler generates a lot of warning for self includes... */ @@ -173,20 +182,9 @@ * not guaranteed. Specify /EHsc */ #endif /* _MSC_VER (warnings) */ -#if defined(_WIN32) || defined(_WIN64) -#if !defined(_CRT_SECURE_NO_WARNINGS) -#define _CRT_SECURE_NO_WARNINGS -#endif /* _CRT_SECURE_NO_WARNINGS */ -#if !defined(_NO_CRT_STDIO_INLINE) && MDBX_BUILD_SHARED_LIBRARY && \ - !defined(xMDBX_TOOLS) && MDBX_WITHOUT_MSVC_CRT -#define _NO_CRT_STDIO_INLINE -#endif -#elif !defined(_POSIX_C_SOURCE) -#define _POSIX_C_SOURCE 200809L -#endif /* Windows */ - /*----------------------------------------------------------------------------*/ /* basic C99 includes */ + #include #include #include @@ -200,21 +198,6 @@ #include #include -#if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF -#error \ - "Sanity checking failed: Two's complement, reasonably sized integer types" -#endif - -#ifndef SSIZE_MAX -#define SSIZE_MAX INTPTR_MAX -#endif - -#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || defined(_WIN64) -#define MDBX_WORDBITS 64 -#else -#define MDBX_WORDBITS 32 -#endif /* MDBX_WORDBITS */ - /*----------------------------------------------------------------------------*/ /* feature testing */ @@ -226,6 +209,14 @@ #define __has_include(x) (0) #endif +#ifndef __has_attribute +#define __has_attribute(x) (0) +#endif + +#ifndef __has_cpp_attribute +#define __has_cpp_attribute(x) 0 +#endif + #ifndef __has_feature #define __has_feature(x) (0) #endif @@ -234,6 +225,10 @@ #define __has_extension(x) (0) #endif +#ifndef __has_builtin +#define __has_builtin(x) (0) +#endif + #if __has_feature(thread_sanitizer) #define __SANITIZE_THREAD__ 1 #endif @@ -269,6 +264,47 @@ #endif #endif /* __GLIBC_PREREQ */ +/*----------------------------------------------------------------------------*/ +/* pre-requirements */ + +#if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF +#error \ + "Sanity checking failed: Two's complement, reasonably sized integer types" +#endif + +#ifndef SSIZE_MAX +#define SSIZE_MAX INTPTR_MAX +#endif + +#if defined(__GNUC__) && !__GNUC_PREREQ(4, 2) +/* Actually libmdbx was not tested with compilers older than GCC 4.2. + * But you could ignore this warning at your own risk. + * In such case please don't rise up an issues related ONLY to old compilers. + */ +#warning "libmdbx required GCC >= 4.2" +#endif + +#if defined(__clang__) && !__CLANG_PREREQ(3, 8) +/* Actually libmdbx was not tested with CLANG older than 3.8. + * But you could ignore this warning at your own risk. + * In such case please don't rise up an issues related ONLY to old compilers. + */ +#warning "libmdbx required CLANG >= 3.8" +#endif + +#if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12) +/* Actually libmdbx was not tested with something older than glibc 2.12. + * But you could ignore this warning at your own risk. + * In such case please don't rise up an issues related ONLY to old systems. + */ +#warning "libmdbx was only tested with GLIBC >= 2.12." +#endif + +#ifdef __SANITIZE_THREAD__ +#warning \ + "libmdbx don't compatible with ThreadSanitizer, you will get a lot of false-positive issues." +#endif /* __SANITIZE_THREAD__ */ + /*----------------------------------------------------------------------------*/ /* C11' alignas() */ @@ -385,12 +421,14 @@ __extern_C key_t ftok(const char *, int); #ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #endif /* WIN32_LEAN_AND_MEAN */ -#include -#include #include #include #include +/* После подгрузки windows.h, чтобы избежать проблем со сборкой MINGW и т.п. */ +#include +#include + #else /*----------------------------------------------------------------------*/ #include @@ -647,10 +685,11 @@ __extern_C key_t ftok(const char *, int); #ifndef container_of #define container_of(ptr, type, member) \ - ((type *)((char *)(ptr)-offsetof(type, member))) + ((type *)((char *)(ptr) - offsetof(type, member))) #endif /* container_of */ /*----------------------------------------------------------------------------*/ +/* useful attributes */ #ifndef __always_inline #if defined(__GNUC__) || __has_attribute(__always_inline__) @@ -658,7 +697,7 @@ __extern_C key_t ftok(const char *, int); #elif defined(_MSC_VER) #define __always_inline __forceinline #else -#define __always_inline +#define __always_inline __inline #endif #endif /* __always_inline */ @@ -784,16 +823,6 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* __anonymous_struct_extension__ */ -#ifndef expect_with_probability -#if defined(__builtin_expect_with_probability) || \ - __has_builtin(__builtin_expect_with_probability) || __GNUC_PREREQ(9, 0) -#define expect_with_probability(expr, value, prob) \ - __builtin_expect_with_probability(expr, value, prob) -#else -#define expect_with_probability(expr, value, prob) (expr) -#endif -#endif /* expect_with_probability */ - #ifndef MDBX_WEAK_IMPORT_ATTRIBUTE #ifdef WEAK_IMPORT_ATTRIBUTE #define MDBX_WEAK_IMPORT_ATTRIBUTE WEAK_IMPORT_ATTRIBUTE @@ -807,6 +836,32 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */ +#if !defined(__thread) && (defined(_MSC_VER) || defined(__DMC__)) +#define __thread __declspec(thread) +#endif /* __thread */ + +#ifndef MDBX_EXCLUDE_FOR_GPROF +#ifdef ENABLE_GPROF +#define MDBX_EXCLUDE_FOR_GPROF \ + __attribute__((__no_instrument_function__, \ + __no_profile_instrument_function__)) +#else +#define MDBX_EXCLUDE_FOR_GPROF +#endif /* ENABLE_GPROF */ +#endif /* MDBX_EXCLUDE_FOR_GPROF */ + +/*----------------------------------------------------------------------------*/ + +#ifndef expect_with_probability +#if defined(__builtin_expect_with_probability) || \ + __has_builtin(__builtin_expect_with_probability) || __GNUC_PREREQ(9, 0) +#define expect_with_probability(expr, value, prob) \ + __builtin_expect_with_probability(expr, value, prob) +#else +#define expect_with_probability(expr, value, prob) (expr) +#endif +#endif /* expect_with_probability */ + #ifndef MDBX_GOOFY_MSVC_STATIC_ANALYZER #ifdef _PREFAST_ #define MDBX_GOOFY_MSVC_STATIC_ANALYZER 1 @@ -829,7 +884,17 @@ __extern_C key_t ftok(const char *, int); #define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) #endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ +#ifndef FLEXIBLE_ARRAY_MEMBERS +#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ + (!defined(__cplusplus) && defined(_MSC_VER)) +#define FLEXIBLE_ARRAY_MEMBERS 1 +#else +#define FLEXIBLE_ARRAY_MEMBERS 0 +#endif +#endif /* FLEXIBLE_ARRAY_MEMBERS */ + /*----------------------------------------------------------------------------*/ +/* Valgrind and Address Sanitizer */ #if defined(ENABLE_MEMCHECK) #include @@ -911,42 +976,37 @@ template char (&__ArraySizeHelper(T (&array)[N]))[N]; #define STATIC_ASSERT(expr) STATIC_ASSERT_MSG(expr, #expr) #endif -#ifndef __Wpedantic_format_voidptr -MDBX_MAYBE_UNUSED MDBX_PURE_FUNCTION static __inline const void * -__Wpedantic_format_voidptr(const void *ptr) { - return ptr; -} -#define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG) -#endif /* __Wpedantic_format_voidptr */ +/*----------------------------------------------------------------------------*/ -#if defined(__GNUC__) && !__GNUC_PREREQ(4, 2) -/* Actually libmdbx was not tested with compilers older than GCC 4.2. - * But you could ignore this warning at your own risk. - * In such case please don't rise up an issues related ONLY to old compilers. - */ -#warning "libmdbx required GCC >= 4.2" -#endif +#if defined(_MSC_VER) && _MSC_VER >= 1900 +/* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros + * for internal format-args checker. */ +#undef PRIuPTR +#undef PRIiPTR +#undef PRIdPTR +#undef PRIxPTR +#define PRIuPTR "Iu" +#define PRIiPTR "Ii" +#define PRIdPTR "Id" +#define PRIxPTR "Ix" +#define PRIuSIZE "zu" +#define PRIiSIZE "zi" +#define PRIdSIZE "zd" +#define PRIxSIZE "zx" +#endif /* fix PRI*PTR for _MSC_VER */ -#if defined(__clang__) && !__CLANG_PREREQ(3, 8) -/* Actually libmdbx was not tested with CLANG older than 3.8. - * But you could ignore this warning at your own risk. - * In such case please don't rise up an issues related ONLY to old compilers. - */ -#warning "libmdbx required CLANG >= 3.8" -#endif +#ifndef PRIuSIZE +#define PRIuSIZE PRIuPTR +#define PRIiSIZE PRIiPTR +#define PRIdSIZE PRIdPTR +#define PRIxSIZE PRIxPTR +#endif /* PRI*SIZE macros for MSVC */ -#if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12) -/* Actually libmdbx was not tested with something older than glibc 2.12. - * But you could ignore this warning at your own risk. - * In such case please don't rise up an issues related ONLY to old systems. - */ -#warning "libmdbx was only tested with GLIBC >= 2.12." +#ifdef _MSC_VER +#pragma warning(pop) #endif -#ifdef __SANITIZE_THREAD__ -#warning \ - "libmdbx don't compatible with ThreadSanitizer, you will get a lot of false-positive issues." -#endif /* __SANITIZE_THREAD__ */ +/*----------------------------------------------------------------------------*/ #if __has_warning("-Wnested-anon-types") #if defined(__clang__) @@ -983,66 +1043,23 @@ __Wpedantic_format_voidptr(const void *ptr) { #endif #endif /* -Walignment-reduction-ignored */ -#ifndef MDBX_EXCLUDE_FOR_GPROF -#ifdef ENABLE_GPROF -#define MDBX_EXCLUDE_FOR_GPROF \ - __attribute__((__no_instrument_function__, \ - __no_profile_instrument_function__)) +#ifdef xMDBX_ALLOY +/* Amalgamated build */ +#define MDBX_INTERNAL static #else -#define MDBX_EXCLUDE_FOR_GPROF -#endif /* ENABLE_GPROF */ -#endif /* MDBX_EXCLUDE_FOR_GPROF */ +/* Non-amalgamated build */ +#define MDBX_INTERNAL +#endif /* xMDBX_ALLOY */ -#ifdef __cplusplus -extern "C" { -#endif - -/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ +#include "mdbx.h" -/* - * Copyright 2015-2024 Leonid Yuriev - * and other libmdbx authors: please see AUTHORS file. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ +/*----------------------------------------------------------------------------*/ +/* Basic constants and types */ +typedef struct iov_ctx iov_ctx_t; +/// -/*----------------------------------------------------------------------------*/ -/* C11 Atomics */ -#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include() -#include -#define MDBX_HAVE_C11ATOMICS -#elif !defined(__cplusplus) && \ - (__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) && \ - !defined(__STDC_NO_ATOMICS__) && \ - (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ - !(defined(__GNUC__) || defined(__clang__))) -#include -#define MDBX_HAVE_C11ATOMICS -#elif defined(__GNUC__) || defined(__clang__) -#elif defined(_MSC_VER) -#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ -#pragma warning(disable : 4133) /* 'function': incompatible types - from \ - 'size_t' to 'LONGLONG' */ -#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ - 'std::size_t', possible loss of data */ -#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ - 'long', possible loss of data */ -#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) -#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) -#elif defined(__APPLE__) -#include -#else -#error FIXME atomic-ops -#endif /*----------------------------------------------------------------------------*/ /* Memory/Compiler barriers, cache coherence */ @@ -1056,7 +1073,7 @@ extern "C" { #include #endif -MDBX_MAYBE_UNUSED static __inline void osal_compiler_barrier(void) { +MDBX_MAYBE_UNUSED static inline void osal_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) __asm__ __volatile__("" ::: "memory"); #elif defined(_MSC_VER) @@ -1076,7 +1093,7 @@ MDBX_MAYBE_UNUSED static __inline void osal_compiler_barrier(void) { #endif } -MDBX_MAYBE_UNUSED static __inline void osal_memory_barrier(void) { +MDBX_MAYBE_UNUSED static inline void osal_memory_barrier(void) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) @@ -1116,7 +1133,7 @@ MDBX_MAYBE_UNUSED static __inline void osal_memory_barrier(void) { #define HAVE_SYS_TYPES_H typedef HANDLE osal_thread_t; typedef unsigned osal_thread_key_t; -#define MAP_FAILED NULL +#define MAP_FAILED nullptr #define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0)) #define THREAD_CALL WINAPI #define THREAD_RESULT DWORD @@ -1204,19 +1221,6 @@ typedef pthread_mutex_t osal_fastmutex_t; /*----------------------------------------------------------------------------*/ /* OS abstraction layer stuff */ -MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize; -MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize_ln2, - sys_allocation_granularity; - -/* Get the size of a memory page for the system. - * This is the basic size that the platform's memory manager uses, and is - * fundamental to the use of memory-mapped files. */ -MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t -osal_syspagesize(void) { - assert(sys_pagesize > 0 && (sys_pagesize & (sys_pagesize - 1)) == 0); - return sys_pagesize; -} - #if defined(_WIN32) || defined(_WIN64) typedef wchar_t pathchar_t; #define MDBX_PRIsPATH "ls" @@ -1228,7 +1232,7 @@ typedef char pathchar_t; typedef struct osal_mmap { union { void *base; - struct MDBX_lockinfo *lck; + struct shared_lck *lck; }; mdbx_filehandle_t fd; size_t limit; /* mapping length, but NOT a size of file nor DB */ @@ -1239,25 +1243,6 @@ typedef struct osal_mmap { #endif } osal_mmap_t; -typedef union bin128 { - __anonymous_struct_extension__ struct { - uint64_t x, y; - }; - __anonymous_struct_extension__ struct { - uint32_t a, b, c, d; - }; -} bin128_t; - -#if defined(_WIN32) || defined(_WIN64) -typedef union osal_srwlock { - __anonymous_struct_extension__ struct { - long volatile readerCount; - long volatile writerCount; - }; - RTL_SRWLOCK native; -} osal_srwlock_t; -#endif /* Windows */ - #ifndef MDBX_HAVE_PWRITEV #if defined(_WIN32) || defined(_WIN64) @@ -1340,32 +1325,30 @@ typedef struct osal_ioring { char *boundary; } osal_ioring_t; -#ifndef __cplusplus - /* Actually this is not ioring for now, but on the way. */ -MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t * +MDBX_INTERNAL int osal_ioring_create(osal_ioring_t * #if defined(_WIN32) || defined(_WIN64) - , - bool enable_direct, - mdbx_filehandle_t overlapped_fd + , + bool enable_direct, + mdbx_filehandle_t overlapped_fd #endif /* Windows */ ); -MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items); -MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *); -MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *); -MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ctx, const size_t offset, - void *data, const size_t bytes); +MDBX_INTERNAL int osal_ioring_resize(osal_ioring_t *, size_t items); +MDBX_INTERNAL void osal_ioring_destroy(osal_ioring_t *); +MDBX_INTERNAL void osal_ioring_reset(osal_ioring_t *); +MDBX_INTERNAL int osal_ioring_add(osal_ioring_t *ctx, const size_t offset, + void *data, const size_t bytes); typedef struct osal_ioring_write_result { int err; unsigned wops; } osal_ioring_write_result_t; -MDBX_INTERNAL_FUNC osal_ioring_write_result_t +MDBX_INTERNAL osal_ioring_write_result_t osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd); -typedef struct iov_ctx iov_ctx_t; -MDBX_INTERNAL_FUNC void osal_ioring_walk( - osal_ioring_t *ior, iov_ctx_t *ctx, - void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes)); +MDBX_INTERNAL void osal_ioring_walk(osal_ioring_t *ior, iov_ctx_t *ctx, + void (*callback)(iov_ctx_t *ctx, + size_t offset, void *data, + size_t bytes)); MDBX_MAYBE_UNUSED static inline unsigned osal_ioring_left(const osal_ioring_t *ior) { @@ -1402,9 +1385,9 @@ osal_ioring_prepare(osal_ioring_t *ior, size_t items, size_t bytes) { #define osal_asprintf asprintf #define osal_vasprintf vasprintf #else -MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC +MDBX_MAYBE_UNUSED MDBX_INTERNAL MDBX_PRINTF_ARGS(2, 3) int osal_asprintf(char **strp, const char *fmt, ...); -MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap); +MDBX_INTERNAL int osal_vasprintf(char **strp, const char *fmt, va_list ap); #endif #if !defined(MADV_DODUMP) && defined(MADV_CORE) @@ -1415,8 +1398,7 @@ MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap); #define MADV_DONTDUMP MADV_NOCORE #endif /* MADV_NOCORE -> MADV_DONTDUMP */ -MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny); -MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); +MDBX_MAYBE_UNUSED MDBX_INTERNAL void osal_jitter(bool tiny); /* max bytes to write in one call */ #if defined(_WIN64) @@ -1466,19 +1448,13 @@ MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); #endif /* OFF_T_MAX */ #endif /* MDBX_F_OFD_SETLK64, MDBX_F_OFD_SETLKW64, MDBX_F_OFD_GETLK64 */ -#endif - -#if defined(__linux__) || defined(__gnu_linux__) -MDBX_INTERNAL_VAR_PROTO uint32_t linux_kernel_version; -MDBX_INTERNAL_VAR_PROTO bool - mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; -#endif /* Linux */ +#endif /* !Windows */ #ifndef osal_strdup LIBMDBX_API char *osal_strdup(const char *str); #endif -MDBX_MAYBE_UNUSED static __inline int osal_get_errno(void) { +MDBX_MAYBE_UNUSED static inline int osal_get_errno(void) { #if defined(_WIN32) || defined(_WIN64) DWORD rc = GetLastError(); #else @@ -1488,40 +1464,39 @@ MDBX_MAYBE_UNUSED static __inline int osal_get_errno(void) { } #ifndef osal_memalign_alloc -MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes, - void **result); +MDBX_INTERNAL int osal_memalign_alloc(size_t alignment, size_t bytes, + void **result); #endif #ifndef osal_memalign_free -MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr); -#endif - -MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair); -MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair); -MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair); -MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair, - bool part); -MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair, bool part); -MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair); - -MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex); - -MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, - size_t sgvcnt, uint64_t offset); -MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count, - uint64_t offset); -MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, - size_t count, uint64_t offset); -MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, - size_t count); - -MDBX_INTERNAL_FUNC int +MDBX_INTERNAL void osal_memalign_free(void *ptr); +#endif + +MDBX_INTERNAL int osal_condpair_init(osal_condpair_t *condpair); +MDBX_INTERNAL int osal_condpair_lock(osal_condpair_t *condpair); +MDBX_INTERNAL int osal_condpair_unlock(osal_condpair_t *condpair); +MDBX_INTERNAL int osal_condpair_signal(osal_condpair_t *condpair, bool part); +MDBX_INTERNAL int osal_condpair_wait(osal_condpair_t *condpair, bool part); +MDBX_INTERNAL int osal_condpair_destroy(osal_condpair_t *condpair); + +MDBX_INTERNAL int osal_fastmutex_init(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL int osal_fastmutex_release(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex); + +MDBX_INTERNAL int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, + size_t sgvcnt, uint64_t offset); +MDBX_INTERNAL int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count, + uint64_t offset); +MDBX_INTERNAL int osal_pwrite(mdbx_filehandle_t fd, const void *buf, + size_t count, uint64_t offset); +MDBX_INTERNAL int osal_write(mdbx_filehandle_t fd, const void *buf, + size_t count); + +MDBX_INTERNAL int osal_thread_create(osal_thread_t *thread, THREAD_RESULT(THREAD_CALL *start_routine)(void *), void *arg); -MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); +MDBX_INTERNAL int osal_thread_join(osal_thread_t thread); enum osal_syncmode_bits { MDBX_SYNC_NONE = 0, @@ -1531,11 +1506,11 @@ enum osal_syncmode_bits { MDBX_SYNC_IODQ = 8 }; -MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, - const enum osal_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length); -MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos); -MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length); +MDBX_INTERNAL int osal_fsync(mdbx_filehandle_t fd, + const enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length); +MDBX_INTERNAL int osal_fseek(mdbx_filehandle_t fd, uint64_t pos); +MDBX_INTERNAL int osal_filesize(mdbx_filehandle_t fd, uint64_t *length); enum osal_openfile_purpose { MDBX_OPEN_DXB_READ, @@ -1550,7 +1525,7 @@ enum osal_openfile_purpose { MDBX_OPEN_DELETE }; -MDBX_MAYBE_UNUSED static __inline bool osal_isdirsep(pathchar_t c) { +MDBX_MAYBE_UNUSED static inline bool osal_isdirsep(pathchar_t c) { return #if defined(_WIN32) || defined(_WIN64) c == '\\' || @@ -1558,50 +1533,45 @@ MDBX_MAYBE_UNUSED static __inline bool osal_isdirsep(pathchar_t c) { c == '/'; } -MDBX_INTERNAL_FUNC bool osal_pathequal(const pathchar_t *l, const pathchar_t *r, - size_t len); -MDBX_INTERNAL_FUNC pathchar_t *osal_fileext(const pathchar_t *pathname, - size_t len); -MDBX_INTERNAL_FUNC int osal_fileexists(const pathchar_t *pathname); -MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, - const MDBX_env *env, - const pathchar_t *pathname, - mdbx_filehandle_t *fd, - mdbx_mode_t unix_mode_bits); -MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname); -MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname); -MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); +MDBX_INTERNAL bool osal_pathequal(const pathchar_t *l, const pathchar_t *r, + size_t len); +MDBX_INTERNAL pathchar_t *osal_fileext(const pathchar_t *pathname, size_t len); +MDBX_INTERNAL int osal_fileexists(const pathchar_t *pathname); +MDBX_INTERNAL int osal_openfile(const enum osal_openfile_purpose purpose, + const MDBX_env *env, const pathchar_t *pathname, + mdbx_filehandle_t *fd, + mdbx_mode_t unix_mode_bits); +MDBX_INTERNAL int osal_closefile(mdbx_filehandle_t fd); +MDBX_INTERNAL int osal_removefile(const pathchar_t *pathname); +MDBX_INTERNAL int osal_removedirectory(const pathchar_t *pathname); +MDBX_INTERNAL int osal_is_pipe(mdbx_filehandle_t fd); +MDBX_INTERNAL int osal_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 -MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, size_t size, - const size_t limit, const unsigned options); -MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); +MDBX_INTERNAL int osal_mmap(const int flags, osal_mmap_t *map, size_t size, + const size_t limit, const unsigned options); +MDBX_INTERNAL int osal_munmap(osal_mmap_t *map); #define MDBX_MRESIZE_MAY_MOVE 0x00000100 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 -MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, - size_t size, size_t limit); +MDBX_INTERNAL int osal_mresize(const int flags, osal_mmap_t *map, size_t size, + size_t limit); #if defined(_WIN32) || defined(_WIN64) typedef struct { unsigned limit, count; HANDLE handles[31]; } mdbx_handle_array_t; -MDBX_INTERNAL_FUNC int +MDBX_INTERNAL int osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); -MDBX_INTERNAL_FUNC int -osal_resume_threads_after_remap(mdbx_handle_array_t *array); +MDBX_INTERNAL int osal_resume_threads_after_remap(mdbx_handle_array_t *array); #endif /* Windows */ -MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, - size_t length, - enum osal_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, - const pathchar_t *pathname, - int err); -MDBX_INTERNAL_FUNC int osal_check_fs_incore(mdbx_filehandle_t handle); - -MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { +MDBX_INTERNAL int osal_msync(const osal_mmap_t *map, size_t offset, + size_t length, enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL int osal_check_fs_rdonly(mdbx_filehandle_t handle, + const pathchar_t *pathname, int err); +MDBX_INTERNAL int osal_check_fs_incore(mdbx_filehandle_t handle); + +MDBX_MAYBE_UNUSED static inline uint32_t osal_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); #if defined(_WIN32) || defined(_WIN64) return GetCurrentProcessId(); @@ -1611,7 +1581,7 @@ MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { #endif } -MDBX_MAYBE_UNUSED static __inline uintptr_t osal_thread_self(void) { +MDBX_MAYBE_UNUSED static inline uintptr_t osal_thread_self(void) { mdbx_tid_t thunk; STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); #if defined(_WIN32) || defined(_WIN64) @@ -1624,22 +1594,22 @@ MDBX_MAYBE_UNUSED static __inline uintptr_t osal_thread_self(void) { #if !defined(_WIN32) && !defined(_WIN64) #if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC) -MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void); +MDBX_INTERNAL int osal_check_tid4bionic(void); #else -static __inline int osal_check_tid4bionic(void) { return 0; } +static inline int osal_check_tid4bionic(void) { return 0; } #endif /* __ANDROID_API__ || ANDROID) || BIONIC */ -MDBX_MAYBE_UNUSED static __inline int +MDBX_MAYBE_UNUSED static inline int osal_pthread_mutex_lock(pthread_mutex_t *mutex) { int err = osal_check_tid4bionic(); return unlikely(err) ? err : pthread_mutex_lock(mutex); } #endif /* !Windows */ -MDBX_INTERNAL_FUNC uint64_t osal_monotime(void); -MDBX_INTERNAL_FUNC uint64_t osal_cputime(size_t *optional_page_faults); -MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16); -MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime); +MDBX_INTERNAL uint64_t osal_monotime(void); +MDBX_INTERNAL uint64_t osal_cputime(size_t *optional_page_faults); +MDBX_INTERNAL uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16); +MDBX_INTERNAL uint32_t osal_monotime_to_16dot16(uint64_t monotime); MDBX_MAYBE_UNUSED static inline uint32_t osal_monotime_to_16dot16_noUnderflow(uint64_t monotime) { @@ -1647,249 +1617,18 @@ osal_monotime_to_16dot16_noUnderflow(uint64_t monotime) { return seconds_16dot16 ? seconds_16dot16 : /* fix underflow */ (monotime > 0); } -MDBX_INTERNAL_FUNC bin128_t osal_bootid(void); /*----------------------------------------------------------------------------*/ -/* lck stuff */ - -/// \brief Initialization of synchronization primitives linked with MDBX_env -/// instance both in LCK-file and within the current process. -/// \param -/// global_uniqueness_flag = true - denotes that there are no other processes -/// working with DB and LCK-file. Thus the function MUST initialize -/// shared synchronization objects in memory-mapped LCK-file. -/// global_uniqueness_flag = false - denotes that at least one process is -/// already working with DB and LCK-file, including the case when DB -/// has already been opened in the current process. Thus the function -/// MUST NOT initialize shared synchronization objects in memory-mapped -/// LCK-file that are already in use. -/// \return Error code or zero on success. -MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, - MDBX_env *inprocess_neighbor, - int global_uniqueness_flag); - -/// \brief Disconnects from shared interprocess objects and destructs -/// synchronization objects linked with MDBX_env instance -/// within the current process. -/// \param -/// inprocess_neighbor = NULL - if the current process does not have other -/// instances of MDBX_env linked with the DB being closed. -/// Thus the function MUST check for other processes working with DB or -/// LCK-file, and keep or destroy shared synchronization objects in -/// memory-mapped LCK-file depending on the result. -/// inprocess_neighbor = not-NULL - pointer to another instance of MDBX_env -/// (anyone of there is several) working with DB or LCK-file within the -/// current process. Thus the function MUST NOT try to acquire exclusive -/// lock and/or try to destruct shared synchronization objects linked with -/// DB or LCK-file. Moreover, the implementation MUST ensure correct work -/// of other instances of MDBX_env within the current process, e.g. -/// restore POSIX-fcntl locks after the closing of file descriptors. -/// \return Error code (MDBX_PANIC) or zero on success. -MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, - MDBX_env *inprocess_neighbor, - const uint32_t current_pid); - -/// \brief Connects to shared interprocess locking objects and tries to acquire -/// the maximum lock level (shared if exclusive is not available) -/// Depending on implementation or/and platform (Windows) this function may -/// acquire the non-OS super-level lock (e.g. for shared synchronization -/// objects initialization), which will be downgraded to OS-exclusive or -/// shared via explicit calling of osal_lck_downgrade(). -/// \return -/// MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus -/// the current process is the first and only after the last use of DB. -/// MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus -/// DB has already been opened and now is used by other processes. -/// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); - -/// \brief Downgrades the level of initially acquired lock to -/// operational level specified by argument. The reason for such downgrade: -/// - unblocking of other processes that are waiting for access, i.e. -/// if (env->me_flags & MDBX_EXCLUSIVE) != 0, then other processes -/// should be made aware that access is unavailable rather than -/// wait for it. -/// - freeing locks that interfere file operation (especially for Windows) -/// (env->me_flags & MDBX_EXCLUSIVE) == 0 - downgrade to shared lock. -/// (env->me_flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive -/// operational lock. -/// \return Error code or zero on success -MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); -MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, - bool dont_wait); - -/// \brief Locks LCK-file or/and table of readers for (de)registering. -/// \return Error code or zero on success -MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); - -/// \brief Unlocks LCK-file or/and table of readers after (de)registering. -MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); - -/// \brief Acquires write-transaction lock. -/// \return Error code or zero on success -MDBX_INTERNAL_FUNC int osal_txn_lock(MDBX_env *env, bool dont_wait); - -/// \brief Releases write-transaction lock.. -MDBX_INTERNAL_FUNC void osal_txn_unlock(MDBX_env *env); - -/// \brief Sets alive-flag of reader presence (indicative lock) for PID of -/// the current process. The function does no more than needed for -/// the correct working of osal_rpid_check() in other processes. -/// \return Error code or zero on success -MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env); - -/// \brief Resets alive-flag of reader presence (indicative lock) -/// for PID of the current process. The function does no more than needed -/// for the correct working of osal_rpid_check() in other processes. -/// \return Error code or zero on success -MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env); - -/// \brief Checks for reading process status with the given pid with help of -/// alive-flag of presence (indicative lock) or using another way. -/// \return -/// MDBX_RESULT_TRUE (-1) - if the reader process with the given PID is alive -/// and working with DB (indicative lock is present). -/// MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent -/// or not working with DB (indicative lock is not present). -/// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); - -#if defined(_WIN32) || defined(_WIN64) - -MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst); - -typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); -MDBX_INTERNAL_VAR_PROTO osal_srwlock_t_function osal_srwlock_Init, - osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, - osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; - -#if _WIN32_WINNT < 0x0600 /* prior to Windows Vista */ -typedef enum _FILE_INFO_BY_HANDLE_CLASS { - FileBasicInfo, - FileStandardInfo, - FileNameInfo, - FileRenameInfo, - FileDispositionInfo, - FileAllocationInfo, - FileEndOfFileInfo, - FileStreamInfo, - FileCompressionInfo, - FileAttributeTagInfo, - FileIdBothDirectoryInfo, - FileIdBothDirectoryRestartInfo, - FileIoPriorityHintInfo, - FileRemoteProtocolInfo, - MaximumFileInfoByHandleClass -} FILE_INFO_BY_HANDLE_CLASS, - *PFILE_INFO_BY_HANDLE_CLASS; - -typedef struct _FILE_END_OF_FILE_INFO { - LARGE_INTEGER EndOfFile; -} FILE_END_OF_FILE_INFO, *PFILE_END_OF_FILE_INFO; - -#define REMOTE_PROTOCOL_INFO_FLAG_LOOPBACK 0x00000001 -#define REMOTE_PROTOCOL_INFO_FLAG_OFFLINE 0x00000002 - -typedef struct _FILE_REMOTE_PROTOCOL_INFO { - USHORT StructureVersion; - USHORT StructureSize; - DWORD Protocol; - USHORT ProtocolMajorVersion; - USHORT ProtocolMinorVersion; - USHORT ProtocolRevision; - USHORT Reserved; - DWORD Flags; - struct { - DWORD Reserved[8]; - } GenericReserved; - struct { - DWORD Reserved[16]; - } ProtocolSpecificReserved; -} FILE_REMOTE_PROTOCOL_INFO, *PFILE_REMOTE_PROTOCOL_INFO; - -#endif /* _WIN32_WINNT < 0x0600 (prior to Windows Vista) */ - -typedef BOOL(WINAPI *MDBX_GetFileInformationByHandleEx)( - _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, - _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -MDBX_INTERNAL_VAR_PROTO MDBX_GetFileInformationByHandleEx - mdbx_GetFileInformationByHandleEx; - -typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( - _In_ HANDLE hFile, _Out_opt_ LPWSTR lpVolumeNameBuffer, - _In_ DWORD nVolumeNameSize, _Out_opt_ LPDWORD lpVolumeSerialNumber, - _Out_opt_ LPDWORD lpMaximumComponentLength, - _Out_opt_ LPDWORD lpFileSystemFlags, - _Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize); -MDBX_INTERNAL_VAR_PROTO MDBX_GetVolumeInformationByHandleW - mdbx_GetVolumeInformationByHandleW; - -typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile, - _Out_ LPWSTR lpszFilePath, - _In_ DWORD cchFilePath, - _In_ DWORD dwFlags); -MDBX_INTERNAL_VAR_PROTO MDBX_GetFinalPathNameByHandleW - mdbx_GetFinalPathNameByHandleW; - -typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)( - _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, - _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -MDBX_INTERNAL_VAR_PROTO MDBX_SetFileInformationByHandle - mdbx_SetFileInformationByHandle; - -typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( - IN HANDLE FileHandle, IN OUT HANDLE Event, - IN OUT PVOID /* PIO_APC_ROUTINE */ ApcRoutine, IN OUT PVOID ApcContext, - OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode, - IN OUT PVOID InputBuffer, IN ULONG InputBufferLength, - OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength); -MDBX_INTERNAL_VAR_PROTO MDBX_NtFsControlFile mdbx_NtFsControlFile; - -typedef uint64_t(WINAPI *MDBX_GetTickCount64)(void); -MDBX_INTERNAL_VAR_PROTO MDBX_GetTickCount64 mdbx_GetTickCount64; - -#if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8 -typedef struct _WIN32_MEMORY_RANGE_ENTRY { - PVOID VirtualAddress; - SIZE_T NumberOfBytes; -} WIN32_MEMORY_RANGE_ENTRY, *PWIN32_MEMORY_RANGE_ENTRY; -#endif /* Windows 8.x */ - -typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)( - HANDLE hProcess, ULONG_PTR NumberOfEntries, - PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags); -MDBX_INTERNAL_VAR_PROTO MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; - -typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT; - -typedef NTSTATUS(NTAPI *MDBX_NtExtendSection)(IN HANDLE SectionHandle, - IN PLARGE_INTEGER NewSectionSize); -MDBX_INTERNAL_VAR_PROTO MDBX_NtExtendSection mdbx_NtExtendSection; - -static __inline bool mdbx_RunningUnderWine(void) { - return !mdbx_NtExtendSection; -} - -typedef LSTATUS(WINAPI *MDBX_RegGetValueA)(HKEY hkey, LPCSTR lpSubKey, - LPCSTR lpValue, DWORD dwFlags, - LPDWORD pdwType, PVOID pvData, - LPDWORD pcbData); -MDBX_INTERNAL_VAR_PROTO MDBX_RegGetValueA mdbx_RegGetValueA; - -NTSYSAPI ULONG RtlRandomEx(PULONG Seed); -typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle, - PUCHAR OverlappedRangeStart, - ULONG Length); -MDBX_INTERNAL_VAR_PROTO MDBX_SetFileIoOverlappedRange - mdbx_SetFileIoOverlappedRange; +MDBX_INTERNAL void osal_ctor(void); +MDBX_INTERNAL void osal_dtor(void); +#if defined(_WIN32) || defined(_WIN64) +MDBX_INTERNAL int osal_mb2w(const char *const src, wchar_t **const pdst); #endif /* Windows */ -#endif /* !__cplusplus */ - /*----------------------------------------------------------------------------*/ -MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline uint64_t osal_bswap64(uint64_t v) { #if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ __has_builtin(__builtin_bswap64) @@ -1910,7 +1649,7 @@ osal_bswap64(uint64_t v) { #endif } -MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline uint32_t osal_bswap32(uint32_t v) { #if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ __has_builtin(__builtin_bswap32) @@ -1927,45 +1666,17 @@ osal_bswap32(uint32_t v) { #endif } -/*----------------------------------------------------------------------------*/ - -#if defined(_MSC_VER) && _MSC_VER >= 1900 -/* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros - * for internal format-args checker. */ -#undef PRIuPTR -#undef PRIiPTR -#undef PRIdPTR -#undef PRIxPTR -#define PRIuPTR "Iu" -#define PRIiPTR "Ii" -#define PRIdPTR "Id" -#define PRIxPTR "Ix" -#define PRIuSIZE "zu" -#define PRIiSIZE "zi" -#define PRIdSIZE "zd" -#define PRIxSIZE "zx" -#endif /* fix PRI*PTR for _MSC_VER */ - -#ifndef PRIuSIZE -#define PRIuSIZE PRIuPTR -#define PRIiSIZE PRIiPTR -#define PRIdSIZE PRIdPTR -#define PRIxSIZE PRIxPTR -#endif /* PRI*SIZE macros for MSVC */ - -#ifdef _MSC_VER -#pragma warning(pop) -#endif +#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || defined(_WIN64) +#define MDBX_WORDBITS 64 +#else +#define MDBX_WORDBITS 32 +#endif /* MDBX_WORDBITS */ -#define mdbx_sourcery_anchor XCONCAT(mdbx_sourcery_, MDBX_BUILD_SOURCERY) -#if defined(xMDBX_TOOLS) -extern LIBMDBX_API const char *const mdbx_sourcery_anchor; -#endif /******************************************************************************* - ******************************************************************************* ******************************************************************************* * + * BUILD TIME * * #### ##### ##### # #### # # #### * # # # # # # # # ## # # @@ -1977,6 +1688,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; * */ + + /** \defgroup build_option Build options * The libmdbx build options. @{ */ @@ -2156,7 +1869,11 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /** Avoid dependence from MSVC CRT and use ntdll.dll instead. */ #ifndef MDBX_WITHOUT_MSVC_CRT +#if !defined(MDBX_BUILD_CXX) || !MDBX_BUILD_CXX #define MDBX_WITHOUT_MSVC_CRT 1 +#else +#define MDBX_WITHOUT_MSVC_CRT 0 +#endif #elif !(MDBX_WITHOUT_MSVC_CRT == 0 || MDBX_WITHOUT_MSVC_CRT == 1) #error MDBX_WITHOUT_MSVC_CRT must be defined as 0 or 1 #endif /* MDBX_WITHOUT_MSVC_CRT */ @@ -2463,6 +2180,13 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #endif #endif /* MDBX_CACHELINE_SIZE */ +/* Max length of iov-vector passed to writev() call, used for auxilary writes */ +#define MDBX_AUXILARY_IOV_MAX 64 +#if defined(IOV_MAX) && IOV_MAX < MDBX_AUXILARY_IOV_MAX +#undef MDBX_AUXILARY_IOV_MAX +#define MDBX_AUXILARY_IOV_MAX IOV_MAX +#endif /* MDBX_AUXILARY_IOV_MAX */ + /** @} end of build options */ /******************************************************************************* ******************************************************************************* @@ -2477,6 +2201,9 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #else #define MDBX_DEBUG 1 #endif +#endif +#if MDBX_DEBUG < 0 || MDBX_DEBUG > 2 +#error "The MDBX_DEBUG must be defined to 0, 1 or 2" #endif /* MDBX_DEBUG */ #else @@ -2496,179 +2223,63 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; * Also enables \ref MDBX_DBG_AUDIT if `MDBX_DEBUG >= 2`. * * \ingroup build_option */ -#define MDBX_DEBUG 0...7 +#define MDBX_DEBUG 0...2 /** Disables using of GNU libc extensions. */ #define MDBX_DISABLE_GNU_SOURCE 0 or 1 #endif /* DOXYGEN */ -/* Undefine the NDEBUG if debugging is enforced by MDBX_DEBUG */ -#if MDBX_DEBUG -#undef NDEBUG -#endif -#ifndef __cplusplus -/*----------------------------------------------------------------------------*/ -/* Debug and Logging stuff */ -#define MDBX_RUNTIME_FLAGS_INIT \ - ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT -union logger_union { - void *ptr; - MDBX_debug_func *fmt; - MDBX_debug_func_nofmt *nofmt; -}; +#ifndef MDBX_64BIT_ATOMIC +#error "The MDBX_64BIT_ATOMIC must be defined before" +#endif /* MDBX_64BIT_ATOMIC */ -MDBX_INTERNAL_VAR_PROTO struct mdbx_static { - uint8_t flags; - uint8_t loglevel; - union logger_union logger; - size_t logger_buffer_size; - char *logger_buffer; -} mdbx_static; +#ifndef MDBX_64BIT_CAS +#error "The MDBX_64BIT_CAS must be defined before" +#endif /* MDBX_64BIT_CAS */ -MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { -#if MDBX_DEBUG - if (MDBX_DBG_JITTER & mdbx_static.flags) - osal_jitter(tiny); -#else - (void)tiny; -#endif -} - -MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) - debug_log(int level, const char *function, int line, const char *fmt, ...) - MDBX_PRINTF_ARGS(4, 5); -MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, - const char *fmt, va_list args); - -#if MDBX_DEBUG -#define LOG_ENABLED(msg) unlikely(msg <= mdbx_static.loglevel) -#define AUDIT_ENABLED() unlikely((mdbx_static.flags & MDBX_DBG_AUDIT)) -#else /* MDBX_DEBUG */ -#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= mdbx_static.loglevel) -#define AUDIT_ENABLED() (0) -#endif /* MDBX_DEBUG */ - -#if MDBX_FORCE_ASSERTIONS -#define ASSERT_ENABLED() (1) -#elif MDBX_DEBUG -#define ASSERT_ENABLED() likely((mdbx_static.flags & MDBX_DBG_ASSERT)) +#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include() +#include +#define MDBX_HAVE_C11ATOMICS +#elif !defined(__cplusplus) && \ + (__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) && \ + !defined(__STDC_NO_ATOMICS__) && \ + (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ + !(defined(__GNUC__) || defined(__clang__))) +#include +#define MDBX_HAVE_C11ATOMICS +#elif defined(__GNUC__) || defined(__clang__) +#elif defined(_MSC_VER) +#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ +#pragma warning(disable : 4133) /* 'function': incompatible types - from \ + 'size_t' to 'LONGLONG' */ +#pragma warning(disable : 4244) /* 'return': conversion from 'LONGLONG' to \ + 'std::size_t', possible loss of data */ +#pragma warning(disable : 4267) /* 'function': conversion from 'size_t' to \ + 'long', possible loss of data */ +#pragma intrinsic(_InterlockedExchangeAdd, _InterlockedCompareExchange) +#pragma intrinsic(_InterlockedExchangeAdd64, _InterlockedCompareExchange64) +#elif defined(__APPLE__) +#include #else -#define ASSERT_ENABLED() (0) -#endif /* assertions */ - -#define DEBUG_EXTRA(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ - debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ - } while (0) - -#define DEBUG_EXTRA_PRINT(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ - debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ - } while (0) - -#define TRACE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_TRACE)) \ - debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define DEBUG(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ - debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define VERBOSE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ - debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define NOTICE(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ - debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define WARNING(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_WARN)) \ - debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#undef ERROR /* wingdi.h \ - Yeah, morons from M$ put such definition to the public header. */ - -#define ERROR(fmt, ...) \ - do { \ - if (LOG_ENABLED(MDBX_LOG_ERROR)) \ - debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ - } while (0) - -#define FATAL(fmt, ...) \ - debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); - -#if MDBX_DEBUG -#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) -#else /* MDBX_DEBUG */ -MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, - unsigned line); -#define ASSERT_FAIL(env, msg, func, line) \ - do { \ - (void)(env); \ - assert_fail(msg, func, line); \ - } while (0) -#endif /* MDBX_DEBUG */ - -#define ENSURE_MSG(env, expr, msg) \ - do { \ - if (unlikely(!(expr))) \ - ASSERT_FAIL(env, msg, __func__, __LINE__); \ - } while (0) - -#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) - -/* assert(3) variant in environment context */ -#define eASSERT(env, expr) \ - do { \ - if (ASSERT_ENABLED()) \ - ENSURE(env, expr); \ - } while (0) - -/* assert(3) variant in cursor context */ -#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) - -/* assert(3) variant in transaction context */ -#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) - -#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ -#undef assert -#define assert(expr) eASSERT(NULL, expr) +#error FIXME atomic-ops #endif -#endif /* __cplusplus */ - -/*----------------------------------------------------------------------------*/ -/* Atomics */ - -enum MDBX_memory_order { +typedef enum mdbx_memory_order { mo_Relaxed, mo_AcquireRelease /* , mo_SequentialConsistency */ -}; +} mdbx_memory_order_t; typedef union { volatile uint32_t weak; #ifdef MDBX_HAVE_C11ATOMICS volatile _Atomic uint32_t c11a; #endif /* MDBX_HAVE_C11ATOMICS */ -} MDBX_atomic_uint32_t; +} mdbx_atomic_uint32_t; typedef union { volatile uint64_t weak; @@ -2678,15 +2289,15 @@ typedef union { #if !defined(MDBX_HAVE_C11ATOMICS) || !MDBX_64BIT_CAS || !MDBX_64BIT_ATOMIC __anonymous_struct_extension__ struct { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - MDBX_atomic_uint32_t low, high; + mdbx_atomic_uint32_t low, high; #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - MDBX_atomic_uint32_t high, low; + mdbx_atomic_uint32_t high, low; #else #error "FIXME: Unsupported byte order" #endif /* __BYTE_ORDER__ */ }; #endif -} MDBX_atomic_uint64_t; +} mdbx_atomic_uint64_t; #ifdef MDBX_HAVE_C11ATOMICS @@ -2713,81 +2324,12 @@ typedef union { #endif /* MDBX_HAVE_C11ATOMICS */ -#ifndef __cplusplus - -#ifdef MDBX_HAVE_C11ATOMICS -#define osal_memory_fence(order, write) \ - atomic_thread_fence((write) ? mo_c11_store(order) : mo_c11_load(order)) -#else /* MDBX_HAVE_C11ATOMICS */ -#define osal_memory_fence(order, write) \ - do { \ - osal_compiler_barrier(); \ - if (write && order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed \ - : mo_AcquireRelease)) \ - osal_memory_barrier(); \ - } while (0) -#endif /* MDBX_HAVE_C11ATOMICS */ - -#if defined(MDBX_HAVE_C11ATOMICS) && defined(__LCC__) -#define atomic_store32(p, value, order) \ - ({ \ - const uint32_t value_to_store = (value); \ - atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value_to_store, \ - mo_c11_store(order)); \ - value_to_store; \ - }) -#define atomic_load32(p, order) \ - atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order)) -#define atomic_store64(p, value, order) \ - ({ \ - const uint64_t value_to_store = (value); \ - atomic_store_explicit(MDBX_c11a_rw(uint64_t, p), value_to_store, \ - mo_c11_store(order)); \ - value_to_store; \ - }) -#define atomic_load64(p, order) \ - atomic_load_explicit(MDBX_c11a_ro(uint64_t, p), mo_c11_load(order)) -#endif /* LCC && MDBX_HAVE_C11ATOMICS */ +#define SAFE64_INVALID_THRESHOLD UINT64_C(0xffffFFFF00000000) -#ifndef atomic_store32 -MDBX_MAYBE_UNUSED static __always_inline uint32_t -atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, - enum MDBX_memory_order order) { - STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); -#ifdef MDBX_HAVE_C11ATOMICS - assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p))); - atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order)); -#else /* MDBX_HAVE_C11ATOMICS */ - if (order != mo_Relaxed) - osal_compiler_barrier(); - p->weak = value; - osal_memory_fence(order, true); -#endif /* MDBX_HAVE_C11ATOMICS */ - return value; -} -#endif /* atomic_store32 */ -#ifndef atomic_load32 -MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32( - const volatile MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { - STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); -#ifdef MDBX_HAVE_C11ATOMICS - assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p))); - return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order)); -#else /* MDBX_HAVE_C11ATOMICS */ - osal_memory_fence(order, false); - const uint32_t value = p->weak; - if (order != mo_Relaxed) - osal_compiler_barrier(); - return value; -#endif /* MDBX_HAVE_C11ATOMICS */ -} -#endif /* atomic_load32 */ -#endif /* !__cplusplus */ -/*----------------------------------------------------------------------------*/ -/* Basic constants and types */ +#pragma pack(push, 4) /* A stamp that identifies a file as an MDBX file. * There's nothing special about this value other than that it is easily @@ -2796,8 +2338,12 @@ MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32( /* FROZEN: The version number for a database's datafile format. */ #define MDBX_DATA_VERSION 3 -/* The version number for a database's lockfile format. */ -#define MDBX_LOCK_VERSION 5 + +#define MDBX_DATA_MAGIC \ + ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION) +#define MDBX_DATA_MAGIC_LEGACY_COMPAT \ + ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + 2) +#define MDBX_DATA_MAGIC_LEGACY_DEVEL ((MDBX_MAGIC << 8) + 255) /* handle for the DB used to track free pages. */ #define FREE_DBI 0 @@ -2814,204 +2360,304 @@ MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32( * MDBX uses 32 bit for page numbers. This limits database * size up to 2^44 bytes, in case of 4K pages. */ typedef uint32_t pgno_t; -typedef MDBX_atomic_uint32_t atomic_pgno_t; +typedef mdbx_atomic_uint32_t atomic_pgno_t; #define PRIaPGNO PRIu32 #define MAX_PAGENO UINT32_C(0x7FFFffff) #define MIN_PAGENO NUM_METAS -#define SAFE64_INVALID_THRESHOLD UINT64_C(0xffffFFFF00000000) +/* An invalid page number. + * Mainly used to denote an empty tree. */ +#define P_INVALID (~(pgno_t)0) /* A transaction ID. */ typedef uint64_t txnid_t; -typedef MDBX_atomic_uint64_t atomic_txnid_t; +typedef mdbx_atomic_uint64_t atomic_txnid_t; #define PRIaTXN PRIi64 #define MIN_TXNID UINT64_C(1) #define MAX_TXNID (SAFE64_INVALID_THRESHOLD - 1) #define INITIAL_TXNID (MIN_TXNID + NUM_METAS - 1) #define INVALID_TXNID UINT64_MAX -/* LY: for testing non-atomic 64-bit txnid on 32-bit arches. - * #define xMDBX_TXNID_STEP (UINT32_MAX / 3) */ -#ifndef xMDBX_TXNID_STEP -#if MDBX_64BIT_CAS -#define xMDBX_TXNID_STEP 1u -#else -#define xMDBX_TXNID_STEP 2u -#endif -#endif /* xMDBX_TXNID_STEP */ -/* Used for offsets within a single page. - * Since memory pages are typically 4 or 8KB in size, 12-13 bits, - * this is plenty. */ +/* Used for offsets within a single page. */ typedef uint16_t indx_t; -#define MEGABYTE ((size_t)1 << 20) - -/*----------------------------------------------------------------------------*/ -/* Core structures for database and shared memory (i.e. format definition) */ -#pragma pack(push, 4) - -/* Information about a single database in the environment. */ -typedef struct MDBX_db { - uint16_t md_flags; /* see mdbx_dbi_open */ - uint16_t md_depth; /* depth of this tree */ - uint32_t md_xsize; /* key-size for MDBX_DUPFIXED (LEAF2 pages) */ - pgno_t md_root; /* the root page of this tree */ - pgno_t md_branch_pages; /* number of internal pages */ - pgno_t md_leaf_pages; /* number of leaf pages */ - pgno_t md_overflow_pages; /* number of overflow pages */ - uint64_t md_seq; /* table sequence counter */ - uint64_t md_entries; /* number of data items */ - uint64_t md_mod_txnid; /* txnid of last committed modification */ -} MDBX_db; +typedef struct tree { + uint16_t flags; /* see mdbx_dbi_open */ + uint16_t height; /* height of this tree */ + uint32_t dupfix_size; /* key-size for MDBX_DUPFIXED (DUPFIX pages) */ + pgno_t root; /* the root page of this tree */ + pgno_t branch_pages; /* number of internal pages */ + pgno_t leaf_pages; /* number of leaf pages */ + pgno_t large_pages; /* number of large pages */ + uint64_t sequence; /* table sequence counter */ + uint64_t items; /* number of data items */ + uint64_t mod_txnid; /* txnid of last committed modification */ +} tree_t; /* database size-related parameters */ -typedef struct MDBX_geo { +typedef struct geo { uint16_t grow_pv; /* datafile growth step as a 16-bit packed (exponential quantized) value */ uint16_t shrink_pv; /* datafile shrink threshold as a 16-bit packed (exponential quantized) value */ pgno_t lower; /* minimal size of datafile in pages */ pgno_t upper; /* maximal size of datafile in pages */ - pgno_t now; /* current size of datafile in pages */ - pgno_t next; /* first unused page in the datafile, + union { + pgno_t now; /* current size of datafile in pages */ + pgno_t end_pgno; + }; + union { + pgno_t first_unallocated; /* first unused page in the datafile, but actually the file may be shorter. */ -} MDBX_geo; + pgno_t next_pgno; + }; +} geo_t; + +typedef union bin128 { + __anonymous_struct_extension__ struct { + uint64_t x, y; + }; + __anonymous_struct_extension__ struct { + uint32_t a, b, c, d; + }; +} bin128_t; /* Meta page content. * A meta page is the start point for accessing a database snapshot. - * Pages 0-1 are meta pages. Transaction N writes meta page (N % 2). */ -typedef struct MDBX_meta { + * Pages 0-2 are meta pages. */ +typedef struct meta { /* Stamp identifying this as an MDBX file. * It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */ - uint32_t mm_magic_and_version[2]; + uint32_t magic_and_version[2]; - /* txnid that committed this page, the first of a two-phase-update pair */ + /* txnid that committed this meta, the first of a two-phase-update pair */ union { - MDBX_atomic_uint32_t mm_txnid_a[2]; + mdbx_atomic_uint32_t txnid_a[2]; uint64_t unsafe_txnid; }; - uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ - uint8_t mm_validator_id; /* ID of checksum and page validation method, - * zero (nothing) for now */ - uint8_t mm_extra_pagehdr; /* extra bytes in the page header, - * zero (nothing) for now */ + uint16_t reserve16; /* extra flags, zero (nothing) for now */ + uint8_t validator_id; /* ID of checksum and page validation method, + * zero (nothing) for now */ + int8_t extra_pagehdr; /* extra bytes in the page header, + * zero (nothing) for now */ - MDBX_geo mm_geo; /* database size-related parameters */ + geo_t geometry; /* database size-related parameters */ - MDBX_db mm_dbs[CORE_DBS]; /* first is free space, 2nd is main db */ - /* The size of pages used in this DB */ -#define mm_psize mm_dbs[FREE_DBI].md_xsize - MDBX_canary mm_canary; + union { + struct { + tree_t gc, main; + } trees; + __anonymous_struct_extension__ struct { + uint16_t gc_flags; + uint16_t gc_height; + uint32_t pagesize; + }; + }; + + MDBX_canary canary; -#define MDBX_DATASIGN_NONE 0u -#define MDBX_DATASIGN_WEAK 1u -#define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) -#define META_IS_STEADY(meta) \ - SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_sign)) +#define DATASIGN_NONE 0u +#define DATASIGN_WEAK 1u +#define SIGN_IS_STEADY(sign) ((sign) > DATASIGN_WEAK) union { - uint32_t mm_sign[2]; + uint32_t sign[2]; uint64_t unsafe_sign; }; - /* txnid that committed this page, the second of a two-phase-update pair */ - MDBX_atomic_uint32_t mm_txnid_b[2]; + /* txnid that committed this meta, the second of a two-phase-update pair */ + mdbx_atomic_uint32_t txnid_b[2]; /* Number of non-meta pages which were put in GC after COW. May be 0 in case * DB was previously handled by libmdbx without corresponding feature. - * This value in couple with mr_snapshot_pages_retired allows fast estimation - * of "how much reader is restraining GC recycling". */ - uint32_t mm_pages_retired[2]; + * This value in couple with reader.snapshot_pages_retired allows fast + * estimation of "how much reader is restraining GC recycling". */ + uint32_t pages_retired[2]; /* The analogue /proc/sys/kernel/random/boot_id or similar to determine * whether the system was rebooted after the last use of the database files. * If there was no reboot, but there is no need to rollback to the last * steady sync point. Zeros mean that no relevant information is available * from the system. */ - bin128_t mm_bootid; - -} MDBX_meta; + bin128_t bootid; +} meta_t; #pragma pack(1) -/* Common header for all page types. The page type depends on mp_flags. +typedef enum page_type { + P_BRANCH = 0x01u /* branch page */, + P_LEAF = 0x02u /* leaf page */, + P_LARGE = 0x04u /* large/overflow page */, + P_META = 0x08u /* meta page */, + P_LEGACY_DIRTY = 0x10u /* legacy P_DIRTY flag prior to v0.10 958fd5b9 */, + P_BAD = P_LEGACY_DIRTY /* explicit flag for invalid/bad page */, + P_DUPFIX = 0x20u /* for MDBX_DUPFIXED records */, + P_SUBP = 0x40u /* for MDBX_DUPSORT sub-pages */, + P_SPILLED = 0x2000u /* spilled in parent txn */, + P_LOOSE = 0x4000u /* page was dirtied then freed, can be reused */, + P_FROZEN = 0x8000u /* used for retire page with known status */, + P_ILL_BITS = (uint16_t)~(P_BRANCH | P_LEAF | P_DUPFIX | P_LARGE | P_SPILLED), + + page_broken = 0, + page_large = P_LARGE, + page_branch = P_BRANCH, + page_leaf = P_LEAF, + page_dupfix_leaf = P_DUPFIX, + page_sub_leaf = P_SUBP | P_LEAF, + page_sub_dupfix_leaf = P_SUBP | P_DUPFIX, + page_sub_broken = P_SUBP, +} page_type_t; + +/* Common header for all page types. The page type depends on flags. * - * P_BRANCH and P_LEAF pages have unsorted 'MDBX_node's at the end, with - * sorted mp_ptrs[] entries referring to them. Exception: P_LEAF2 pages - * omit mp_ptrs and pack sorted MDBX_DUPFIXED values after the page header. + * P_BRANCH and P_LEAF pages have unsorted 'node_t's at the end, with + * sorted entries[] entries referring to them. Exception: P_DUPFIX pages + * omit entries and pack sorted MDBX_DUPFIXED values after the page header. * - * P_OVERFLOW records occupy one or more contiguous pages where only the - * first has a page header. They hold the real data of F_BIGDATA nodes. + * P_LARGE records occupy one or more contiguous pages where only the + * first has a page header. They hold the real data of N_BIGDATA nodes. * * P_SUBP sub-pages are small leaf "pages" with duplicate data. - * A node with flag F_DUPDATA but not F_SUBDATA contains a sub-page. + * A node with flag N_DUPDATA but not N_SUBDATA contains a sub-page. * (Duplicate data can also go in sub-databases, which use normal pages.) * - * P_META pages contain MDBX_meta, the start point of an MDBX snapshot. + * P_META pages contain meta_t, the start point of an MDBX snapshot. * - * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once + * Each non-metapage up to meta_t.mm_last_pg is reachable exactly once * in the snapshot: Either used by a database or listed in a GC record. */ -typedef struct MDBX_page { -#define IS_FROZEN(txn, p) ((p)->mp_txnid < (txn)->mt_txnid) -#define IS_SPILLED(txn, p) ((p)->mp_txnid == (txn)->mt_txnid) -#define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) -#define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) -#define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) - uint64_t mp_txnid; /* txnid which created page, maybe zero in legacy DB */ - uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ -#define P_BRANCH 0x01u /* branch page */ -#define P_LEAF 0x02u /* leaf page */ -#define P_OVERFLOW 0x04u /* overflow page */ -#define P_META 0x08u /* meta page */ -#define P_LEGACY_DIRTY 0x10u /* legacy P_DIRTY flag prior to v0.10 958fd5b9 */ -#define P_BAD P_LEGACY_DIRTY /* explicit flag for invalid/bad page */ -#define P_LEAF2 0x20u /* for MDBX_DUPFIXED records */ -#define P_SUBP 0x40u /* for MDBX_DUPSORT sub-pages */ -#define P_SPILLED 0x2000u /* spilled in parent txn */ -#define P_LOOSE 0x4000u /* page was dirtied then freed, can be reused */ -#define P_FROZEN 0x8000u /* used for retire page with known status */ -#define P_ILL_BITS \ - ((uint16_t) ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) - uint16_t mp_flags; +typedef struct page { + uint64_t txnid; /* txnid which created page, maybe zero in legacy DB */ + uint16_t dupfix_ksize; /* key size if this is a DUPFIX page */ + uint16_t flags; union { - uint32_t mp_pages; /* number of overflow pages */ + uint32_t pages; /* number of overflow pages */ __anonymous_struct_extension__ struct { - indx_t mp_lower; /* lower bound of free space */ - indx_t mp_upper; /* upper bound of free space */ + indx_t lower; /* lower bound of free space */ + indx_t upper; /* upper bound of free space */ }; }; - pgno_t mp_pgno; /* page number */ - -#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ - (!defined(__cplusplus) && defined(_MSC_VER)) - indx_t mp_ptrs[] /* dynamic size */; -#endif /* C99 */ -} MDBX_page; + pgno_t pgno; /* page number */ -#define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) - -/* Drop legacy P_DIRTY flag for sub-pages for compatilibity, - * for assertions only. */ -#define PAGETYPE_COMPAT(p) \ - (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ - ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ - : PAGETYPE_WHOLE(p)) +#if FLEXIBLE_ARRAY_MEMBERS + indx_t entries[] /* dynamic size */; +#endif /* FLEXIBLE_ARRAY_MEMBERS */ +} page_t; /* Size of the page header, excluding dynamic data at the end */ -#define PAGEHDRSZ offsetof(MDBX_page, mp_ptrs) - -/* Pointer displacement without casting to char* to avoid pointer-aliasing */ -#define ptr_disp(ptr, disp) ((void *)(((intptr_t)(ptr)) + ((intptr_t)(disp)))) +#define PAGEHDRSZ 20u -/* Pointer distance as signed number of bytes */ -#define ptr_dist(more, less) (((intptr_t)(more)) - ((intptr_t)(less))) +/* Header for a single key/data pair within a page. + * Used in pages of type P_BRANCH and P_LEAF without P_DUPFIX. + * We guarantee 2-byte alignment for 'node_t's. + * + * Leaf node flags describe node contents. N_BIGDATA says the node's + * data part is the page number of an overflow page with actual data. + * N_DUPDATA and N_SUBDATA can be combined giving duplicate data in + * a sub-page/sub-database, and named databases (just N_SUBDATA). */ +typedef struct node { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + union { + uint32_t dsize; + uint32_t child_pgno; + }; + uint8_t flags; /* see node_flags */ + uint8_t extra; + uint16_t ksize; /* key size */ +#else + uint16_t ksize; /* key size */ + uint8_t extra; + uint8_t flags; /* see node_flags */ + union { + uint32_t child_pgno; + uint32_t dsize; + }; +#endif /* __BYTE_ORDER__ */ + +#if FLEXIBLE_ARRAY_MEMBERS + uint8_t payload[] /* key and data are appended here */; +#endif /* FLEXIBLE_ARRAY_MEMBERS */ +} node_t; + +/* Size of the node header, excluding dynamic data at the end */ +#define NODESIZE 8u -#define mp_next(mp) \ - (*(MDBX_page **)ptr_disp((mp)->mp_ptrs, sizeof(void *) - sizeof(uint32_t))) +typedef enum node_flags { + N_BIGDATA = 0x01 /* data put on large page */, + N_SUBDATA = 0x02 /* data is a sub-database */, + N_DUPDATA = 0x04 /* data has duplicates */ +} node_flags_t; #pragma pack(pop) -typedef struct profgc_stat { +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline uint8_t +page_type(const page_t *mp) { + return mp->flags; +} + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline uint8_t +page_type_compat(const page_t *mp) { + /* Drop legacy P_DIRTY flag for sub-pages for compatilibity, + * for assertions only. */ + return unlikely(mp->flags & P_SUBP) ? mp->flags & ~(P_SUBP | P_LEGACY_DIRTY) + : mp->flags; +} + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool +is_leaf(const page_t *mp) { + return (mp->flags & P_LEAF) != 0; +} + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool +is_dupfix_leaf(const page_t *mp) { + return (mp->flags & P_DUPFIX) != 0; +} + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool +is_branch(const page_t *mp) { + return (mp->flags & P_BRANCH) != 0; +} + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool +is_largepage(const page_t *mp) { + return (mp->flags & P_LARGE) != 0; +} + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool +is_subpage(const page_t *mp) { + return (mp->flags & P_SUBP) != 0; +} + + + +/* The version number for a database's lockfile format. */ +#define MDBX_LOCK_VERSION 5 + +#if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES + +#define MDBX_LCK_SIGN UINT32_C(0xF10C) +typedef void osal_ipclock_t; +#elif MDBX_LOCKING == MDBX_LOCKING_SYSV + +#define MDBX_LCK_SIGN UINT32_C(0xF18D) +typedef mdbx_pid_t osal_ipclock_t; + +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 + +#define MDBX_LCK_SIGN UINT32_C(0x8017) +typedef pthread_mutex_t osal_ipclock_t; + +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + +#define MDBX_LCK_SIGN UINT32_C(0xFC29) +typedef sem_t osal_ipclock_t; + +#else +#error "FIXME" +#endif /* MDBX_LOCKING */ + +/* Статистика профилирования работы GC */ +typedef struct gc_prof_stat { /* Монотонное время по "настенным часам" * затраченное на чтение и поиск внутри GC */ uint64_t rtime_monotonic; @@ -3027,42 +2673,42 @@ typedef struct profgc_stat { uint32_t spe_counter; /* page faults (hard page faults) */ uint32_t majflt; -} profgc_stat_t; - -/* Statistics of page operations overall of all (running, completed and aborted) - * transactions */ -typedef struct pgop_stat { - MDBX_atomic_uint64_t newly; /* Quantity of a new pages added */ - MDBX_atomic_uint64_t cow; /* Quantity of pages copied for update */ - MDBX_atomic_uint64_t clone; /* Quantity of parent's dirty pages clones +} gc_prof_stat_t; + +/* Statistics of pages operations for all transactions, + * including incomplete and aborted. */ +typedef struct pgops { + mdbx_atomic_uint64_t newly; /* Quantity of a new pages added */ + mdbx_atomic_uint64_t cow; /* Quantity of pages copied for update */ + mdbx_atomic_uint64_t clone; /* Quantity of parent's dirty pages clones for nested transactions */ - MDBX_atomic_uint64_t split; /* Page splits */ - MDBX_atomic_uint64_t merge; /* Page merges */ - MDBX_atomic_uint64_t spill; /* Quantity of spilled dirty pages */ - MDBX_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */ - MDBX_atomic_uint64_t + mdbx_atomic_uint64_t split; /* Page splits */ + mdbx_atomic_uint64_t merge; /* Page merges */ + mdbx_atomic_uint64_t spill; /* Quantity of spilled dirty pages */ + mdbx_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */ + mdbx_atomic_uint64_t wops; /* Number of explicit write operations (not a pages) to a disk */ - MDBX_atomic_uint64_t + mdbx_atomic_uint64_t msync; /* Number of explicit msync/flush-to-disk operations */ - MDBX_atomic_uint64_t + mdbx_atomic_uint64_t fsync; /* Number of explicit fsync/flush-to-disk operations */ - MDBX_atomic_uint64_t prefault; /* Number of prefault write operations */ - MDBX_atomic_uint64_t mincore; /* Number of mincore() calls */ + mdbx_atomic_uint64_t prefault; /* Number of prefault write operations */ + mdbx_atomic_uint64_t mincore; /* Number of mincore() calls */ - MDBX_atomic_uint32_t + mdbx_atomic_uint32_t incoherence; /* number of https://libmdbx.dqdkfa.ru/dead-github/issues/269 caught */ - MDBX_atomic_uint32_t reserved; + mdbx_atomic_uint32_t reserved; /* Статистика для профилирования GC. - * Логически эти данные может быть стоит вынести в другую структуру, + * Логически эти данные, возможно, стоит вынести в другую структуру, * но разница будет сугубо косметическая. */ struct { /* Затраты на поддержку данных пользователя */ - profgc_stat_t work; + gc_prof_stat_t work; /* Затраты на поддержку и обновления самой GC */ - profgc_stat_t self; + gc_prof_stat_t self; /* Итераций обновления GC, * больше 1 если были повторы/перезапуски */ uint32_t wloops; @@ -3077,33 +2723,6 @@ typedef struct pgop_stat { } gc_prof; } pgop_stat_t; -#if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES -#define MDBX_CLOCK_SIGN UINT32_C(0xF10C) -typedef void osal_ipclock_t; -#elif MDBX_LOCKING == MDBX_LOCKING_SYSV - -#define MDBX_CLOCK_SIGN UINT32_C(0xF18D) -typedef mdbx_pid_t osal_ipclock_t; -#ifndef EOWNERDEAD -#define EOWNERDEAD MDBX_RESULT_TRUE -#endif - -#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ - MDBX_LOCKING == MDBX_LOCKING_POSIX2008 -#define MDBX_CLOCK_SIGN UINT32_C(0x8017) -typedef pthread_mutex_t osal_ipclock_t; -#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 -#define MDBX_CLOCK_SIGN UINT32_C(0xFC29) -typedef sem_t osal_ipclock_t; -#else -#error "FIXME" -#endif /* MDBX_LOCKING */ - -#if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) -MDBX_INTERNAL_FUNC int osal_ipclock_stubinit(osal_ipclock_t *ipc); -MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); -#endif /* MDBX_LOCKING */ - /* Reader Lock Table * * Readers don't acquire any locks for their data access. Instead, they @@ -3144,14 +2763,14 @@ MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); * many old transactions together. */ /* The actual reader record, with cacheline padding. */ -typedef struct MDBX_reader { - /* Current Transaction ID when this transaction began, or (txnid_t)-1. +typedef struct reader_slot { + /* Current Transaction ID when this transaction began, or INVALID_TXNID. * Multiple readers that start at the same time will probably have the * same ID here. Again, it's not important to exclude them from * anything; all we need to know is which version of the DB they * started from so we can avoid overwriting any data used in that * particular version. */ - MDBX_atomic_uint64_t /* txnid_t */ mr_txnid; + atomic_txnid_t txnid; /* The information we store in a single slot of the reader table. * In addition to a transaction ID, we also record the process and @@ -3163,35 +2782,37 @@ typedef struct MDBX_reader { * opening the lock file. */ /* The thread ID of the thread owning this txn. */ - MDBX_atomic_uint64_t mr_tid; + mdbx_atomic_uint64_t tid; /* The process ID of the process owning this reader txn. */ - MDBX_atomic_uint32_t mr_pid; + mdbx_atomic_uint32_t pid; /* The number of pages used in the reader's MVCC snapshot, - * i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */ - atomic_pgno_t mr_snapshot_pages_used; + * i.e. the value of meta->geometry.first_unallocated and + * txn->geo.first_unallocated */ + atomic_pgno_t snapshot_pages_used; /* Number of retired pages at the time this reader starts transaction. So, - * at any time the difference mm_pages_retired - mr_snapshot_pages_retired - * will give the number of pages which this reader restraining from reuse. */ - MDBX_atomic_uint64_t mr_snapshot_pages_retired; -} MDBX_reader; + * at any time the difference meta.pages_retired - + * reader.snapshot_pages_retired will give the number of pages which this + * reader restraining from reuse. */ + mdbx_atomic_uint64_t snapshot_pages_retired; +} reader_slot_t; /* The header for the reader table (a memory-mapped lock file). */ -typedef struct MDBX_lockinfo { +typedef struct shared_lck { /* Stamp identifying this as an MDBX file. * It must be set to MDBX_MAGIC with with MDBX_LOCK_VERSION. */ - uint64_t mti_magic_and_version; + uint64_t magic_and_version; /* Format of this lock file. Must be set to MDBX_LOCK_FORMAT. */ - uint32_t mti_os_and_format; + uint32_t os_and_format; /* Flags which environment was opened. */ - MDBX_atomic_uint32_t mti_envmode; + mdbx_atomic_uint32_t envmode; /* Threshold of un-synced-with-disk pages for auto-sync feature, * zero means no-threshold, i.e. auto-sync is disabled. */ - atomic_pgno_t mti_autosync_threshold; + atomic_pgno_t autosync_threshold; /* Low 32-bit of txnid with which meta-pages was synced, * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ @@ -3199,681 +2820,292 @@ typedef struct MDBX_lockinfo { #define MDBX_NOMETASYNC_LAZY_FD (MDBX_NOMETASYNC_LAZY_UNK + UINT32_MAX / 8) #define MDBX_NOMETASYNC_LAZY_WRITEMAP \ (MDBX_NOMETASYNC_LAZY_UNK - UINT32_MAX / 8) - MDBX_atomic_uint32_t mti_meta_sync_txnid; + mdbx_atomic_uint32_t meta_sync_txnid; /* Period for timed auto-sync feature, i.e. at the every steady checkpoint - * the mti_unsynced_timeout sets to the current_time + mti_autosync_period. + * the mti_unsynced_timeout sets to the current_time + autosync_period. * The time value is represented in a suitable system-dependent form, for * example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC). * Zero means timed auto-sync is disabled. */ - MDBX_atomic_uint64_t mti_autosync_period; + mdbx_atomic_uint64_t autosync_period; /* Marker to distinguish uniqueness of DB/CLK. */ - MDBX_atomic_uint64_t mti_bait_uniqueness; + mdbx_atomic_uint64_t bait_uniqueness; /* Paired counter of processes that have mlock()ed part of mmapped DB. - * The (mti_mlcnt[0] - mti_mlcnt[1]) > 0 means at least one process + * The (mlcnt[0] - mlcnt[1]) > 0 means at least one process * lock at least one page, so therefore madvise() could return EINVAL. */ - MDBX_atomic_uint32_t mti_mlcnt[2]; + mdbx_atomic_uint32_t mlcnt[2]; MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ /* Statistics of costly ops of all (running, completed and aborted) * transactions */ - pgop_stat_t mti_pgop_stat; + pgop_stat_t pgops; MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ - /* Write transaction lock. */ #if MDBX_LOCKING > 0 - osal_ipclock_t mti_wlock; + /* Write transaction lock. */ + osal_ipclock_t wrt_lock; #endif /* MDBX_LOCKING > 0 */ - atomic_txnid_t mti_oldest_reader; + atomic_txnid_t cached_oldest; /* Timestamp of entering an out-of-sync state. Value is represented in a * suitable system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) * or clock_gettime(CLOCK_MONOTONIC). */ - MDBX_atomic_uint64_t mti_eoos_timestamp; + mdbx_atomic_uint64_t eoos_timestamp; /* Number un-synced-with-disk pages for auto-sync feature. */ - MDBX_atomic_uint64_t mti_unsynced_pages; + mdbx_atomic_uint64_t unsynced_pages; /* Timestamp of the last readers check. */ - MDBX_atomic_uint64_t mti_reader_check_timestamp; + mdbx_atomic_uint64_t readers_check_timestamp; /* Number of page which was discarded last time by madvise(DONTNEED). */ - atomic_pgno_t mti_discarded_tail; + atomic_pgno_t discarded_tail; /* Shared anchor for tracking readahead edge and enabled/disabled status. */ - pgno_t mti_readahead_anchor; + pgno_t readahead_anchor; /* Shared cache for mincore() results */ struct { pgno_t begin[4]; uint64_t mask[4]; - } mti_mincore_cache; + } mincore_cache; MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ - /* Readeaders registration lock. */ #if MDBX_LOCKING > 0 - osal_ipclock_t mti_rlock; + /* Readeaders table lock. */ + osal_ipclock_t rdt_lock; #endif /* MDBX_LOCKING > 0 */ /* The number of slots that have been used in the reader table. * This always records the maximum count, it is not decremented * when readers release their slots. */ - MDBX_atomic_uint32_t mti_numreaders; - MDBX_atomic_uint32_t mti_readers_refresh_flag; + mdbx_atomic_uint32_t rdt_length; + mdbx_atomic_uint32_t rdt_refresh_flag; -#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ - (!defined(__cplusplus) && defined(_MSC_VER)) +#if FLEXIBLE_ARRAY_MEMBERS MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ - MDBX_reader mti_readers[] /* dynamic size */; -#endif /* C99 */ -} MDBX_lockinfo; + reader_slot_t rdt[] /* dynamic size */; /* Lockfile format signature: version, features and field layout */ #define MDBX_LOCK_FORMAT \ - (MDBX_CLOCK_SIGN * 27733 + (unsigned)sizeof(MDBX_reader) * 13 + \ - (unsigned)offsetof(MDBX_reader, mr_snapshot_pages_used) * 251 + \ - (unsigned)offsetof(MDBX_lockinfo, mti_oldest_reader) * 83 + \ - (unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 37 + \ - (unsigned)offsetof(MDBX_lockinfo, mti_readers) * 29) - -#define MDBX_DATA_MAGIC \ - ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION) - -#define MDBX_DATA_MAGIC_LEGACY_COMPAT \ - ((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + 2) - -#define MDBX_DATA_MAGIC_LEGACY_DEVEL ((MDBX_MAGIC << 8) + 255) + (MDBX_LCK_SIGN * 27733 + (unsigned)sizeof(reader_slot_t) * 13 + \ + (unsigned)offsetof(reader_slot_t, snapshot_pages_used) * 251 + \ + (unsigned)offsetof(lck_t, cached_oldest) * 83 + \ + (unsigned)offsetof(lck_t, rdt_length) * 37 + \ + (unsigned)offsetof(lck_t, rdt) * 29) +#endif /* FLEXIBLE_ARRAY_MEMBERS */ +} lck_t; #define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION) -/* The maximum size of a database page. - * - * It is 64K, but value-PAGEHDRSZ must fit in MDBX_page.mp_upper. - * - * MDBX will use database pages < OS pages if needed. - * That causes more I/O in write transactions: The OS must - * know (read) the whole page before writing a partial page. - * - * Note that we don't currently support Huge pages. On Linux, - * regular data files cannot use Huge pages, and in general - * Huge pages aren't actually pageable. We rely on the OS - * demand-pager to read our data and page it out when memory - * pressure from other processes is high. So until OSs have - * actual paging support for Huge pages, they're not viable. */ -#define MAX_PAGESIZE MDBX_MAX_PAGESIZE -#define MIN_PAGESIZE MDBX_MIN_PAGESIZE - -#define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO) +#define MDBX_READERS_LIMIT 32767 + +#define MIN_MAPSIZE (MDBX_MIN_PAGESIZE * MIN_PAGENO) #if defined(_WIN32) || defined(_WIN64) #define MAX_MAPSIZE32 UINT32_C(0x38000000) #else #define MAX_MAPSIZE32 UINT32_C(0x7f000000) #endif -#define MAX_MAPSIZE64 ((MAX_PAGENO + 1) * (uint64_t)MAX_PAGESIZE) +#define MAX_MAPSIZE64 ((MAX_PAGENO + 1) * (uint64_t)MDBX_MAX_PAGESIZE) #if MDBX_WORDBITS >= 64 #define MAX_MAPSIZE MAX_MAPSIZE64 -#define MDBX_PGL_LIMIT ((size_t)MAX_PAGENO) +#define PAGELIST_LIMIT ((size_t)MAX_PAGENO) #else #define MAX_MAPSIZE MAX_MAPSIZE32 -#define MDBX_PGL_LIMIT (MAX_MAPSIZE32 / MIN_PAGESIZE) +#define PAGELIST_LIMIT (MAX_MAPSIZE32 / MDBX_MIN_PAGESIZE) #endif /* MDBX_WORDBITS */ -#define MDBX_READERS_LIMIT 32767 -#define MDBX_RADIXSORT_THRESHOLD 142 #define MDBX_GOLD_RATIO_DBL 1.6180339887498948482 +#define MEGABYTE ((size_t)1 << 20) /*----------------------------------------------------------------------------*/ -/* An PNL is an Page Number List, a sorted array of IDs. - * The first element of the array is a counter for how many actual page-numbers - * are in the list. By default PNLs are sorted in descending order, this allow - * cut off a page with lowest pgno (at the tail) just truncating the list. The - * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */ -typedef pgno_t *MDBX_PNL; +union logger_union { + void *ptr; + MDBX_debug_func *fmt; + MDBX_debug_func_nofmt *nofmt; +}; -#if MDBX_PNL_ASCENDING -#define MDBX_PNL_ORDERED(first, last) ((first) < (last)) -#define MDBX_PNL_DISORDERED(first, last) ((first) >= (last)) -#else -#define MDBX_PNL_ORDERED(first, last) ((first) > (last)) -#define MDBX_PNL_DISORDERED(first, last) ((first) <= (last)) -#endif +struct libmdbx_globals { + bin128_t bootid; + unsigned sys_pagesize, sys_allocation_granularity; + uint8_t sys_pagesize_ln2; + uint8_t runtime_flags; + uint8_t loglevel; +#if defined(_WIN32) || defined(_WIN64) + bool running_under_Wine; +#elif defined(__linux__) || defined(__gnu_linux__) + bool running_on_WSL1 /* Windows Subsystem 1 for Linux */; + uint32_t linux_kernel_version; +#endif /* Linux */ + union logger_union logger; + osal_fastmutex_t debug_lock; + size_t logger_buffer_size; + char *logger_buffer; +}; -/* List of txnid, only for MDBX_txn.tw.lifo_reclaimed */ -typedef txnid_t *MDBX_TXL; +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ -/* An Dirty-Page list item is an pgno/pointer pair. */ -typedef struct MDBX_dp { - MDBX_page *ptr; - pgno_t pgno, npages; -} MDBX_dp; +extern struct libmdbx_globals globals; +#if defined(_WIN32) || defined(_WIN64) +extern struct libmdbx_imports imports; +#endif /* Windows */ -/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ -typedef struct MDBX_dpl { - size_t sorted; - size_t length; - size_t pages_including_loose; /* number of pages, but not an entries. */ - size_t detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */ -#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ - (!defined(__cplusplus) && defined(_MSC_VER)) - MDBX_dp items[] /* dynamic size with holes at zero and after the last */; -#endif -} MDBX_dpl; -/* PNL sizes */ -#define MDBX_PNL_GRANULATE_LOG2 10 -#define MDBX_PNL_GRANULATE (1 << MDBX_PNL_GRANULATE_LOG2) -#define MDBX_PNL_INITIAL \ - (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#define MDBX_TXL_GRANULATE 32 -#define MDBX_TXL_INITIAL \ - (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) -#define MDBX_TXL_MAX \ - ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) -#define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1]) -#define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0])) -#define MDBX_PNL_SETSIZE(pl, size) \ +#ifndef __Wpedantic_format_voidptr +MDBX_MAYBE_UNUSED static inline const void * +__Wpedantic_format_voidptr(const void *ptr) { + return ptr; +} +#define __Wpedantic_format_voidptr(ARG) __Wpedantic_format_voidptr(ARG) +#endif /* __Wpedantic_format_voidptr */ + +MDBX_INTERNAL void MDBX_PRINTF_ARGS(4, 5) + debug_log(int level, const char *function, int line, const char *fmt, ...) + MDBX_PRINTF_ARGS(4, 5); +MDBX_INTERNAL void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args); + +#if MDBX_DEBUG +#define LOG_ENABLED(LVL) unlikely(LVL <= globals.loglevel) +#define AUDIT_ENABLED() \ + unlikely((globals.runtime_flags & (unsigned)MDBX_DBG_AUDIT)) +#else /* MDBX_DEBUG */ +#define LOG_ENABLED(LVL) (LVL < MDBX_LOG_VERBOSE && LVL <= globals.loglevel) +#define AUDIT_ENABLED() (0) +#endif /* LOG_ENABLED() & AUDIT_ENABLED() */ + +#if MDBX_FORCE_ASSERTIONS +#define ASSERT_ENABLED() (1) +#elif MDBX_DEBUG +#define ASSERT_ENABLED() \ + likely((globals.runtime_flags & (unsigned)MDBX_DBG_ASSERT)) +#else +#define ASSERT_ENABLED() (0) +#endif /* ASSERT_ENABLED() */ + +#define DEBUG_EXTRA(fmt, ...) \ do { \ - const size_t __size = size; \ - assert(__size < INT_MAX); \ - (pl)[0] = (pgno_t)__size; \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ } while (0) -#define MDBX_PNL_FIRST(pl) ((pl)[1]) -#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_GETSIZE(pl)]) -#define MDBX_PNL_BEGIN(pl) (&(pl)[1]) -#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1]) -#if MDBX_PNL_ASCENDING -#define MDBX_PNL_EDGE(pl) ((pl) + 1) -#define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl) -#define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl) -#else -#define MDBX_PNL_EDGE(pl) ((pl) + MDBX_PNL_GETSIZE(pl)) -#define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl) -#define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl) -#endif +#define DEBUG_EXTRA_PRINT(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, nullptr, 0, fmt, __VA_ARGS__); \ + } while (0) -#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_GETSIZE(pl) + 1) * sizeof(pgno_t)) -#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_GETSIZE(pl) == 0) +#define TRACE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_TRACE)) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) -/*----------------------------------------------------------------------------*/ -/* Internal structures */ +#define DEBUG(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) -/* Auxiliary DB info. - * The information here is mostly static/read-only. There is - * only a single copy of this record in the environment. */ -typedef struct MDBX_dbx { - MDBX_val md_name; /* name of the database */ - MDBX_cmp_func *md_cmp; /* function for comparing keys */ - MDBX_cmp_func *md_dcmp; /* function for comparing data items */ - size_t md_klen_min, md_klen_max; /* min/max key length for the database */ - size_t md_vlen_min, - md_vlen_max; /* min/max value/data length for the database */ -} MDBX_dbx; +#define VERBOSE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) -typedef struct troika { - uint8_t fsm, recent, prefer_steady, tail_and_flags; -#if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */ - uint32_t unused_pad; -#endif -#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7u) -#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64u) -#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128u) -#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3u) - txnid_t txnid[NUM_METAS]; -} meta_troika_t; +#define NOTICE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) -/* A database transaction. - * Every operation requires a transaction handle. */ -struct MDBX_txn { -#define MDBX_MT_SIGNATURE UINT32_C(0x93D53A31) - uint32_t mt_signature; - - /* Transaction Flags */ - /* mdbx_txn_begin() flags */ -#define MDBX_TXN_RO_BEGIN_FLAGS (MDBX_TXN_RDONLY | MDBX_TXN_RDONLY_PREPARE) -#define MDBX_TXN_RW_BEGIN_FLAGS \ - (MDBX_TXN_NOMETASYNC | MDBX_TXN_NOSYNC | MDBX_TXN_TRY) - /* Additional flag for sync_locked() */ -#define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) - -#define MDBX_TXN_DRAINED_GC 0x20 /* GC was depleted up to oldest reader */ - -#define TXN_FLAGS \ - (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \ - MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_DRAINED_GC) - -#if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \ - ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \ - MDBX_SHRINK_ALLOWED) -#error "Oops, some txn flags overlapped or wrong" -#endif - uint32_t mt_flags; - unsigned mt_numdbs; - size_t mt_owner; /* thread ID that owns this transaction */ - - MDBX_txn *mt_parent; /* parent of a nested txn */ - /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ - MDBX_txn *mt_child; - MDBX_geo mt_geo; - /* next unallocated page */ -#define mt_next_pgno mt_geo.next - /* corresponding to the current size of datafile */ -#define mt_end_pgno mt_geo.now +#define WARNING(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_WARN)) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) - /* The ID of this transaction. IDs are integers incrementing from - * INITIAL_TXNID. Only committed write transactions increment the ID. If a - * transaction aborts, the ID may be re-used by the next writer. */ - txnid_t mt_txnid; - txnid_t mt_front; +#undef ERROR /* wingdi.h \ + Yeah, morons from M$ put such definition to the public header. */ - MDBX_env *mt_env; /* the DB environment */ - /* Array of MDBX_db records for each known DB */ - MDBX_db *mt_dbs; +#define ERROR(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_ERROR)) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) -#if MDBX_ENABLE_DBI_SPARSE - unsigned *__restrict mt_dbi_sparse; -#endif /* MDBX_ENABLE_DBI_SPARSE */ +#define FATAL(fmt, ...) \ + debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); - /* Non-shared DBI state flags inside transaction */ -#define DBI_DIRTY 0x01 /* DB was written in this txn */ -#define DBI_STALE 0x02 /* Named-DB record is older than txnID */ -#define DBI_FRESH 0x04 /* Named-DB handle opened in this txn */ -#define DBI_CREAT 0x08 /* Named-DB handle created in this txn */ -#define DBI_VALID 0x10 /* Handle is valid, see also DB_VALID */ -#define DBI_OLDEN 0x40 /* Handle was closed/reopened outside txn */ -#define DBI_LINDO 0x80 /* Lazy initialization done for DBI-slot */ - /* Array of non-shared txn's flags of DBI */ - uint8_t *__restrict mt_dbi_state; +#if MDBX_DEBUG +#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) +#else /* MDBX_DEBUG */ +MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, + unsigned line); +#define ASSERT_FAIL(env, msg, func, line) \ + do { \ + (void)(env); \ + assert_fail(msg, func, line); \ + } while (0) +#endif /* MDBX_DEBUG */ - /* Array of sequence numbers for each DB handle. */ - uint32_t *__restrict mt_dbi_seqs; - MDBX_cursor **mt_cursors; +#define ENSURE_MSG(env, expr, msg) \ + do { \ + if (unlikely(!(expr))) \ + ASSERT_FAIL(env, msg, __func__, __LINE__); \ + } while (0) - MDBX_canary mt_canary; - void *mt_userctx; /* User-settable context */ +#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) - union { - struct { - /* For read txns: This thread/txn's reader table slot, or NULL. */ - MDBX_reader *reader; - } to; - struct { - meta_troika_t troika; - /* In write txns, array of cursors for each DB */ - MDBX_PNL __restrict relist; /* Reclaimed GC pages */ - txnid_t last_reclaimed; /* ID of last used record */ -#if MDBX_ENABLE_REFUND - pgno_t loose_refund_wl /* FIXME: describe */; -#endif /* MDBX_ENABLE_REFUND */ - /* a sequence to spilling dirty page with LRU policy */ - unsigned dirtylru; - /* dirtylist room: Dirty array size - dirty pages visible to this txn. - * Includes ancestor txns' dirty pages not hidden by other txns' - * dirty/spilled pages. Thus commit(nested txn) has room to merge - * dirtylist into mt_parent after freeing hidden mt_parent pages. */ - size_t dirtyroom; - /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ - MDBX_dpl *__restrict dirtylist; - /* The list of reclaimed txns from GC */ - MDBX_TXL __restrict lifo_reclaimed; - /* The list of pages that became unused during this transaction. */ - MDBX_PNL __restrict retired_pages; - /* The list of loose pages that became unused and may be reused - * in this transaction, linked through `mp_next`. */ - MDBX_page *__restrict loose_pages; - /* Number of loose pages (tw.loose_pages) */ - size_t loose_count; - union { - struct { - size_t least_removed; - /* The sorted list of dirty pages we temporarily wrote to disk - * because the dirty list was full. page numbers in here are - * shifted left by 1, deleted slots have the LSB set. */ - MDBX_PNL __restrict list; - } spilled; - size_t writemap_dirty_npages; - size_t writemap_spilled_npages; - }; - uint64_t gc_time_acc; - } tw; - }; -}; - -#if MDBX_WORDBITS >= 64 -#define CURSOR_STACK 32 -#else -#define CURSOR_STACK 24 -#endif - -struct MDBX_xcursor; - -/* Cursors are used for all DB operations. - * A cursor holds a path of (page pointer, key index) from the DB - * root to a position in the DB, plus other state. MDBX_DUPSORT - * cursors include an xcursor to the current data item. Write txns - * track their cursors and keep them up to date when data moves. - * Exception: An xcursor's pointer to a P_SUBP page can be stale. - * (A node with F_DUPDATA but no F_SUBDATA contains a subpage). */ -struct MDBX_cursor { -#define MDBX_MC_LIVE UINT32_C(0xFE05D5B1) -#define MDBX_MC_READY4CLOSE UINT32_C(0x2817A047) -#define MDBX_MC_WAIT4EOT UINT32_C(0x90E297A7) - uint32_t mc_signature; - /* The database handle this cursor operates on */ - MDBX_dbi mc_dbi; - /* Next cursor on this DB in this txn */ - MDBX_cursor *mc_next; - /* Backup of the original cursor if this cursor is a shadow */ - MDBX_cursor *mc_backup; - /* Context used for databases with MDBX_DUPSORT, otherwise NULL */ - struct MDBX_xcursor *mc_xcursor; - /* The transaction that owns this cursor */ - MDBX_txn *mc_txn; - /* The database record for this cursor */ - MDBX_db *mc_db; - /* The database auxiliary record for this cursor */ - MDBX_dbx *mc_dbx; - /* The mt_dbi_state[] for this DBI */ - uint8_t *__restrict mc_dbi_state; - uint8_t mc_snum; /* number of pushed pages */ - uint8_t mc_top; /* index of top page, normally mc_snum-1 */ - - /* Cursor state flags. */ -#define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */ -#define C_EOF 0x02 /* No more data */ -#define C_SUB 0x04 /* Cursor is a sub-cursor */ -#define C_DEL 0x08 /* last op was a cursor_del */ -#define C_UNTRACK 0x10 /* Un-track cursor when closing */ -#define C_GCU \ - 0x20 /* Происходит подготовка к обновлению GC, поэтому \ - * можно брать страницы из GC даже для FREE_DBI */ - uint8_t mc_flags; - - /* Cursor checking flags. */ -#define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */ -#define CC_LEAF 0x02 /* same as P_LEAF for CHECK_LEAF_TYPE() */ -#define CC_OVERFLOW 0x04 /* same as P_OVERFLOW for CHECK_LEAF_TYPE() */ -#define CC_UPDATING 0x08 /* update/rebalance pending */ -#define CC_SKIPORD 0x10 /* don't check keys ordering */ -#define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */ -#define CC_RETIRING 0x40 /* refs to child pages may be invalid */ -#define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */ - uint8_t mc_checking; - - MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ - indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */ -}; - -#define CHECK_LEAF_TYPE(mc, mp) \ - (((PAGETYPE_WHOLE(mp) ^ (mc)->mc_checking) & \ - (CC_BRANCH | CC_LEAF | CC_OVERFLOW | CC_LEAF2)) == 0) - -/* Context for sorted-dup records. - * We could have gone to a fully recursive design, with arbitrarily - * deep nesting of sub-databases. But for now we only handle these - * levels - main DB, optional sub-DB, sorted-duplicate DB. */ -typedef struct MDBX_xcursor { - /* A sub-cursor for traversing the Dup DB */ - MDBX_cursor mx_cursor; - /* The database record for this Dup DB */ - MDBX_db mx_db; - /* The auxiliary DB record for this Dup DB */ - MDBX_dbx mx_dbx; -} MDBX_xcursor; - -typedef struct MDBX_cursor_couple { - MDBX_cursor outer; - void *mc_userctx; /* User-settable context */ - MDBX_xcursor inner; -} MDBX_cursor_couple; - -struct mdbx_defer_free_item { - struct mdbx_defer_free_item *next; - uint64_t timestamp; -}; - -/* The database environment. */ -struct MDBX_env { - /* ----------------------------------------------------- mostly static part */ -#define MDBX_ME_SIGNATURE UINT32_C(0x9A899641) - MDBX_atomic_uint32_t me_signature; - /* Failed to update the meta page. Probably an I/O error. */ -#define MDBX_FATAL_ERROR UINT32_C(0x80000000) - /* Some fields are initialized. */ -#define MDBX_ENV_ACTIVE UINT32_C(0x20000000) - /* me_txkey is set */ -#define MDBX_ENV_TXKEY UINT32_C(0x10000000) - /* Legacy MDBX_MAPASYNC (prior v0.9) */ -#define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000) - /* Legacy MDBX_COALESCE (prior v0.12) */ -#define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) -#define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) - uint32_t me_flags; - unsigned me_psize; /* DB page size, initialized from me_os_psize */ - osal_mmap_t me_dxb_mmap; /* The main data file */ -#define me_map me_dxb_mmap.base -#define me_lazy_fd me_dxb_mmap.fd - mdbx_filehandle_t me_dsync_fd, me_fd4meta; -#if defined(_WIN32) || defined(_WIN64) -#define me_overlapped_fd me_ioring.overlapped_fd - HANDLE me_data_lock_event; -#endif /* Windows */ - osal_mmap_t me_lck_mmap; /* The lock file */ -#define me_lfd me_lck_mmap.fd - struct MDBX_lockinfo *me_lck; - - uint16_t me_leaf_nodemax; /* max size of a leaf-node */ - uint16_t me_branch_nodemax; /* max size of a branch-node */ - uint16_t me_subpage_limit; - uint16_t me_subpage_room_threshold; - uint16_t me_subpage_reserve_prereq; - uint16_t me_subpage_reserve_limit; - atomic_pgno_t me_mlocked_pgno; - uint8_t me_psize2log; /* log2 of DB page size */ - int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ - uint16_t me_merge_threshold, - me_merge_threshold_gc; /* pages emptier than this are candidates for - merging */ - unsigned me_os_psize; /* OS page size, from osal_syspagesize() */ - unsigned me_maxreaders; /* size of the reader table */ - MDBX_dbi me_maxdbs; /* size of the DB table */ - uint32_t me_pid; /* process ID of this env */ - osal_thread_key_t me_txkey; /* thread-key for readers */ - struct { /* path to the DB files */ - pathchar_t *lck, *dxb, *specified; - void *buffer; - } me_pathname; - void *me_pbuf; /* scratch area for DUPSORT put() */ - MDBX_txn *me_txn0; /* preallocated write transaction */ - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *__restrict me_db_flags; /* array of flags from MDBX_db.md_flags */ - MDBX_atomic_uint32_t *me_dbi_seqs; /* array of dbi sequence numbers */ - unsigned - me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ - unsigned me_maxgc_per_branch; - uint32_t me_live_reader; /* have liveness lock in reader table */ - void *me_userctx; /* User-settable context */ - MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ - size_t me_madv_threshold; - - struct { - unsigned dp_reserve_limit; - unsigned rp_augment_limit; - unsigned dp_limit; - unsigned dp_initial; - uint64_t gc_time_limit; - uint8_t dp_loose_limit; - uint8_t spill_max_denominator; - uint8_t spill_min_denominator; - uint8_t spill_parent4child_denominator; - unsigned merge_threshold_16dot16_percent; -#if !(defined(_WIN32) || defined(_WIN64)) - unsigned writethrough_threshold; -#endif /* Windows */ - bool prefault_write; - bool prefer_waf_insteadof_balance; /* Strive to minimize WAF instead of - balancing pages fullment */ - union { - unsigned all; - /* tracks options with non-auto values but tuned by user */ - struct { - unsigned dp_limit : 1; - unsigned rp_augment_limit : 1; - unsigned prefault_write : 1; - } non_auto; - } flags; - } me_options; - - /* struct me_dbgeo used for accepting db-geo params from user for the new - * database creation, i.e. when mdbx_env_set_geometry() was called before - * mdbx_env_open(). */ - struct { - size_t lower; /* minimal size of datafile */ - size_t upper; /* maximal size of datafile */ - size_t now; /* current size of datafile */ - size_t grow; /* step to grow datafile */ - size_t shrink; /* threshold to shrink datafile */ - } me_dbgeo; - -#if MDBX_LOCKING == MDBX_LOCKING_SYSV - union { - key_t key; - int semid; - } me_sysv_ipc; -#endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ - bool me_incore; - bool me_prefault_write; - -#if MDBX_ENABLE_DBI_LOCKFREE - struct mdbx_defer_free_item *me_defer_free; -#endif /* MDBX_ENABLE_DBI_LOCKFREE */ - - /* --------------------------------------------------- mostly volatile part */ - - MDBX_txn *me_txn; /* current write transaction */ - osal_fastmutex_t me_dbi_lock; - unsigned me_numdbs; /* number of DBs opened */ +/* assert(3) variant in environment context */ +#define eASSERT(env, expr) \ + do { \ + if (ASSERT_ENABLED()) \ + ENSURE(env, expr); \ + } while (0) - unsigned me_dp_reserve_len; - MDBX_page *__restrict me_dp_reserve; /* list of malloc'ed blocks for re-use */ +/* assert(3) variant in cursor context */ +#define cASSERT(mc, expr) eASSERT((mc)->txn->env, expr) - /* PNL of pages that became unused in a write txn */ - MDBX_PNL __restrict me_retired_pages; - osal_ioring_t me_ioring; +/* assert(3) variant in transaction context */ +#define tASSERT(txn, expr) eASSERT((txn)->env, expr) -#if defined(_WIN32) || defined(_WIN64) - osal_srwlock_t me_remap_guard; - /* Workaround for LockFileEx and WriteFile multithread bug */ - CRITICAL_SECTION me_windowsbug_lock; - char *me_pathname_char; /* cache of multi-byte representation of pathname - to the DB files */ -#else - osal_fastmutex_t me_remap_guard; +#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ +#undef assert +#define assert(expr) eASSERT(nullptr, expr) #endif - /* -------------------------------------------------------------- debugging */ - +MDBX_MAYBE_UNUSED static inline void jitter4testing(bool tiny) { #if MDBX_DEBUG - MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ -#endif -#ifdef ENABLE_MEMCHECK - int me_valgrind_handle; -#endif -#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) - pgno_t me_poison_edge; -#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ - -#ifndef xMDBX_DEBUG_SPILLING -#define xMDBX_DEBUG_SPILLING 0 -#endif -#if xMDBX_DEBUG_SPILLING == 2 - size_t debug_dirtied_est, debug_dirtied_act; -#endif /* xMDBX_DEBUG_SPILLING */ - - /* ------------------------------------------------- stub for lck-less mode */ - MDBX_atomic_uint64_t - x_lckless_stub[(sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) / - sizeof(MDBX_atomic_uint64_t)]; -}; - -#ifndef __cplusplus -/*----------------------------------------------------------------------------*/ -/* Cache coherence and mmap invalidation */ - -#if MDBX_CPU_WRITEBACK_INCOHERENT -#define osal_flush_incoherent_cpu_writeback() osal_memory_barrier() -#else -#define osal_flush_incoherent_cpu_writeback() osal_compiler_barrier() -#endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ - -MDBX_MAYBE_UNUSED static __inline void -osal_flush_incoherent_mmap(const void *addr, size_t nbytes, - const intptr_t pagesize) { -#if MDBX_MMAP_INCOHERENT_FILE_WRITE - char *const begin = (char *)(-pagesize & (intptr_t)addr); - char *const end = - (char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1)); - int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0; - eASSERT(nullptr, err == 0); - (void)err; -#else - (void)pagesize; -#endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ - -#if MDBX_MMAP_INCOHERENT_CPU_CACHE -#ifdef DCACHE - /* MIPS has cache coherency issues. - * Note: for any nbytes >= on-chip cache size, entire is flushed. */ - cacheflush((void *)addr, nbytes, DCACHE); + if (globals.runtime_flags & (unsigned)MDBX_DBG_JITTER) + osal_jitter(tiny); #else -#error "Oops, cacheflush() not available" -#endif /* DCACHE */ -#endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */ - -#if !MDBX_MMAP_INCOHERENT_FILE_WRITE && !MDBX_MMAP_INCOHERENT_CPU_CACHE - (void)addr; - (void)nbytes; + (void)tiny; #endif } -/*----------------------------------------------------------------------------*/ -/* Internal prototypes */ - -MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, - int *dead); -MDBX_INTERNAL_FUNC void global_ctor(void); -MDBX_INTERNAL_FUNC void osal_ctor(void); -MDBX_INTERNAL_FUNC void global_dtor(void); -MDBX_INTERNAL_FUNC void osal_dtor(void); -MDBX_INTERNAL_FUNC void thread_dtor(void *ptr); - -#endif /* !__cplusplus */ - -#define MDBX_IS_ERROR(rc) \ - ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE) - -/* Internal error codes, not exposed outside libmdbx */ -#define MDBX_NO_ROOT (MDBX_LAST_ADDED_ERRCODE + 10) - -/* Debugging output value of a cursor DBI: Negative in a sub-cursor. */ -#define DDBI(mc) \ - (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) +MDBX_MAYBE_UNUSED MDBX_INTERNAL void page_list(page_t *mp); +MDBX_INTERNAL const char *pagetype_caption(const uint8_t type, + char buf4unknown[16]); /* Key size which fits in a DKBUF (debug key buffer). */ -#define DKBUF_MAX 511 -#define DKBUF char _kbuf[DKBUF_MAX * 4 + 2] -#define DKEY(x) mdbx_dump_val(x, _kbuf, DKBUF_MAX * 2 + 1) -#define DVAL(x) mdbx_dump_val(x, _kbuf + DKBUF_MAX * 2 + 1, DKBUF_MAX * 2 + 1) +#define DKBUF_MAX 127 +#define DKBUF char dbg_kbuf[DKBUF_MAX * 4 + 2] +#define DKEY(x) mdbx_dump_val(x, dbg_kbuf, DKBUF_MAX * 2 + 1) +#define DVAL(x) \ + mdbx_dump_val(x, dbg_kbuf + DKBUF_MAX * 2 + 1, DKBUF_MAX * 2 + 1) #if MDBX_DEBUG #define DKBUF_DEBUG DKBUF @@ -3885,103 +3117,16 @@ MDBX_INTERNAL_FUNC void thread_dtor(void *ptr); #define DVAL_DEBUG(x) ("-") #endif -/* An invalid page number. - * Mainly used to denote an empty tree. */ -#define P_INVALID (~(pgno_t)0) + /* Test if the flags f are set in a flag word w. */ #define F_ISSET(w, f) (((w) & (f)) == (f)) /* Round n up to an even number. */ -#define EVEN(n) (((n) + 1UL) & -2L) /* sign-extending -2 to match n+1U */ - -/* Default size of memory map. - * This is certainly too small for any actual applications. Apps should - * always set the size explicitly using mdbx_env_set_geometry(). */ -#define DEFAULT_MAPSIZE MEGABYTE - -/* Number of slots in the reader table. - * This value was chosen somewhat arbitrarily. The 61 is a prime number, - * and such readers plus a couple mutexes fit into single 4KB page. - * Applications should set the table size using mdbx_env_set_maxreaders(). */ -#define DEFAULT_READERS 61 - -/* Test if a page is a leaf page */ -#define IS_LEAF(p) (((p)->mp_flags & P_LEAF) != 0) -/* Test if a page is a LEAF2 page */ -#define IS_LEAF2(p) unlikely(((p)->mp_flags & P_LEAF2) != 0) -/* Test if a page is a branch page */ -#define IS_BRANCH(p) (((p)->mp_flags & P_BRANCH) != 0) -/* Test if a page is an overflow page */ -#define IS_OVERFLOW(p) unlikely(((p)->mp_flags & P_OVERFLOW) != 0) -/* Test if a page is a sub page */ -#define IS_SUBP(p) (((p)->mp_flags & P_SUBP) != 0) - -/* Header for a single key/data pair within a page. - * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. - * We guarantee 2-byte alignment for 'MDBX_node's. - * - * Leaf node flags describe node contents. F_BIGDATA says the node's - * data part is the page number of an overflow page with actual data. - * F_DUPDATA and F_SUBDATA can be combined giving duplicate data in - * a sub-page/sub-database, and named databases (just F_SUBDATA). */ -typedef struct MDBX_node { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - union { - uint32_t mn_dsize; - uint32_t mn_pgno32; - }; - uint8_t mn_flags; /* see mdbx_node flags */ - uint8_t mn_extra; - uint16_t mn_ksize; /* key size */ -#else - uint16_t mn_ksize; /* key size */ - uint8_t mn_extra; - uint8_t mn_flags; /* see mdbx_node flags */ - union { - uint32_t mn_pgno32; - uint32_t mn_dsize; - }; -#endif /* __BYTE_ORDER__ */ - - /* mdbx_node Flags */ -#define F_BIGDATA 0x01 /* data put on overflow page */ -#define F_SUBDATA 0x02 /* data is a sub-database */ -#define F_DUPDATA 0x04 /* data has duplicates */ - - /* valid flags for mdbx_node_add() */ -#define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDBX_RESERVE | MDBX_APPEND) - -#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ - (!defined(__cplusplus) && defined(_MSC_VER)) - uint8_t mn_data[] /* key and data are appended here */; -#endif /* C99 */ -} MDBX_node; - -#define DB_PERSISTENT_FLAGS \ - (MDBX_REVERSEKEY | MDBX_DUPSORT | MDBX_INTEGERKEY | MDBX_DUPFIXED | \ - MDBX_INTEGERDUP | MDBX_REVERSEDUP) +#define EVEN_CEIL(n) (((n) + 1UL) & -2L) /* sign-extending -2 to match n+1U */ -/* mdbx_dbi_open() flags */ -#define DB_USABLE_FLAGS (DB_PERSISTENT_FLAGS | MDBX_CREATE | MDBX_DB_ACCEDE) - -#define DB_VALID 0x8000u /* DB handle is valid, for me_db_flags */ -#define DB_POISON 0x7fffu /* update pending */ -#define DB_INTERNAL_FLAGS DB_VALID - -#if DB_INTERNAL_FLAGS & DB_USABLE_FLAGS -#error "Oops, some flags overlapped or wrong" -#endif -#if DB_PERSISTENT_FLAGS & ~DB_USABLE_FLAGS -#error "Oops, some flags overlapped or wrong" -#endif - -/* Max length of iov-vector passed to writev() call, used for auxilary writes */ -#define MDBX_AUXILARY_IOV_MAX 64 -#if defined(IOV_MAX) && IOV_MAX < MDBX_AUXILARY_IOV_MAX -#undef MDBX_AUXILARY_IOV_MAX -#define MDBX_AUXILARY_IOV_MAX IOV_MAX -#endif /* MDBX_AUXILARY_IOV_MAX */ +/* Round n down to an even number. */ +#define EVEN_FLOOR(n) ((n) & ~(size_t)1) /* * / @@ -3992,12716 +3137,9599 @@ typedef struct MDBX_node { */ #define CMP2INT(a, b) (((a) != (b)) ? (((a) < (b)) ? -1 : 1) : 0) -MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t -int64pgno(int64_t i64) { - if (likely(i64 >= (int64_t)MIN_PAGENO && i64 <= (int64_t)MAX_PAGENO + 1)) - return (pgno_t)i64; - return (i64 < (int64_t)MIN_PAGENO) ? MIN_PAGENO : MAX_PAGENO; -} +/* Pointer displacement without casting to char* to avoid pointer-aliasing */ +#define ptr_disp(ptr, disp) ((void *)(((intptr_t)(ptr)) + ((intptr_t)(disp)))) -MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t -pgno_add(size_t base, size_t augend) { - assert(base <= MAX_PAGENO + 1 && augend < MAX_PAGENO); - return int64pgno((int64_t)base + (int64_t)augend); -} +/* Pointer distance as signed number of bytes */ +#define ptr_dist(more, less) (((intptr_t)(more)) - ((intptr_t)(less))) -MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t -pgno_sub(size_t base, size_t subtrahend) { - assert(base >= MIN_PAGENO && base <= MAX_PAGENO + 1 && - subtrahend < MAX_PAGENO); - return int64pgno((int64_t)base - (int64_t)subtrahend); +#define MDBX_ASAN_POISON_MEMORY_REGION(addr, size) \ + do { \ + TRACE("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ + ASAN_POISON_MEMORY_REGION(addr, size); \ + } while (0) + +#define MDBX_ASAN_UNPOISON_MEMORY_REGION(addr, size) \ + do { \ + TRACE("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ + ASAN_UNPOISON_MEMORY_REGION(addr, size); \ + } while (0) + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline size_t +branchless_abs(intptr_t value) { + assert(value > INT_MIN); + const size_t expanded_sign = + (size_t)(value >> (sizeof(value) * CHAR_BIT - 1)); + return ((size_t)value + expanded_sign) ^ expanded_sign; } -MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline bool is_powerof2(size_t x) { return (x & (x - 1)) == 0; } -MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline size_t floor_powerof2(size_t value, size_t granularity) { assert(is_powerof2(granularity)); return value & ~(granularity - 1); } -MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline size_t ceil_powerof2(size_t value, size_t granularity) { return floor_powerof2(value + granularity - 1, granularity); } -MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned -log2n_powerof2(size_t value_uintptr) { - assert(value_uintptr > 0 && value_uintptr < INT32_MAX && - is_powerof2(value_uintptr)); - assert((value_uintptr & -(intptr_t)value_uintptr) == value_uintptr); - const uint32_t value_uint32 = (uint32_t)value_uintptr; -#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctz) - STATIC_ASSERT(sizeof(value_uint32) <= sizeof(unsigned)); - return __builtin_ctz(value_uint32); -#elif defined(_MSC_VER) - unsigned long index; - STATIC_ASSERT(sizeof(value_uint32) <= sizeof(long)); - _BitScanForward(&index, value_uint32); - return index; -#else - static const uint8_t debruijn_ctz32[32] = { - 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, - 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; - return debruijn_ctz32[(uint32_t)(value_uint32 * 0x077CB531ul) >> 27]; -#endif -} +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL unsigned +log2n_powerof2(size_t value_uintptr); -/* Only a subset of the mdbx_env flags can be changed - * at runtime. Changing other flags requires closing the - * environment and re-opening it with the new flags. */ -#define ENV_CHANGEABLE_FLAGS \ - (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ - MDBX_NOMEMINIT | MDBX_DEPRECATED_COALESCE | MDBX_PAGEPERTURB | \ - MDBX_ACCEDE | MDBX_VALIDATION) -#define ENV_CHANGELESS_FLAGS \ - (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOSTICKYTHREADS | \ - MDBX_NORDAHEAD | MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) -#define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS) +MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL uint64_t rrxmrrxmsx_0(uint64_t v); -#if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS -MDBX_MAYBE_UNUSED static void static_checks(void) { - STATIC_ASSERT_MSG(INT16_MAX - CORE_DBS == MDBX_MAX_DBI, - "Oops, MDBX_MAX_DBI or CORE_DBS?"); - STATIC_ASSERT_MSG((unsigned)(MDBX_DB_ACCEDE | MDBX_CREATE) == - ((DB_USABLE_FLAGS | DB_INTERNAL_FLAGS) & - (ENV_USABLE_FLAGS | ENV_INTERNAL_FLAGS)), - "Oops, some flags overlapped or wrong"); - STATIC_ASSERT_MSG((ENV_INTERNAL_FLAGS & ENV_USABLE_FLAGS) == 0, - "Oops, some flags overlapped or wrong"); -} -#endif /* Disabled for MSVC 19.0 (VisualStudio 2015) */ +struct monotime_cache { + uint64_t value; + int expire_countdown; +}; -#ifdef __cplusplus +MDBX_MAYBE_UNUSED static inline uint64_t +monotime_since_cached(uint64_t begin_timestamp, struct monotime_cache *cache) { + if (cache->expire_countdown) + cache->expire_countdown -= 1; + else { + cache->value = osal_monotime(); + cache->expire_countdown = 42 / 3; + } + return cache->value - begin_timestamp; } + + + + +/* An PNL is an Page Number List, a sorted array of IDs. + * + * The first element of the array is a counter for how many actual page-numbers + * are in the list. By default PNLs are sorted in descending order, this allow + * cut off a page with lowest pgno (at the tail) just truncating the list. The + * sort order of PNLs is controlled by the MDBX_PNL_ASCENDING build option. */ +typedef pgno_t *pnl_t; +typedef const pgno_t *const_pnl_t; + +#if MDBX_PNL_ASCENDING +#define MDBX_PNL_ORDERED(first, last) ((first) < (last)) +#define MDBX_PNL_DISORDERED(first, last) ((first) >= (last)) +#else +#define MDBX_PNL_ORDERED(first, last) ((first) > (last)) +#define MDBX_PNL_DISORDERED(first, last) ((first) <= (last)) #endif -#define MDBX_ASAN_POISON_MEMORY_REGION(addr, size) \ - do { \ - TRACE("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ - ASAN_POISON_MEMORY_REGION(addr, size); \ - } while (0) +#define MDBX_PNL_GRANULATE_LOG2 10 +#define MDBX_PNL_GRANULATE (1 << MDBX_PNL_GRANULATE_LOG2) +#define MDBX_PNL_INITIAL \ + (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) -#define MDBX_ASAN_UNPOISON_MEMORY_REGION(addr, size) \ +#define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1]) +#define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0])) +#define MDBX_PNL_SETSIZE(pl, size) \ do { \ - TRACE("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ - ASAN_UNPOISON_MEMORY_REGION(addr, size); \ + const size_t __size = size; \ + assert(__size < INT_MAX); \ + (pl)[0] = (pgno_t)__size; \ } while (0) +#define MDBX_PNL_FIRST(pl) ((pl)[1]) +#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_GETSIZE(pl)]) +#define MDBX_PNL_BEGIN(pl) (&(pl)[1]) +#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1]) -/******************************************************************************/ +#if MDBX_PNL_ASCENDING +#define MDBX_PNL_EDGE(pl) ((pl) + 1) +#define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl) +#define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl) +#else +#define MDBX_PNL_EDGE(pl) ((pl) + MDBX_PNL_GETSIZE(pl)) +#define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl) +#define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl) +#endif -/** \brief Page types for traverse the b-tree. - * \see mdbx_env_pgwalk() \see MDBX_pgvisitor_func */ -enum MDBX_page_type_t { - MDBX_page_broken, - MDBX_page_large, - MDBX_page_branch, - MDBX_page_leaf, - MDBX_page_dupfixed_leaf, - MDBX_subpage_leaf, - MDBX_subpage_dupfixed_leaf, - MDBX_subpage_broken, -}; -typedef enum MDBX_page_type_t MDBX_page_type_t; +#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_GETSIZE(pl) + 1) * sizeof(pgno_t)) +#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_GETSIZE(pl) == 0) -typedef struct MDBX_walk_sdb { - MDBX_val name; - struct MDBX_db *internal, *nested; -} MDBX_walk_sdb_t; - -/** \brief Callback function for traverse the b-tree. \see mdbx_env_pgwalk() */ -typedef int -MDBX_pgvisitor_func(const size_t pgno, const unsigned number, void *const ctx, - const int deep, const MDBX_walk_sdb_t *subdb, - const size_t page_size, const MDBX_page_type_t page_type, - const MDBX_error_t err, const size_t nentries, - const size_t payload_bytes, const size_t header_bytes, - const size_t unused_bytes); -/* - * Copyright 2015-2024 Leonid Yuriev . - * and other libmdbx authors: please see AUTHORS file. - * All rights reserved. - * - * This code is derived from "LMDB engine" written by - * Howard Chu (Symas Corporation), which itself derived from btree.c - * written by Martin Hedenfalk. - * - * --- - * - * Portions Copyright 2011-2015 Howard Chu, Symas Corp. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - * - * --- - * - * Portions Copyright (c) 2009, 2010 Martin Hedenfalk - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +MDBX_MAYBE_UNUSED static inline size_t pnl_size2bytes(size_t size) { + assert(size > 0 && size <= PAGELIST_LIMIT); +#if MDBX_PNL_PREALLOC_FOR_RADIXSORT + size += size; +#endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */ + STATIC_ASSERT(MDBX_ASSUME_MALLOC_OVERHEAD + + (PAGELIST_LIMIT * (MDBX_PNL_PREALLOC_FOR_RADIXSORT + 1) + + MDBX_PNL_GRANULATE + 3) * + sizeof(pgno_t) < + SIZE_MAX / 4 * 3); + size_t bytes = + ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(pgno_t) * (size + 3), + MDBX_PNL_GRANULATE * sizeof(pgno_t)) - + MDBX_ASSUME_MALLOC_OVERHEAD; + return bytes; +} -/*------------------------------------------------------------------------------ - * Internal inline functions */ +MDBX_MAYBE_UNUSED static inline pgno_t pnl_bytes2size(const size_t bytes) { + size_t size = bytes / sizeof(pgno_t); + assert(size > 3 && size <= PAGELIST_LIMIT + /* alignment gap */ 65536); + size -= 3; +#if MDBX_PNL_PREALLOC_FOR_RADIXSORT + size >>= 1; +#endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */ + return (pgno_t)size; +} -MDBX_NOTHROW_CONST_FUNCTION static size_t branchless_abs(intptr_t value) { - assert(value > INT_MIN); - const size_t expanded_sign = - (size_t)(value >> (sizeof(value) * CHAR_BIT - 1)); - return ((size_t)value + expanded_sign) ^ expanded_sign; +MDBX_INTERNAL pnl_t pnl_alloc(size_t size); + +MDBX_INTERNAL void pnl_free(pnl_t pnl); + +MDBX_INTERNAL int pnl_reserve(pnl_t __restrict *__restrict ppnl, + const size_t wanna); + +MDBX_MAYBE_UNUSED static inline int __must_check_result +pnl_need(pnl_t __restrict *__restrict ppnl, size_t num) { + assert(MDBX_PNL_GETSIZE(*ppnl) <= PAGELIST_LIMIT && + MDBX_PNL_ALLOCLEN(*ppnl) >= MDBX_PNL_GETSIZE(*ppnl)); + assert(num <= PAGELIST_LIMIT); + const size_t wanna = MDBX_PNL_GETSIZE(*ppnl) + num; + return likely(MDBX_PNL_ALLOCLEN(*ppnl) >= wanna) ? MDBX_SUCCESS + : pnl_reserve(ppnl, wanna); } -/* Pack/Unpack 16-bit values for Grow step & Shrink threshold */ -MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t me2v(size_t m, size_t e) { - assert(m < 2048 && e < 8); - return (pgno_t)(32768 + ((m + 1) << (e + 8))); +MDBX_MAYBE_UNUSED static inline void +pnl_append_prereserved(__restrict pnl_t pnl, pgno_t pgno) { + assert(MDBX_PNL_GETSIZE(pnl) < MDBX_PNL_ALLOCLEN(pnl)); + if (AUDIT_ENABLED()) { + for (size_t i = MDBX_PNL_GETSIZE(pnl); i > 0; --i) + assert(pgno != pnl[i]); + } + *pnl += 1; + MDBX_PNL_LAST(pnl) = pgno; } -MDBX_NOTHROW_CONST_FUNCTION static __inline uint16_t v2me(size_t v, size_t e) { - assert(v > (e ? me2v(2047, e - 1) : 32768)); - assert(v <= me2v(2047, e)); - size_t m = (v - 32768 + ((size_t)1 << (e + 8)) - 1) >> (e + 8); - m -= m > 0; - assert(m < 2048 && e < 8); - // f e d c b a 9 8 7 6 5 4 3 2 1 0 - // 1 e e e m m m m m m m m m m m 1 - const uint16_t pv = (uint16_t)(0x8001 + (e << 12) + (m << 1)); - assert(pv != 65535); - return pv; +MDBX_INTERNAL void pnl_shrink(pnl_t __restrict *__restrict ppnl); + +MDBX_INTERNAL int __must_check_result spill_append_span(__restrict pnl_t *ppnl, + pgno_t pgno, size_t n); + +MDBX_INTERNAL int __must_check_result pnl_append_span(__restrict pnl_t *ppnl, + pgno_t pgno, size_t n); + +MDBX_INTERNAL int __must_check_result pnl_insert_span(__restrict pnl_t *ppnl, + pgno_t pgno, size_t n); + +MDBX_INTERNAL size_t pnl_search_nochk(const pnl_t pnl, pgno_t pgno); + +MDBX_INTERNAL void pnl_sort_nochk(pnl_t pnl); + +MDBX_INTERNAL bool pnl_check(const const_pnl_t pnl, const size_t limit); + +MDBX_MAYBE_UNUSED static inline bool pnl_check_allocated(const const_pnl_t pnl, + const size_t limit) { + return pnl == nullptr || (MDBX_PNL_ALLOCLEN(pnl) >= MDBX_PNL_GETSIZE(pnl) && + pnl_check(pnl, limit)); } -/* Convert 16-bit packed (exponential quantized) value to number of pages */ -MDBX_NOTHROW_CONST_FUNCTION static pgno_t pv2pages(uint16_t pv) { - if ((pv & 0x8001) != 0x8001) - return pv; - if (pv == 65535) - return 65536; - // f e d c b a 9 8 7 6 5 4 3 2 1 0 - // 1 e e e m m m m m m m m m m m 1 - return me2v((pv >> 1) & 2047, (pv >> 12) & 7); +MDBX_MAYBE_UNUSED static inline void pnl_sort(pnl_t pnl, size_t limit4check) { + pnl_sort_nochk(pnl); + assert(pnl_check(pnl, limit4check)); + (void)limit4check; } -/* Convert number of pages to 16-bit packed (exponential quantized) value */ -MDBX_NOTHROW_CONST_FUNCTION static uint16_t pages2pv(size_t pages) { - if (pages < 32769 || (pages < 65536 && (pages & 1) == 0)) - return (uint16_t)pages; - if (pages <= me2v(2047, 0)) - return v2me(pages, 0); - if (pages <= me2v(2047, 1)) - return v2me(pages, 1); - if (pages <= me2v(2047, 2)) - return v2me(pages, 2); - if (pages <= me2v(2047, 3)) - return v2me(pages, 3); - if (pages <= me2v(2047, 4)) - return v2me(pages, 4); - if (pages <= me2v(2047, 5)) - return v2me(pages, 5); - if (pages <= me2v(2047, 6)) - return v2me(pages, 6); - return (pages < me2v(2046, 7)) ? v2me(pages, 7) : 65533; +MDBX_MAYBE_UNUSED static inline size_t pnl_search(const pnl_t pnl, pgno_t pgno, + size_t limit) { + assert(pnl_check_allocated(pnl, limit)); + if (MDBX_HAVE_CMOV) { + /* cmov-ускоренный бинарный поиск может читать (но не использовать) один + * элемент за концом данных, этот элемент в пределах выделенного участка + * памяти, но не инициализирован. */ + VALGRIND_MAKE_MEM_DEFINED(MDBX_PNL_END(pnl), sizeof(pgno_t)); + } + assert(pgno < limit); + (void)limit; + size_t n = pnl_search_nochk(pnl, pgno); + if (MDBX_HAVE_CMOV) { + VALGRIND_MAKE_MEM_UNDEFINED(MDBX_PNL_END(pnl), sizeof(pgno_t)); + } + return n; } -/*------------------------------------------------------------------------------ - * Unaligned access */ +MDBX_INTERNAL size_t pnl_merge(pnl_t dst, const pnl_t src); -MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t -field_alignment(size_t alignment_baseline, size_t field_offset) { - size_t merge = alignment_baseline | (size_t)field_offset; - return merge & -(int)merge; +#ifdef __cplusplus } +#endif /* __cplusplus */ -/* read-thunk for UB-sanitizer */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint8_t -peek_u8(const uint8_t *const __restrict ptr) { - return *ptr; -} +#define mdbx_sourcery_anchor XCONCAT(mdbx_sourcery_, MDBX_BUILD_SOURCERY) +#if defined(xMDBX_TOOLS) +extern LIBMDBX_API const char *const mdbx_sourcery_anchor; +#endif -/* write-thunk for UB-sanitizer */ -static __always_inline void poke_u8(uint8_t *const __restrict ptr, - const uint8_t v) { - *ptr = v; +#define MDBX_IS_ERROR(rc) \ + ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE) + +/*----------------------------------------------------------------------------*/ + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline pgno_t +int64pgno(int64_t i64) { + if (likely(i64 >= (int64_t)MIN_PAGENO && i64 <= (int64_t)MAX_PAGENO + 1)) + return (pgno_t)i64; + return (i64 < (int64_t)MIN_PAGENO) ? MIN_PAGENO : MAX_PAGENO; } -MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint16_t -unaligned_peek_u16(const size_t expected_alignment, const void *const ptr) { - assert((uintptr_t)ptr % expected_alignment == 0); - if (MDBX_UNALIGNED_OK >= 2 || (expected_alignment % sizeof(uint16_t)) == 0) - return *(const uint16_t *)ptr; - else { -#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ - defined(_M_X64) || defined(_M_IA64) - return *(const __unaligned uint16_t *)ptr; -#else - uint16_t v; - memcpy(&v, ptr, sizeof(v)); - return v; -#endif /* _MSC_VER || __unaligned */ - } +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline pgno_t +pgno_add(size_t base, size_t augend) { + assert(base <= MAX_PAGENO + 1 && augend < MAX_PAGENO); + return int64pgno((int64_t)base + (int64_t)augend); } -static __always_inline void unaligned_poke_u16(const size_t expected_alignment, - void *const __restrict ptr, - const uint16_t v) { - assert((uintptr_t)ptr % expected_alignment == 0); - if (MDBX_UNALIGNED_OK >= 2 || (expected_alignment % sizeof(v)) == 0) - *(uint16_t *)ptr = v; - else { -#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ - defined(_M_X64) || defined(_M_IA64) - *((uint16_t __unaligned *)ptr) = v; -#else - memcpy(ptr, &v, sizeof(v)); -#endif /* _MSC_VER || __unaligned */ - } +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline pgno_t +pgno_sub(size_t base, size_t subtrahend) { + assert(base >= MIN_PAGENO && base <= MAX_PAGENO + 1 && + subtrahend < MAX_PAGENO); + return int64pgno((int64_t)base - (int64_t)subtrahend); } -MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t unaligned_peek_u32( - const size_t expected_alignment, const void *const __restrict ptr) { - assert((uintptr_t)ptr % expected_alignment == 0); - if (MDBX_UNALIGNED_OK >= 4 || (expected_alignment % sizeof(uint32_t)) == 0) - return *(const uint32_t *)ptr; - else if ((expected_alignment % sizeof(uint16_t)) == 0) { - const uint16_t lo = - ((const uint16_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__]; - const uint16_t hi = - ((const uint16_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__]; - return lo | (uint32_t)hi << 16; - } else { -#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ - defined(_M_X64) || defined(_M_IA64) - return *(const __unaligned uint32_t *)ptr; -#else - uint32_t v; - memcpy(&v, ptr, sizeof(v)); - return v; -#endif /* _MSC_VER || __unaligned */ - } -} - -static __always_inline void unaligned_poke_u32(const size_t expected_alignment, - void *const __restrict ptr, - const uint32_t v) { - assert((uintptr_t)ptr % expected_alignment == 0); - if (MDBX_UNALIGNED_OK >= 4 || (expected_alignment % sizeof(v)) == 0) - *(uint32_t *)ptr = v; - else if ((expected_alignment % sizeof(uint16_t)) == 0) { - ((uint16_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__] = (uint16_t)v; - ((uint16_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__] = - (uint16_t)(v >> 16); - } else { -#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ - defined(_M_X64) || defined(_M_IA64) - *((uint32_t __unaligned *)ptr) = v; -#else - memcpy(ptr, &v, sizeof(v)); -#endif /* _MSC_VER || __unaligned */ - } -} - -MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t unaligned_peek_u64( - const size_t expected_alignment, const void *const __restrict ptr) { - assert((uintptr_t)ptr % expected_alignment == 0); - if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(uint64_t)) == 0) - return *(const uint64_t *)ptr; - else if ((expected_alignment % sizeof(uint32_t)) == 0) { - const uint32_t lo = - ((const uint32_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__]; - const uint32_t hi = - ((const uint32_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__]; - return lo | (uint64_t)hi << 32; - } else { -#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ - defined(_M_X64) || defined(_M_IA64) - return *(const __unaligned uint64_t *)ptr; -#else - uint64_t v; - memcpy(&v, ptr, sizeof(v)); - return v; -#endif /* _MSC_VER || __unaligned */ - } -} - -static __always_inline uint64_t -unaligned_peek_u64_volatile(const size_t expected_alignment, - const volatile void *const __restrict ptr) { - assert((uintptr_t)ptr % expected_alignment == 0); - assert(expected_alignment % sizeof(uint32_t) == 0); - if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(uint64_t)) == 0) - return *(const volatile uint64_t *)ptr; - else { -#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ - defined(_M_X64) || defined(_M_IA64) - return *(const volatile __unaligned uint64_t *)ptr; -#else - const uint32_t lo = ((const volatile uint32_t *) - ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__]; - const uint32_t hi = ((const volatile uint32_t *) - ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__]; - return lo | (uint64_t)hi << 32; -#endif /* _MSC_VER || __unaligned */ - } -} -static __always_inline void unaligned_poke_u64(const size_t expected_alignment, - void *const __restrict ptr, - const uint64_t v) { - assert((uintptr_t)ptr % expected_alignment == 0); - if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(v)) == 0) - *(uint64_t *)ptr = v; - else if ((expected_alignment % sizeof(uint32_t)) == 0) { - ((uint32_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__] = (uint32_t)v; - ((uint32_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__] = - (uint32_t)(v >> 32); - } else { -#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ - defined(_M_X64) || defined(_M_IA64) - *((uint64_t __unaligned *)ptr) = v; -#else - memcpy(ptr, &v, sizeof(v)); -#endif /* _MSC_VER || __unaligned */ - } -} +/*----------------------------------------------------------------------------*/ -#define UNALIGNED_PEEK_8(ptr, struct, field) \ - peek_u8(ptr_disp(ptr, offsetof(struct, field))) -#define UNALIGNED_POKE_8(ptr, struct, field, value) \ - poke_u8(ptr_disp(ptr, offsetof(struct, field)), value) -#define UNALIGNED_PEEK_16(ptr, struct, field) \ - unaligned_peek_u16(1, ptr_disp(ptr, offsetof(struct, field))) -#define UNALIGNED_POKE_16(ptr, struct, field, value) \ - unaligned_poke_u16(1, ptr_disp(ptr, offsetof(struct, field)), value) +typedef struct dp dp_t; +typedef struct dpl dpl_t; +typedef struct kvx kvx_t; +typedef struct meta_ptr meta_ptr_t; +typedef struct inner_cursor subcur_t; +typedef struct cursor_couple cursor_couple_t; +typedef struct defer_free_item defer_free_item_t; -#define UNALIGNED_PEEK_32(ptr, struct, field) \ - unaligned_peek_u32(1, ptr_disp(ptr, offsetof(struct, field))) -#define UNALIGNED_POKE_32(ptr, struct, field, value) \ - unaligned_poke_u32(1, ptr_disp(ptr, offsetof(struct, field)), value) +typedef struct troika { + uint8_t fsm, recent, prefer_steady, tail_and_flags; +#if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */ + uint32_t unused_pad; +#endif +#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7u) +#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64u) +#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128u) +#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3u) + txnid_t txnid[NUM_METAS]; +} troika_t; -#define UNALIGNED_PEEK_64(ptr, struct, field) \ - unaligned_peek_u64(1, ptr_disp(ptr, offsetof(struct, field))) -#define UNALIGNED_POKE_64(ptr, struct, field, value) \ - unaligned_poke_u64(1, ptr_disp(ptr, offsetof(struct, field)), value) +typedef struct page_get_result { + page_t *page; + int err; +} pgr_t; -/* Get the page number pointed to by a branch node */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t -node_pgno(const MDBX_node *const __restrict node) { - pgno_t pgno = UNALIGNED_PEEK_32(node, MDBX_node, mn_pgno32); - if (sizeof(pgno) > 4) - pgno |= ((uint64_t)UNALIGNED_PEEK_8(node, MDBX_node, mn_extra)) << 32; - return pgno; -} +typedef struct node_search_result { + node_t *node; + bool exact; +} nsr_t; -/* Set the page number in a branch node */ -static __always_inline void node_set_pgno(MDBX_node *const __restrict node, - pgno_t pgno) { - assert(pgno >= MIN_PAGENO && pgno <= MAX_PAGENO); +typedef struct bind_reader_slot_result { + int err; + reader_slot_t *rslot; +} bsr_t; - UNALIGNED_POKE_32(node, MDBX_node, mn_pgno32, (uint32_t)pgno); - if (sizeof(pgno) > 4) - UNALIGNED_POKE_8(node, MDBX_node, mn_extra, - (uint8_t)((uint64_t)pgno >> 32)); -} -/* Get the size of the data in a leaf node */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t -node_ds(const MDBX_node *const __restrict node) { - return UNALIGNED_PEEK_32(node, MDBX_node, mn_dsize); -} -/* Set the size of the data for a leaf node */ -static __always_inline void node_set_ds(MDBX_node *const __restrict node, - size_t size) { - assert(size < INT_MAX); - UNALIGNED_POKE_32(node, MDBX_node, mn_dsize, (uint32_t)size); -} -/* The size of a key in a node */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t -node_ks(const MDBX_node *const __restrict node) { - return UNALIGNED_PEEK_16(node, MDBX_node, mn_ksize); -} +#ifndef __cplusplus -/* Set the size of the key for a leaf node */ -static __always_inline void node_set_ks(MDBX_node *const __restrict node, - size_t size) { - assert(size < INT16_MAX); - UNALIGNED_POKE_16(node, MDBX_node, mn_ksize, (uint16_t)size); -} +#ifdef MDBX_HAVE_C11ATOMICS +#define osal_memory_fence(order, write) \ + atomic_thread_fence((write) ? mo_c11_store(order) : mo_c11_load(order)) +#else /* MDBX_HAVE_C11ATOMICS */ +#define osal_memory_fence(order, write) \ + do { \ + osal_compiler_barrier(); \ + if (write && order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed \ + : mo_AcquireRelease)) \ + osal_memory_barrier(); \ + } while (0) +#endif /* MDBX_HAVE_C11ATOMICS */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint8_t -node_flags(const MDBX_node *const __restrict node) { - return UNALIGNED_PEEK_8(node, MDBX_node, mn_flags); -} +#if defined(MDBX_HAVE_C11ATOMICS) && defined(__LCC__) +#define atomic_store32(p, value, order) \ + ({ \ + const uint32_t value_to_store = (value); \ + atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value_to_store, \ + mo_c11_store(order)); \ + value_to_store; \ + }) +#define atomic_load32(p, order) \ + atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order)) +#define atomic_store64(p, value, order) \ + ({ \ + const uint64_t value_to_store = (value); \ + atomic_store_explicit(MDBX_c11a_rw(uint64_t, p), value_to_store, \ + mo_c11_store(order)); \ + value_to_store; \ + }) +#define atomic_load64(p, order) \ + atomic_load_explicit(MDBX_c11a_ro(uint64_t, p), mo_c11_load(order)) +#endif /* LCC && MDBX_HAVE_C11ATOMICS */ -static __always_inline void node_set_flags(MDBX_node *const __restrict node, - uint8_t flags) { - UNALIGNED_POKE_8(node, MDBX_node, mn_flags, flags); +#ifndef atomic_store32 +MDBX_MAYBE_UNUSED static __always_inline uint32_t +atomic_store32(mdbx_atomic_uint32_t *p, const uint32_t value, + enum mdbx_memory_order order) { + STATIC_ASSERT(sizeof(mdbx_atomic_uint32_t) == 4); +#ifdef MDBX_HAVE_C11ATOMICS + assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p))); + atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order)); +#else /* MDBX_HAVE_C11ATOMICS */ + if (order != mo_Relaxed) + osal_compiler_barrier(); + p->weak = value; + osal_memory_fence(order, true); +#endif /* MDBX_HAVE_C11ATOMICS */ + return value; } +#endif /* atomic_store32 */ -/* Size of the node header, excluding dynamic data at the end */ -#define NODESIZE offsetof(MDBX_node, mn_data) - -/* Address of the key for the node */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * -node_key(const MDBX_node *const __restrict node) { - return ptr_disp(node, NODESIZE); +#ifndef atomic_load32 +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32( + const volatile mdbx_atomic_uint32_t *p, enum mdbx_memory_order order) { + STATIC_ASSERT(sizeof(mdbx_atomic_uint32_t) == 4); +#ifdef MDBX_HAVE_C11ATOMICS + assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p))); + return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order)); +#else /* MDBX_HAVE_C11ATOMICS */ + osal_memory_fence(order, false); + const uint32_t value = p->weak; + if (order != mo_Relaxed) + osal_compiler_barrier(); + return value; +#endif /* MDBX_HAVE_C11ATOMICS */ } +#endif /* atomic_load32 */ -/* Address of the data for a node */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * -node_data(const MDBX_node *const __restrict node) { - return ptr_disp(node_key(node), node_ks(node)); -} +/*------------------------------------------------------------------------------ + * safe read/write volatile 64-bit fields on 32-bit architectures. */ -/* Size of a node in a leaf page with a given key and data. - * This is node header plus key plus data size. */ -MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t -node_size_len(const size_t key_len, const size_t value_len) { - return NODESIZE + EVEN(key_len + value_len); -} -MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t -node_size(const MDBX_val *key, const MDBX_val *value) { - return node_size_len(key ? key->iov_len : 0, value ? value->iov_len : 0); -} +/* LY: for testing non-atomic 64-bit txnid on 32-bit arches. + * #define xMDBX_TXNID_STEP (UINT32_MAX / 3) */ +#ifndef xMDBX_TXNID_STEP +#if MDBX_64BIT_CAS +#define xMDBX_TXNID_STEP 1u +#else +#define xMDBX_TXNID_STEP 2u +#endif +#endif /* xMDBX_TXNID_STEP */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t -peek_pgno(const void *const __restrict ptr) { - if (sizeof(pgno_t) == sizeof(uint32_t)) - return (pgno_t)unaligned_peek_u32(1, ptr); - else if (sizeof(pgno_t) == sizeof(uint64_t)) - return (pgno_t)unaligned_peek_u64(1, ptr); - else { - pgno_t pgno; - memcpy(&pgno, ptr, sizeof(pgno)); - return pgno; - } +#ifndef atomic_store64 +MDBX_MAYBE_UNUSED static __always_inline uint64_t +atomic_store64(mdbx_atomic_uint64_t *p, const uint64_t value, + enum mdbx_memory_order order) { + STATIC_ASSERT(sizeof(mdbx_atomic_uint64_t) == 8); +#if MDBX_64BIT_ATOMIC +#if __GNUC_PREREQ(11, 0) + STATIC_ASSERT(__alignof__(mdbx_atomic_uint64_t) >= sizeof(uint64_t)); +#endif /* GNU C >= 11 */ +#ifdef MDBX_HAVE_C11ATOMICS + assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p))); + atomic_store_explicit(MDBX_c11a_rw(uint64_t, p), value, mo_c11_store(order)); +#else /* MDBX_HAVE_C11ATOMICS */ + if (order != mo_Relaxed) + osal_compiler_barrier(); + p->weak = value; + osal_memory_fence(order, true); +#endif /* MDBX_HAVE_C11ATOMICS */ +#else /* !MDBX_64BIT_ATOMIC */ + osal_compiler_barrier(); + atomic_store32(&p->low, (uint32_t)value, mo_Relaxed); + jitter4testing(true); + atomic_store32(&p->high, (uint32_t)(value >> 32), order); + jitter4testing(true); +#endif /* !MDBX_64BIT_ATOMIC */ + return value; } +#endif /* atomic_store64 */ -static __always_inline void poke_pgno(void *const __restrict ptr, - const pgno_t pgno) { - if (sizeof(pgno) == sizeof(uint32_t)) - unaligned_poke_u32(1, ptr, pgno); - else if (sizeof(pgno) == sizeof(uint64_t)) - unaligned_poke_u64(1, ptr, pgno); - else - memcpy(ptr, &pgno, sizeof(pgno)); +#ifndef atomic_load64 +MDBX_MAYBE_UNUSED static +#if MDBX_64BIT_ATOMIC + __always_inline +#endif /* MDBX_64BIT_ATOMIC */ + uint64_t + atomic_load64(const volatile mdbx_atomic_uint64_t *p, + enum mdbx_memory_order order) { + STATIC_ASSERT(sizeof(mdbx_atomic_uint64_t) == 8); +#if MDBX_64BIT_ATOMIC +#ifdef MDBX_HAVE_C11ATOMICS + assert(atomic_is_lock_free(MDBX_c11a_ro(uint64_t, p))); + return atomic_load_explicit(MDBX_c11a_ro(uint64_t, p), mo_c11_load(order)); +#else /* MDBX_HAVE_C11ATOMICS */ + osal_memory_fence(order, false); + const uint64_t value = p->weak; + if (order != mo_Relaxed) + osal_compiler_barrier(); + return value; +#endif /* MDBX_HAVE_C11ATOMICS */ +#else /* !MDBX_64BIT_ATOMIC */ + osal_compiler_barrier(); + uint64_t value = (uint64_t)atomic_load32(&p->high, order) << 32; + jitter4testing(true); + value |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed + : mo_AcquireRelease); + jitter4testing(true); + for (;;) { + osal_compiler_barrier(); + uint64_t again = (uint64_t)atomic_load32(&p->high, order) << 32; + jitter4testing(true); + again |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed + : mo_AcquireRelease); + jitter4testing(true); + if (likely(value == again)) + return value; + value = again; + } +#endif /* !MDBX_64BIT_ATOMIC */ } +#endif /* atomic_load64 */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t -node_largedata_pgno(const MDBX_node *const __restrict node) { - assert(node_flags(node) & F_BIGDATA); - return peek_pgno(node_data(node)); +MDBX_MAYBE_UNUSED static __always_inline void atomic_yield(void) { +#if defined(_WIN32) || defined(_WIN64) + YieldProcessor(); +#elif defined(__ia32__) || defined(__e2k__) + __builtin_ia32_pause(); +#elif defined(__ia64__) +#if defined(__HP_cc__) || defined(__HP_aCC__) + _Asm_hint(_HINT_PAUSE); +#else + __asm__ __volatile__("hint @pause"); +#endif +#elif defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH > 6) || \ + defined(__ARM_ARCH_6K__) +#ifdef __CC_ARM + __yield(); +#else + __asm__ __volatile__("yield"); +#endif +#elif (defined(__mips64) || defined(__mips64__)) && defined(__mips_isa_rev) && \ + __mips_isa_rev >= 2 + __asm__ __volatile__("pause"); +#elif defined(__mips) || defined(__mips__) || defined(__mips64) || \ + defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \ + defined(__MWERKS__) || defined(__sgi) + __asm__ __volatile__(".word 0x00000140"); +#elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE) + sched_yield(); +#elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS) + pthread_yield(); +#endif } -/*------------------------------------------------------------------------------ - * Nodes, Keys & Values length limitation factors: - * - * BRANCH_NODE_MAX - * Branch-page must contain at least two nodes, within each a key and a child - * page number. But page can't be split if it contains less that 4 keys, - * i.e. a page should not overflow before adding the fourth key. Therefore, - * at least 3 branch-node should fit in the single branch-page. Further, the - * first node of a branch-page doesn't contain a key, i.e. the first node - * is always require space just for itself. Thus: - * PAGEROOM = pagesize - page_hdr_len; - * BRANCH_NODE_MAX = even_floor( - * (PAGEROOM - sizeof(indx_t) - NODESIZE) / (3 - 1) - sizeof(indx_t)); - * KEYLEN_MAX = BRANCH_NODE_MAX - node_hdr_len; - * - * LEAF_NODE_MAX - * Leaf-node must fit into single leaf-page, where a value could be placed on - * a large/overflow page. However, may require to insert a nearly page-sized - * node between two large nodes are already fill-up a page. In this case the - * page must be split to two if some pair of nodes fits on one page, or - * otherwise the page should be split to the THREE with a single node - * per each of ones. Such 1-into-3 page splitting is costly and complex since - * requires TWO insertion into the parent page, that could lead to split it - * and so on up to the root. Therefore double-splitting is avoided here and - * the maximum node size is half of a leaf page space: - * LEAF_NODE_MAX = even_floor(PAGEROOM / 2 - sizeof(indx_t)); - * DATALEN_NO_OVERFLOW = LEAF_NODE_MAX - NODESIZE - KEYLEN_MAX; - * - * - SubDatabase-node must fit into one leaf-page: - * SUBDB_NAME_MAX = LEAF_NODE_MAX - node_hdr_len - sizeof(MDBX_db); - * - * - Dupsort values itself are a keys in a dupsort-subdb and couldn't be longer - * than the KEYLEN_MAX. But dupsort node must not great than LEAF_NODE_MAX, - * since dupsort value couldn't be placed on a large/overflow page: - * DUPSORT_DATALEN_MAX = min(KEYLEN_MAX, - * max(DATALEN_NO_OVERFLOW, sizeof(MDBX_db)); - */ - -#define PAGEROOM(pagesize) ((pagesize)-PAGEHDRSZ) -#define EVEN_FLOOR(n) ((n) & ~(size_t)1) -#define BRANCH_NODE_MAX(pagesize) \ - (EVEN_FLOOR((PAGEROOM(pagesize) - sizeof(indx_t) - NODESIZE) / (3 - 1) - \ - sizeof(indx_t))) -#define LEAF_NODE_MAX(pagesize) \ - (EVEN_FLOOR(PAGEROOM(pagesize) / 2) - sizeof(indx_t)) -#define MAX_GC1OVPAGE(pagesize) (PAGEROOM(pagesize) / sizeof(pgno_t) - 1) - -static __inline size_t keysize_max(size_t pagesize, MDBX_db_flags_t flags) { - assert(pagesize >= MIN_PAGESIZE && pagesize <= MAX_PAGESIZE && - is_powerof2(pagesize)); - STATIC_ASSERT(BRANCH_NODE_MAX(MIN_PAGESIZE) - NODESIZE >= 8); - if (flags & MDBX_INTEGERKEY) - return 8 /* sizeof(uint64_t) */; - - const intptr_t max_branch_key = BRANCH_NODE_MAX(pagesize) - NODESIZE; - STATIC_ASSERT(LEAF_NODE_MAX(MIN_PAGESIZE) - NODESIZE - - /* sizeof(uint64) as a key */ 8 > - sizeof(MDBX_db)); - if (flags & - (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP | MDBX_INTEGERDUP)) { - const intptr_t max_dupsort_leaf_key = - LEAF_NODE_MAX(pagesize) - NODESIZE - sizeof(MDBX_db); - return (max_branch_key < max_dupsort_leaf_key) ? max_branch_key - : max_dupsort_leaf_key; - } - return max_branch_key; +#if MDBX_64BIT_CAS +MDBX_MAYBE_UNUSED static __always_inline bool +atomic_cas64(mdbx_atomic_uint64_t *p, uint64_t c, uint64_t v) { +#ifdef MDBX_HAVE_C11ATOMICS + STATIC_ASSERT(sizeof(long long) >= sizeof(uint64_t)); + assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p))); + return atomic_compare_exchange_strong(MDBX_c11a_rw(uint64_t, p), &c, v); +#elif defined(__GNUC__) || defined(__clang__) + return __sync_bool_compare_and_swap(&p->weak, c, v); +#elif defined(_MSC_VER) + return c == (uint64_t)_InterlockedCompareExchange64( + (volatile __int64 *)&p->weak, v, c); +#elif defined(__APPLE__) + return OSAtomicCompareAndSwap64Barrier(c, v, &p->weak); +#else +#error FIXME: Unsupported compiler +#endif } +#endif /* MDBX_64BIT_CAS */ -static __inline size_t keysize_min(MDBX_db_flags_t flags) { - return (flags & MDBX_INTEGERKEY) ? 4 /* sizeof(uint32_t) */ : 0; +MDBX_MAYBE_UNUSED static __always_inline bool +atomic_cas32(mdbx_atomic_uint32_t *p, uint32_t c, uint32_t v) { +#ifdef MDBX_HAVE_C11ATOMICS + STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t)); + assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p))); + return atomic_compare_exchange_strong(MDBX_c11a_rw(uint32_t, p), &c, v); +#elif defined(__GNUC__) || defined(__clang__) + return __sync_bool_compare_and_swap(&p->weak, c, v); +#elif defined(_MSC_VER) + STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t)); + return c == + (uint32_t)_InterlockedCompareExchange((volatile long *)&p->weak, v, c); +#elif defined(__APPLE__) + return OSAtomicCompareAndSwap32Barrier(c, v, &p->weak); +#else +#error FIXME: Unsupported compiler +#endif } -static __inline size_t valsize_min(MDBX_db_flags_t flags) { - if (flags & MDBX_INTEGERDUP) - return 4 /* sizeof(uint32_t) */; - else if (flags & MDBX_DUPFIXED) - return sizeof(indx_t); - else - return 0; +MDBX_MAYBE_UNUSED static __always_inline uint32_t +atomic_add32(mdbx_atomic_uint32_t *p, uint32_t v) { +#ifdef MDBX_HAVE_C11ATOMICS + STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t)); + assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p))); + return atomic_fetch_add(MDBX_c11a_rw(uint32_t, p), v); +#elif defined(__GNUC__) || defined(__clang__) + return __sync_fetch_and_add(&p->weak, v); +#elif defined(_MSC_VER) + STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t)); + return (uint32_t)_InterlockedExchangeAdd((volatile long *)&p->weak, v); +#elif defined(__APPLE__) + return OSAtomicAdd32Barrier(v, &p->weak); +#else +#error FIXME: Unsupported compiler +#endif } -static __inline size_t valsize_max(size_t pagesize, MDBX_db_flags_t flags) { - assert(pagesize >= MIN_PAGESIZE && pagesize <= MAX_PAGESIZE && - is_powerof2(pagesize)); - - if (flags & MDBX_INTEGERDUP) - return 8 /* sizeof(uint64_t) */; - - if (flags & (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP)) - return keysize_max(pagesize, 0); - - const unsigned page_ln2 = log2n_powerof2(pagesize); - const size_t hard = 0x7FF00000ul; - const size_t hard_pages = hard >> page_ln2; - STATIC_ASSERT(MDBX_PGL_LIMIT <= MAX_PAGENO); - const size_t pages_limit = MDBX_PGL_LIMIT / 4; - const size_t limit = - (hard_pages < pages_limit) ? hard : (pages_limit << page_ln2); - return (limit < MAX_MAPSIZE / 2) ? limit : MAX_MAPSIZE / 2; -} +#define atomic_sub32(p, v) atomic_add32(p, 0 - (v)) -__cold int mdbx_env_get_maxkeysize(const MDBX_env *env) { - return mdbx_env_get_maxkeysize_ex(env, MDBX_DUPSORT); +MDBX_MAYBE_UNUSED static __always_inline uint64_t +safe64_txnid_next(uint64_t txnid) { + txnid += xMDBX_TXNID_STEP; +#if !MDBX_64BIT_CAS + /* avoid overflow of low-part in safe64_reset() */ + txnid += (UINT32_MAX == (uint32_t)txnid); +#endif + return txnid; } -__cold int mdbx_env_get_maxkeysize_ex(const MDBX_env *env, - MDBX_db_flags_t flags) { - if (unlikely(!env || env->me_signature.weak != MDBX_ME_SIGNATURE)) - return -1; - - return (int)mdbx_limits_keysize_max((intptr_t)env->me_psize, flags); +/* Atomically make target value >= SAFE64_INVALID_THRESHOLD */ +MDBX_MAYBE_UNUSED static __always_inline void +safe64_reset(mdbx_atomic_uint64_t *p, bool single_writer) { + if (single_writer) { +#if MDBX_64BIT_ATOMIC && MDBX_WORDBITS >= 64 + atomic_store64(p, UINT64_MAX, mo_AcquireRelease); +#else + atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease); +#endif /* MDBX_64BIT_ATOMIC && MDBX_WORDBITS >= 64 */ + } else { +#if MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC + /* atomically make value >= SAFE64_INVALID_THRESHOLD by 64-bit operation */ + atomic_store64(p, UINT64_MAX, mo_AcquireRelease); +#elif MDBX_64BIT_CAS + /* atomically make value >= SAFE64_INVALID_THRESHOLD by 32-bit operation */ + atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease); +#else + /* it is safe to increment low-part to avoid ABA, since xMDBX_TXNID_STEP > 1 + * and overflow was preserved in safe64_txnid_next() */ + STATIC_ASSERT(xMDBX_TXNID_STEP > 1); + atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */; + atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease); + atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */; +#endif /* MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC */ + } + assert(p->weak >= SAFE64_INVALID_THRESHOLD); + jitter4testing(true); } -size_t mdbx_default_pagesize(void) { - size_t pagesize = osal_syspagesize(); - ENSURE(nullptr, is_powerof2(pagesize)); - pagesize = (pagesize >= MIN_PAGESIZE) ? pagesize : MIN_PAGESIZE; - pagesize = (pagesize <= MAX_PAGESIZE) ? pagesize : MAX_PAGESIZE; - return pagesize; +MDBX_MAYBE_UNUSED static __always_inline bool +safe64_reset_compare(mdbx_atomic_uint64_t *p, uint64_t compare) { + /* LY: This function is used to reset `txnid` from hsr-handler in case + * the asynchronously cancellation of read transaction. Therefore, + * there may be a collision between the cleanup performed here and + * asynchronous termination and restarting of the read transaction + * in another process/thread. In general we MUST NOT reset the `txnid` + * if a new transaction was started (i.e. if `txnid` was changed). */ +#if MDBX_64BIT_CAS + bool rc = atomic_cas64(p, compare, UINT64_MAX); +#else + /* LY: There is no gold ratio here since shared mutex is too costly, + * in such way we must acquire/release it for every update of txnid, + * i.e. twice for each read transaction). */ + bool rc = false; + if (likely(atomic_load32(&p->low, mo_AcquireRelease) == (uint32_t)compare && + atomic_cas32(&p->high, (uint32_t)(compare >> 32), UINT32_MAX))) { + if (unlikely(atomic_load32(&p->low, mo_AcquireRelease) != + (uint32_t)compare)) + atomic_cas32(&p->high, UINT32_MAX, (uint32_t)(compare >> 32)); + else + rc = true; + } +#endif /* MDBX_64BIT_CAS */ + jitter4testing(true); + return rc; } -__cold intptr_t mdbx_limits_keysize_max(intptr_t pagesize, - MDBX_db_flags_t flags) { - if (pagesize < 1) - pagesize = (intptr_t)mdbx_default_pagesize(); - if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || - pagesize > (intptr_t)MAX_PAGESIZE || - !is_powerof2((size_t)pagesize))) - return -1; - - return keysize_max(pagesize, flags); -} - -__cold intptr_t mdbx_limits_keysize_min(MDBX_db_flags_t flags) { - return keysize_min(flags); -} - -__cold int mdbx_env_get_maxvalsize_ex(const MDBX_env *env, - MDBX_db_flags_t flags) { - if (unlikely(!env || env->me_signature.weak != MDBX_ME_SIGNATURE)) - return -1; - - return (int)mdbx_limits_valsize_max((intptr_t)env->me_psize, flags); +MDBX_MAYBE_UNUSED static __always_inline void +safe64_write(mdbx_atomic_uint64_t *p, const uint64_t v) { + assert(p->weak >= SAFE64_INVALID_THRESHOLD); +#if MDBX_64BIT_ATOMIC && MDBX_64BIT_CAS + atomic_store64(p, v, mo_AcquireRelease); +#else /* MDBX_64BIT_ATOMIC */ + osal_compiler_barrier(); + /* update low-part but still value >= SAFE64_INVALID_THRESHOLD */ + atomic_store32(&p->low, (uint32_t)v, mo_Relaxed); + assert(p->weak >= SAFE64_INVALID_THRESHOLD); + jitter4testing(true); + /* update high-part from SAFE64_INVALID_THRESHOLD to actual value */ + atomic_store32(&p->high, (uint32_t)(v >> 32), mo_AcquireRelease); +#endif /* MDBX_64BIT_ATOMIC */ + assert(p->weak == v); + jitter4testing(true); } -__cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize, - MDBX_db_flags_t flags) { - if (pagesize < 1) - pagesize = (intptr_t)mdbx_default_pagesize(); - if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || - pagesize > (intptr_t)MAX_PAGESIZE || - !is_powerof2((size_t)pagesize))) - return -1; - - return valsize_max(pagesize, flags); +MDBX_MAYBE_UNUSED static __always_inline uint64_t +safe64_read(const mdbx_atomic_uint64_t *p) { + jitter4testing(true); + uint64_t v; + do + v = atomic_load64(p, mo_AcquireRelease); + while (!MDBX_64BIT_ATOMIC && unlikely(v != p->weak)); + return v; } -__cold intptr_t mdbx_limits_valsize_min(MDBX_db_flags_t flags) { - return valsize_min(flags); +#if 0 /* unused for now */ +MDBX_MAYBE_UNUSED static __always_inline bool safe64_is_valid(uint64_t v) { +#if MDBX_WORDBITS >= 64 + return v < SAFE64_INVALID_THRESHOLD; +#else + return (v >> 32) != UINT32_MAX; +#endif /* MDBX_WORDBITS */ } -__cold intptr_t mdbx_limits_pairsize4page_max(intptr_t pagesize, - MDBX_db_flags_t flags) { - if (pagesize < 1) - pagesize = (intptr_t)mdbx_default_pagesize(); - if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || - pagesize > (intptr_t)MAX_PAGESIZE || - !is_powerof2((size_t)pagesize))) - return -1; - - if (flags & - (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP)) - return BRANCH_NODE_MAX(pagesize) - NODESIZE; - - return LEAF_NODE_MAX(pagesize) - NODESIZE; +MDBX_MAYBE_UNUSED static __always_inline bool + safe64_is_valid_ptr(const mdbx_atomic_uint64_t *p) { +#if MDBX_64BIT_ATOMIC + return atomic_load64(p, mo_AcquireRelease) < SAFE64_INVALID_THRESHOLD; +#else + return atomic_load32(&p->high, mo_AcquireRelease) != UINT32_MAX; +#endif /* MDBX_64BIT_ATOMIC */ } +#endif /* unused for now */ -__cold int mdbx_env_get_pairsize4page_max(const MDBX_env *env, - MDBX_db_flags_t flags) { - if (unlikely(!env || env->me_signature.weak != MDBX_ME_SIGNATURE)) - return -1; - - return (int)mdbx_limits_pairsize4page_max((intptr_t)env->me_psize, flags); +/* non-atomic write with safety for reading a half-updated value */ +MDBX_MAYBE_UNUSED static __always_inline void +safe64_update(mdbx_atomic_uint64_t *p, const uint64_t v) { +#if MDBX_64BIT_ATOMIC + atomic_store64(p, v, mo_Relaxed); +#else + safe64_reset(p, true); + safe64_write(p, v); +#endif /* MDBX_64BIT_ATOMIC */ } -__cold intptr_t mdbx_limits_valsize4page_max(intptr_t pagesize, - MDBX_db_flags_t flags) { - if (pagesize < 1) - pagesize = (intptr_t)mdbx_default_pagesize(); - if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || - pagesize > (intptr_t)MAX_PAGESIZE || - !is_powerof2((size_t)pagesize))) - return -1; - - if (flags & - (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP)) - return valsize_max(pagesize, flags); - - return PAGEROOM(pagesize); +/* non-atomic increment with safety for reading a half-updated value */ +MDBX_MAYBE_UNUSED static +#if MDBX_64BIT_ATOMIC + __always_inline +#endif /* MDBX_64BIT_ATOMIC */ + void + safe64_inc(mdbx_atomic_uint64_t *p, const uint64_t v) { + assert(v > 0); + safe64_update(p, safe64_read(p) + v); } -__cold int mdbx_env_get_valsize4page_max(const MDBX_env *env, - MDBX_db_flags_t flags) { - if (unlikely(!env || env->me_signature.weak != MDBX_ME_SIGNATURE)) - return -1; - - return (int)mdbx_limits_valsize4page_max((intptr_t)env->me_psize, flags); -} +#endif /* !__cplusplus */ -/* Calculate the size of a leaf node. - * - * The size depends on the environment's page size; if a data item - * is too large it will be put onto an large/overflow page and the node - * size will only include the key and not the data. Sizes are always - * rounded up to an even number of bytes, to guarantee 2-byte alignment - * of the MDBX_node headers. */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t -leaf_size(const MDBX_env *env, const MDBX_val *key, const MDBX_val *data) { - size_t node_bytes = node_size(key, data); - if (node_bytes > env->me_leaf_nodemax) { - /* put on large/overflow page */ - node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t); - } - return node_bytes + sizeof(indx_t); -} -/* Calculate the size of a branch node. - * - * The size should depend on the environment's page size but since - * we currently don't support spilling large keys onto large/overflow - * pages, it's simply the size of the MDBX_node header plus the - * size of the key. Sizes are always rounded up to an even number - * of bytes, to guarantee 2-byte alignment of the MDBX_node headers. - * - * [in] env The environment handle. - * [in] key The key for the node. - * - * Returns The number of bytes needed to store the node. */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t -branch_size(const MDBX_env *env, const MDBX_val *key) { - /* Size of a node in a branch page with a given key. - * This is just the node header plus the key, there is no data. */ - size_t node_bytes = node_size(key, nullptr); - if (unlikely(node_bytes > env->me_branch_nodemax)) { - /* put on large/overflow page */ - /* not implemented */ - mdbx_panic("node_size(key) %zu > %u branch_nodemax", node_bytes, - env->me_branch_nodemax); - node_bytes = node_size(key, nullptr) + sizeof(pgno_t); - } +/* Internal prototypes */ - return node_bytes + sizeof(indx_t); +/* audit.c */ +MDBX_INTERNAL int audit_ex(MDBX_txn *txn, size_t retired_stored, + bool dont_filter_gc); + +/* mvcc-readers.c */ +MDBX_INTERNAL bsr_t mvcc_bind_slot(MDBX_env *env, const uintptr_t tid); +MDBX_MAYBE_UNUSED MDBX_INTERNAL pgno_t mvcc_largest_this(MDBX_env *env, + pgno_t largest); +MDBX_INTERNAL txnid_t mvcc_shapshot_oldest(MDBX_env *const env, + const txnid_t steady); +MDBX_INTERNAL pgno_t mvcc_snapshot_largest(const MDBX_env *env, + pgno_t last_used_page); +MDBX_INTERNAL txnid_t mvcc_kick_laggards(MDBX_env *env, + const txnid_t straggler); +MDBX_INTERNAL int mvcc_cleanup_dead(MDBX_env *env, int rlocked, int *dead); +MDBX_INTERNAL txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t laggard); + +/* dxb.c */ +MDBX_INTERNAL int dxb_setup(MDBX_env *env, const int lck_rc, + const mdbx_mode_t mode_bits); +MDBX_INTERNAL int __must_check_result +dxb_read_header(MDBX_env *env, meta_t *meta, const int lck_exclusive, + const mdbx_mode_t mode_bits); +enum resize_mode { implicit_grow, impilict_shrink, explicit_resize }; +MDBX_INTERNAL int __must_check_result dxb_resize(MDBX_env *const env, + const pgno_t used_pgno, + const pgno_t size_pgno, + pgno_t limit_pgno, + const enum resize_mode mode); +MDBX_INTERNAL int dxb_set_readahead(const MDBX_env *env, const pgno_t edge, + const bool enable, const bool force_whole); +MDBX_INTERNAL int __must_check_result dxb_sync_locked(MDBX_env *env, + unsigned flags, + meta_t *const pending, + troika_t *const troika); +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) +MDBX_INTERNAL void dxb_sanitize_tail(MDBX_env *env, MDBX_txn *txn); +#else +static inline void dxb_sanitize_tail(MDBX_env *env, MDBX_txn *txn) { + (void)env; + (void)txn; } +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ -MDBX_NOTHROW_CONST_FUNCTION static __always_inline uint16_t -flags_db2sub(uint16_t db_flags) { - uint16_t sub_flags = db_flags & MDBX_DUPFIXED; - - /* MDBX_INTEGERDUP => MDBX_INTEGERKEY */ -#define SHIFT_INTEGERDUP_TO_INTEGERKEY 2 - STATIC_ASSERT((MDBX_INTEGERDUP >> SHIFT_INTEGERDUP_TO_INTEGERKEY) == - MDBX_INTEGERKEY); - sub_flags |= (db_flags & MDBX_INTEGERDUP) >> SHIFT_INTEGERDUP_TO_INTEGERKEY; +/* txn.c */ +MDBX_INTERNAL bool txn_refund(MDBX_txn *txn); +MDBX_INTERNAL txnid_t txn_snapshot_oldest(const MDBX_txn *const txn); +MDBX_INTERNAL int txn_abort(MDBX_txn *txn); +MDBX_INTERNAL int txn_renew(MDBX_txn *txn, unsigned flags); - /* MDBX_REVERSEDUP => MDBX_REVERSEKEY */ -#define SHIFT_REVERSEDUP_TO_REVERSEKEY 5 - STATIC_ASSERT((MDBX_REVERSEDUP >> SHIFT_REVERSEDUP_TO_REVERSEKEY) == - MDBX_REVERSEKEY); - sub_flags |= (db_flags & MDBX_REVERSEDUP) >> SHIFT_REVERSEDUP_TO_REVERSEKEY; +#define TXN_END_NAMES \ + {"committed", "empty-commit", "abort", "reset", \ + "reset-tmp", "fail-begin", "fail-beginchild"} +enum { + /* txn_end operation number, for logging */ + TXN_END_COMMITTED, + TXN_END_PURE_COMMIT, + TXN_END_ABORT, + TXN_END_RESET, + TXN_END_RESET_TMP, + TXN_END_FAIL_BEGIN, + TXN_END_FAIL_BEGINCHILD, - return sub_flags; -} + TXN_END_OPMASK = 0x0F /* mask for txn_end() operation number */, + TXN_END_UPDATE = 0x10 /* update env state (DBIs) */, + TXN_END_FREE = 0x20 /* free txn unless it is env.basal_txn */, + TXN_END_EOTDONE = 0x40 /* txn's cursors already closed */, + TXN_END_SLOT = 0x80 /* release any reader slot if NOSTICKYTHREADS */ +}; +MDBX_INTERNAL int txn_end(MDBX_txn *txn, const unsigned mode); +MDBX_INTERNAL int txn_write(MDBX_txn *txn, iov_ctx_t *ctx); + +/* env.c */ +MDBX_INTERNAL int env_open(MDBX_env *env, mdbx_mode_t mode); +MDBX_INTERNAL int env_info(const MDBX_env *env, const MDBX_txn *txn, + MDBX_envinfo *out, size_t bytes, troika_t *troika); +MDBX_INTERNAL int env_sync(MDBX_env *env, bool force, bool nonblock); +MDBX_INTERNAL int env_close(MDBX_env *env, bool resurrect_after_fork); +MDBX_INTERNAL bool env_txn0_owned(const MDBX_env *env); +MDBX_INTERNAL void env_options_init(MDBX_env *env); +MDBX_INTERNAL void env_options_adjust_defaults(MDBX_env *env); +MDBX_INTERNAL int __must_check_result env_page_auxbuffer(MDBX_env *env); +MDBX_INTERNAL unsigned env_setup_pagesize(MDBX_env *env, const size_t pagesize); + +/* tree.c */ +MDBX_INTERNAL int tree_drop(MDBX_cursor *mc, const bool may_have_subDBs); +MDBX_INTERNAL int __must_check_result tree_rebalance(MDBX_cursor *mc); +MDBX_INTERNAL int __must_check_result tree_propagate_key(MDBX_cursor *mc, + const MDBX_val *key); +MDBX_INTERNAL void recalculate_merge_thresholds(MDBX_env *env); +MDBX_INTERNAL void recalculate_subpage_thresholds(MDBX_env *env); + +/* subdb.c */ +MDBX_INTERNAL int __must_check_result sdb_fetch(MDBX_txn *txn, size_t dbi); +MDBX_INTERNAL int __must_check_result sdb_setup(const MDBX_env *env, + kvx_t *const kvx, + const tree_t *const db); + +/* coherency.c */ +MDBX_INTERNAL bool coherency_check_meta(const MDBX_env *env, + const volatile meta_t *meta, + bool report); +MDBX_INTERNAL int coherency_check_head(MDBX_txn *txn, const meta_ptr_t head, + uint64_t *timestamp); +MDBX_INTERNAL int coherency_check_written(const MDBX_env *env, + const txnid_t txnid, + const volatile meta_t *meta, + const intptr_t pgno, + uint64_t *timestamp); +MDBX_INTERNAL int coherency_timeout(uint64_t *timestamp, intptr_t pgno, + const MDBX_env *env); + + + +/* List of txnid */ +typedef txnid_t *txl_t; +typedef const txnid_t *const_txl_t; + +enum txl_rules { + txl_granulate = 32, + txl_initial = + txl_granulate - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t), + txl_max = (1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t) +}; -/*----------------------------------------------------------------------------*/ +MDBX_INTERNAL txl_t txl_alloc(void); -MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t -pgno2bytes(const MDBX_env *env, size_t pgno) { - eASSERT(env, (1u << env->me_psize2log) == env->me_psize); - return ((size_t)pgno) << env->me_psize2log; -} +MDBX_INTERNAL void txl_free(txl_t txl); -MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_page * -pgno2page(const MDBX_env *env, size_t pgno) { - return ptr_disp(env->me_map, pgno2bytes(env, pgno)); -} +MDBX_INTERNAL int __must_check_result txl_append(txl_t __restrict *ptxl, + txnid_t id); -MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t -bytes2pgno(const MDBX_env *env, size_t bytes) { - eASSERT(env, (env->me_psize >> env->me_psize2log) == 1); - return (pgno_t)(bytes >> env->me_psize2log); -} +MDBX_INTERNAL void txl_sort(txl_t txl); -MDBX_NOTHROW_PURE_FUNCTION static size_t -pgno_align2os_bytes(const MDBX_env *env, size_t pgno) { - return ceil_powerof2(pgno2bytes(env, pgno), env->me_os_psize); -} -MDBX_NOTHROW_PURE_FUNCTION static pgno_t pgno_align2os_pgno(const MDBX_env *env, - size_t pgno) { - return bytes2pgno(env, pgno_align2os_bytes(env, pgno)); -} +/*------------------------------------------------------------------------------ + * Unaligned access */ -MDBX_NOTHROW_PURE_FUNCTION static size_t -bytes_align2os_bytes(const MDBX_env *env, size_t bytes) { - return ceil_powerof2(ceil_powerof2(bytes, env->me_psize), env->me_os_psize); +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static inline size_t +field_alignment(size_t alignment_baseline, size_t field_offset) { + size_t merge = alignment_baseline | (size_t)field_offset; + return merge & -(int)merge; } -/* Address of first usable data byte in a page, after the header */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * -page_data(const MDBX_page *mp) { - return ptr_disp(mp, PAGEHDRSZ); +/* read-thunk for UB-sanitizer */ +MDBX_NOTHROW_PURE_FUNCTION static inline uint8_t +peek_u8(const uint8_t *__restrict ptr) { + return *ptr; } -MDBX_NOTHROW_PURE_FUNCTION static __always_inline const MDBX_page * -data_page(const void *data) { - return container_of(data, MDBX_page, mp_ptrs); +/* write-thunk for UB-sanitizer */ +static inline void poke_u8(uint8_t *__restrict ptr, const uint8_t v) { + *ptr = v; } -MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_meta * -page_meta(MDBX_page *mp) { - return (MDBX_meta *)page_data(mp); +static inline void *bcopy_2(void *__restrict dst, const void *__restrict src) { + uint8_t *__restrict d = (uint8_t *)dst; + const uint8_t *__restrict s = (uint8_t *)src; + d[0] = s[0]; + d[1] = s[1]; + return d; +} + +static inline void *bcopy_4(void *const __restrict dst, + const void *const __restrict src) { + uint8_t *__restrict d = (uint8_t *)dst; + const uint8_t *__restrict s = (uint8_t *)src; + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + return d; +} + +static inline void *bcopy_8(void *const __restrict dst, + const void *const __restrict src) { + uint8_t *__restrict d = (uint8_t *)dst; + const uint8_t *__restrict s = (uint8_t *)src; + d[0] = s[0]; + d[1] = s[1]; + d[2] = s[2]; + d[3] = s[3]; + d[4] = s[4]; + d[5] = s[5]; + d[6] = s[6]; + d[7] = s[7]; + return d; +} + +MDBX_NOTHROW_PURE_FUNCTION static inline uint16_t +unaligned_peek_u16(const size_t expected_alignment, const void *const ptr) { + assert((uintptr_t)ptr % expected_alignment == 0); + if (MDBX_UNALIGNED_OK >= 2 || (expected_alignment % sizeof(uint16_t)) == 0) + return *(const uint16_t *)ptr; + else { +#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ + defined(_M_X64) || defined(_M_IA64) + return *(const __unaligned uint16_t *)ptr; +#else + uint16_t v; + bcopy_2((uint8_t *)&v, (const uint8_t *)ptr); + return v; +#endif /* _MSC_VER || __unaligned */ + } } -/* Number of nodes on a page */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t -page_numkeys(const MDBX_page *mp) { - return mp->mp_lower >> 1; +static inline void unaligned_poke_u16(const size_t expected_alignment, + void *const __restrict ptr, + const uint16_t v) { + assert((uintptr_t)ptr % expected_alignment == 0); + if (MDBX_UNALIGNED_OK >= 2 || (expected_alignment % sizeof(v)) == 0) + *(uint16_t *)ptr = v; + else { +#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ + defined(_M_X64) || defined(_M_IA64) + *((uint16_t __unaligned *)ptr) = v; +#else + bcopy_2((uint8_t *)ptr, (const uint8_t *)&v); +#endif /* _MSC_VER || __unaligned */ + } } -/* The amount of space remaining in the page */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t -page_room(const MDBX_page *mp) { - return mp->mp_upper - mp->mp_lower; +MDBX_NOTHROW_PURE_FUNCTION static inline uint32_t +unaligned_peek_u32(const size_t expected_alignment, + const void *const __restrict ptr) { + assert((uintptr_t)ptr % expected_alignment == 0); + if (MDBX_UNALIGNED_OK >= 4 || (expected_alignment % sizeof(uint32_t)) == 0) + return *(const uint32_t *)ptr; + else if ((expected_alignment % sizeof(uint16_t)) == 0) { + const uint16_t lo = + ((const uint16_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__]; + const uint16_t hi = + ((const uint16_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__]; + return lo | (uint32_t)hi << 16; + } else { +#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ + defined(_M_X64) || defined(_M_IA64) + return *(const __unaligned uint32_t *)ptr; +#else + uint32_t v; + bcopy_4((uint8_t *)&v, (const uint8_t *)ptr); + return v; +#endif /* _MSC_VER || __unaligned */ + } } -/* Maximum free space in an empty page */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t -page_space(const MDBX_env *env) { - STATIC_ASSERT(PAGEHDRSZ % 2 == 0); - return env->me_psize - PAGEHDRSZ; +static inline void unaligned_poke_u32(const size_t expected_alignment, + void *const __restrict ptr, + const uint32_t v) { + assert((uintptr_t)ptr % expected_alignment == 0); + if (MDBX_UNALIGNED_OK >= 4 || (expected_alignment % sizeof(v)) == 0) + *(uint32_t *)ptr = v; + else if ((expected_alignment % sizeof(uint16_t)) == 0) { + ((uint16_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__] = (uint16_t)v; + ((uint16_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__] = + (uint16_t)(v >> 16); + } else { +#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ + defined(_M_X64) || defined(_M_IA64) + *((uint32_t __unaligned *)ptr) = v; +#else + bcopy_4((uint8_t *)ptr, (const uint8_t *)&v); +#endif /* _MSC_VER || __unaligned */ + } } -MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t -page_used(const MDBX_env *env, const MDBX_page *mp) { - return page_space(env) - page_room(mp); +MDBX_NOTHROW_PURE_FUNCTION static inline uint64_t +unaligned_peek_u64(const size_t expected_alignment, + const void *const __restrict ptr) { + assert((uintptr_t)ptr % expected_alignment == 0); + if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(uint64_t)) == 0) + return *(const uint64_t *)ptr; + else if ((expected_alignment % sizeof(uint32_t)) == 0) { + const uint32_t lo = + ((const uint32_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__]; + const uint32_t hi = + ((const uint32_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__]; + return lo | (uint64_t)hi << 32; + } else { +#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ + defined(_M_X64) || defined(_M_IA64) + return *(const __unaligned uint64_t *)ptr; +#else + uint64_t v; + bcopy_8((uint8_t *)&v, (const uint8_t *)ptr); + return v; +#endif /* _MSC_VER || __unaligned */ + } } -/* The percentage of space used in the page, in a percents. */ -MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __inline double -page_fill(const MDBX_env *env, const MDBX_page *mp) { - return page_used(env, mp) * 100.0 / page_space(env); -} - -/* The number of large/overflow pages needed to store the given size. */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t -number_of_ovpages(const MDBX_env *env, size_t bytes) { - return bytes2pgno(env, PAGEHDRSZ - 1 + bytes) + 1; +static inline uint64_t +unaligned_peek_u64_volatile(const size_t expected_alignment, + const volatile void *const __restrict ptr) { + assert((uintptr_t)ptr % expected_alignment == 0); + assert(expected_alignment % sizeof(uint32_t) == 0); + if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(uint64_t)) == 0) + return *(const volatile uint64_t *)ptr; + else { +#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ + defined(_M_X64) || defined(_M_IA64) + return *(const volatile __unaligned uint64_t *)ptr; +#else + const uint32_t lo = ((const volatile uint32_t *) + ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__]; + const uint32_t hi = ((const volatile uint32_t *) + ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__]; + return lo | (uint64_t)hi << 32; +#endif /* _MSC_VER || __unaligned */ + } } -__cold static const char *pagetype_caption(const uint8_t type, - char buf4unknown[16]) { - switch (type) { - case P_BRANCH: - return "branch"; - case P_LEAF: - return "leaf"; - case P_LEAF | P_SUBP: - return "subleaf"; - case P_LEAF | P_LEAF2: - return "dupfixed-leaf"; - case P_LEAF | P_LEAF2 | P_SUBP: - return "dupfixed-subleaf"; - case P_LEAF | P_LEAF2 | P_SUBP | P_LEGACY_DIRTY: - return "dupfixed-subleaf.legacy-dirty"; - case P_OVERFLOW: - return "large"; - default: - snprintf(buf4unknown, 16, "unknown_0x%x", type); - return buf4unknown; +static inline void unaligned_poke_u64(const size_t expected_alignment, + void *const __restrict ptr, + const uint64_t v) { + assert((uintptr_t)ptr % expected_alignment == 0); + if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(v)) == 0) + *(uint64_t *)ptr = v; + else if ((expected_alignment % sizeof(uint32_t)) == 0) { + ((uint32_t *)ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__] = (uint32_t)v; + ((uint32_t *)ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__] = + (uint32_t)(v >> 32); + } else { +#if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ + defined(_M_X64) || defined(_M_IA64) + *((uint64_t __unaligned *)ptr) = v; +#else + bcopy_8((uint8_t *)ptr, (const uint8_t *)&v); +#endif /* _MSC_VER || __unaligned */ } } -__cold static int MDBX_PRINTF_ARGS(2, 3) - bad_page(const MDBX_page *mp, const char *fmt, ...) { - if (LOG_ENABLED(MDBX_LOG_ERROR)) { - static const MDBX_page *prev; - if (prev != mp) { - char buf4unknown[16]; - prev = mp; - debug_log(MDBX_LOG_ERROR, "badpage", 0, - "corrupted %s-page #%u, mod-txnid %" PRIaTXN "\n", - pagetype_caption(PAGETYPE_WHOLE(mp), buf4unknown), mp->mp_pgno, - mp->mp_txnid); - } +#define UNALIGNED_PEEK_8(ptr, struct, field) \ + peek_u8(ptr_disp(ptr, offsetof(struct, field))) +#define UNALIGNED_POKE_8(ptr, struct, field, value) \ + poke_u8(ptr_disp(ptr, offsetof(struct, field)), value) - va_list args; - va_start(args, fmt); - debug_log_va(MDBX_LOG_ERROR, "badpage", 0, fmt, args); - va_end(args); - } - return MDBX_CORRUPTED; -} +#define UNALIGNED_PEEK_16(ptr, struct, field) \ + unaligned_peek_u16(1, ptr_disp(ptr, offsetof(struct, field))) +#define UNALIGNED_POKE_16(ptr, struct, field, value) \ + unaligned_poke_u16(1, ptr_disp(ptr, offsetof(struct, field)), value) -__cold static void MDBX_PRINTF_ARGS(2, 3) - poor_page(const MDBX_page *mp, const char *fmt, ...) { - if (LOG_ENABLED(MDBX_LOG_NOTICE)) { - static const MDBX_page *prev; - if (prev != mp) { - char buf4unknown[16]; - prev = mp; - debug_log(MDBX_LOG_NOTICE, "poorpage", 0, - "suboptimal %s-page #%u, mod-txnid %" PRIaTXN "\n", - pagetype_caption(PAGETYPE_WHOLE(mp), buf4unknown), mp->mp_pgno, - mp->mp_txnid); - } +#define UNALIGNED_PEEK_32(ptr, struct, field) \ + unaligned_peek_u32(1, ptr_disp(ptr, offsetof(struct, field))) +#define UNALIGNED_POKE_32(ptr, struct, field, value) \ + unaligned_poke_u32(1, ptr_disp(ptr, offsetof(struct, field)), value) - va_list args; - va_start(args, fmt); - debug_log_va(MDBX_LOG_NOTICE, "poorpage", 0, fmt, args); - va_end(args); +#define UNALIGNED_PEEK_64(ptr, struct, field) \ + unaligned_peek_u64(1, ptr_disp(ptr, offsetof(struct, field))) +#define UNALIGNED_POKE_64(ptr, struct, field, value) \ + unaligned_poke_u64(1, ptr_disp(ptr, offsetof(struct, field)), value) + +MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t +peek_pgno(const void *const __restrict ptr) { + if (sizeof(pgno_t) == sizeof(uint32_t)) + return (pgno_t)unaligned_peek_u32(1, ptr); + else if (sizeof(pgno_t) == sizeof(uint64_t)) + return (pgno_t)unaligned_peek_u64(1, ptr); + else { + pgno_t pgno; + memcpy(&pgno, ptr, sizeof(pgno)); + return pgno; } } -/* Address of node i in page p */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_node * -page_node(const MDBX_page *mp, size_t i) { - assert(PAGETYPE_COMPAT(mp) == P_LEAF || PAGETYPE_WHOLE(mp) == P_BRANCH); - assert(page_numkeys(mp) > i); - assert(mp->mp_ptrs[i] % 2 == 0); - return ptr_disp(mp, mp->mp_ptrs[i] + PAGEHDRSZ); +static inline void poke_pgno(void *const __restrict ptr, const pgno_t pgno) { + if (sizeof(pgno) == sizeof(uint32_t)) + unaligned_poke_u32(1, ptr, pgno); + else if (sizeof(pgno) == sizeof(uint64_t)) + unaligned_poke_u64(1, ptr, pgno); + else + memcpy(ptr, &pgno, sizeof(pgno)); } +#if defined(_WIN32) || defined(_WIN64) -/* The address of a key in a LEAF2 page. - * LEAF2 pages are used for MDBX_DUPFIXED sorted-duplicate sub-DBs. - * There are no node headers, keys are stored contiguously. */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * -page_leaf2key(const MDBX_page *mp, size_t i, size_t keysize) { - assert(PAGETYPE_COMPAT(mp) == (P_LEAF | P_LEAF2)); - assert(mp->mp_leaf2_ksize == keysize); - (void)keysize; - return ptr_disp(mp, PAGEHDRSZ + i * mp->mp_leaf2_ksize); -} -/* Set the node's key into keyptr. */ -static __always_inline void get_key(const MDBX_node *node, MDBX_val *keyptr) { - keyptr->iov_len = node_ks(node); - keyptr->iov_base = node_key(node); -} +typedef union osal_srwlock { + __anonymous_struct_extension__ struct { + long volatile readerCount; + long volatile writerCount; + }; + RTL_SRWLOCK native; +} osal_srwlock_t; -/* Set the node's key into keyptr, if requested. */ -static __always_inline void -get_key_optional(const MDBX_node *node, MDBX_val *keyptr /* __may_null */) { - if (keyptr) - get_key(node, keyptr); -} +typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); -/*------------------------------------------------------------------------------ - * safe read/write volatile 64-bit fields on 32-bit architectures. */ +#if _WIN32_WINNT < 0x0600 /* prior to Windows Vista */ +typedef enum _FILE_INFO_BY_HANDLE_CLASS { + FileBasicInfo, + FileStandardInfo, + FileNameInfo, + FileRenameInfo, + FileDispositionInfo, + FileAllocationInfo, + FileEndOfFileInfo, + FileStreamInfo, + FileCompressionInfo, + FileAttributeTagInfo, + FileIdBothDirectoryInfo, + FileIdBothDirectoryRestartInfo, + FileIoPriorityHintInfo, + FileRemoteProtocolInfo, + MaximumFileInfoByHandleClass +} FILE_INFO_BY_HANDLE_CLASS, + *PFILE_INFO_BY_HANDLE_CLASS; -#ifndef atomic_store64 -MDBX_MAYBE_UNUSED static __always_inline uint64_t -atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, - enum MDBX_memory_order order) { - STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8); -#if MDBX_64BIT_ATOMIC -#if __GNUC_PREREQ(11, 0) - STATIC_ASSERT(__alignof__(MDBX_atomic_uint64_t) >= sizeof(uint64_t)); -#endif /* GNU C >= 11 */ -#ifdef MDBX_HAVE_C11ATOMICS - assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p))); - atomic_store_explicit(MDBX_c11a_rw(uint64_t, p), value, mo_c11_store(order)); -#else /* MDBX_HAVE_C11ATOMICS */ - if (order != mo_Relaxed) - osal_compiler_barrier(); - p->weak = value; - osal_memory_fence(order, true); -#endif /* MDBX_HAVE_C11ATOMICS */ -#else /* !MDBX_64BIT_ATOMIC */ - osal_compiler_barrier(); - atomic_store32(&p->low, (uint32_t)value, mo_Relaxed); - jitter4testing(true); - atomic_store32(&p->high, (uint32_t)(value >> 32), order); - jitter4testing(true); -#endif /* !MDBX_64BIT_ATOMIC */ - return value; -} -#endif /* atomic_store64 */ +typedef struct _FILE_END_OF_FILE_INFO { + LARGE_INTEGER EndOfFile; +} FILE_END_OF_FILE_INFO, *PFILE_END_OF_FILE_INFO; -#ifndef atomic_load64 -MDBX_MAYBE_UNUSED static -#if MDBX_64BIT_ATOMIC - __always_inline -#endif /* MDBX_64BIT_ATOMIC */ - uint64_t - atomic_load64(const volatile MDBX_atomic_uint64_t *p, - enum MDBX_memory_order order) { - STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8); -#if MDBX_64BIT_ATOMIC -#ifdef MDBX_HAVE_C11ATOMICS - assert(atomic_is_lock_free(MDBX_c11a_ro(uint64_t, p))); - return atomic_load_explicit(MDBX_c11a_ro(uint64_t, p), mo_c11_load(order)); -#else /* MDBX_HAVE_C11ATOMICS */ - osal_memory_fence(order, false); - const uint64_t value = p->weak; - if (order != mo_Relaxed) - osal_compiler_barrier(); - return value; -#endif /* MDBX_HAVE_C11ATOMICS */ -#else /* !MDBX_64BIT_ATOMIC */ - osal_compiler_barrier(); - uint64_t value = (uint64_t)atomic_load32(&p->high, order) << 32; - jitter4testing(true); - value |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed - : mo_AcquireRelease); - jitter4testing(true); - for (;;) { - osal_compiler_barrier(); - uint64_t again = (uint64_t)atomic_load32(&p->high, order) << 32; - jitter4testing(true); - again |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed - : mo_AcquireRelease); - jitter4testing(true); - if (likely(value == again)) - return value; - value = again; - } -#endif /* !MDBX_64BIT_ATOMIC */ -} -#endif /* atomic_load64 */ +#define REMOTE_PROTOCOL_INFO_FLAG_LOOPBACK 0x00000001 +#define REMOTE_PROTOCOL_INFO_FLAG_OFFLINE 0x00000002 -static __always_inline void atomic_yield(void) { -#if defined(_WIN32) || defined(_WIN64) - YieldProcessor(); -#elif defined(__ia32__) || defined(__e2k__) - __builtin_ia32_pause(); -#elif defined(__ia64__) -#if defined(__HP_cc__) || defined(__HP_aCC__) - _Asm_hint(_HINT_PAUSE); -#else - __asm__ __volatile__("hint @pause"); -#endif -#elif defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH > 6) || \ - defined(__ARM_ARCH_6K__) -#ifdef __CC_ARM - __yield(); -#else - __asm__ __volatile__("yield"); -#endif -#elif (defined(__mips64) || defined(__mips64__)) && defined(__mips_isa_rev) && \ - __mips_isa_rev >= 2 - __asm__ __volatile__("pause"); -#elif defined(__mips) || defined(__mips__) || defined(__mips64) || \ - defined(__mips64__) || defined(_M_MRX000) || defined(_MIPS_) || \ - defined(__MWERKS__) || defined(__sgi) - __asm__ __volatile__(".word 0x00000140"); -#elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE) - sched_yield(); -#elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS) - pthread_yield(); -#endif -} +typedef struct _FILE_REMOTE_PROTOCOL_INFO { + USHORT StructureVersion; + USHORT StructureSize; + DWORD Protocol; + USHORT ProtocolMajorVersion; + USHORT ProtocolMinorVersion; + USHORT ProtocolRevision; + USHORT Reserved; + DWORD Flags; + struct { + DWORD Reserved[8]; + } GenericReserved; + struct { + DWORD Reserved[16]; + } ProtocolSpecificReserved; +} FILE_REMOTE_PROTOCOL_INFO, *PFILE_REMOTE_PROTOCOL_INFO; -#if MDBX_64BIT_CAS -static __always_inline bool atomic_cas64(MDBX_atomic_uint64_t *p, uint64_t c, - uint64_t v) { -#ifdef MDBX_HAVE_C11ATOMICS - STATIC_ASSERT(sizeof(long long) >= sizeof(uint64_t)); - assert(atomic_is_lock_free(MDBX_c11a_rw(uint64_t, p))); - return atomic_compare_exchange_strong(MDBX_c11a_rw(uint64_t, p), &c, v); -#elif defined(__GNUC__) || defined(__clang__) - return __sync_bool_compare_and_swap(&p->weak, c, v); -#elif defined(_MSC_VER) - return c == (uint64_t)_InterlockedCompareExchange64( - (volatile __int64 *)&p->weak, v, c); -#elif defined(__APPLE__) - return OSAtomicCompareAndSwap64Barrier(c, v, &p->weak); -#else -#error FIXME: Unsupported compiler -#endif -} -#endif /* MDBX_64BIT_CAS */ - -static __always_inline bool atomic_cas32(MDBX_atomic_uint32_t *p, uint32_t c, - uint32_t v) { -#ifdef MDBX_HAVE_C11ATOMICS - STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t)); - assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p))); - return atomic_compare_exchange_strong(MDBX_c11a_rw(uint32_t, p), &c, v); -#elif defined(__GNUC__) || defined(__clang__) - return __sync_bool_compare_and_swap(&p->weak, c, v); -#elif defined(_MSC_VER) - STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t)); - return c == - (uint32_t)_InterlockedCompareExchange((volatile long *)&p->weak, v, c); -#elif defined(__APPLE__) - return OSAtomicCompareAndSwap32Barrier(c, v, &p->weak); -#else -#error FIXME: Unsupported compiler -#endif -} +#endif /* _WIN32_WINNT < 0x0600 (prior to Windows Vista) */ -static __always_inline uint32_t atomic_add32(MDBX_atomic_uint32_t *p, - uint32_t v) { -#ifdef MDBX_HAVE_C11ATOMICS - STATIC_ASSERT(sizeof(int) >= sizeof(uint32_t)); - assert(atomic_is_lock_free(MDBX_c11a_rw(uint32_t, p))); - return atomic_fetch_add(MDBX_c11a_rw(uint32_t, p), v); -#elif defined(__GNUC__) || defined(__clang__) - return __sync_fetch_and_add(&p->weak, v); -#elif defined(_MSC_VER) - STATIC_ASSERT(sizeof(volatile long) == sizeof(volatile uint32_t)); - return (uint32_t)_InterlockedExchangeAdd((volatile long *)&p->weak, v); -#elif defined(__APPLE__) - return OSAtomicAdd32Barrier(v, &p->weak); -#else -#error FIXME: Unsupported compiler -#endif -} +typedef BOOL(WINAPI *MDBX_GetFileInformationByHandleEx)( + _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, + _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -#define atomic_sub32(p, v) atomic_add32(p, 0 - (v)) +typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( + _In_ HANDLE hFile, _Out_opt_ LPWSTR lpVolumeNameBuffer, + _In_ DWORD nVolumeNameSize, _Out_opt_ LPDWORD lpVolumeSerialNumber, + _Out_opt_ LPDWORD lpMaximumComponentLength, + _Out_opt_ LPDWORD lpFileSystemFlags, + _Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize); -static __always_inline uint64_t safe64_txnid_next(uint64_t txnid) { - txnid += xMDBX_TXNID_STEP; -#if !MDBX_64BIT_CAS - /* avoid overflow of low-part in safe64_reset() */ - txnid += (UINT32_MAX == (uint32_t)txnid); -#endif - return txnid; -} +typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile, + _Out_ LPWSTR lpszFilePath, + _In_ DWORD cchFilePath, + _In_ DWORD dwFlags); -/* Atomically make target value >= SAFE64_INVALID_THRESHOLD */ -static __always_inline void safe64_reset(MDBX_atomic_uint64_t *p, - bool single_writer) { - if (single_writer) { -#if MDBX_64BIT_ATOMIC && MDBX_WORDBITS >= 64 - atomic_store64(p, UINT64_MAX, mo_AcquireRelease); -#else - atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease); -#endif /* MDBX_64BIT_ATOMIC && MDBX_WORDBITS >= 64 */ - } else { -#if MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC - /* atomically make value >= SAFE64_INVALID_THRESHOLD by 64-bit operation */ - atomic_store64(p, UINT64_MAX, mo_AcquireRelease); -#elif MDBX_64BIT_CAS - /* atomically make value >= SAFE64_INVALID_THRESHOLD by 32-bit operation */ - atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease); -#else - /* it is safe to increment low-part to avoid ABA, since xMDBX_TXNID_STEP > 1 - * and overflow was preserved in safe64_txnid_next() */ - STATIC_ASSERT(xMDBX_TXNID_STEP > 1); - atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */; - atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease); - atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */; -#endif /* MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC */ - } - assert(p->weak >= SAFE64_INVALID_THRESHOLD); - jitter4testing(true); -} +typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)( + _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, + _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -static __always_inline bool safe64_reset_compare(MDBX_atomic_uint64_t *p, - txnid_t compare) { - /* LY: This function is used to reset `mr_txnid` from hsr-handler in case - * the asynchronously cancellation of read transaction. Therefore, - * there may be a collision between the cleanup performed here and - * asynchronous termination and restarting of the read transaction - * in another process/thread. In general we MUST NOT reset the `mr_txnid` - * if a new transaction was started (i.e. if `mr_txnid` was changed). */ -#if MDBX_64BIT_CAS - bool rc = atomic_cas64(p, compare, UINT64_MAX); -#else - /* LY: There is no gold ratio here since shared mutex is too costly, - * in such way we must acquire/release it for every update of mr_txnid, - * i.e. twice for each read transaction). */ - bool rc = false; - if (likely(atomic_load32(&p->low, mo_AcquireRelease) == (uint32_t)compare && - atomic_cas32(&p->high, (uint32_t)(compare >> 32), UINT32_MAX))) { - if (unlikely(atomic_load32(&p->low, mo_AcquireRelease) != - (uint32_t)compare)) - atomic_cas32(&p->high, UINT32_MAX, (uint32_t)(compare >> 32)); - else - rc = true; - } -#endif /* MDBX_64BIT_CAS */ - jitter4testing(true); - return rc; -} +typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( + IN HANDLE FileHandle, IN OUT HANDLE Event, + IN OUT PVOID /* PIO_APC_ROUTINE */ ApcRoutine, IN OUT PVOID ApcContext, + OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode, + IN OUT PVOID InputBuffer, IN ULONG InputBufferLength, + OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength); -static __always_inline void safe64_write(MDBX_atomic_uint64_t *p, - const uint64_t v) { - assert(p->weak >= SAFE64_INVALID_THRESHOLD); -#if MDBX_64BIT_ATOMIC && MDBX_64BIT_CAS - atomic_store64(p, v, mo_AcquireRelease); -#else /* MDBX_64BIT_ATOMIC */ - osal_compiler_barrier(); - /* update low-part but still value >= SAFE64_INVALID_THRESHOLD */ - atomic_store32(&p->low, (uint32_t)v, mo_Relaxed); - assert(p->weak >= SAFE64_INVALID_THRESHOLD); - jitter4testing(true); - /* update high-part from SAFE64_INVALID_THRESHOLD to actual value */ - atomic_store32(&p->high, (uint32_t)(v >> 32), mo_AcquireRelease); -#endif /* MDBX_64BIT_ATOMIC */ - assert(p->weak == v); - jitter4testing(true); -} +typedef uint64_t(WINAPI *MDBX_GetTickCount64)(void); -static __always_inline uint64_t safe64_read(const MDBX_atomic_uint64_t *p) { - jitter4testing(true); - uint64_t v; - do - v = atomic_load64(p, mo_AcquireRelease); - while (!MDBX_64BIT_ATOMIC && unlikely(v != p->weak)); - return v; -} +#if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8 +typedef struct _WIN32_MEMORY_RANGE_ENTRY { + PVOID VirtualAddress; + SIZE_T NumberOfBytes; +} WIN32_MEMORY_RANGE_ENTRY, *PWIN32_MEMORY_RANGE_ENTRY; +#endif /* Windows 8.x */ -#if 0 /* unused for now */ -MDBX_MAYBE_UNUSED static __always_inline bool safe64_is_valid(uint64_t v) { -#if MDBX_WORDBITS >= 64 - return v < SAFE64_INVALID_THRESHOLD; -#else - return (v >> 32) != UINT32_MAX; -#endif /* MDBX_WORDBITS */ -} +typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)( + HANDLE hProcess, ULONG_PTR NumberOfEntries, + PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags); -MDBX_MAYBE_UNUSED static __always_inline bool - safe64_is_valid_ptr(const MDBX_atomic_uint64_t *p) { -#if MDBX_64BIT_ATOMIC - return atomic_load64(p, mo_AcquireRelease) < SAFE64_INVALID_THRESHOLD; -#else - return atomic_load32(&p->high, mo_AcquireRelease) != UINT32_MAX; -#endif /* MDBX_64BIT_ATOMIC */ -} -#endif /* unused for now */ +typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT; -/* non-atomic write with safety for reading a half-updated value */ -static __always_inline void safe64_update(MDBX_atomic_uint64_t *p, - const uint64_t v) { -#if MDBX_64BIT_ATOMIC - atomic_store64(p, v, mo_Relaxed); -#else - safe64_reset(p, true); - safe64_write(p, v); -#endif /* MDBX_64BIT_ATOMIC */ -} +typedef NTSTATUS(NTAPI *MDBX_NtExtendSection)(IN HANDLE SectionHandle, + IN PLARGE_INTEGER NewSectionSize); -/* non-atomic increment with safety for reading a half-updated value */ -MDBX_MAYBE_UNUSED static -#if MDBX_64BIT_ATOMIC - __always_inline -#endif /* MDBX_64BIT_ATOMIC */ - void - safe64_inc(MDBX_atomic_uint64_t *p, const uint64_t v) { - assert(v > 0); - safe64_update(p, safe64_read(p) + v); -} +typedef LSTATUS(WINAPI *MDBX_RegGetValueA)(HKEY hkey, LPCSTR lpSubKey, + LPCSTR lpValue, DWORD dwFlags, + LPDWORD pdwType, PVOID pvData, + LPDWORD pcbData); -/*----------------------------------------------------------------------------*/ -/* rthc (tls keys and destructors) */ +NTSYSAPI ULONG RtlRandomEx(PULONG Seed); -static int rthc_register(MDBX_env *const env); -static int rthc_remove(MDBX_env *const env); -static int rthc_uniq_check(const osal_mmap_t *pending, MDBX_env **found); +typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle, + PUCHAR OverlappedRangeStart, + ULONG Length); -typedef struct rthc_entry_t { - MDBX_env *env; -} rthc_entry_t; +struct libmdbx_imports { + osal_srwlock_t_function srwl_Init; + osal_srwlock_t_function srwl_AcquireShared; + osal_srwlock_t_function srwl_ReleaseShared; + osal_srwlock_t_function srwl_AcquireExclusive; + osal_srwlock_t_function srwl_ReleaseExclusive; + MDBX_NtExtendSection NtExtendSection; + MDBX_GetFileInformationByHandleEx GetFileInformationByHandleEx; + MDBX_GetVolumeInformationByHandleW GetVolumeInformationByHandleW; + MDBX_GetFinalPathNameByHandleW GetFinalPathNameByHandleW; + MDBX_SetFileInformationByHandle SetFileInformationByHandle; + MDBX_NtFsControlFile NtFsControlFile; + MDBX_PrefetchVirtualMemory PrefetchVirtualMemory; + MDBX_GetTickCount64 GetTickCount64; + MDBX_RegGetValueA RegGetValueA; + MDBX_SetFileIoOverlappedRange SetFileIoOverlappedRange; +}; -#if MDBX_DEBUG -#define RTHC_INITIAL_LIMIT 1 -#else -#define RTHC_INITIAL_LIMIT 16 -#endif +MDBX_INTERNAL void windows_import(void); +#endif /* Windows */ -static bin128_t bootid; +enum signatures { + env_signature = INT32_C(0x1A899641), + txn_signature = INT32_C(0x13D53A31), + cur_signature_live = INT32_C(0x7E05D5B1), + cur_signature_ready4dispose = INT32_C(0x2817A047), + cur_signature_wait4eot = INT32_C(0x10E297A7) +}; -#if defined(_WIN32) || defined(_WIN64) -static CRITICAL_SECTION rthc_critical_section; -#else +/*----------------------------------------------------------------------------*/ -static pthread_mutex_t rthc_mutex = PTHREAD_MUTEX_INITIALIZER; -static pthread_cond_t rthc_cond = PTHREAD_COND_INITIALIZER; -static osal_thread_key_t rthc_key; -static MDBX_atomic_uint32_t rthc_pending; +/* An dirty-page list item is an pgno/pointer pair. */ +struct dp { + page_t *ptr; + pgno_t pgno, npages; +}; -static __inline uint64_t rthc_signature(const void *addr, uint8_t kind) { - uint64_t salt = osal_thread_self() * UINT64_C(0xA2F0EEC059629A17) ^ - UINT64_C(0x01E07C6FDB596497) * (uintptr_t)(addr); -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return salt << 8 | kind; -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return (uint64_t)kind << 56 | salt >> 8; -#else -#error "FIXME: Unsupported byte order" -#endif /* __BYTE_ORDER__ */ -} +enum dpl_rules { + dpl_gap_edging = 2, + dpl_gap_mergesort = 16, + dpl_reserve_gap = dpl_gap_mergesort + dpl_gap_edging, + dpl_insertion_threshold = 42 +}; -#define MDBX_THREAD_RTHC_REGISTERED(addr) rthc_signature(addr, 0x0D) -#define MDBX_THREAD_RTHC_COUNTED(addr) rthc_signature(addr, 0xC0) -static __thread uint64_t rthc_thread_state -#if __has_attribute(tls_model) && \ - (defined(__PIC__) || defined(__pic__) || MDBX_BUILD_SHARED_LIBRARY) - __attribute__((tls_model("local-dynamic"))) -#endif - ; +/* An DPL (dirty-page list) is a lazy-sorted array of MDBX_DPs. */ +struct dpl { + size_t sorted; + size_t length; + /* number of pages, but not an entries. */ + size_t pages_including_loose; + /* allocated size excluding the dpl_reserve_gap */ + size_t detent; + /* dynamic size with holes at zero and after the last */ + dp_t items[dpl_reserve_gap]; +}; -#if defined(__APPLE__) && defined(__SANITIZE_ADDRESS__) && \ - !defined(MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS) -/* Avoid ASAN-trap due the target TLS-variable feed by Darwin's tlv_free() */ -#define MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS \ - __attribute__((__no_sanitize_address__, __noinline__)) -#else -#define MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS __inline -#endif +/*----------------------------------------------------------------------------*/ +/* Internal structures */ -MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS static uint64_t rthc_read(const void *rthc) { - return *(volatile uint64_t *)rthc; -} +/* Comparing/ordering and length constraints */ +typedef struct clc { + MDBX_cmp_func *cmp; /* comparator */ + size_t lmin, lmax; /* min/max length constraints */ +} clc_t; -MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS static uint64_t -rthc_compare_and_clean(const void *rthc, const uint64_t signature) { -#if MDBX_64BIT_CAS - return atomic_cas64((MDBX_atomic_uint64_t *)rthc, signature, 0); -#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - return atomic_cas32((MDBX_atomic_uint32_t *)rthc, (uint32_t)signature, 0); -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - return atomic_cas32((MDBX_atomic_uint32_t *)rthc, (uint32_t)(signature >> 32), - 0); -#else -#error "FIXME: Unsupported byte order" -#endif -} +/* Вспомогательная информация о subDB. + * + * Совокупность потребностей: + * 1. Для транзакций и основного курсора нужны все поля. + * 2. Для вложенного dupsort-курсора нужен компаратор значений, который изнутри + * курсора будет выглядеть как компаратор ключей. Плюс заглушка компаратора + * значений, которая не должна использоваться в штатных ситуациях, но + * требуется хотя-бы для отслеживания таких обращений. + * 3. Использование компараторов для курсора и вложенного dupsort-курсора + * должно выглядеть одинаково. + * 4. Желательно минимизировать объём данных размещаемых внутри вложенного + * dupsort-курсора. + * 5. Желательно чтобы объем всей структуры был степенью двойки. + * + * Решение: + * - не храним в dupsort-курсоре ничего лишнего, а только tree; + * - в курсоры помещаем только указатель на clc_t, который будет указывать + * на соответствующее clc-поле в общей kvx-таблице привязанной к env; + * - компаратор размещаем в начале clc_t, в kvx_t сначала размещаем clc + * для ключей, потом для значений, а имя БД в конце kvx_t. + * - тогда в курсоре clc[0] будет содержать информацию для ключей, + * а clc[1] для значений, причем компаратор значений для dupsort-курсора + * будет попадать на MDBX_val с именем, что приведет к SIGSEGV при попытке + * использования такого компаратора. + * - размер kvx_t становится равным 8 словам. + * + * Трюки и прочая экономия на списках: + * - не храним dbi внутри курсора, вместо этого вычисляем его как разницу между + * dbi_state курсора и началом таблицы dbi_state в транзакции. Смысл тут в + * экономии кол-ва полей при инициализации курсора. Затрат это не создает, + * так как dbi требуется для последующего доступа к массивам в транзакции, + * т.е. при вычислении dbi разыменовывается тот-же указатель на txn + * и читается та же кэш-линия с указателями. */ +typedef struct clc2 { + clc_t k; /* для ключей */ + clc_t v; /* для значений */ +} clc2_t; + +struct kvx { + clc2_t clc; + MDBX_val name; /* имя subDB */ +}; -static __inline int rthc_atexit(void (*dtor)(void *), void *obj, - void *dso_symbol) { -#ifndef MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL -#if defined(LIBCXXABI_HAS_CXA_THREAD_ATEXIT_IMPL) || \ - defined(HAVE___CXA_THREAD_ATEXIT_IMPL) || __GLIBC_PREREQ(2, 18) || \ - defined(BIONIC) -#define MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL 1 -#else -#define MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL 0 -#endif -#endif /* MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL */ +/* Non-shared DBI state flags inside transaction */ +enum dbi_state { + DBI_DIRTY = 0x01 /* DB was written in this txn */, + DBI_STALE = 0x02 /* Named-DB record is older than txnID */, + DBI_FRESH = 0x04 /* Named-DB handle opened in this txn */, + DBI_CREAT = 0x08 /* Named-DB handle created in this txn */, + DBI_VALID = 0x10 /* Handle is valid, see also DB_VALID */, + DBI_OLDEN = 0x40 /* Handle was closed/reopened outside txn */, + DBI_LINDO = 0x80 /* Lazy initialization done for DBI-slot */, +}; -#ifndef MDBX_HAVE_CXA_THREAD_ATEXIT -#if defined(LIBCXXABI_HAS_CXA_THREAD_ATEXIT) || \ - defined(HAVE___CXA_THREAD_ATEXIT) -#define MDBX_HAVE_CXA_THREAD_ATEXIT 1 -#elif !MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL && \ - (defined(__linux__) || defined(__gnu_linux__)) -#define MDBX_HAVE_CXA_THREAD_ATEXIT 1 -#else -#define MDBX_HAVE_CXA_THREAD_ATEXIT 0 -#endif -#endif /* MDBX_HAVE_CXA_THREAD_ATEXIT */ +enum txn_flags { + txn_ro_begin_flags = MDBX_TXN_RDONLY | MDBX_TXN_RDONLY_PREPARE, + txn_rw_begin_flags = MDBX_TXN_NOMETASYNC | MDBX_TXN_NOSYNC | MDBX_TXN_TRY, + txn_shrink_allowed = UINT32_C(0x40000000), + txn_gc_drained = 0x20 /* GC was depleted up to oldest reader */, + txn_state_flags = MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | + MDBX_TXN_SPILLS | MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | + txn_gc_drained +}; - int rc = MDBX_ENOSYS; -#if MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL && !MDBX_HAVE_CXA_THREAD_ATEXIT -#define __cxa_thread_atexit __cxa_thread_atexit_impl -#endif -#if MDBX_HAVE_CXA_THREAD_ATEXIT || defined(__cxa_thread_atexit) - extern int __cxa_thread_atexit(void (*dtor)(void *), void *obj, - void *dso_symbol) MDBX_WEAK_IMPORT_ATTRIBUTE; - if (&__cxa_thread_atexit) - rc = __cxa_thread_atexit(dtor, obj, dso_symbol); -#elif defined(__APPLE__) || defined(_DARWIN_C_SOURCE) - extern void _tlv_atexit(void (*termfunc)(void *objAddr), void *objAddr) - MDBX_WEAK_IMPORT_ATTRIBUTE; - if (&_tlv_atexit) { - (void)dso_symbol; - _tlv_atexit(dtor, obj); - rc = 0; - } -#else - (void)dtor; - (void)obj; - (void)dso_symbol; -#endif - return rc; -} +/* A database transaction. + * Every operation requires a transaction handle. */ +struct MDBX_txn { + int32_t signature; + uint32_t flags; /* Transaction Flags */ + size_t n_dbi; + size_t owner; /* thread ID that owns this transaction */ -__cold static void workaround_glibc_bug21031(void) { - /* Workaround for https://sourceware.org/bugzilla/show_bug.cgi?id=21031 - * - * Due race between pthread_key_delete() and __nptl_deallocate_tsd() - * The destructor(s) of thread-local-storage object(s) may be running - * in another thread(s) and be blocked or not finished yet. - * In such case we get a SEGFAULT after unload this library DSO. - * - * So just by yielding a few timeslices we give a chance - * to such destructor(s) for completion and avoids segfault. */ - sched_yield(); - sched_yield(); - sched_yield(); -} -#endif + MDBX_txn *parent; /* parent of a nested txn */ + MDBX_txn *nested; /* nested txn under this txn, + set together with MDBX_TXN_HAS_CHILD */ + geo_t geo; -static unsigned rthc_count, rthc_limit; -static rthc_entry_t *rthc_table; -static rthc_entry_t rthc_table_static[RTHC_INITIAL_LIMIT]; + /* The ID of this transaction. IDs are integers incrementing from + * INITIAL_TXNID. Only committed write transactions increment the ID. If a + * transaction aborts, the ID may be re-used by the next writer. */ + txnid_t txnid, front_txnid; -static __inline void rthc_lock(void) { -#if defined(_WIN32) || defined(_WIN64) - EnterCriticalSection(&rthc_critical_section); -#else - ENSURE(nullptr, osal_pthread_mutex_lock(&rthc_mutex) == 0); -#endif -} + MDBX_env *env; /* the DB environment */ + tree_t *dbs; /* Array of tree_t records for each known DB */ -static __inline void rthc_unlock(void) { -#if defined(_WIN32) || defined(_WIN64) - LeaveCriticalSection(&rthc_critical_section); -#else - ENSURE(nullptr, pthread_mutex_unlock(&rthc_mutex) == 0); -#endif -} +#if MDBX_ENABLE_DBI_SPARSE + unsigned *__restrict dbi_sparse; +#endif /* MDBX_ENABLE_DBI_SPARSE */ -static __inline int thread_key_create(osal_thread_key_t *key) { - int rc; -#if defined(_WIN32) || defined(_WIN64) - *key = TlsAlloc(); - rc = (*key != TLS_OUT_OF_INDEXES) ? MDBX_SUCCESS : GetLastError(); -#else - rc = pthread_key_create(key, nullptr); -#endif - TRACE("&key = %p, value %" PRIuPTR ", rc %d", __Wpedantic_format_voidptr(key), - (uintptr_t)*key, rc); - return rc; -} + /* Array of non-shared txn's flags of DBI. + * Модификатор __restrict тут полезен и безопасен в текущем понимании, + * так как пересечение возможно только с dbi_state курсоров, + * и происходит по-чтению до последующего изменения/записи. */ + uint8_t *__restrict dbi_state; -static __inline void thread_key_delete(osal_thread_key_t key) { - TRACE("key = %" PRIuPTR, (uintptr_t)key); -#if defined(_WIN32) || defined(_WIN64) - ENSURE(nullptr, TlsFree(key)); -#else - ENSURE(nullptr, pthread_key_delete(key) == 0); - workaround_glibc_bug21031(); -#endif -} + /* Array of sequence numbers for each DB handle. */ + uint32_t *__restrict dbi_seqs; -static __inline void *thread_rthc_get(osal_thread_key_t key) { -#if defined(_WIN32) || defined(_WIN64) - return TlsGetValue(key); -#else - return pthread_getspecific(key); -#endif -} + /* Массив с головами односвязных списков отслеживания курсоров. */ + MDBX_cursor **cursors; -static void thread_rthc_set(osal_thread_key_t key, const void *value) { -#if defined(_WIN32) || defined(_WIN64) - ENSURE(nullptr, TlsSetValue(key, (void *)value)); -#else - const uint64_t sign_registered = - MDBX_THREAD_RTHC_REGISTERED(&rthc_thread_state); - const uint64_t sign_counted = MDBX_THREAD_RTHC_COUNTED(&rthc_thread_state); - if (value && unlikely(rthc_thread_state != sign_registered && - rthc_thread_state != sign_counted)) { - rthc_thread_state = sign_registered; - TRACE("thread registered 0x%" PRIxPTR, osal_thread_self()); - if (rthc_atexit(thread_dtor, &rthc_thread_state, - (void *)&mdbx_version /* dso_anchor */)) { - ENSURE(nullptr, pthread_setspecific(rthc_key, &rthc_thread_state) == 0); - rthc_thread_state = sign_counted; - const unsigned count_before = atomic_add32(&rthc_pending, 1); - ENSURE(nullptr, count_before < INT_MAX); - NOTICE("fallback to pthreads' tsd, key %" PRIuPTR ", count %u", - (uintptr_t)rthc_key, count_before); - (void)count_before; - } - } - ENSURE(nullptr, pthread_setspecific(key, value) == 0); -#endif -} + /* "Канареечные" маркеры/счетчики */ + MDBX_canary canary; -/* dtor called for thread, i.e. for all mdbx's environment objects */ -__cold void thread_dtor(void *rthc) { - rthc_lock(); - const uint32_t current_pid = osal_getpid(); - TRACE(">> pid %d, thread 0x%" PRIxPTR ", rthc %p", current_pid, - osal_thread_self(), rthc); + /* User-settable context */ + void *userctx; - for (size_t i = 0; i < rthc_count; ++i) { - MDBX_env *const env = rthc_table[i].env; - if (env->me_pid != current_pid) - continue; - if (!(env->me_flags & MDBX_ENV_TXKEY)) - continue; - MDBX_reader *const reader = thread_rthc_get(env->me_txkey); - MDBX_reader *const begin = &env->me_lck_mmap.lck->mti_readers[0]; - MDBX_reader *const end = - &env->me_lck_mmap.lck->mti_readers[env->me_maxreaders]; - if (reader < begin || reader >= end) - continue; -#if !defined(_WIN32) && !defined(_WIN64) - if (pthread_setspecific(env->me_txkey, nullptr) != 0) { - TRACE("== thread 0x%" PRIxPTR - ", rthc %p: ignore race with tsd-key deletion", - osal_thread_self(), __Wpedantic_format_voidptr(reader)); - continue /* ignore race with tsd-key deletion by mdbx_env_close() */; - } -#endif + union { + struct { + /* For read txns: This thread/txn's reader table slot, or nullptr. */ + reader_slot_t *reader; + } to; + struct { + troika_t troika; + /* In write txns, array of cursors for each DB */ + pnl_t __restrict relist; /* Reclaimed GC pages */ + struct { + /* The list of reclaimed txns from GC */ + txl_t __restrict reclaimed; + txnid_t last_reclaimed; /* ID of last used record */ + uint64_t time_acc; + } gc; +#if MDBX_ENABLE_REFUND + pgno_t loose_refund_wl /* FIXME: describe */; +#endif /* MDBX_ENABLE_REFUND */ + /* a sequence to spilling dirty page with LRU policy */ + unsigned dirtylru; + /* dirtylist room: Dirty array size - dirty pages visible to this txn. + * Includes ancestor txns' dirty pages not hidden by other txns' + * dirty/spilled pages. Thus commit(nested txn) has room to merge + * dirtylist into parent after freeing hidden parent pages. */ + size_t dirtyroom; + /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ + dpl_t *__restrict dirtylist; + /* The list of pages that became unused during this transaction. */ + pnl_t __restrict retired_pages; + /* The list of loose pages that became unused and may be reused + * in this transaction, linked through `page_next()`. */ + page_t *__restrict loose_pages; + /* Number of loose pages (tw.loose_pages) */ + size_t loose_count; + union { + struct { + size_t least_removed; + /* The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. */ + pnl_t __restrict list; + } spilled; + size_t writemap_dirty_npages; + size_t writemap_spilled_npages; + }; + } tw; + }; +}; - TRACE("== thread 0x%" PRIxPTR - ", rthc %p, [%zi], %p ... %p (%+i), rtch-pid %i, " - "current-pid %i", - osal_thread_self(), __Wpedantic_format_voidptr(reader), i, - __Wpedantic_format_voidptr(begin), __Wpedantic_format_voidptr(end), - (int)(reader - begin), reader->mr_pid.weak, current_pid); - if (atomic_load32(&reader->mr_pid, mo_Relaxed) == current_pid) { - TRACE("==== thread 0x%" PRIxPTR ", rthc %p, cleanup", osal_thread_self(), - __Wpedantic_format_voidptr(reader)); - (void)atomic_cas32(&reader->mr_pid, current_pid, 0); - atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); - } - } +#define CURSOR_STACK_SIZE (16 + MDBX_WORDBITS / 4) -#if defined(_WIN32) || defined(_WIN64) - TRACE("<< thread 0x%" PRIxPTR ", rthc %p", osal_thread_self(), rthc); - rthc_unlock(); +struct MDBX_cursor { + int32_t signature; + union { + /* Тут некоторые трюки/заморочки с тем чтобы во всех основных сценариях + * проверять состояние курсора одной простой операцией сравнения, + * и при этом ни на каплю не усложнять код итерации стека курсора. + * + * Поэтому решение такое: + * - поля flags и top сделаны знаковыми, а их отрицательные значения + * используются для обозначения не-установленного/не-инициализированного + * состояния курсора; + * - для инвалидации/сброса курсора достаточно записать отрицательное + * значение в объединенное поле top_and_flags; + * - все проверки состояния сводятся к сравнению одного из полей + * flags/snum/snum_and_flags, которые в зависимости от сценария, + * трактуются либо как знаковые, либо как безнаковые. */ + __anonymous_struct_extension__ struct { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + int8_t flags; + /* индекс вершины стека, меньше нуля для не-инициализированного курсора */ + int8_t top; #else - const uint64_t sign_registered = MDBX_THREAD_RTHC_REGISTERED(rthc); - const uint64_t sign_counted = MDBX_THREAD_RTHC_COUNTED(rthc); - const uint64_t state = rthc_read(rthc); - if (state == sign_registered && - rthc_compare_and_clean(rthc, sign_registered)) { - TRACE("== thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - osal_thread_self(), rthc, osal_getpid(), "registered", state); - } else if (state == sign_counted && - rthc_compare_and_clean(rthc, sign_counted)) { - TRACE("== thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - osal_thread_self(), rthc, osal_getpid(), "counted", state); - ENSURE(nullptr, atomic_sub32(&rthc_pending, 1) > 0); - } else { - WARNING("thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - osal_thread_self(), rthc, osal_getpid(), "wrong", state); - } - - if (atomic_load32(&rthc_pending, mo_AcquireRelease) == 0) { - TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, wake", osal_thread_self(), - rthc, osal_getpid()); - ENSURE(nullptr, pthread_cond_broadcast(&rthc_cond) == 0); - } - - TRACE("<< thread 0x%" PRIxPTR ", rthc %p", osal_thread_self(), rthc); - /* Allow tail call optimization, i.e. gcc should generate the jmp instruction - * instead of a call for pthread_mutex_unlock() and therefore CPU could not - * return to current DSO's code section, which may be unloaded immediately - * after the mutex got released. */ - pthread_mutex_unlock(&rthc_mutex); + int8_t top; + int8_t flags; #endif -} + }; + int16_t top_and_flags; + }; + /* флаги проверки, в том числе биты для проверки типа листовых страниц. */ + uint8_t checking; + + /* Указывает на txn->dbi_state[] для DBI этого курсора. + * Модификатор __restrict тут полезен и безопасен в текущем понимании, + * так как пересечение возможно только с dbi_state транзакции, + * и происходит по-чтению до последующего изменения/записи. */ + uint8_t *__restrict dbi_state; + /* Связь списка отслеживания курсоров в транзакции */ + MDBX_txn *txn; + /* Указывает на tree->dbs[] для DBI этого курсора. */ + tree_t *tree; + /* Указывает на env->kvs[] для DBI этого курсора. */ + clc2_t *clc; + subcur_t *__restrict subcur; + page_t *pg[CURSOR_STACK_SIZE]; /* stack of pushed pages */ + indx_t ki[CURSOR_STACK_SIZE]; /* stack of page indices */ + MDBX_cursor *next; + /* Состояние на момент старта вложенной транзакции */ + MDBX_cursor *backup; +}; -MDBX_INTERNAL_VAR_INSTA struct mdbx_static mdbx_static = { - MDBX_RUNTIME_FLAGS_INIT, MDBX_LOG_FATAL, {nullptr}, 0, nullptr}; -static osal_fastmutex_t debug_lock; +struct inner_cursor { + MDBX_cursor cursor; + tree_t nested_tree; +}; -MDBX_EXCLUDE_FOR_GPROF -__cold void global_dtor(void) { - const uint32_t current_pid = osal_getpid(); - TRACE(">> pid %d", current_pid); +struct cursor_couple { + MDBX_cursor outer; + void *userctx; /* User-settable context */ + subcur_t inner; +}; - rthc_lock(); -#if !defined(_WIN32) && !defined(_WIN64) - uint64_t *rthc = pthread_getspecific(rthc_key); - TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status 0x%08" PRIx64 - ", left %d", - osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid, - rthc ? rthc_read(rthc) : ~UINT64_C(0), - atomic_load32(&rthc_pending, mo_Relaxed)); - if (rthc) { - const uint64_t sign_registered = MDBX_THREAD_RTHC_REGISTERED(rthc); - const uint64_t sign_counted = MDBX_THREAD_RTHC_COUNTED(rthc); - const uint64_t state = rthc_read(rthc); - if (state == sign_registered && - rthc_compare_and_clean(rthc, sign_registered)) { - TRACE("== thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid, - "registered", state); - } else if (state == sign_counted && - rthc_compare_and_clean(rthc, sign_counted)) { - TRACE("== thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid, - "counted", state); - ENSURE(nullptr, atomic_sub32(&rthc_pending, 1) > 0); - } else { - WARNING("thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid, - "wrong", state); - } - } +struct defer_free_item { + struct defer_free_item *next; + uint64_t timestamp; +}; - struct timespec abstime; - ENSURE(nullptr, clock_gettime(CLOCK_REALTIME, &abstime) == 0); - abstime.tv_nsec += 1000000000l / 10; - if (abstime.tv_nsec >= 1000000000l) { - abstime.tv_nsec -= 1000000000l; - abstime.tv_sec += 1; - } -#if MDBX_DEBUG > 0 - abstime.tv_sec += 600; +enum env_flags { + /* Failed to update the meta page. Probably an I/O error. */ + ENV_FATAL_ERROR = INT32_MIN /* 0x80000000 */, + /* Some fields are initialized. */ + ENV_ACTIVE = UINT32_C(0x20000000), + /* me_txkey is set */ + ENV_TXKEY = UINT32_C(0x10000000), + /* Legacy MDBX_MAPASYNC (prior v0.9) */ + DEPRECATED_MAPASYNC = UINT32_C(0x100000), + /* Legacy MDBX_COALESCE (prior v0.12) */ + DEPRECATED_COALESCE = UINT32_C(0x2000000), + ENV_INTERNAL_FLAGS = ENV_FATAL_ERROR | ENV_ACTIVE | ENV_TXKEY, + /* Only a subset of the mdbx_env flags can be changed + * at runtime. Changing other flags requires closing the + * environment and re-opening it with the new flags. */ + ENV_CHANGEABLE_FLAGS = MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | + DEPRECATED_MAPASYNC | MDBX_NOMEMINIT | + DEPRECATED_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | + MDBX_VALIDATION, + ENV_CHANGELESS_FLAGS = MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | + MDBX_NOSTICKYTHREADS | MDBX_NORDAHEAD | + MDBX_LIFORECLAIM | MDBX_EXCLUSIVE, + ENV_USABLE_FLAGS = ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS +}; + +/* The database environment. */ +struct MDBX_env { + /* ----------------------------------------------------- mostly static part */ + mdbx_atomic_uint32_t signature; + uint32_t flags; + unsigned ps; /* DB page size, initialized from me_os_psize */ + osal_mmap_t dxb_mmap; /* The main data file */ +#define lazy_fd dxb_mmap.fd + mdbx_filehandle_t dsync_fd, fd4meta; +#if defined(_WIN32) || defined(_WIN64) + HANDLE dxb_lock_event; +#endif /* Windows */ + osal_mmap_t lck_mmap; /* The lock file */ + lck_t *lck; + + uint16_t leaf_nodemax; /* max size of a leaf-node */ + uint16_t branch_nodemax; /* max size of a branch-node */ + uint16_t subpage_limit; + uint16_t subpage_room_threshold; + uint16_t subpage_reserve_prereq; + uint16_t subpage_reserve_limit; + atomic_pgno_t mlocked_pgno; + uint8_t ps2ln; /* log2 of DB page size */ + int8_t stuck_meta; /* recovery-only: target meta page or less that zero */ + uint16_t merge_threshold, merge_threshold_gc; /* pages emptier than this are + candidates for merging */ + unsigned max_readers; /* size of the reader table */ + MDBX_dbi max_dbi; /* size of the DB table */ + uint32_t pid; /* process ID of this env */ + osal_thread_key_t me_txkey; /* thread-key for readers */ + struct { /* path to the DB files */ + pathchar_t *lck, *dxb, *specified; + void *buffer; + } pathname; + void *page_auxbuf; /* scratch area for DUPSORT put() */ + MDBX_txn *basal_txn; /* preallocated write transaction */ + kvx_t *kvs; /* array of auxiliary key-value properties */ + uint8_t *__restrict dbs_flags; /* array of flags from tree_t.flags */ + mdbx_atomic_uint32_t *dbi_seqs; /* array of dbi sequence numbers */ + unsigned maxgc_large1page; /* Number of pgno_t fit in a single large page */ + unsigned maxgc_per_branch; + uint32_t registered_reader_pid; /* have liveness lock in reader table */ + void *userctx; /* User-settable context */ + MDBX_hsr_func *hsr_callback; /* Callback for kicking laggard readers */ + size_t madv_threshold; + + struct { + unsigned dp_reserve_limit; + unsigned rp_augment_limit; + unsigned dp_limit; + unsigned dp_initial; + uint64_t gc_time_limit; + uint8_t dp_loose_limit; + uint8_t spill_max_denominator; + uint8_t spill_min_denominator; + uint8_t spill_parent4child_denominator; + unsigned merge_threshold_16dot16_percent; +#if !(defined(_WIN32) || defined(_WIN64)) + unsigned writethrough_threshold; +#endif /* Windows */ + bool prefault_write; + bool prefer_waf_insteadof_balance; /* Strive to minimize WAF instead of + balancing pages fullment */ + struct { + uint16_t limit; + uint16_t room_threshold; + uint16_t reserve_prereq; + uint16_t reserve_limit; + } subpage; + + union { + unsigned all; + /* tracks options with non-auto values but tuned by user */ + struct { + unsigned dp_limit : 1; + unsigned rp_augment_limit : 1; + unsigned prefault_write : 1; + } non_auto; + } flags; + } options; + + /* struct geo_in_bytes used for accepting db-geo params from user for the new + * database creation, i.e. when mdbx_env_set_geometry() was called before + * mdbx_env_open(). */ + struct { + size_t lower; /* minimal size of datafile */ + size_t upper; /* maximal size of datafile */ + size_t now; /* current size of datafile */ + size_t grow; /* step to grow datafile */ + size_t shrink; /* threshold to shrink datafile */ + } geo_in_bytes; + +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + union { + key_t key; + int semid; + } me_sysv_ipc; +#endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ + bool incore; + bool prefault_write_activated; + +#if MDBX_ENABLE_DBI_LOCKFREE + defer_free_item_t *defer_free; +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ + + /* -------------------------------------------------------------- debugging */ + +#if MDBX_DEBUG + MDBX_assert_func *assert_func; /* Callback for assertion failures */ +#endif +#ifdef ENABLE_MEMCHECK + int valgrind_handle; #endif +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) + pgno_t poison_edge; +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ - for (unsigned left; - (left = atomic_load32(&rthc_pending, mo_AcquireRelease)) > 0;) { - NOTICE("tls-cleanup: pid %d, pending %u, wait for...", current_pid, left); - const int rc = pthread_cond_timedwait(&rthc_cond, &rthc_mutex, &abstime); - if (rc && rc != EINTR) - break; - } - thread_key_delete(rthc_key); +#ifndef xMDBX_DEBUG_SPILLING +#define xMDBX_DEBUG_SPILLING 0 #endif +#if xMDBX_DEBUG_SPILLING == 2 + size_t debug_dirtied_est, debug_dirtied_act; +#endif /* xMDBX_DEBUG_SPILLING */ - for (size_t i = 0; i < rthc_count; ++i) { - MDBX_env *const env = rthc_table[i].env; - if (env->me_pid != current_pid) - continue; - if (!(env->me_flags & MDBX_ENV_TXKEY)) - continue; - MDBX_reader *const begin = &env->me_lck_mmap.lck->mti_readers[0]; - MDBX_reader *const end = - &env->me_lck_mmap.lck->mti_readers[env->me_maxreaders]; - thread_key_delete(env->me_txkey); - bool cleaned = false; - for (MDBX_reader *reader = begin; reader < end; ++reader) { - TRACE("== [%zi] = key %" PRIuPTR ", %p ... %p, rthc %p (%+i), " - "rthc-pid %i, current-pid %i", - i, (uintptr_t)env->me_txkey, __Wpedantic_format_voidptr(begin), - __Wpedantic_format_voidptr(end), __Wpedantic_format_voidptr(reader), - (int)(reader - begin), reader->mr_pid.weak, current_pid); - if (atomic_load32(&reader->mr_pid, mo_Relaxed) == current_pid) { - (void)atomic_cas32(&reader->mr_pid, current_pid, 0); - TRACE("== cleanup %p", __Wpedantic_format_voidptr(reader)); - cleaned = true; - } - } - if (cleaned) - atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); - } + /* --------------------------------------------------- mostly volatile part */ - rthc_limit = rthc_count = 0; - if (rthc_table != rthc_table_static) - osal_free(rthc_table); - rthc_table = nullptr; - rthc_unlock(); + MDBX_txn *txn; /* current write transaction */ + osal_fastmutex_t dbi_lock; + unsigned n_dbi; /* number of DBs opened */ + + unsigned shadow_reserve_len; + page_t *__restrict shadow_reserve; /* list of malloc'ed blocks for re-use */ + + osal_ioring_t ioring; #if defined(_WIN32) || defined(_WIN64) - DeleteCriticalSection(&rthc_critical_section); + osal_srwlock_t remap_guard; + /* Workaround for LockFileEx and WriteFile multithread bug */ + CRITICAL_SECTION windowsbug_lock; + char *pathname_char; /* cache of multi-byte representation of pathname + to the DB files */ #else - /* LY: yielding a few timeslices to give a more chance - * to racing destructor(s) for completion. */ - workaround_glibc_bug21031(); + osal_fastmutex_t remap_guard; #endif - osal_dtor(); - TRACE("<< pid %d\n", current_pid); - ENSURE(nullptr, osal_fastmutex_destroy(&debug_lock) == 0); -} + /* ------------------------------------------------- stub for lck-less mode */ + mdbx_atomic_uint64_t + lckless_placeholder[(sizeof(lck_t) + MDBX_CACHELINE_SIZE - 1) / + sizeof(mdbx_atomic_uint64_t)]; +}; -__cold int rthc_register(MDBX_env *const env) { - TRACE(">> env %p, rthc_count %u, rthc_limit %u", - __Wpedantic_format_voidptr(env), rthc_count, rthc_limit); +/*----------------------------------------------------------------------------*/ - int rc = MDBX_SUCCESS; - for (size_t i = 0; i < rthc_count; ++i) - if (unlikely(rthc_table[i].env == env)) { - rc = MDBX_PANIC; - goto bailout; - } +/* pseudo-error code, not exposed outside libmdbx */ +#define MDBX_NO_ROOT (MDBX_LAST_ADDED_ERRCODE + 33) - env->me_txkey = 0; - if (unlikely(rthc_count == rthc_limit)) { - rthc_entry_t *new_table = - osal_realloc((rthc_table == rthc_table_static) ? nullptr : rthc_table, - sizeof(rthc_entry_t) * rthc_limit * 2); - if (unlikely(new_table == nullptr)) { - rc = MDBX_ENOMEM; - goto bailout; - } - if (rthc_table == rthc_table_static) - memcpy(new_table, rthc_table, sizeof(rthc_entry_t) * rthc_limit); - rthc_table = new_table; - rthc_limit *= 2; - } +/* Number of slots in the reader table. + * This value was chosen somewhat arbitrarily. The 61 is a prime number, + * and such readers plus a couple mutexes fit into single 4KB page. + * Applications should set the table size using mdbx_env_set_maxreaders(). */ +#define DEFAULT_READERS 61 - if ((env->me_flags & MDBX_NOSTICKYTHREADS) == 0) { - rc = thread_key_create(&env->me_txkey); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - env->me_flags |= MDBX_ENV_TXKEY; - } +enum db_flags { + DB_PERSISTENT_FLAGS = MDBX_REVERSEKEY | MDBX_DUPSORT | MDBX_INTEGERKEY | + MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP, - rthc_table[rthc_count].env = env; - TRACE("== [%i] = env %p, key %" PRIuPTR, rthc_count, - __Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey); - ++rthc_count; + /* mdbx_dbi_open() flags */ + DB_USABLE_FLAGS = DB_PERSISTENT_FLAGS | MDBX_CREATE | MDBX_DB_ACCEDE, -bailout: - TRACE("<< env %p, key %" PRIuPTR ", rthc_count %u, rthc_limit %u, rc %d", - __Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey, rthc_count, - rthc_limit, rc); - return rc; -} -__cold static int rthc_drown(MDBX_env *const env) { - const uint32_t current_pid = osal_getpid(); - int rc = MDBX_SUCCESS; - MDBX_env *inprocess_neighbor = nullptr; - if (likely(env->me_lck_mmap.lck && current_pid == env->me_pid)) { - MDBX_reader *const begin = &env->me_lck_mmap.lck->mti_readers[0]; - MDBX_reader *const end = - &env->me_lck_mmap.lck->mti_readers[env->me_maxreaders]; - TRACE("== %s env %p pid %d, readers %p ...%p, current-pid %d", - (current_pid == env->me_pid) ? "cleanup" : "skip", - __Wpedantic_format_voidptr(env), env->me_pid, - __Wpedantic_format_voidptr(begin), __Wpedantic_format_voidptr(end), - current_pid); - bool cleaned = false; - for (MDBX_reader *r = begin; r < end; ++r) { - if (atomic_load32(&r->mr_pid, mo_Relaxed) == current_pid) { - atomic_store32(&r->mr_pid, 0, mo_AcquireRelease); - TRACE("== cleanup %p", __Wpedantic_format_voidptr(r)); - cleaned = true; - } - } - if (cleaned) - atomic_store32(&env->me_lck_mmap.lck->mti_readers_refresh_flag, true, - mo_Relaxed); - rc = rthc_uniq_check(&env->me_lck_mmap, &inprocess_neighbor); - if (!inprocess_neighbor && env->me_live_reader && - env->me_lfd != INVALID_HANDLE_VALUE) { - int err = osal_rpid_clear(env); - rc = rc ? rc : err; - } - } - int err = osal_lck_destroy(env, inprocess_neighbor, current_pid); - env->me_pid = 0; - return rc ? rc : err; -} + DB_VALID = 0x80u /* DB handle is valid, for dbs_flags */, + DB_POISON = 0x7fu /* update pending */, + DB_INTERNAL_FLAGS = DB_VALID +}; -__cold static int rthc_remove(MDBX_env *const env) { - TRACE(">>> env %p, key %zu, rthc_count %u, rthc_limit %u", - __Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey, rthc_count, - rthc_limit); +#if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS +MDBX_MAYBE_UNUSED static void static_checks(void) { + STATIC_ASSERT(MDBX_WORDBITS == sizeof(void *) * CHAR_BIT); + STATIC_ASSERT(UINT64_C(0x80000000) == (uint32_t)ENV_FATAL_ERROR); + STATIC_ASSERT_MSG(INT16_MAX - CORE_DBS == MDBX_MAX_DBI, + "Oops, MDBX_MAX_DBI or CORE_DBS?"); + STATIC_ASSERT_MSG((unsigned)(MDBX_DB_ACCEDE | MDBX_CREATE) == + ((DB_USABLE_FLAGS | DB_INTERNAL_FLAGS) & + (ENV_USABLE_FLAGS | ENV_INTERNAL_FLAGS)), + "Oops, some flags overlapped or wrong"); + STATIC_ASSERT_MSG((DB_INTERNAL_FLAGS & DB_USABLE_FLAGS) == 0, + "Oops, some flags overlapped or wrong"); + STATIC_ASSERT_MSG((DB_PERSISTENT_FLAGS & ~DB_USABLE_FLAGS) == 0, + "Oops, some flags overlapped or wrong"); + STATIC_ASSERT(DB_PERSISTENT_FLAGS <= UINT8_MAX); + STATIC_ASSERT_MSG((ENV_INTERNAL_FLAGS & ENV_USABLE_FLAGS) == 0, + "Oops, some flags overlapped or wrong"); - int rc = MDBX_SUCCESS; - if (likely(env->me_pid)) - rc = rthc_drown(env); + STATIC_ASSERT_MSG( + (txn_state_flags & (txn_rw_begin_flags | txn_ro_begin_flags)) == 0, + "Oops, some txn flags overlapped or wrong"); + STATIC_ASSERT_MSG( + ((txn_rw_begin_flags | txn_ro_begin_flags | txn_state_flags) & + txn_shrink_allowed) == 0, + "Oops, some txn flags overlapped or wrong"); - for (size_t i = 0; i < rthc_count; ++i) { - if (rthc_table[i].env == env) { - if (--rthc_count > 0) - rthc_table[i] = rthc_table[rthc_count]; - else if (rthc_table != rthc_table_static) { - void *tmp = rthc_table; - rthc_table = rthc_table_static; - rthc_limit = RTHC_INITIAL_LIMIT; - osal_memory_barrier(); - osal_free(tmp); - } - break; - } - } + STATIC_ASSERT(sizeof(reader_slot_t) == 32); +#if MDBX_LOCKING > 0 + STATIC_ASSERT(offsetof(lck_t, wrt_lock) % MDBX_CACHELINE_SIZE == 0); + STATIC_ASSERT(offsetof(lck_t, rdt_lock) % MDBX_CACHELINE_SIZE == 0); +#else + STATIC_ASSERT(offsetof(lck_t, cached_oldest) % MDBX_CACHELINE_SIZE == 0); + STATIC_ASSERT(offsetof(lck_t, rdt_length) % MDBX_CACHELINE_SIZE == 0); +#endif /* MDBX_LOCKING */ + STATIC_ASSERT(offsetof(lck_t, rdt) % MDBX_CACHELINE_SIZE == 0); - TRACE("<<< %p, key %zu, rthc_count %u, rthc_limit %u", - __Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey, rthc_count, - rthc_limit); - return rc; +#if FLEXIBLE_ARRAY_MEMBERS + STATIC_ASSERT(NODESIZE == offsetof(node_t, payload)); + STATIC_ASSERT(PAGEHDRSZ == offsetof(page_t, entries)); +#endif /* FLEXIBLE_ARRAY_MEMBERS */ + STATIC_ASSERT(sizeof(clc_t) == 3 * sizeof(void *)); + STATIC_ASSERT(sizeof(kvx_t) == 8 * sizeof(void *)); + +#if MDBX_WORDBITS == 64 +#define KVX_SIZE_LN2 6 +#else +#define KVX_SIZE_LN2 5 +#endif + STATIC_ASSERT(sizeof(kvx_t) == (1u << KVX_SIZE_LN2)); } +#endif /* Disabled for MSVC 19.0 (VisualStudio 2015) */ -//------------------------------------------------------------------------------ +/******************************************************************************/ -MDBX_NOTHROW_CONST_FUNCTION static uint64_t rrxmrrxmsx_0(uint64_t v) { - /* Pelle Evensen's mixer, https://bit.ly/2HOfynt */ - v ^= (v << 39 | v >> 25) ^ (v << 14 | v >> 50); - v *= UINT64_C(0xA24BAED4963EE407); - v ^= (v << 40 | v >> 24) ^ (v << 15 | v >> 49); - v *= UINT64_C(0x9FB21C651E98DF25); - return v ^ v >> 28; + + + +/* valid flags for mdbx_node_add() */ +#define NODE_ADD_FLAGS (N_DUPDATA | N_SUBDATA | MDBX_RESERVE | MDBX_APPEND) + +/* Get the page number pointed to by a branch node */ +MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t +node_pgno(const node_t *const __restrict node) { + pgno_t pgno = UNALIGNED_PEEK_32(node, node_t, child_pgno); + return pgno; } -static int uniq_peek(const osal_mmap_t *pending, osal_mmap_t *scan) { - int rc; - uint64_t bait; - MDBX_lockinfo *const pending_lck = pending->lck; - MDBX_lockinfo *const scan_lck = scan->lck; - if (pending_lck) { - bait = atomic_load64(&pending_lck->mti_bait_uniqueness, mo_AcquireRelease); - rc = MDBX_SUCCESS; - } else { - bait = 0 /* hush MSVC warning */; - rc = osal_msync(scan, 0, sizeof(MDBX_lockinfo), MDBX_SYNC_DATA); - if (rc == MDBX_SUCCESS) - rc = osal_pread(pending->fd, &bait, sizeof(scan_lck->mti_bait_uniqueness), - offsetof(MDBX_lockinfo, mti_bait_uniqueness)); - } - if (likely(rc == MDBX_SUCCESS) && - bait == atomic_load64(&scan_lck->mti_bait_uniqueness, mo_AcquireRelease)) - rc = MDBX_RESULT_TRUE; +/* Set the page number in a branch node */ +static inline void node_set_pgno(node_t *const __restrict node, pgno_t pgno) { + assert(pgno >= MIN_PAGENO && pgno <= MAX_PAGENO); - TRACE("uniq-peek: %s, bait 0x%016" PRIx64 ",%s rc %d", - pending_lck ? "mem" : "file", bait, - (rc == MDBX_RESULT_TRUE) ? " found," : (rc ? " FAILED," : ""), rc); - return rc; + UNALIGNED_POKE_32(node, node_t, child_pgno, (uint32_t)pgno); } -static int uniq_poke(const osal_mmap_t *pending, osal_mmap_t *scan, - uint64_t *abra) { - if (*abra == 0) { - const uintptr_t tid = osal_thread_self(); - uintptr_t uit = 0; - memcpy(&uit, &tid, (sizeof(tid) < sizeof(uit)) ? sizeof(tid) : sizeof(uit)); - *abra = rrxmrrxmsx_0(osal_monotime() + UINT64_C(5873865991930747) * uit); - } - const uint64_t cadabra = - rrxmrrxmsx_0(*abra + UINT64_C(7680760450171793) * (unsigned)osal_getpid()) - << 24 | - *abra >> 40; - MDBX_lockinfo *const scan_lck = scan->lck; - atomic_store64(&scan_lck->mti_bait_uniqueness, cadabra, mo_AcquireRelease); - *abra = *abra * UINT64_C(6364136223846793005) + 1; - return uniq_peek(pending, scan); +/* Get the size of the data in a leaf node */ +MDBX_NOTHROW_PURE_FUNCTION static inline size_t +node_ds(const node_t *const __restrict node) { + return UNALIGNED_PEEK_32(node, node_t, dsize); } -__cold static int rthc_uniq_check(const osal_mmap_t *pending, - MDBX_env **found) { - *found = nullptr; - uint64_t salt = 0; - for (size_t i = 0; i < rthc_count; ++i) { - MDBX_env *const scan = rthc_table[i].env; - if (!scan->me_lck_mmap.lck || &scan->me_lck_mmap == pending) - continue; - int err = atomic_load64(&scan->me_lck_mmap.lck->mti_bait_uniqueness, - mo_AcquireRelease) - ? uniq_peek(pending, &scan->me_lck_mmap) - : uniq_poke(pending, &scan->me_lck_mmap, &salt); - if (err == MDBX_ENODATA) { - uint64_t length = 0; - if (likely(osal_filesize(pending->fd, &length) == MDBX_SUCCESS && - length == 0)) { - /* LY: skip checking since LCK-file is empty, i.e. just created. */ - DEBUG("%s", "unique (new/empty lck)"); - return MDBX_SUCCESS; - } - } - if (err == MDBX_RESULT_TRUE) - err = uniq_poke(pending, &scan->me_lck_mmap, &salt); - if (err == MDBX_RESULT_TRUE) { - (void)osal_msync(&scan->me_lck_mmap, 0, sizeof(MDBX_lockinfo), - MDBX_SYNC_KICK); - err = uniq_poke(pending, &scan->me_lck_mmap, &salt); - } - if (err == MDBX_RESULT_TRUE) { - err = uniq_poke(pending, &scan->me_lck_mmap, &salt); - *found = scan; - DEBUG("found %p", __Wpedantic_format_voidptr(*found)); - return MDBX_SUCCESS; - } - if (unlikely(err != MDBX_SUCCESS)) { - DEBUG("failed rc %d", err); - return err; - } - } +/* Set the size of the data for a leaf node */ +static inline void node_set_ds(node_t *const __restrict node, size_t size) { + assert(size < INT_MAX); + UNALIGNED_POKE_32(node, node_t, dsize, (uint32_t)size); +} - DEBUG("%s", "unique"); - return MDBX_SUCCESS; +/* The size of a key in a node */ +MDBX_NOTHROW_PURE_FUNCTION static inline size_t +node_ks(const node_t *const __restrict node) { + return UNALIGNED_PEEK_16(node, node_t, ksize); } -/*------------------------------------------------------------------------------ - * LY: State of the art quicksort-based sorting, with internal stack - * and network-sort for small chunks. - * Thanks to John M. Gamble for the http://pages.ripco.net/~jgamble/nw.html */ +/* Set the size of the key for a leaf node */ +static inline void node_set_ks(node_t *const __restrict node, size_t size) { + assert(size < INT16_MAX); + UNALIGNED_POKE_16(node, node_t, ksize, (uint16_t)size); +} -#if MDBX_HAVE_CMOV -#define SORT_CMP_SWAP(TYPE, CMP, a, b) \ - do { \ - const TYPE swap_tmp = (a); \ - const bool swap_cmp = expect_with_probability(CMP(swap_tmp, b), 0, .5); \ - (a) = swap_cmp ? swap_tmp : b; \ - (b) = swap_cmp ? b : swap_tmp; \ - } while (0) +MDBX_NOTHROW_PURE_FUNCTION static inline uint8_t +node_flags(const node_t *const __restrict node) { + return UNALIGNED_PEEK_8(node, node_t, flags); +} + +static inline void node_set_flags(node_t *const __restrict node, + uint8_t flags) { + UNALIGNED_POKE_8(node, node_t, flags, flags); +} + +/* Address of the key for the node */ +MDBX_NOTHROW_PURE_FUNCTION static inline void * +node_key(const node_t *const __restrict node) { + return ptr_disp(node, NODESIZE); +} + +/* Address of the data for a node */ +MDBX_NOTHROW_PURE_FUNCTION static inline void * +node_data(const node_t *const __restrict node) { + return ptr_disp(node_key(node), node_ks(node)); +} + +/* Size of a node in a leaf page with a given key and data. + * This is node header plus key plus data size. */ +MDBX_NOTHROW_CONST_FUNCTION static inline size_t +node_size_len(const size_t key_len, const size_t value_len) { + return NODESIZE + EVEN_CEIL(key_len + value_len); +} +MDBX_NOTHROW_PURE_FUNCTION static inline size_t +node_size(const MDBX_val *key, const MDBX_val *value) { + return node_size_len(key ? key->iov_len : 0, value ? value->iov_len : 0); +} + +MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t +node_largedata_pgno(const node_t *const __restrict node) { + assert(node_flags(node) & N_BIGDATA); + return peek_pgno(node_data(node)); +} + +MDBX_INTERNAL int __must_check_result node_read_bigdata(MDBX_cursor *mc, + const node_t *node, + MDBX_val *data, + const page_t *mp); + +static inline int __must_check_result node_read(MDBX_cursor *mc, + const node_t *node, + MDBX_val *data, + const page_t *mp) { + data->iov_len = node_ds(node); + data->iov_base = node_data(node); + if (likely(node_flags(node) != N_BIGDATA)) + return MDBX_SUCCESS; + return node_read_bigdata(mc, node, data, mp); +} + +/*----------------------------------------------------------------------------*/ + +MDBX_INTERNAL nsr_t node_search(MDBX_cursor *mc, const MDBX_val *key); + +MDBX_INTERNAL int __must_check_result node_add_branch(MDBX_cursor *mc, + size_t indx, + const MDBX_val *key, + pgno_t pgno); + +MDBX_INTERNAL int __must_check_result node_add_leaf(MDBX_cursor *mc, + size_t indx, + const MDBX_val *key, + MDBX_val *data, + unsigned flags); + +MDBX_INTERNAL int __must_check_result node_add_dupfix(MDBX_cursor *mc, + size_t indx, + const MDBX_val *key); + +MDBX_INTERNAL void node_del(MDBX_cursor *mc, size_t ksize); + +MDBX_INTERNAL node_t *node_shrink(page_t *mp, size_t indx, node_t *node); + + + + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL size_t +dbi_bitmap_ctz_fallback(const MDBX_txn *txn, intptr_t bmi); + +#if MDBX_ENABLE_DBI_SPARSE + +static inline size_t dbi_bitmap_ctz(const MDBX_txn *txn, intptr_t bmi) { + tASSERT(txn, bmi > 0); + STATIC_ASSERT(sizeof(bmi) >= sizeof(txn->dbi_sparse[0])); +#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl) + if (sizeof(txn->dbi_sparse[0]) <= sizeof(int)) + return __builtin_ctz((int)bmi); + if (sizeof(txn->dbi_sparse[0]) == sizeof(long)) + return __builtin_ctzl((long)bmi); +#if (defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ == 8) || \ + __has_builtin(__builtin_ctzll) + return __builtin_ctzll(bmi); +#endif /* have(long long) && long long == uint64_t */ +#endif /* GNU C */ + +#if defined(_MSC_VER) + unsigned long index; + if (sizeof(txn->dbi_sparse[0]) > 4) { +#if defined(_M_AMD64) || defined(_M_ARM64) || defined(_M_X64) + _BitScanForward64(&index, bmi); + return index; #else -#define SORT_CMP_SWAP(TYPE, CMP, a, b) \ - do \ - if (expect_with_probability(!CMP(a, b), 0, .5)) { \ - const TYPE swap_tmp = (a); \ - (a) = (b); \ - (b) = swap_tmp; \ - } \ - while (0) + if (bmi > UINT32_MAX) { + _BitScanForward(&index, (uint32_t)((uint64_t)bmi >> 32)); + return index; + } #endif + } + _BitScanForward(&index, (uint32_t)bmi); + return index; +#endif /* MSVC */ -// 3 comparators, 3 parallel operations -// o-----^--^--o -// | | -// o--^--|--v--o -// | | -// o--v--v-----o -// -// [[1,2]] -// [[0,2]] -// [[0,1]] -#define SORT_NETWORK_3(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - } while (0) + return dbi_bitmap_ctz_fallback(txn, bmi); +} -// 5 comparators, 3 parallel operations -// o--^--^--------o -// | | -// o--v--|--^--^--o -// | | | -// o--^--v--|--v--o -// | | -// o--v-----v-----o -// -// [[0,1],[2,3]] -// [[0,2],[1,3]] -// [[1,2]] -#define SORT_NETWORK_4(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - } while (0) +/* LY: Макрос целенаправленно сделан с одним циклом, чтобы сохранить возможность + * использования оператора break */ +#define TXN_FOREACH_DBI_FROM(TXN, I, FROM) \ + for (size_t bitmap_chunk = CHAR_BIT * sizeof(TXN->dbi_sparse[0]), \ + bitmap_item = TXN->dbi_sparse[0] >> FROM, I = FROM; \ + I < TXN->n_dbi; ++I) \ + if (bitmap_item == 0) { \ + I = (I - 1) | (bitmap_chunk - 1); \ + bitmap_item = TXN->dbi_sparse[(1 + I) / bitmap_chunk]; \ + if (!bitmap_item) \ + I += bitmap_chunk; \ + continue; \ + } else if ((bitmap_item & 1) == 0) { \ + size_t bitmap_skip = dbi_bitmap_ctz(txn, bitmap_item); \ + bitmap_item >>= bitmap_skip; \ + I += bitmap_skip - 1; \ + continue; \ + } else if (bitmap_item >>= 1, TXN->dbi_state[I]) -// 9 comparators, 5 parallel operations -// o--^--^-----^-----------o -// | | | -// o--|--|--^--v-----^--^--o -// | | | | | -// o--|--v--|--^--^--|--v--o -// | | | | | -// o--|-----v--|--v--|--^--o -// | | | | -// o--v--------v-----v--v--o -// -// [[0,4],[1,3]] -// [[0,2]] -// [[2,4],[0,1]] -// [[2,3],[1,4]] -// [[1,2],[3,4]] -#define SORT_NETWORK_5(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - } while (0) +#else -// 12 comparators, 6 parallel operations -// o-----^--^--^-----------------o -// | | | -// o--^--|--v--|--^--------^-----o -// | | | | | -// o--v--v-----|--|--^--^--|--^--o -// | | | | | | -// o-----^--^--v--|--|--|--v--v--o -// | | | | | -// o--^--|--v-----v--|--v--------o -// | | | -// o--v--v-----------v-----------o -// -// [[1,2],[4,5]] -// [[0,2],[3,5]] -// [[0,1],[3,4],[2,5]] -// [[0,3],[1,4]] -// [[2,4],[1,3]] -// [[2,3]] -#define SORT_NETWORK_6(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - } while (0) - -// 16 comparators, 6 parallel operations -// o--^--------^-----^-----------------o -// | | | -// o--|--^-----|--^--v--------^--^-----o -// | | | | | | -// o--|--|--^--v--|--^-----^--|--v-----o -// | | | | | | | -// o--|--|--|-----v--|--^--v--|--^--^--o -// | | | | | | | | -// o--v--|--|--^-----v--|--^--v--|--v--o -// | | | | | | -// o-----v--|--|--------v--v-----|--^--o -// | | | | -// o--------v--v-----------------v--v--o -// -// [[0,4],[1,5],[2,6]] -// [[0,2],[1,3],[4,6]] -// [[2,4],[3,5],[0,1]] -// [[2,3],[4,5]] -// [[1,4],[3,6]] -// [[1,2],[3,4],[5,6]] -#define SORT_NETWORK_7(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - } while (0) +#define TXN_FOREACH_DBI_FROM(TXN, I, SKIP) \ + for (size_t I = SKIP; I < TXN->n_dbi; ++I) \ + if (TXN->dbi_state[I]) -// 19 comparators, 6 parallel operations -// o--^--------^-----^-----------------o -// | | | -// o--|--^-----|--^--v--------^--^-----o -// | | | | | | -// o--|--|--^--v--|--^-----^--|--v-----o -// | | | | | | | -// o--|--|--|--^--v--|--^--v--|--^--^--o -// | | | | | | | | | -// o--v--|--|--|--^--v--|--^--v--|--v--o -// | | | | | | | -// o-----v--|--|--|--^--v--v-----|--^--o -// | | | | | | -// o--------v--|--v--|--^--------v--v--o -// | | | -// o-----------v-----v--v--------------o -// -// [[0,4],[1,5],[2,6],[3,7]] -// [[0,2],[1,3],[4,6],[5,7]] -// [[2,4],[3,5],[0,1],[6,7]] -// [[2,3],[4,5]] -// [[1,4],[3,6]] -// [[1,2],[3,4],[5,6]] -#define SORT_NETWORK_8(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - } while (0) +#endif /* MDBX_ENABLE_DBI_SPARSE */ -#define SORT_INNER(TYPE, CMP, begin, end, len) \ - switch (len) { \ - default: \ - assert(false); \ - __unreachable(); \ - case 0: \ - case 1: \ - break; \ - case 2: \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - break; \ - case 3: \ - SORT_NETWORK_3(TYPE, CMP, begin); \ - break; \ - case 4: \ - SORT_NETWORK_4(TYPE, CMP, begin); \ - break; \ - case 5: \ - SORT_NETWORK_5(TYPE, CMP, begin); \ - break; \ - case 6: \ - SORT_NETWORK_6(TYPE, CMP, begin); \ - break; \ - case 7: \ - SORT_NETWORK_7(TYPE, CMP, begin); \ - break; \ - case 8: \ - SORT_NETWORK_8(TYPE, CMP, begin); \ - break; \ - } +#define TXN_FOREACH_DBI_ALL(TXN, I) TXN_FOREACH_DBI_FROM(TXN, I, 0) +#define TXN_FOREACH_DBI_USER(TXN, I) TXN_FOREACH_DBI_FROM(TXN, I, CORE_DBS) -#define SORT_SWAP(TYPE, a, b) \ - do { \ - const TYPE swap_tmp = (a); \ - (a) = (b); \ - (b) = swap_tmp; \ - } while (0) +MDBX_INTERNAL int dbi_import(MDBX_txn *txn, const size_t dbi); -#define SORT_PUSH(low, high) \ - do { \ - top->lo = (low); \ - top->hi = (high); \ - ++top; \ - } while (0) +struct dbi_snap_result { + uint32_t sequence; + unsigned flags; +}; +MDBX_INTERNAL struct dbi_snap_result dbi_snap(const MDBX_env *env, + const size_t dbi); -#define SORT_POP(low, high) \ - do { \ - --top; \ - low = top->lo; \ - high = top->hi; \ - } while (0) +MDBX_INTERNAL int dbi_update(MDBX_txn *txn, int keep); -#define SORT_IMPL(NAME, EXPECT_LOW_CARDINALITY_OR_PRESORTED, TYPE, CMP) \ - \ - static __inline bool NAME##_is_sorted(const TYPE *first, const TYPE *last) { \ - while (++first <= last) \ - if (expect_with_probability(CMP(first[0], first[-1]), 1, .1)) \ - return false; \ - return true; \ - } \ - \ - typedef struct { \ - TYPE *lo, *hi; \ - } NAME##_stack; \ - \ - __hot static void NAME(TYPE *const __restrict begin, \ - TYPE *const __restrict end) { \ - NAME##_stack stack[sizeof(size_t) * CHAR_BIT], *__restrict top = stack; \ - \ - TYPE *__restrict hi = end - 1; \ - TYPE *__restrict lo = begin; \ - while (true) { \ - const ptrdiff_t len = hi - lo; \ - if (len < 8) { \ - SORT_INNER(TYPE, CMP, lo, hi + 1, len + 1); \ - if (unlikely(top == stack)) \ - break; \ - SORT_POP(lo, hi); \ - continue; \ - } \ - \ - TYPE *__restrict mid = lo + (len >> 1); \ - SORT_CMP_SWAP(TYPE, CMP, *lo, *mid); \ - SORT_CMP_SWAP(TYPE, CMP, *mid, *hi); \ - SORT_CMP_SWAP(TYPE, CMP, *lo, *mid); \ - \ - TYPE *right = hi - 1; \ - TYPE *left = lo + 1; \ - while (1) { \ - while (expect_with_probability(CMP(*left, *mid), 0, .5)) \ - ++left; \ - while (expect_with_probability(CMP(*mid, *right), 0, .5)) \ - --right; \ - if (unlikely(left > right)) { \ - if (EXPECT_LOW_CARDINALITY_OR_PRESORTED) { \ - if (NAME##_is_sorted(lo, right)) \ - lo = right + 1; \ - if (NAME##_is_sorted(left, hi)) \ - hi = left; \ - } \ - break; \ - } \ - SORT_SWAP(TYPE, *left, *right); \ - mid = (mid == left) ? right : (mid == right) ? left : mid; \ - ++left; \ - --right; \ - } \ - \ - if (right - lo > hi - left) { \ - SORT_PUSH(lo, right); \ - lo = left; \ - } else { \ - SORT_PUSH(left, hi); \ - hi = right; \ - } \ - } \ - \ - if (AUDIT_ENABLED()) { \ - for (TYPE *scan = begin + 1; scan < end; ++scan) \ - assert(CMP(scan[-1], scan[0])); \ - } \ - } +static inline uint8_t dbi_state(const MDBX_txn *txn, const size_t dbi) { + STATIC_ASSERT( + (int)DBI_DIRTY == MDBX_DBI_DIRTY && (int)DBI_STALE == MDBX_DBI_STALE && + (int)DBI_FRESH == MDBX_DBI_FRESH && (int)DBI_CREAT == MDBX_DBI_CREAT); -/*------------------------------------------------------------------------------ - * LY: radix sort for large chunks */ +#if MDBX_ENABLE_DBI_SPARSE + const size_t bitmap_chunk = CHAR_BIT * sizeof(txn->dbi_sparse[0]); + const size_t bitmap_indx = dbi / bitmap_chunk; + const size_t bitmap_mask = (size_t)1 << dbi % bitmap_chunk; + return likely(dbi < txn->n_dbi && + (txn->dbi_sparse[bitmap_indx] & bitmap_mask) != 0) + ? txn->dbi_state[dbi] + : 0; +#else + return likely(dbi < txn->n_dbi) ? txn->dbi_state[dbi] : 0; +#endif /* MDBX_ENABLE_DBI_SPARSE */ +} -#define RADIXSORT_IMPL(NAME, TYPE, EXTRACT_KEY, BUFFER_PREALLOCATED, END_GAP) \ - \ - __hot static bool NAME##_radixsort(TYPE *const begin, const size_t length) { \ - TYPE *tmp; \ - if (BUFFER_PREALLOCATED) { \ - tmp = begin + length + END_GAP; \ - /* memset(tmp, 0xDeadBeef, sizeof(TYPE) * length); */ \ - } else { \ - tmp = osal_malloc(sizeof(TYPE) * length); \ - if (unlikely(!tmp)) \ - return false; \ - } \ - \ - size_t key_shift = 0, key_diff_mask; \ - do { \ - struct { \ - pgno_t a[256], b[256]; \ - } counters; \ - memset(&counters, 0, sizeof(counters)); \ - \ - key_diff_mask = 0; \ - size_t prev_key = EXTRACT_KEY(begin) >> key_shift; \ - TYPE *r = begin, *end = begin + length; \ - do { \ - const size_t key = EXTRACT_KEY(r) >> key_shift; \ - counters.a[key & 255]++; \ - counters.b[(key >> 8) & 255]++; \ - key_diff_mask |= prev_key ^ key; \ - prev_key = key; \ - } while (++r != end); \ - \ - pgno_t ta = 0, tb = 0; \ - for (size_t i = 0; i < 256; ++i) { \ - const pgno_t ia = counters.a[i]; \ - counters.a[i] = ta; \ - ta += ia; \ - const pgno_t ib = counters.b[i]; \ - counters.b[i] = tb; \ - tb += ib; \ - } \ - \ - r = begin; \ - do { \ - const size_t key = EXTRACT_KEY(r) >> key_shift; \ - tmp[counters.a[key & 255]++] = *r; \ - } while (++r != end); \ - \ - if (unlikely(key_diff_mask < 256)) { \ - memcpy(begin, tmp, ptr_dist(end, begin)); \ - break; \ - } \ - end = (r = tmp) + length; \ - do { \ - const size_t key = EXTRACT_KEY(r) >> key_shift; \ - begin[counters.b[(key >> 8) & 255]++] = *r; \ - } while (++r != end); \ - \ - key_shift += 16; \ - } while (key_diff_mask >> 16); \ - \ - if (!(BUFFER_PREALLOCATED)) \ - osal_free(tmp); \ - return true; \ - } +static inline bool dbi_changed(const MDBX_txn *txn, const size_t dbi) { + const MDBX_env *const env = txn->env; + eASSERT(env, dbi_state(txn, dbi) & DBI_LINDO); + const uint32_t snap_seq = + atomic_load32(&env->dbi_seqs[dbi], mo_AcquireRelease); + return snap_seq != txn->dbi_seqs[dbi]; +} -/*------------------------------------------------------------------------------ - * LY: Binary search */ +static inline int dbi_check(const MDBX_txn *txn, const size_t dbi) { + const uint8_t state = dbi_state(txn, dbi); + if (likely((state & DBI_LINDO) != 0 && !dbi_changed(txn, dbi))) + return (state & DBI_VALID) ? MDBX_SUCCESS : MDBX_BAD_DBI; -#if defined(__clang__) && __clang_major__ > 4 && defined(__ia32__) -#define WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(size, flag) \ - do \ - __asm __volatile("" \ - : "+r"(size) \ - : "r" /* the `b` constraint is more suitable here, but \ - cause CLANG to allocate and push/pop an one more \ - register, so using the `r` which avoids this. */ \ - (flag)); \ - while (0) -#else -#define WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(size, flag) \ - do { \ - /* nope for non-clang or non-x86 */; \ - } while (0) -#endif /* Workaround for CLANG */ + /* Медленный путь: ленивая до-инициализацяи и импорт */ + return dbi_import((MDBX_txn *)txn, dbi); +} -#define BINARY_SEARCH_STEP(TYPE_LIST, CMP, it, size, key) \ - do { \ - } while (0) +static inline uint32_t dbi_seq_next(const MDBX_env *const env, size_t dbi) { + uint32_t v = atomic_load32(&env->dbi_seqs[dbi], mo_AcquireRelease) + 1; + return v ? v : 1; +} -#define SEARCH_IMPL(NAME, TYPE_LIST, TYPE_ARG, CMP) \ - static __always_inline const TYPE_LIST *NAME( \ - const TYPE_LIST *it, size_t length, const TYPE_ARG item) { \ - const TYPE_LIST *const begin = it, *const end = begin + length; \ - \ - if (MDBX_HAVE_CMOV) \ - do { \ - /* Адаптивно-упрощенный шаг двоичного поиска: \ - * - без переходов при наличии cmov или аналога; \ - * - допускает лишние итерации; \ - * - но ищет пока size > 2, что требует дозавершения поиска \ - * среди остающихся 0-1-2 элементов. */ \ - const TYPE_LIST *const middle = it + (length >> 1); \ - length = (length + 1) >> 1; \ - const bool flag = expect_with_probability(CMP(*middle, item), 0, .5); \ - WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(length, flag); \ - it = flag ? middle : it; \ - } while (length > 2); \ - else \ - while (length > 2) { \ - /* Вариант с использованием условного перехода. Основное отличие в \ - * том, что при "не равно" (true от компаратора) переход делается на 1 \ - * ближе к концу массива. Алгоритмически это верно и обеспечивает \ - * чуть-чуть более быструю сходимость, но зато требует больше \ - * вычислений при true от компаратора. Также ВАЖНО(!) не допускается \ - * спекулятивное выполнение при size == 0. */ \ - const TYPE_LIST *const middle = it + (length >> 1); \ - length = (length + 1) >> 1; \ - const bool flag = expect_with_probability(CMP(*middle, item), 0, .5); \ - if (flag) { \ - it = middle + 1; \ - length -= 1; \ - } \ - } \ - it += length > 1 && expect_with_probability(CMP(*it, item), 0, .5); \ - it += length > 0 && expect_with_probability(CMP(*it, item), 0, .5); \ - \ - if (AUDIT_ENABLED()) { \ - for (const TYPE_LIST *scan = begin; scan < it; ++scan) \ - assert(CMP(*scan, item)); \ - for (const TYPE_LIST *scan = it; scan < end; ++scan) \ - assert(!CMP(*scan, item)); \ - (void)begin, (void)end; \ - } \ - \ - return it; \ - } +MDBX_INTERNAL int dbi_open(MDBX_txn *txn, const MDBX_val *const name, + unsigned user_flags, MDBX_dbi *dbi, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp); -/*----------------------------------------------------------------------------*/ +MDBX_INTERNAL int dbi_bind(MDBX_txn *txn, const size_t dbi, unsigned user_flags, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp); -static __always_inline size_t pnl_size2bytes(size_t size) { - assert(size > 0 && size <= MDBX_PGL_LIMIT); -#if MDBX_PNL_PREALLOC_FOR_RADIXSORT - size += size; -#endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */ - STATIC_ASSERT(MDBX_ASSUME_MALLOC_OVERHEAD + - (MDBX_PGL_LIMIT * (MDBX_PNL_PREALLOC_FOR_RADIXSORT + 1) + - MDBX_PNL_GRANULATE + 3) * - sizeof(pgno_t) < - SIZE_MAX / 4 * 3); - size_t bytes = - ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(pgno_t) * (size + 3), - MDBX_PNL_GRANULATE * sizeof(pgno_t)) - - MDBX_ASSUME_MALLOC_OVERHEAD; - return bytes; -} -static __always_inline pgno_t pnl_bytes2size(const size_t bytes) { - size_t size = bytes / sizeof(pgno_t); - assert(size > 3 && size <= MDBX_PGL_LIMIT + /* alignment gap */ 65536); - size -= 3; -#if MDBX_PNL_PREALLOC_FOR_RADIXSORT - size >>= 1; -#endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */ - return (pgno_t)size; -} -static MDBX_PNL pnl_alloc(size_t size) { - size_t bytes = pnl_size2bytes(size); - MDBX_PNL pl = osal_malloc(bytes); - if (likely(pl)) { -#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) - bytes = malloc_usable_size(pl); -#endif /* malloc_usable_size */ - pl[0] = pnl_bytes2size(bytes); - assert(pl[0] >= size); - pl += 1; - *pl = 0; - } - return pl; -} -static void pnl_free(MDBX_PNL pl) { - if (likely(pl)) - osal_free(pl - 1); -} +MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL pgno_t pv2pages(uint16_t pv); -/* Shrink the PNL to the default size if it has grown larger */ -static void pnl_shrink(MDBX_PNL __restrict *__restrict ppl) { - assert(pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) >= MDBX_PNL_INITIAL && - pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) < - MDBX_PNL_INITIAL * 3 / 2); - assert(MDBX_PNL_GETSIZE(*ppl) <= MDBX_PGL_LIMIT && - MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_GETSIZE(*ppl)); - MDBX_PNL_SETSIZE(*ppl, 0); - if (unlikely(MDBX_PNL_ALLOCLEN(*ppl) > - MDBX_PNL_INITIAL * (MDBX_PNL_PREALLOC_FOR_RADIXSORT ? 8 : 4) - - MDBX_CACHELINE_SIZE / sizeof(pgno_t))) { - size_t bytes = pnl_size2bytes(MDBX_PNL_INITIAL * 2); - MDBX_PNL pl = osal_realloc(*ppl - 1, bytes); - if (likely(pl)) { -#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) - bytes = malloc_usable_size(pl); -#endif /* malloc_usable_size */ - *pl = pnl_bytes2size(bytes); - *ppl = pl + 1; - } - } -} +MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL uint16_t pages2pv(size_t pages); -/* Grow the PNL to the size growed to at least given size */ -static int pnl_reserve(MDBX_PNL __restrict *__restrict ppl, - const size_t wanna) { - const size_t allocated = MDBX_PNL_ALLOCLEN(*ppl); - assert(MDBX_PNL_GETSIZE(*ppl) <= MDBX_PGL_LIMIT && - MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_GETSIZE(*ppl)); - if (likely(allocated >= wanna)) - return MDBX_SUCCESS; +MDBX_MAYBE_UNUSED MDBX_INTERNAL bool pv2pages_verify(void); - if (unlikely(wanna > /* paranoia */ MDBX_PGL_LIMIT)) { - ERROR("PNL too long (%zu > %zu)", wanna, (size_t)MDBX_PGL_LIMIT); - return MDBX_TXN_FULL; +/*------------------------------------------------------------------------------ + * Nodes, Keys & Values length limitation factors: + * + * BRANCH_NODE_MAX + * Branch-page must contain at least two nodes, within each a key and a child + * page number. But page can't be split if it contains less that 4 keys, + * i.e. a page should not overflow before adding the fourth key. Therefore, + * at least 3 branch-node should fit in the single branch-page. Further, the + * first node of a branch-page doesn't contain a key, i.e. the first node + * is always require space just for itself. Thus: + * PAGESPACE = pagesize - page_hdr_len; + * BRANCH_NODE_MAX = even_floor( + * (PAGESPACE - sizeof(indx_t) - NODESIZE) / (3 - 1) - sizeof(indx_t)); + * KEYLEN_MAX = BRANCH_NODE_MAX - node_hdr_len; + * + * LEAF_NODE_MAX + * Leaf-node must fit into single leaf-page, where a value could be placed on + * a large/overflow page. However, may require to insert a nearly page-sized + * node between two large nodes are already fill-up a page. In this case the + * page must be split to two if some pair of nodes fits on one page, or + * otherwise the page should be split to the THREE with a single node + * per each of ones. Such 1-into-3 page splitting is costly and complex since + * requires TWO insertion into the parent page, that could lead to split it + * and so on up to the root. Therefore double-splitting is avoided here and + * the maximum node size is half of a leaf page space: + * LEAF_NODE_MAX = even_floor(PAGESPACE / 2 - sizeof(indx_t)); + * DATALEN_NO_OVERFLOW = LEAF_NODE_MAX - NODESIZE - KEYLEN_MAX; + * + * - SubDatabase-node must fit into one leaf-page: + * SUBDB_NAME_MAX = LEAF_NODE_MAX - node_hdr_len - sizeof(tree_t); + * + * - Dupsort values itself are a keys in a dupsort-subdb and couldn't be longer + * than the KEYLEN_MAX. But dupsort node must not great than LEAF_NODE_MAX, + * since dupsort value couldn't be placed on a large/overflow page: + * DUPSORT_DATALEN_MAX = min(KEYLEN_MAX, + * max(DATALEN_NO_OVERFLOW, sizeof(tree_t)); + */ + +#define PAGESPACE(pagesize) ((pagesize) - PAGEHDRSZ) + +#define BRANCH_NODE_MAX(pagesize) \ + (EVEN_FLOOR((PAGESPACE(pagesize) - sizeof(indx_t) - NODESIZE) / (3 - 1) - \ + sizeof(indx_t))) + +#define LEAF_NODE_MAX(pagesize) \ + (EVEN_FLOOR(PAGESPACE(pagesize) / 2) - sizeof(indx_t)) + +#define MAX_GC1OVPAGE(pagesize) (PAGESPACE(pagesize) / sizeof(pgno_t) - 1) + +MDBX_NOTHROW_CONST_FUNCTION static inline size_t +keysize_max(size_t pagesize, MDBX_db_flags_t flags) { + assert(pagesize >= MDBX_MIN_PAGESIZE && pagesize <= MDBX_MAX_PAGESIZE && + is_powerof2(pagesize)); + STATIC_ASSERT(BRANCH_NODE_MAX(MDBX_MIN_PAGESIZE) - NODESIZE >= 8); + if (flags & MDBX_INTEGERKEY) + return 8 /* sizeof(uint64_t) */; + + const intptr_t max_branch_key = BRANCH_NODE_MAX(pagesize) - NODESIZE; + STATIC_ASSERT(LEAF_NODE_MAX(MDBX_MIN_PAGESIZE) - NODESIZE - + /* sizeof(uint64) as a key */ 8 > + sizeof(tree_t)); + if (flags & + (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP | MDBX_INTEGERDUP)) { + const intptr_t max_dupsort_leaf_key = + LEAF_NODE_MAX(pagesize) - NODESIZE - sizeof(tree_t); + return (max_branch_key < max_dupsort_leaf_key) ? max_branch_key + : max_dupsort_leaf_key; } + return max_branch_key; +} - const size_t size = (wanna + wanna - allocated < MDBX_PGL_LIMIT) - ? wanna + wanna - allocated - : MDBX_PGL_LIMIT; - size_t bytes = pnl_size2bytes(size); - MDBX_PNL pl = osal_realloc(*ppl - 1, bytes); - if (likely(pl)) { -#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) - bytes = malloc_usable_size(pl); -#endif /* malloc_usable_size */ - *pl = pnl_bytes2size(bytes); - assert(*pl >= wanna); - *ppl = pl + 1; - return MDBX_SUCCESS; +MDBX_NOTHROW_CONST_FUNCTION static inline size_t +env_keysize_max(const MDBX_env *env, MDBX_db_flags_t flags) { + size_t size_max; + if (flags & MDBX_INTEGERKEY) + size_max = 8 /* sizeof(uint64_t) */; + else { + const intptr_t max_branch_key = env->branch_nodemax - NODESIZE; + STATIC_ASSERT(LEAF_NODE_MAX(MDBX_MIN_PAGESIZE) - NODESIZE - + /* sizeof(uint64) as a key */ 8 > + sizeof(tree_t)); + if (flags & + (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP | MDBX_INTEGERDUP)) { + const intptr_t max_dupsort_leaf_key = + env->leaf_nodemax - NODESIZE - sizeof(tree_t); + size_max = (max_branch_key < max_dupsort_leaf_key) ? max_branch_key + : max_dupsort_leaf_key; + } else + size_max = max_branch_key; } - return MDBX_ENOMEM; + eASSERT(env, size_max == keysize_max(env->ps, flags)); + return size_max; } -/* Make room for num additional elements in an PNL */ -static __always_inline int __must_check_result -pnl_need(MDBX_PNL __restrict *__restrict ppl, size_t num) { - assert(MDBX_PNL_GETSIZE(*ppl) <= MDBX_PGL_LIMIT && - MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_GETSIZE(*ppl)); - assert(num <= MDBX_PGL_LIMIT); - const size_t wanna = MDBX_PNL_GETSIZE(*ppl) + num; - return likely(MDBX_PNL_ALLOCLEN(*ppl) >= wanna) ? MDBX_SUCCESS - : pnl_reserve(ppl, wanna); +MDBX_NOTHROW_CONST_FUNCTION static inline size_t +keysize_min(MDBX_db_flags_t flags) { + return (flags & MDBX_INTEGERKEY) ? 4 /* sizeof(uint32_t) */ : 0; } -static __always_inline void pnl_xappend(__restrict MDBX_PNL pl, pgno_t pgno) { - assert(MDBX_PNL_GETSIZE(pl) < MDBX_PNL_ALLOCLEN(pl)); - if (AUDIT_ENABLED()) { - for (size_t i = MDBX_PNL_GETSIZE(pl); i > 0; --i) - assert(pgno != pl[i]); - } - *pl += 1; - MDBX_PNL_LAST(pl) = pgno; +MDBX_NOTHROW_CONST_FUNCTION static inline size_t +valsize_min(MDBX_db_flags_t flags) { + if (flags & MDBX_INTEGERDUP) + return 4 /* sizeof(uint32_t) */; + else if (flags & MDBX_DUPFIXED) + return sizeof(indx_t); + else + return 0; } -/* Append an pgno range onto an unsorted PNL */ -__always_inline static int __must_check_result pnl_append_range( - bool spilled, __restrict MDBX_PNL *ppl, pgno_t pgno, size_t n) { - assert(n > 0); - int rc = pnl_need(ppl, n); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +MDBX_NOTHROW_CONST_FUNCTION static inline size_t +valsize_max(size_t pagesize, MDBX_db_flags_t flags) { + assert(pagesize >= MDBX_MIN_PAGESIZE && pagesize <= MDBX_MAX_PAGESIZE && + is_powerof2(pagesize)); - const MDBX_PNL pnl = *ppl; -#if MDBX_PNL_ASCENDING - size_t w = MDBX_PNL_GETSIZE(pnl); - do { - pnl[++w] = pgno; - pgno += spilled ? 2 : 1; - } while (--n); - MDBX_PNL_SETSIZE(pnl, w); -#else - size_t w = MDBX_PNL_GETSIZE(pnl) + n; - MDBX_PNL_SETSIZE(pnl, w); - do { - pnl[w--] = pgno; - pgno += spilled ? 2 : 1; - } while (--n); -#endif + if (flags & MDBX_INTEGERDUP) + return 8 /* sizeof(uint64_t) */; - return MDBX_SUCCESS; + if (flags & (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP)) + return keysize_max(pagesize, 0); + + const unsigned page_ln2 = log2n_powerof2(pagesize); + const size_t hard = 0x7FF00000ul; + const size_t hard_pages = hard >> page_ln2; + STATIC_ASSERT(PAGELIST_LIMIT <= MAX_PAGENO); + const size_t pages_limit = PAGELIST_LIMIT / 4; + const size_t limit = + (hard_pages < pages_limit) ? hard : (pages_limit << page_ln2); + return (limit < MAX_MAPSIZE / 2) ? limit : MAX_MAPSIZE / 2; } -/* Append an pgno range into the sorted PNL */ -__hot static int __must_check_result pnl_insert_range(__restrict MDBX_PNL *ppl, - pgno_t pgno, size_t n) { - assert(n > 0); - int rc = pnl_need(ppl, n); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +MDBX_NOTHROW_CONST_FUNCTION static inline size_t +env_valsize_max(const MDBX_env *env, MDBX_db_flags_t flags) { + size_t size_max; + if (flags & MDBX_INTEGERDUP) + size_max = 8 /* sizeof(uint64_t) */; + else if (flags & (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP)) + size_max = env_keysize_max(env, 0); + else { + const size_t hard = 0x7FF00000ul; + const size_t hard_pages = hard >> env->ps2ln; + STATIC_ASSERT(PAGELIST_LIMIT <= MAX_PAGENO); + const size_t pages_limit = PAGELIST_LIMIT / 4; + const size_t limit = + (hard_pages < pages_limit) ? hard : (pages_limit << env->ps2ln); + size_max = (limit < MAX_MAPSIZE / 2) ? limit : MAX_MAPSIZE / 2; + } + eASSERT(env, size_max == valsize_max(env->ps, flags)); + return size_max; +} - const MDBX_PNL pnl = *ppl; - size_t r = MDBX_PNL_GETSIZE(pnl), w = r + n; - MDBX_PNL_SETSIZE(pnl, w); - while (r && MDBX_PNL_DISORDERED(pnl[r], pgno)) - pnl[w--] = pnl[r--]; +/*----------------------------------------------------------------------------*/ - for (pgno_t fill = MDBX_PNL_ASCENDING ? pgno + n : pgno; w > r; --w) - pnl[w] = MDBX_PNL_ASCENDING ? --fill : fill++; +MDBX_NOTHROW_PURE_FUNCTION static inline size_t +leaf_size(const MDBX_env *env, const MDBX_val *key, const MDBX_val *data) { + size_t node_bytes = node_size(key, data); + if (node_bytes > env->leaf_nodemax) + /* put on large/overflow page */ + node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t); - return MDBX_SUCCESS; + return node_bytes + sizeof(indx_t); } -__hot static bool pnl_check(const pgno_t *pl, const size_t limit) { - assert(limit >= MIN_PAGENO - MDBX_ENABLE_REFUND); - if (likely(MDBX_PNL_GETSIZE(pl))) { - if (unlikely(MDBX_PNL_GETSIZE(pl) > MDBX_PGL_LIMIT)) - return false; - if (unlikely(MDBX_PNL_LEAST(pl) < MIN_PAGENO)) - return false; - if (unlikely(MDBX_PNL_MOST(pl) >= limit)) - return false; +MDBX_NOTHROW_PURE_FUNCTION static inline size_t +branch_size(const MDBX_env *env, const MDBX_val *key) { + /* Size of a node in a branch page with a given key. + * This is just the node header plus the key, there is no data. */ + size_t node_bytes = node_size(key, nullptr); + if (unlikely(node_bytes > env->branch_nodemax)) { + /* put on large/overflow page, not implemented */ + mdbx_panic("node_size(key) %zu > %u branch_nodemax", node_bytes, + env->branch_nodemax); + node_bytes = node_size(key, nullptr) + sizeof(pgno_t); + } - if ((!MDBX_DISABLE_VALIDATION || AUDIT_ENABLED()) && - likely(MDBX_PNL_GETSIZE(pl) > 1)) { - const pgno_t *scan = MDBX_PNL_BEGIN(pl); - const pgno_t *const end = MDBX_PNL_END(pl); - pgno_t prev = *scan++; - do { - if (unlikely(!MDBX_PNL_ORDERED(prev, *scan))) - return false; - prev = *scan; - } while (likely(++scan != end)); - } + return node_bytes + sizeof(indx_t); +} + +MDBX_NOTHROW_CONST_FUNCTION static inline uint16_t +flags_db2sub(uint16_t db_flags) { + uint16_t sub_flags = db_flags & MDBX_DUPFIXED; + + /* MDBX_INTEGERDUP => MDBX_INTEGERKEY */ +#define SHIFT_INTEGERDUP_TO_INTEGERKEY 2 + STATIC_ASSERT((MDBX_INTEGERDUP >> SHIFT_INTEGERDUP_TO_INTEGERKEY) == + MDBX_INTEGERKEY); + sub_flags |= (db_flags & MDBX_INTEGERDUP) >> SHIFT_INTEGERDUP_TO_INTEGERKEY; + + /* MDBX_REVERSEDUP => MDBX_REVERSEKEY */ +#define SHIFT_REVERSEDUP_TO_REVERSEKEY 5 + STATIC_ASSERT((MDBX_REVERSEDUP >> SHIFT_REVERSEDUP_TO_REVERSEKEY) == + MDBX_REVERSEKEY); + sub_flags |= (db_flags & MDBX_REVERSEDUP) >> SHIFT_REVERSEDUP_TO_REVERSEKEY; + + return sub_flags; +} + +static inline bool check_sdb_flags(unsigned flags) { + switch (flags & ~(MDBX_REVERSEKEY | MDBX_INTEGERKEY)) { + default: + NOTICE("invalid db-flags 0x%x", flags); + return false; + case MDBX_DUPSORT: + case MDBX_DUPSORT | MDBX_REVERSEDUP: + case MDBX_DUPSORT | MDBX_DUPFIXED: + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP: + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP: + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP: + case MDBX_DB_DEFAULTS: + return (flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY)) != + (MDBX_REVERSEKEY | MDBX_INTEGERKEY); } - return true; } -static __always_inline bool pnl_check_allocated(const pgno_t *pl, - const size_t limit) { - return pl == nullptr || (MDBX_PNL_ALLOCLEN(pl) >= MDBX_PNL_GETSIZE(pl) && - pnl_check(pl, limit)); +/*----------------------------------------------------------------------------*/ + +MDBX_NOTHROW_PURE_FUNCTION static inline size_t pgno2bytes(const MDBX_env *env, + size_t pgno) { + eASSERT(env, (1u << env->ps2ln) == env->ps); + return ((size_t)pgno) << env->ps2ln; } -static __always_inline void -pnl_merge_inner(pgno_t *__restrict dst, const pgno_t *__restrict src_a, - const pgno_t *__restrict src_b, - const pgno_t *__restrict const src_b_detent) { - do { -#if MDBX_HAVE_CMOV - const bool flag = MDBX_PNL_ORDERED(*src_b, *src_a); -#if defined(__LCC__) || __CLANG_PREREQ(13, 0) - // lcc 1.26: 13ШК (подготовка и первая итерация) + 7ШК (цикл), БЕЗ loop-mode - // gcc>=7: cmp+jmp с возвратом в тело цикла (WTF?) - // gcc<=6: cmov×3 - // clang<=12: cmov×3 - // clang>=13: cmov, set+add/sub - *dst = flag ? *src_a-- : *src_b--; -#else - // gcc: cmov, cmp+set+add/sub - // clang<=5: cmov×2, set+add/sub - // clang>=6: cmov, set+add/sub - *dst = flag ? *src_a : *src_b; - src_b += (ptrdiff_t)flag - 1; - src_a -= flag; -#endif - --dst; -#else /* MDBX_HAVE_CMOV */ - while (MDBX_PNL_ORDERED(*src_b, *src_a)) - *dst-- = *src_a--; - *dst-- = *src_b--; -#endif /* !MDBX_HAVE_CMOV */ - } while (likely(src_b > src_b_detent)); +MDBX_NOTHROW_PURE_FUNCTION static inline page_t *pgno2page(const MDBX_env *env, + size_t pgno) { + return ptr_disp(env->dxb_mmap.base, pgno2bytes(env, pgno)); } -/* Merge a PNL onto a PNL. The destination PNL must be big enough */ -__hot static size_t pnl_merge(MDBX_PNL dst, const MDBX_PNL src) { - assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); - assert(pnl_check(src, MAX_PAGENO + 1)); - const size_t src_len = MDBX_PNL_GETSIZE(src); - const size_t dst_len = MDBX_PNL_GETSIZE(dst); - size_t total = dst_len; - assert(MDBX_PNL_ALLOCLEN(dst) >= total); - if (likely(src_len > 0)) { - total += src_len; - if (!MDBX_DEBUG && total < (MDBX_HAVE_CMOV ? 21 : 12)) - goto avoid_call_libc_for_short_cases; - if (dst_len == 0 || - MDBX_PNL_ORDERED(MDBX_PNL_LAST(dst), MDBX_PNL_FIRST(src))) - memcpy(MDBX_PNL_END(dst), MDBX_PNL_BEGIN(src), src_len * sizeof(pgno_t)); - else if (MDBX_PNL_ORDERED(MDBX_PNL_LAST(src), MDBX_PNL_FIRST(dst))) { - memmove(MDBX_PNL_BEGIN(dst) + src_len, MDBX_PNL_BEGIN(dst), - dst_len * sizeof(pgno_t)); - memcpy(MDBX_PNL_BEGIN(dst), MDBX_PNL_BEGIN(src), - src_len * sizeof(pgno_t)); - } else { - avoid_call_libc_for_short_cases: - dst[0] = /* the detent */ (MDBX_PNL_ASCENDING ? 0 : P_INVALID); - pnl_merge_inner(dst + total, dst + dst_len, src + src_len, src); - } - MDBX_PNL_SETSIZE(dst, total); - } - assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); - return total; +MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t bytes2pgno(const MDBX_env *env, + size_t bytes) { + eASSERT(env, (env->ps >> env->ps2ln) == 1); + return (pgno_t)(bytes >> env->ps2ln); } -static void spill_remove(MDBX_txn *txn, size_t idx, size_t npages) { - tASSERT(txn, idx > 0 && idx <= MDBX_PNL_GETSIZE(txn->tw.spilled.list) && - txn->tw.spilled.least_removed > 0); - txn->tw.spilled.least_removed = (idx < txn->tw.spilled.least_removed) - ? idx - : txn->tw.spilled.least_removed; - txn->tw.spilled.list[idx] |= 1; - MDBX_PNL_SETSIZE(txn->tw.spilled.list, - MDBX_PNL_GETSIZE(txn->tw.spilled.list) - - (idx == MDBX_PNL_GETSIZE(txn->tw.spilled.list))); +MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL size_t +bytes_align2os_bytes(const MDBX_env *env, size_t bytes); - while (unlikely(npages > 1)) { - const pgno_t pgno = (txn->tw.spilled.list[idx] >> 1) + 1; - if (MDBX_PNL_ASCENDING) { - if (++idx > MDBX_PNL_GETSIZE(txn->tw.spilled.list) || - (txn->tw.spilled.list[idx] >> 1) != pgno) - return; - } else { - if (--idx < 1 || (txn->tw.spilled.list[idx] >> 1) != pgno) - return; - txn->tw.spilled.least_removed = (idx < txn->tw.spilled.least_removed) - ? idx - : txn->tw.spilled.least_removed; - } - txn->tw.spilled.list[idx] |= 1; - MDBX_PNL_SETSIZE(txn->tw.spilled.list, - MDBX_PNL_GETSIZE(txn->tw.spilled.list) - - (idx == MDBX_PNL_GETSIZE(txn->tw.spilled.list))); - --npages; - } +MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL size_t +pgno_align2os_bytes(const MDBX_env *env, size_t pgno); + +MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL pgno_t +pgno_align2os_pgno(const MDBX_env *env, size_t pgno); + +MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t +largechunk_npages(const MDBX_env *env, size_t bytes) { + return bytes2pgno(env, PAGEHDRSZ - 1 + bytes) + 1; } -static MDBX_PNL spill_purge(MDBX_txn *txn) { - tASSERT(txn, txn->tw.spilled.least_removed > 0); - const MDBX_PNL sl = txn->tw.spilled.list; - if (txn->tw.spilled.least_removed != INT_MAX) { - size_t len = MDBX_PNL_GETSIZE(sl), r, w; - for (w = r = txn->tw.spilled.least_removed; r <= len; ++r) { - sl[w] = sl[r]; - w += 1 - (sl[r] & 1); - } - for (size_t i = 1; i < w; ++i) - tASSERT(txn, (sl[i] & 1) == 0); - MDBX_PNL_SETSIZE(sl, w - 1); - txn->tw.spilled.least_removed = INT_MAX; - } else { - for (size_t i = 1; i <= MDBX_PNL_GETSIZE(sl); ++i) - tASSERT(txn, (sl[i] & 1) == 0); - } - return sl; +MDBX_NOTHROW_PURE_FUNCTION static inline MDBX_val get_key(const node_t *node) { + MDBX_val key; + key.iov_len = node_ks(node); + key.iov_base = node_key(node); + return key; } -#if MDBX_PNL_ASCENDING -#define MDBX_PNL_EXTRACT_KEY(ptr) (*(ptr)) -#else -#define MDBX_PNL_EXTRACT_KEY(ptr) (P_INVALID - *(ptr)) -#endif -RADIXSORT_IMPL(pgno, pgno_t, MDBX_PNL_EXTRACT_KEY, - MDBX_PNL_PREALLOC_FOR_RADIXSORT, 0) +static inline void get_key_optional(const node_t *node, + MDBX_val *keyptr /* __may_null */) { + if (keyptr) + *keyptr = get_key(node); +} -SORT_IMPL(pgno_sort, false, pgno_t, MDBX_PNL_ORDERED) +MDBX_NOTHROW_PURE_FUNCTION static inline void *page_data(const page_t *mp) { + return ptr_disp(mp, PAGEHDRSZ); +} -__hot __noinline static void pnl_sort_nochk(MDBX_PNL pnl) { - if (likely(MDBX_PNL_GETSIZE(pnl) < MDBX_RADIXSORT_THRESHOLD) || - unlikely(!pgno_radixsort(&MDBX_PNL_FIRST(pnl), MDBX_PNL_GETSIZE(pnl)))) - pgno_sort(MDBX_PNL_BEGIN(pnl), MDBX_PNL_END(pnl)); +MDBX_NOTHROW_PURE_FUNCTION static inline const page_t * +data_page(const void *data) { + return container_of(data, page_t, entries); } -static __inline void pnl_sort(MDBX_PNL pnl, size_t limit4check) { - pnl_sort_nochk(pnl); - assert(pnl_check(pnl, limit4check)); - (void)limit4check; +MDBX_NOTHROW_PURE_FUNCTION static inline meta_t *page_meta(page_t *mp) { + return (meta_t *)page_data(mp); } -/* Search for an pgno in an PNL. - * Returns The index of the first item greater than or equal to pgno. */ -SEARCH_IMPL(pgno_bsearch, pgno_t, pgno_t, MDBX_PNL_ORDERED) +MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_numkeys(const page_t *mp) { + return mp->lower >> 1; +} -__hot __noinline static size_t pnl_search_nochk(const MDBX_PNL pnl, - pgno_t pgno) { - const pgno_t *begin = MDBX_PNL_BEGIN(pnl); - const pgno_t *it = pgno_bsearch(begin, MDBX_PNL_GETSIZE(pnl), pgno); - const pgno_t *end = begin + MDBX_PNL_GETSIZE(pnl); - assert(it >= begin && it <= end); - if (it != begin) - assert(MDBX_PNL_ORDERED(it[-1], pgno)); - if (it != end) - assert(!MDBX_PNL_ORDERED(it[0], pgno)); - return it - begin + 1; +MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_room(const page_t *mp) { + return mp->upper - mp->lower; } -static __inline size_t pnl_search(const MDBX_PNL pnl, pgno_t pgno, - size_t limit) { - assert(pnl_check_allocated(pnl, limit)); - if (MDBX_HAVE_CMOV) { - /* cmov-ускоренный бинарный поиск может читать (но не использовать) один - * элемент за концом данных, этот элемент в пределах выделенного участка - * памяти, но не инициализирован. */ - VALGRIND_MAKE_MEM_DEFINED(MDBX_PNL_END(pnl), sizeof(pgno_t)); - } - assert(pgno < limit); - (void)limit; - size_t n = pnl_search_nochk(pnl, pgno); - if (MDBX_HAVE_CMOV) { - VALGRIND_MAKE_MEM_UNDEFINED(MDBX_PNL_END(pnl), sizeof(pgno_t)); - } - return n; +MDBX_NOTHROW_PURE_FUNCTION static inline size_t +page_space(const MDBX_env *env) { + STATIC_ASSERT(PAGEHDRSZ % 2 == 0); + return env->ps - PAGEHDRSZ; } -static __inline size_t search_spilled(const MDBX_txn *txn, pgno_t pgno) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - const MDBX_PNL pnl = txn->tw.spilled.list; - if (likely(!pnl)) - return 0; - pgno <<= 1; - size_t n = pnl_search(pnl, pgno, (size_t)MAX_PAGENO + MAX_PAGENO + 1); - return (n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] == pgno) ? n : 0; +MDBX_NOTHROW_PURE_FUNCTION static inline size_t page_used(const MDBX_env *env, + const page_t *mp) { + return page_space(env) - page_room(mp); } -static __inline bool intersect_spilled(const MDBX_txn *txn, pgno_t pgno, - size_t npages) { - const MDBX_PNL pnl = txn->tw.spilled.list; - if (likely(!pnl)) - return false; - const size_t len = MDBX_PNL_GETSIZE(pnl); - if (LOG_ENABLED(MDBX_LOG_EXTRA)) { - DEBUG_EXTRA("PNL len %zu [", len); - for (size_t i = 1; i <= len; ++i) - DEBUG_EXTRA_PRINT(" %li", (pnl[i] & 1) ? -(long)(pnl[i] >> 1) - : (long)(pnl[i] >> 1)); - DEBUG_EXTRA_PRINT("%s\n", "]"); - } - const pgno_t spilled_range_begin = pgno << 1; - const pgno_t spilled_range_last = ((pgno + (pgno_t)npages) << 1) - 1; -#if MDBX_PNL_ASCENDING - const size_t n = - pnl_search(pnl, spilled_range_begin, (size_t)(MAX_PAGENO + 1) << 1); - assert(n && - (n == MDBX_PNL_GETSIZE(pnl) + 1 || spilled_range_begin <= pnl[n])); - const bool rc = n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] <= spilled_range_last; -#else - const size_t n = - pnl_search(pnl, spilled_range_last, (size_t)MAX_PAGENO + MAX_PAGENO + 1); - assert(n && (n == MDBX_PNL_GETSIZE(pnl) + 1 || spilled_range_last >= pnl[n])); - const bool rc = n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] >= spilled_range_begin; -#endif - if (ASSERT_ENABLED()) { - bool check = false; - for (size_t i = 0; i < npages; ++i) - check |= search_spilled(txn, (pgno_t)(pgno + i)) != 0; - assert(check == rc); - } - return rc; +/* The percentage of space used in the page, in a percents. */ +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline unsigned +page_fill_percentum_x10(const MDBX_env *env, const page_t *mp) { + const size_t space = page_space(env); + return (unsigned)((page_used(env, mp) * 1000 + space / 2) / space); } -/*----------------------------------------------------------------------------*/ - -static __always_inline size_t txl_size2bytes(const size_t size) { - assert(size > 0 && size <= MDBX_TXL_MAX * 2); - size_t bytes = - ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(txnid_t) * (size + 2), - MDBX_TXL_GRANULATE * sizeof(txnid_t)) - - MDBX_ASSUME_MALLOC_OVERHEAD; - return bytes; +MDBX_NOTHROW_PURE_FUNCTION static inline node_t *page_node(const page_t *mp, + size_t i) { + assert(page_type_compat(mp) == P_LEAF || page_type(mp) == P_BRANCH); + assert(page_numkeys(mp) > i); + assert(mp->entries[i] % 2 == 0); + return ptr_disp(mp, mp->entries[i] + PAGEHDRSZ); } -static __always_inline size_t txl_bytes2size(const size_t bytes) { - size_t size = bytes / sizeof(txnid_t); - assert(size > 2 && size <= MDBX_TXL_MAX * 2); - return size - 2; +MDBX_NOTHROW_PURE_FUNCTION static inline void * +page_dupfix_ptr(const page_t *mp, size_t i, size_t keysize) { + assert(page_type_compat(mp) == (P_LEAF | P_DUPFIX) && i == (indx_t)i && + mp->dupfix_ksize == keysize); + (void)keysize; + return ptr_disp(mp, PAGEHDRSZ + mp->dupfix_ksize * (indx_t)i); } -static MDBX_TXL txl_alloc(void) { - size_t bytes = txl_size2bytes(MDBX_TXL_INITIAL); - MDBX_TXL tl = osal_malloc(bytes); - if (likely(tl)) { -#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) - bytes = malloc_usable_size(tl); -#endif /* malloc_usable_size */ - tl[0] = txl_bytes2size(bytes); - assert(tl[0] >= MDBX_TXL_INITIAL); - tl += 1; - *tl = 0; - } - return tl; +MDBX_NOTHROW_PURE_FUNCTION static inline MDBX_val +page_dupfix_key(const page_t *mp, size_t i, size_t keysize) { + MDBX_val r; + r.iov_base = page_dupfix_ptr(mp, i, keysize); + r.iov_len = mp->dupfix_ksize; + return r; } -static void txl_free(MDBX_TXL tl) { - if (likely(tl)) - osal_free(tl - 1); -} +/*----------------------------------------------------------------------------*/ -static int txl_reserve(MDBX_TXL __restrict *__restrict ptl, - const size_t wanna) { - const size_t allocated = (size_t)MDBX_PNL_ALLOCLEN(*ptl); - assert(MDBX_PNL_GETSIZE(*ptl) <= MDBX_TXL_MAX && - MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_GETSIZE(*ptl)); - if (likely(allocated >= wanna)) - return MDBX_SUCCESS; +MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int +cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b); - if (unlikely(wanna > /* paranoia */ MDBX_TXL_MAX)) { - ERROR("TXL too long (%zu > %zu)", wanna, (size_t)MDBX_TXL_MAX); - return MDBX_TXN_FULL; - } +#if MDBX_UNALIGNED_OK < 2 || \ + (MDBX_DEBUG || MDBX_FORCE_ASSERTIONS || !defined(NDEBUG)) +MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int +/* Compare two items pointing at 2-byte aligned unsigned int's. */ +cmp_int_align2(const MDBX_val *a, const MDBX_val *b); +#else +#define cmp_int_align2 cmp_int_unaligned +#endif /* !MDBX_UNALIGNED_OK || debug */ - const size_t size = (wanna + wanna - allocated < MDBX_TXL_MAX) - ? wanna + wanna - allocated - : MDBX_TXL_MAX; - size_t bytes = txl_size2bytes(size); - MDBX_TXL tl = osal_realloc(*ptl - 1, bytes); - if (likely(tl)) { -#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) - bytes = malloc_usable_size(tl); -#endif /* malloc_usable_size */ - *tl = txl_bytes2size(bytes); - assert(*tl >= wanna); - *ptl = tl + 1; - return MDBX_SUCCESS; - } - return MDBX_ENOMEM; -} +#if MDBX_UNALIGNED_OK < 4 || \ + (MDBX_DEBUG || MDBX_FORCE_ASSERTIONS || !defined(NDEBUG)) +MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int +/* Compare two items pointing at 4-byte aligned unsigned int's. */ +cmp_int_align4(const MDBX_val *a, const MDBX_val *b); +#else +#define cmp_int_align4 cmp_int_unaligned +#endif /* !MDBX_UNALIGNED_OK || debug */ -static __always_inline int __must_check_result -txl_need(MDBX_TXL __restrict *__restrict ptl, size_t num) { - assert(MDBX_PNL_GETSIZE(*ptl) <= MDBX_TXL_MAX && - MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_GETSIZE(*ptl)); - assert(num <= MDBX_PGL_LIMIT); - const size_t wanna = (size_t)MDBX_PNL_GETSIZE(*ptl) + num; - return likely(MDBX_PNL_ALLOCLEN(*ptl) >= wanna) ? MDBX_SUCCESS - : txl_reserve(ptl, wanna); -} +/* Compare two items lexically */ +MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int cmp_lexical(const MDBX_val *a, + const MDBX_val *b); -static __always_inline void txl_xappend(MDBX_TXL __restrict tl, txnid_t id) { - assert(MDBX_PNL_GETSIZE(tl) < MDBX_PNL_ALLOCLEN(tl)); - tl[0] += 1; - MDBX_PNL_LAST(tl) = id; -} +/* Compare two items in reverse byte order */ +MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int cmp_reverse(const MDBX_val *a, + const MDBX_val *b); -#define TXNID_SORT_CMP(first, last) ((first) > (last)) -SORT_IMPL(txnid_sort, false, txnid_t, TXNID_SORT_CMP) -static void txl_sort(MDBX_TXL tl) { - txnid_sort(MDBX_PNL_BEGIN(tl), MDBX_PNL_END(tl)); -} +/* Fast non-lexically comparator */ +MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int cmp_lenfast(const MDBX_val *a, + const MDBX_val *b); -static int __must_check_result txl_append(MDBX_TXL __restrict *ptl, - txnid_t id) { - if (unlikely(MDBX_PNL_GETSIZE(*ptl) == MDBX_PNL_ALLOCLEN(*ptl))) { - int rc = txl_need(ptl, MDBX_TXL_GRANULATE); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } - txl_xappend(*ptl, id); - return MDBX_SUCCESS; +MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL bool +eq_fast_slowpath(const uint8_t *a, const uint8_t *b, size_t l); + +MDBX_NOTHROW_PURE_FUNCTION static inline bool eq_fast(const MDBX_val *a, + const MDBX_val *b) { + return unlikely(a->iov_len == b->iov_len) && + eq_fast_slowpath(a->iov_base, b->iov_base, a->iov_len); } -/*----------------------------------------------------------------------------*/ +MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int +cmp_equal_or_greater(const MDBX_val *a, const MDBX_val *b); -#define MDBX_DPL_GAP_MERGESORT 16 -#define MDBX_DPL_GAP_EDGING 2 -#define MDBX_DPL_RESERVE_GAP (MDBX_DPL_GAP_MERGESORT + MDBX_DPL_GAP_EDGING) +MDBX_NOTHROW_PURE_FUNCTION MDBX_INTERNAL int +cmp_equal_or_wrong(const MDBX_val *a, const MDBX_val *b); -static __always_inline size_t dpl_size2bytes(ptrdiff_t size) { - assert(size > CURSOR_STACK && (size_t)size <= MDBX_PGL_LIMIT); -#if MDBX_DPL_PREALLOC_FOR_RADIXSORT - size += size; -#endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ - STATIC_ASSERT(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(MDBX_dpl) + - (MDBX_PGL_LIMIT * (MDBX_DPL_PREALLOC_FOR_RADIXSORT + 1) + - MDBX_DPL_RESERVE_GAP) * - sizeof(MDBX_dp) + - MDBX_PNL_GRANULATE * sizeof(void *) * 2 < - SIZE_MAX / 4 * 3); - size_t bytes = - ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(MDBX_dpl) + - ((size_t)size + MDBX_DPL_RESERVE_GAP) * sizeof(MDBX_dp), - MDBX_PNL_GRANULATE * sizeof(void *) * 2) - - MDBX_ASSUME_MALLOC_OVERHEAD; - return bytes; +static inline MDBX_cmp_func *builtin_keycmp(MDBX_db_flags_t flags) { + return (flags & MDBX_REVERSEKEY) ? cmp_reverse + : (flags & MDBX_INTEGERKEY) ? cmp_int_align2 + : cmp_lexical; } -static __always_inline size_t dpl_bytes2size(const ptrdiff_t bytes) { - size_t size = (bytes - sizeof(MDBX_dpl)) / sizeof(MDBX_dp); - assert(size > CURSOR_STACK + MDBX_DPL_RESERVE_GAP && - size <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); - size -= MDBX_DPL_RESERVE_GAP; -#if MDBX_DPL_PREALLOC_FOR_RADIXSORT - size >>= 1; -#endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ - return size; +static inline MDBX_cmp_func *builtin_datacmp(MDBX_db_flags_t flags) { + return !(flags & MDBX_DUPSORT) + ? cmp_lenfast + : ((flags & MDBX_INTEGERDUP) + ? cmp_int_unaligned + : ((flags & MDBX_REVERSEDUP) ? cmp_reverse : cmp_lexical)); } -static __always_inline size_t dpl_setlen(MDBX_dpl *dl, size_t len) { - static const MDBX_page dpl_stub_pageE = {INVALID_TXNID, - 0, - P_BAD, - {0}, - /* pgno */ ~(pgno_t)0}; - assert(dpl_stub_pageE.mp_flags == P_BAD && - dpl_stub_pageE.mp_pgno == P_INVALID); - dl->length = len; - dl->items[len + 1].ptr = (MDBX_page *)&dpl_stub_pageE; - dl->items[len + 1].pgno = P_INVALID; - dl->items[len + 1].npages = 1; - return len; -} +/*----------------------------------------------------------------------------*/ -static __always_inline void dpl_clear(MDBX_dpl *dl) { - static const MDBX_page dpl_stub_pageB = {INVALID_TXNID, - 0, - P_BAD, - {0}, - /* pgno */ 0}; - assert(dpl_stub_pageB.mp_flags == P_BAD && dpl_stub_pageB.mp_pgno == 0); - dl->sorted = dpl_setlen(dl, 0); - dl->pages_including_loose = 0; - dl->items[0].ptr = (MDBX_page *)&dpl_stub_pageB; - dl->items[0].pgno = 0; - dl->items[0].npages = 1; - assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); +MDBX_INTERNAL uint32_t combine_durability_flags(const uint32_t a, + const uint32_t b); + +MDBX_CONST_FUNCTION static inline lck_t *lckless_stub(const MDBX_env *env) { + uintptr_t stub = (uintptr_t)&env->lckless_placeholder; + /* align to avoid false-positive alarm from UndefinedBehaviorSanitizer */ + stub = (stub + MDBX_CACHELINE_SIZE - 1) & ~(MDBX_CACHELINE_SIZE - 1); + return (lck_t *)stub; } -static void dpl_free(MDBX_txn *txn) { - if (likely(txn->tw.dirtylist)) { - osal_free(txn->tw.dirtylist); - txn->tw.dirtylist = NULL; - } +#if !(defined(_WIN32) || defined(_WIN64)) +MDBX_MAYBE_UNUSED static inline int ignore_enosys(int err) { +#ifdef ENOSYS + if (err == ENOSYS) + return MDBX_RESULT_TRUE; +#endif /* ENOSYS */ +#ifdef ENOIMPL + if (err == ENOIMPL) + return MDBX_RESULT_TRUE; +#endif /* ENOIMPL */ +#ifdef ENOTSUP + if (err == ENOTSUP) + return MDBX_RESULT_TRUE; +#endif /* ENOTSUP */ +#ifdef ENOSUPP + if (err == ENOSUPP) + return MDBX_RESULT_TRUE; +#endif /* ENOSUPP */ +#ifdef EOPNOTSUPP + if (err == EOPNOTSUPP) + return MDBX_RESULT_TRUE; +#endif /* EOPNOTSUPP */ + if (err == EAGAIN) + return MDBX_RESULT_TRUE; + return err; } +#endif /* defined(_WIN32) || defined(_WIN64) */ -static MDBX_dpl *dpl_reserve(MDBX_txn *txn, size_t size) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); +static inline int check_env(const MDBX_env *env, const bool wanna_active) { + if (unlikely(!env)) + return MDBX_EINVAL; - size_t bytes = - dpl_size2bytes((size < MDBX_PGL_LIMIT) ? size : MDBX_PGL_LIMIT); - MDBX_dpl *const dl = osal_realloc(txn->tw.dirtylist, bytes); - if (likely(dl)) { -#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) - bytes = malloc_usable_size(dl); -#endif /* malloc_usable_size */ - dl->detent = dpl_bytes2size(bytes); - tASSERT(txn, txn->tw.dirtylist == NULL || dl->length <= dl->detent); - txn->tw.dirtylist = dl; - } - return dl; -} + if (unlikely(env->signature.weak != env_signature)) + return MDBX_EBADSIGN; -static int dpl_alloc(MDBX_txn *txn) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + if (unlikely(env->flags & ENV_FATAL_ERROR)) + return MDBX_PANIC; - const size_t wanna = (txn->mt_env->me_options.dp_initial < txn->mt_geo.upper) - ? txn->mt_env->me_options.dp_initial - : txn->mt_geo.upper; -#if MDBX_FORCE_ASSERTIONS || MDBX_DEBUG - if (txn->tw.dirtylist) - /* обнуляем чтобы не сработал ассерт внутри dpl_reserve() */ - txn->tw.dirtylist->sorted = txn->tw.dirtylist->length = 0; -#endif /* asertions enabled */ - if (unlikely(!txn->tw.dirtylist || txn->tw.dirtylist->detent < wanna || - txn->tw.dirtylist->detent > wanna + wanna) && - unlikely(!dpl_reserve(txn, wanna))) - return MDBX_ENOMEM; + if (wanna_active) { +#if MDBX_ENV_CHECKPID + if (unlikely(env->pid != osal_getpid()) && env->pid) { + ((MDBX_env *)env)->flags |= ENV_FATAL_ERROR; + return MDBX_PANIC; + } +#endif /* MDBX_ENV_CHECKPID */ + if (unlikely((env->flags & ENV_ACTIVE) == 0)) + return MDBX_EPERM; + eASSERT(env, env->dxb_mmap.base != nullptr); + } - dpl_clear(txn->tw.dirtylist); return MDBX_SUCCESS; } -#define MDBX_DPL_EXTRACT_KEY(ptr) ((ptr)->pgno) -RADIXSORT_IMPL(dpl, MDBX_dp, MDBX_DPL_EXTRACT_KEY, - MDBX_DPL_PREALLOC_FOR_RADIXSORT, 1) +static inline int check_txn(const MDBX_txn *txn, int bad_bits) { + if (unlikely(!txn)) + return MDBX_EINVAL; -#define DP_SORT_CMP(first, last) ((first).pgno < (last).pgno) -SORT_IMPL(dp_sort, false, MDBX_dp, DP_SORT_CMP) + if (unlikely(txn->signature != txn_signature)) + return MDBX_EBADSIGN; -__hot __noinline static MDBX_dpl *dpl_sort_slowpath(const MDBX_txn *txn) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + if (unlikely(txn->flags & bad_bits)) + return MDBX_BAD_TXN; - MDBX_dpl *dl = txn->tw.dirtylist; - assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - const size_t unsorted = dl->length - dl->sorted; - if (likely(unsorted < MDBX_RADIXSORT_THRESHOLD) || - unlikely(!dpl_radixsort(dl->items + 1, dl->length))) { - if (dl->sorted > unsorted / 4 + 4 && - (MDBX_DPL_PREALLOC_FOR_RADIXSORT || - dl->length + unsorted < dl->detent + MDBX_DPL_GAP_MERGESORT)) { - MDBX_dp *const sorted_begin = dl->items + 1; - MDBX_dp *const sorted_end = sorted_begin + dl->sorted; - MDBX_dp *const end = - dl->items + (MDBX_DPL_PREALLOC_FOR_RADIXSORT - ? dl->length + dl->length + 1 - : dl->detent + MDBX_DPL_RESERVE_GAP); - MDBX_dp *const tmp = end - unsorted; - assert(dl->items + dl->length + 1 < tmp); - /* copy unsorted to the end of allocated space and sort it */ - memcpy(tmp, sorted_end, unsorted * sizeof(MDBX_dp)); - dp_sort(tmp, tmp + unsorted); - /* merge two parts from end to begin */ - MDBX_dp *__restrict w = dl->items + dl->length; - MDBX_dp *__restrict l = dl->items + dl->sorted; - MDBX_dp *__restrict r = end - 1; - do { - const bool cmp = expect_with_probability(l->pgno > r->pgno, 0, .5); -#if defined(__LCC__) || __CLANG_PREREQ(13, 0) || !MDBX_HAVE_CMOV - *w = cmp ? *l-- : *r--; -#else - *w = cmp ? *l : *r; - l -= cmp; - r += (ptrdiff_t)cmp - 1; -#endif - } while (likely(--w > l)); - assert(r == tmp - 1); - assert(dl->items[0].pgno == 0 && - dl->items[dl->length + 1].pgno == P_INVALID); - if (ASSERT_ENABLED()) - for (size_t i = 0; i <= dl->length; ++i) - assert(dl->items[i].pgno < dl->items[i + 1].pgno); - } else { - dp_sort(dl->items + 1, dl->items + dl->length + 1); - assert(dl->items[0].pgno == 0 && - dl->items[dl->length + 1].pgno == P_INVALID); - } - } else { - assert(dl->items[0].pgno == 0 && - dl->items[dl->length + 1].pgno == P_INVALID); - } - dl->sorted = dl->length; - return dl; + tASSERT(txn, (txn->flags & MDBX_TXN_FINISHED) || + (txn->flags & MDBX_NOSTICKYTHREADS) == + (txn->env->flags & MDBX_NOSTICKYTHREADS)); +#if MDBX_TXN_CHECKOWNER + STATIC_ASSERT((long)MDBX_NOSTICKYTHREADS > (long)MDBX_TXN_FINISHED); + if ((txn->flags & (MDBX_NOSTICKYTHREADS | MDBX_TXN_FINISHED)) < + MDBX_TXN_FINISHED && + unlikely(txn->owner != osal_thread_self())) + return txn->owner ? MDBX_THREAD_MISMATCH : MDBX_BAD_TXN; +#endif /* MDBX_TXN_CHECKOWNER */ + + if (bad_bits && unlikely(!txn->env->dxb_mmap.base)) + return MDBX_EPERM; + + return MDBX_SUCCESS; } -static __always_inline MDBX_dpl *dpl_sort(const MDBX_txn *txn) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); +static inline int check_txn_rw(const MDBX_txn *txn, int bad_bits) { + int err = check_txn(txn, bad_bits); + if (unlikely(err)) + return err; - MDBX_dpl *dl = txn->tw.dirtylist; - assert(dl->length <= MDBX_PGL_LIMIT); - assert(dl->sorted <= dl->length); - assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - return likely(dl->sorted == dl->length) ? dl : dpl_sort_slowpath(txn); + if (unlikely(txn->flags & MDBX_TXN_RDONLY)) + return MDBX_EACCESS; + + return MDBX_SUCCESS; } -/* Returns the index of the first dirty-page whose pgno - * member is greater than or equal to id. */ -#define DP_SEARCH_CMP(dp, id) ((dp).pgno < (id)) -SEARCH_IMPL(dp_bsearch, MDBX_dp, pgno_t, DP_SEARCH_CMP) +/*----------------------------------------------------------------------------*/ -__hot __noinline static size_t dpl_search(const MDBX_txn *txn, pgno_t pgno) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); +MDBX_INTERNAL void mincore_clean_cache(const MDBX_env *const env); - MDBX_dpl *dl = txn->tw.dirtylist; - assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - if (AUDIT_ENABLED()) { - for (const MDBX_dp *ptr = dl->items + dl->sorted; --ptr > dl->items;) { - assert(ptr[0].pgno < ptr[1].pgno); - assert(ptr[0].pgno >= NUM_METAS); - } - } +MDBX_INTERNAL void update_mlcnt(const MDBX_env *env, + const pgno_t new_aligned_mlocked_pgno, + const bool lock_not_release); - switch (dl->length - dl->sorted) { - default: - /* sort a whole */ - dpl_sort_slowpath(txn); - break; - case 0: - /* whole sorted cases */ - break; +MDBX_INTERNAL void munlock_after(const MDBX_env *env, const pgno_t aligned_pgno, + const size_t end_bytes); -#define LINEAR_SEARCH_CASE(N) \ - case N: \ - if (dl->items[dl->length - N + 1].pgno == pgno) \ - return dl->length - N + 1; \ - __fallthrough +MDBX_INTERNAL void munlock_all(const MDBX_env *env); - /* use linear scan until the threshold */ - LINEAR_SEARCH_CASE(7); /* fall through */ - LINEAR_SEARCH_CASE(6); /* fall through */ - LINEAR_SEARCH_CASE(5); /* fall through */ - LINEAR_SEARCH_CASE(4); /* fall through */ - LINEAR_SEARCH_CASE(3); /* fall through */ - LINEAR_SEARCH_CASE(2); /* fall through */ - case 1: - if (dl->items[dl->length].pgno == pgno) - return dl->length; - /* continue bsearch on the sorted part */ - break; - } - return dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items; -} +/*----------------------------------------------------------------------------*/ +/* Cache coherence and mmap invalidation */ +#ifndef MDBX_CPU_WRITEBACK_INCOHERENT +#error "The MDBX_CPU_WRITEBACK_INCOHERENT must be defined before" +#elif MDBX_CPU_WRITEBACK_INCOHERENT +#define osal_flush_incoherent_cpu_writeback() osal_memory_barrier() +#else +#define osal_flush_incoherent_cpu_writeback() osal_compiler_barrier() +#endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ -MDBX_NOTHROW_PURE_FUNCTION static __inline unsigned -dpl_npages(const MDBX_dpl *dl, size_t i) { - assert(0 <= (intptr_t)i && i <= dl->length); - unsigned n = dl->items[i].npages; - assert(n == (IS_OVERFLOW(dl->items[i].ptr) ? dl->items[i].ptr->mp_pages : 1)); - return n; -} +MDBX_MAYBE_UNUSED static inline void +osal_flush_incoherent_mmap(const void *addr, size_t nbytes, + const intptr_t pagesize) { +#ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE +#error "The MDBX_MMAP_INCOHERENT_FILE_WRITE must be defined before" +#elif MDBX_MMAP_INCOHERENT_FILE_WRITE + char *const begin = (char *)(-pagesize & (intptr_t)addr); + char *const end = + (char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1)); + int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0; + eASSERT(nullptr, err == 0); + (void)err; +#else + (void)pagesize; +#endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ -MDBX_NOTHROW_PURE_FUNCTION static __inline pgno_t -dpl_endpgno(const MDBX_dpl *dl, size_t i) { - return dpl_npages(dl, i) + dl->items[i].pgno; +#ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE +#error "The MDBX_MMAP_INCOHERENT_CPU_CACHE must be defined before" +#elif MDBX_MMAP_INCOHERENT_CPU_CACHE +#ifdef DCACHE + /* MIPS has cache coherency issues. + * Note: for any nbytes >= on-chip cache size, entire is flushed. */ + cacheflush((void *)addr, nbytes, DCACHE); +#else +#error "Oops, cacheflush() not available" +#endif /* DCACHE */ +#endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */ + +#if !MDBX_MMAP_INCOHERENT_FILE_WRITE && !MDBX_MMAP_INCOHERENT_CPU_CACHE + (void)addr; + (void)nbytes; +#endif } -static __inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, - size_t npages) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - MDBX_dpl *dl = txn->tw.dirtylist; - assert(dl->sorted == dl->length); - assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - size_t const n = dpl_search(txn, pgno); - assert(n >= 1 && n <= dl->length + 1); - assert(pgno <= dl->items[n].pgno); - assert(pgno > dl->items[n - 1].pgno); - const bool rc = - /* intersection with founded */ pgno + npages > dl->items[n].pgno || - /* intersection with prev */ dpl_endpgno(dl, n - 1) > pgno; - if (ASSERT_ENABLED()) { - bool check = false; - for (size_t i = 1; i <= dl->length; ++i) { - const MDBX_page *const dp = dl->items[i].ptr; - if (!(dp->mp_pgno /* begin */ >= /* end */ pgno + npages || - dpl_endpgno(dl, i) /* end */ <= /* begin */ pgno)) - check |= true; - } - assert(check == rc); - } - return rc; -} -MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t -dpl_exist(const MDBX_txn *txn, pgno_t pgno) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - MDBX_dpl *dl = txn->tw.dirtylist; - size_t i = dpl_search(txn, pgno); - assert((int)i > 0); - return (dl->items[i].pgno == pgno) ? i : 0; -} -MDBX_MAYBE_UNUSED static const MDBX_page *debug_dpl_find(const MDBX_txn *txn, - const pgno_t pgno) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - const MDBX_dpl *dl = txn->tw.dirtylist; - if (dl) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - assert(dl->items[0].pgno == 0 && - dl->items[dl->length + 1].pgno == P_INVALID); - for (size_t i = dl->length; i > dl->sorted; --i) - if (dl->items[i].pgno == pgno) - return dl->items[i].ptr; +/* Состояние курсора. + * + * 1. Неустановлен/poor: + * - следует пропускать во всех циклах отслеживания/корректировки + * позиций курсоров; + * - нельзя использовать для относительных перемещений; + * - нельзя использовать для CRUD; + * - допускаются только операции предполагающие установку абсолютной позиции; + * - в остальных случаях возвращается ENODATA. + * + * У таких курсоров top = -1 и flags < 0, что позволяет дешево проверять и + * пропускать такие курсоры в циклах отслеживания/корректировки по условию + * probe_cursor->top < this_cursor->top. + * + * 2. Скользкий/slippy: + * - частично инициализированный курсор, но без заполненного стека; + * - в отличии от неустановленного/poor допускаются только операции связанные + * с заполнением стека; + * - при любой проблеме или попытке неуместного использования курсор + * сбрасывается в состояние неустановленного/poor; + * - становится установленным/ready только при успешном завершении операций + * позиционирования или поиска. + * + * У таких курсоров top >= 0, но flags < 0. + * + * 3. Установленный/pointed: + * - допускаются операции относительного позиционирования; + * - может иметь флажки z_after_delete, z_hollow и z_pending_oef, + * при наличии которых пользователю возвращается NOTFOUND при попытке + * получить текущие данные, либо продолжить перемещение в недоступную + * сторону. + * + * 4. Наполненный данными/filled: + * - это установленный/pointed курсор без флагов z_after_delete, + * z_hollow и z_pending_oef. + * - за курсором есть даные, возможны CRUD операции в текущей позиции. + * + * У таких курсоров top >= 0 и (unsigned)flags < z_hollow. + * + * 5. Изменения состояния. + * + * - Сбрасывается состояние курсора посредством top_and_flags |= z_poor_mark, + * что равносильно top = -1 вместе с flags |= z_poor_mark + * - При заполнении стека курсора сначала устанавливается top, а flags + * только в самом конце при отсутстви ошибок. + * - При выходе за конец набора данных и/или отсутствии соседних/sibling + * страниц взводится флажок z_hollow, чего (в текущем понимании) достаточно + * для контроля/обработки всех ситуаций. При этом наличие флажка z_hollow + * не означает что ki[top] >= page_numkeys(pg[top]), так как после + * позиционирования курсора за последний элемент (и установке z_hollow) может + * быть произведена append-вставка в эту позицию через другой курсор. */ +enum cursor_state { + /* Это вложенный курсор для вложенного дерева/страницы и является + inner-элементом struct cursor_couple. */ + z_inner = 0x01, + + /* Происходит подготовка к обновлению GC, + поэтому можно брать страницы из GC даже для FREE_DBI. */ + z_gcu_preparation = 0x02, + + /* Курсор только-что создан, поэтому допускается авто-установка + в начало/конец, вместо возврата ошибки */ + z_fresh = 0x04, + + /* Предыдущей операцией было удаление, поэтому курсор уже указывает + на следующий элемент и соответствующая операция перемещения должна + игнорироваться. */ + z_after_delete = 0x08, + + z_disable_tree_search_fastpath = 0x10, + + /* Курсор логически стоит на конце данных, + * но физически на последней строке и ki[top] == page_numkeys(pg[top]) - 1. */ + z_eof = 0x20, + + /* За курсором нет данных, нельзя делать CRUD операции в текущей позиции. + Как правило, это означает что курсор стоит за последней строкой данных + и ki[top] == page_numkeys(pg[top]). */ + z_hollow = 0x40, + + /* Маски для сброса/установки состояния. */ + z_clear_mask = z_inner | z_gcu_preparation, + z_poor_mark = -128 | z_eof | z_hollow | z_disable_tree_search_fastpath, + z_fresh_mark = z_poor_mark | z_fresh +}; - if (dl->sorted) { - const size_t i = dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items; - if (dl->items[i].pgno == pgno) - return dl->items[i].ptr; - } - } else { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); - } - return nullptr; +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool +is_inner(const MDBX_cursor *mc) { + return (mc->flags & z_inner) != 0; } -static void dpl_remove_ex(const MDBX_txn *txn, size_t i, size_t npages) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool +is_poor(const MDBX_cursor *mc) { + const bool r = mc->top < 0; + cASSERT(mc, r == (mc->top_and_flags < 0)); + if (r && mc->subcur) + cASSERT(mc, mc->subcur->cursor.flags < 0 && mc->subcur->cursor.top < 0); + return r; +} - MDBX_dpl *dl = txn->tw.dirtylist; - assert((intptr_t)i > 0 && i <= dl->length); - assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - dl->pages_including_loose -= npages; - dl->sorted -= dl->sorted >= i; - dl->length -= 1; - memmove(dl->items + i, dl->items + i + 1, - (dl->length - i + 2) * sizeof(dl->items[0])); - assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool +is_pointed(const MDBX_cursor *mc) { + const bool r = mc->top >= 0; + cASSERT(mc, r == (mc->top_and_flags >= 0)); + if (!r && mc->subcur) + cASSERT(mc, is_poor(&mc->subcur->cursor)); + return r; } -static void dpl_remove(const MDBX_txn *txn, size_t i) { - dpl_remove_ex(txn, i, dpl_npages(txn->tw.dirtylist, i)); +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool +is_hollow(const MDBX_cursor *mc) { + const bool r = (z_hollow - 1) < (uint8_t)mc->flags; + if (!r) + cASSERT(mc, + mc->top >= 0 && mc->ki[mc->top] < page_numkeys(mc->pg[mc->top])); + else if (mc->subcur) + cASSERT(mc, is_poor(&mc->subcur->cursor)); + return r; } -static __noinline void txn_lru_reduce(MDBX_txn *txn) { - NOTICE("lru-reduce %u -> %u", txn->tw.dirtylru, txn->tw.dirtylru >> 1); - tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); - do { - txn->tw.dirtylru >>= 1; - MDBX_dpl *dl = txn->tw.dirtylist; - for (size_t i = 1; i <= dl->length; ++i) { - size_t *const ptr = - ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t)); - *ptr >>= 1; - } - txn = txn->mt_parent; - } while (txn); +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool +is_eof(const MDBX_cursor *mc) { + const bool r = (z_eof - 1) < (uint8_t)mc->flags; + return r; } -MDBX_NOTHROW_PURE_FUNCTION static __inline uint32_t dpl_age(const MDBX_txn *txn, - size_t i) { - tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); - const MDBX_dpl *dl = txn->tw.dirtylist; - assert((intptr_t)i > 0 && i <= dl->length); - size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t)); - return txn->tw.dirtylru - (uint32_t)*ptr; +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool +is_filled(const MDBX_cursor *mc) { + return !is_hollow(mc); } -static __inline uint32_t txn_lru_turn(MDBX_txn *txn) { - txn->tw.dirtylru += 1; - if (unlikely(txn->tw.dirtylru > UINT32_MAX / 3) && - (txn->mt_flags & MDBX_WRITEMAP) == 0) - txn_lru_reduce(txn); - return txn->tw.dirtylru; +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool +inner_filled(const MDBX_cursor *mc) { + return mc->subcur && is_filled(&mc->subcur->cursor); } -static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, - pgno_t pgno, - MDBX_page *page, - size_t npages) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - const MDBX_dp dp = {page, pgno, (pgno_t)npages}; - if ((txn->mt_flags & MDBX_WRITEMAP) == 0) { - size_t *const ptr = ptr_disp(page, -(ptrdiff_t)sizeof(size_t)); - *ptr = txn->tw.dirtylru; - } +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool +inner_pointed(const MDBX_cursor *mc) { + return mc->subcur && is_pointed(&mc->subcur->cursor); +} - MDBX_dpl *dl = txn->tw.dirtylist; - tASSERT(txn, dl->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); - tASSERT(txn, dl->items[0].pgno == 0 && - dl->items[dl->length + 1].pgno == P_INVALID); - if (AUDIT_ENABLED()) { - for (size_t i = dl->length; i > 0; --i) { - assert(dl->items[i].pgno != dp.pgno); - if (unlikely(dl->items[i].pgno == dp.pgno)) { - ERROR("Page %u already exist in the DPL at %zu", dp.pgno, i); - return MDBX_PROBLEM; - } - } +MDBX_MAYBE_UNUSED static inline void inner_gone(MDBX_cursor *mc) { + if (mc->subcur) { + TRACE("reset inner cursor %p", + __Wpedantic_format_voidptr(&mc->subcur->cursor)); + mc->subcur->nested_tree.root = 0; + mc->subcur->cursor.top_and_flags = z_inner | z_poor_mark; } +} - if (unlikely(dl->length == dl->detent)) { - if (unlikely(dl->detent >= MDBX_PGL_LIMIT)) { - ERROR("DPL is full (MDBX_PGL_LIMIT %zu)", MDBX_PGL_LIMIT); - return MDBX_TXN_FULL; - } - const size_t size = (dl->detent < MDBX_PNL_INITIAL * 42) - ? dl->detent + dl->detent - : dl->detent + dl->detent / 2; - dl = dpl_reserve(txn, size); - if (unlikely(!dl)) - return MDBX_ENOMEM; - tASSERT(txn, dl->length < dl->detent); +MDBX_MAYBE_UNUSED static inline void be_poor(MDBX_cursor *mc) { + const bool inner = is_inner(mc); + if (inner) { + mc->tree->root = 0; + mc->top_and_flags = z_inner | z_poor_mark; + } else { + mc->top_and_flags |= z_poor_mark; + inner_gone(mc); } + cASSERT(mc, is_poor(mc) && !is_pointed(mc) && !is_filled(mc)); + cASSERT(mc, inner == is_inner(mc)); +} - /* Сортировка нужна для быстрого поиска, используем несколько тактик: - * 1) Сохраняем упорядоченность при естественной вставке в нужном порядке. - * 2) Добавляем в не-сортированный хвост, который сортируем и сливаем - * с отсортированной головой по необходимости, а пока хвост короткий - * ищем в нём сканированием, избегая большой пересортировки. - * 3) Если не-сортированный хвост короткий, а добавляемый элемент близок - * к концу отсортированной головы, то выгоднее сразу вставить элемент - * в нужное место. - * - * Алгоритмически: - * - добавлять в не-сортированный хвост следует только если вставка сильно - * дорогая, т.е. если целевая позиция элемента сильно далека от конца; - * - для быстрой проверки достаточно сравнить добавляемый элемент с отстоящим - * от конца на максимально-приемлемое расстояние; - * - если список короче, либо элемент в этой позиции меньше вставляемого, - * то следует перемещать элементы и вставлять в отсортированную голову; - * - если не-сортированный хвост длиннее, либо элемент в этой позиции больше, - * то следует добавлять в не-сортированный хвост. */ +MDBX_MAYBE_UNUSED static inline void be_hollow(MDBX_cursor *mc) { + mc->flags |= z_eof | z_hollow; + inner_gone(mc); +} - dl->pages_including_loose += npages; - MDBX_dp *i = dl->items + dl->length; +MDBX_MAYBE_UNUSED static inline void be_filled(MDBX_cursor *mc) { + const bool inner = is_inner(mc); + mc->flags &= z_clear_mask; + cASSERT(mc, is_filled(mc)); + cASSERT(mc, inner == is_inner(mc)); +} -#define MDBX_DPL_INSERTION_THRESHOLD 42 - const ptrdiff_t pivot = (ptrdiff_t)dl->length - MDBX_DPL_INSERTION_THRESHOLD; -#if MDBX_HAVE_CMOV - const pgno_t pivot_pgno = - dl->items[(dl->length < MDBX_DPL_INSERTION_THRESHOLD) - ? 0 - : dl->length - MDBX_DPL_INSERTION_THRESHOLD] - .pgno; -#endif /* MDBX_HAVE_CMOV */ +MDBX_MAYBE_UNUSED static inline bool is_related(const MDBX_cursor *base, + const MDBX_cursor *scan) { + cASSERT(base, base->top >= 0); + return base->top <= scan->top && base != scan; +} - /* copy the stub beyond the end */ - i[2] = i[1]; - dl->length += 1; +/* Флаги контроля/проверки курсора. */ +enum cursor_checking { + z_branch = 0x01 /* same as P_BRANCH for check_leaf_type() */, + z_leaf = 0x02 /* same as P_LEAF for check_leaf_type() */, + z_largepage = 0x04 /* same as P_LARGE for check_leaf_type() */, + z_updating = 0x08 /* update/rebalance pending */, + z_ignord = 0x10 /* don't check keys ordering */, + z_dupfix = 0x20 /* same as P_DUPFIX for check_leaf_type() */, + z_retiring = 0x40 /* refs to child pages may be invalid */, + z_pagecheck = 0x80 /* perform page checking, see MDBX_VALIDATION */ +}; - if (likely(pivot <= (ptrdiff_t)dl->sorted) && -#if MDBX_HAVE_CMOV - pivot_pgno < dp.pgno) { -#else - (pivot <= 0 || dl->items[pivot].pgno < dp.pgno)) { -#endif /* MDBX_HAVE_CMOV */ - dl->sorted += 1; +MDBX_INTERNAL int __must_check_result cursor_check(const MDBX_cursor *mc); - /* сдвигаем несортированный хвост */ - while (i >= dl->items + dl->sorted) { -#if !defined(__GNUC__) /* пытаемся избежать вызова memmove() */ - i[1] = *i; -#elif MDBX_WORDBITS == 64 && \ - (defined(__SIZEOF_INT128__) || \ - (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)) - STATIC_ASSERT(sizeof(MDBX_dp) == sizeof(__uint128_t)); - ((__uint128_t *)i)[1] = *(volatile __uint128_t *)i; -#else - i[1].ptr = i->ptr; - i[1].pgno = i->pgno; - i[1].npages = i->npages; -#endif - --i; - } - /* ищем нужную позицию сдвигая отсортированные элементы */ - while (i->pgno > pgno) { - tASSERT(txn, i > dl->items); - i[1] = *i; - --i; - } - tASSERT(txn, i->pgno < dp.pgno); - } +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline size_t +cursor_dbi(const MDBX_cursor *mc) { + cASSERT(mc, mc->txn && mc->txn->signature == txn_signature); + size_t dbi = mc->dbi_state - mc->txn->dbi_state; + cASSERT(mc, dbi < mc->txn->env->n_dbi); + return dbi; +} - i[1] = dp; - assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - assert(dl->sorted <= dl->length); - return MDBX_SUCCESS; +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool +cursor_dbi_changed(const MDBX_cursor *mc) { + return dbi_changed(mc->txn, cursor_dbi(mc)); } -/*----------------------------------------------------------------------------*/ +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline uint8_t * +cursor_dbi_state(const MDBX_cursor *mc) { + return mc->dbi_state; +} -static __must_check_result __inline int page_retire(MDBX_cursor *mc, - MDBX_page *mp); +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool +cursor_is_gc(const MDBX_cursor *mc) { + return mc->dbi_state == mc->txn->dbi_state + FREE_DBI; +} -static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, - size_t npages); -typedef struct page_result { - MDBX_page *page; - int err; -} pgr_t; +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool +cursor_is_main(const MDBX_cursor *mc) { + return mc->dbi_state == mc->txn->dbi_state + MAIN_DBI; +} -static txnid_t kick_longlived_readers(MDBX_env *env, const txnid_t laggard); +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool +cursor_is_core(const MDBX_cursor *mc) { + return mc->dbi_state < mc->txn->dbi_state + CORE_DBS; +} -static pgr_t page_new(MDBX_cursor *mc, const unsigned flags); -static pgr_t page_new_large(MDBX_cursor *mc, const size_t npages); -static int page_touch(MDBX_cursor *mc); -static int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, - const MDBX_val *data); +MDBX_MAYBE_UNUSED static inline int cursor_dbi_dbg(const MDBX_cursor *mc) { + /* Debugging output value of a cursor's DBI: Negative for a sub-cursor. */ + const int dbi = cursor_dbi(mc); + return (mc->flags & z_inner) ? -dbi : dbi; +} -#define TXN_END_NAMES \ - { \ - "committed", "empty-commit", "abort", "reset", "reset-tmp", "fail-begin", \ - "fail-beginchild" \ +MDBX_MAYBE_UNUSED static inline int __must_check_result +cursor_push(MDBX_cursor *mc, page_t *mp, indx_t ki) { + TRACE("pushing page %" PRIaPGNO " on db %d cursor %p", mp->pgno, + cursor_dbi_dbg(mc), __Wpedantic_format_voidptr(mc)); + if (unlikely(mc->top >= CURSOR_STACK_SIZE - 1)) { + be_poor(mc); + mc->txn->flags |= MDBX_TXN_ERROR; + return MDBX_CURSOR_FULL; } -enum { - /* txn_end operation number, for logging */ - TXN_END_COMMITTED, - TXN_END_PURE_COMMIT, - TXN_END_ABORT, - TXN_END_RESET, - TXN_END_RESET_TMP, - TXN_END_FAIL_BEGIN, - TXN_END_FAIL_BEGINCHILD -}; -#define TXN_END_OPMASK 0x0F /* mask for txn_end() operation number */ -#define TXN_END_UPDATE 0x10 /* update env state (DBIs) */ -#define TXN_END_FREE 0x20 /* free txn unless it is MDBX_env.me_txn0 */ -#define TXN_END_EOTDONE 0x40 /* txn's cursors already closed */ -#define TXN_END_SLOT 0x80 /* release any reader slot if NOSTICKYTHREADS */ -static int txn_end(MDBX_txn *txn, const unsigned mode); - -static __always_inline pgr_t page_get_inline(const uint16_t ILL, - const MDBX_cursor *const mc, - const pgno_t pgno, - const txnid_t front); - -static pgr_t page_get_any(const MDBX_cursor *const mc, const pgno_t pgno, - const txnid_t front) { - return page_get_inline(P_ILL_BITS, mc, pgno, front); + mc->top += 1; + mc->pg[mc->top] = mp; + mc->ki[mc->top] = ki; + return MDBX_SUCCESS; } -__hot static pgr_t page_get_three(const MDBX_cursor *const mc, - const pgno_t pgno, const txnid_t front) { - return page_get_inline(P_ILL_BITS | P_OVERFLOW, mc, pgno, front); +MDBX_MAYBE_UNUSED static inline void cursor_pop(MDBX_cursor *mc) { + TRACE("popped page %" PRIaPGNO " off db %d cursor %p", mc->pg[mc->top]->pgno, + cursor_dbi_dbg(mc), __Wpedantic_format_voidptr(mc)); + cASSERT(mc, mc->top >= 0); + mc->top -= 1; } -static pgr_t page_get_large(const MDBX_cursor *const mc, const pgno_t pgno, - const txnid_t front) { - return page_get_inline(P_ILL_BITS | P_BRANCH | P_LEAF | P_LEAF2, mc, pgno, - front); +MDBX_NOTHROW_PURE_FUNCTION static inline bool +check_leaf_type(const MDBX_cursor *mc, const page_t *mp) { + return (((page_type(mp) ^ mc->checking) & + (z_branch | z_leaf | z_largepage | z_dupfix)) == 0); } -static __always_inline int __must_check_result page_get(const MDBX_cursor *mc, - const pgno_t pgno, - MDBX_page **mp, - const txnid_t front) { - pgr_t ret = page_get_three(mc, pgno, front); - *mp = ret.page; - return ret.err; -} +MDBX_INTERNAL void cursor_eot(MDBX_cursor *mc, const bool merge); +MDBX_INTERNAL int cursor_shadow(MDBX_cursor *parent_cursor, + MDBX_txn *nested_txn, const size_t dbi); -static int __must_check_result page_search_root(MDBX_cursor *mc, - const MDBX_val *key, int flags); +MDBX_INTERNAL MDBX_cursor *cursor_cpstk(const MDBX_cursor *csrc, + MDBX_cursor *cdst); -#define MDBX_PS_MODIFY 1 -#define MDBX_PS_ROOTONLY 2 -#define MDBX_PS_FIRST 4 -#define MDBX_PS_LAST 8 -static int __must_check_result page_search(MDBX_cursor *mc, const MDBX_val *key, - int flags); -static int __must_check_result page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst); +MDBX_INTERNAL int __must_check_result cursor_ops(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data, + const MDBX_cursor_op op); -#define MDBX_SPLIT_REPLACE MDBX_APPENDDUP /* newkey is not new */ -static int __must_check_result page_split(MDBX_cursor *mc, - const MDBX_val *const newkey, - MDBX_val *const newdata, - pgno_t newpgno, const unsigned naf); - -static int coherency_timeout(uint64_t *timestamp, intptr_t pgno, - const MDBX_env *env); -static int __must_check_result validate_meta_copy(MDBX_env *env, - const MDBX_meta *meta, - MDBX_meta *dest); -static int __must_check_result override_meta(MDBX_env *env, size_t target, - txnid_t txnid, - const MDBX_meta *shape); -static int __must_check_result read_header(MDBX_env *env, MDBX_meta *meta, - const int lck_exclusive, - const mdbx_mode_t mode_bits); -static int __must_check_result sync_locked(MDBX_env *env, unsigned flags, - MDBX_meta *const pending, - meta_troika_t *const troika); -static int env_close(MDBX_env *env, bool resurrect_after_fork); - -struct node_result { - MDBX_node *node; - bool exact; -}; +MDBX_INTERNAL int __must_check_result cursor_put_checklen(MDBX_cursor *mc, + const MDBX_val *key, + MDBX_val *data, + unsigned flags); -static struct node_result node_search(MDBX_cursor *mc, const MDBX_val *key); - -static int __must_check_result node_add_branch(MDBX_cursor *mc, size_t indx, - const MDBX_val *key, - pgno_t pgno); -static int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx, - const MDBX_val *key, - MDBX_val *data, unsigned flags); -static int __must_check_result node_add_leaf2(MDBX_cursor *mc, size_t indx, - const MDBX_val *key); - -static void node_del(MDBX_cursor *mc, size_t ksize); -static MDBX_node *node_shrink(MDBX_page *mp, size_t indx, MDBX_node *node); -static int __must_check_result node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, - bool fromleft); -static int __must_check_result node_read(MDBX_cursor *mc, const MDBX_node *leaf, - MDBX_val *data, const MDBX_page *mp); -static int __must_check_result rebalance(MDBX_cursor *mc); -static int __must_check_result update_key(MDBX_cursor *mc, const MDBX_val *key); - -static void cursor_pop(MDBX_cursor *mc); -static int __must_check_result cursor_push(MDBX_cursor *mc, MDBX_page *mp); - -static int __must_check_result audit_ex(MDBX_txn *txn, size_t retired_stored, - bool dont_filter_gc); - -static int __must_check_result page_check(const MDBX_cursor *const mc, - const MDBX_page *const mp); -static int __must_check_result cursor_check(const MDBX_cursor *mc); -static int __must_check_result cursor_get(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data, MDBX_cursor_op op); -static int __must_check_result cursor_put_checklen(MDBX_cursor *mc, - const MDBX_val *key, - MDBX_val *data, - unsigned flags); -static int __must_check_result cursor_put_nochecklen(MDBX_cursor *mc, - const MDBX_val *key, - MDBX_val *data, - unsigned flags); -static int __must_check_result cursor_check_updating(MDBX_cursor *mc); -static int __must_check_result cursor_del(MDBX_cursor *mc, - MDBX_put_flags_t flags); -static int __must_check_result delete(MDBX_txn *txn, MDBX_dbi dbi, - const MDBX_val *key, const MDBX_val *data, - unsigned flags); -#define SIBLING_LEFT 0 -#define SIBLING_RIGHT 2 -static int __must_check_result cursor_sibling(MDBX_cursor *mc, int dir); -static int __must_check_result cursor_next(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data, MDBX_cursor_op op); -static int __must_check_result cursor_prev(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data, MDBX_cursor_op op); -struct cursor_set_result { +MDBX_INTERNAL int __must_check_result cursor_put(MDBX_cursor *mc, + const MDBX_val *key, + MDBX_val *data, + unsigned flags); + +MDBX_INTERNAL int __must_check_result cursor_check_updating(MDBX_cursor *mc); + +MDBX_INTERNAL int __must_check_result cursor_del(MDBX_cursor *mc, + unsigned flags); + +MDBX_INTERNAL int __must_check_result cursor_sibling_left(MDBX_cursor *mc); +MDBX_INTERNAL int __must_check_result cursor_sibling_right(MDBX_cursor *mc); + +typedef struct cursor_set_result { int err; bool exact; -}; +} csr_t; + +MDBX_INTERNAL csr_t cursor_seek(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op); + +MDBX_INTERNAL int __must_check_result inner_first(MDBX_cursor *__restrict mc, + MDBX_val *__restrict data); +MDBX_INTERNAL int __must_check_result inner_last(MDBX_cursor *__restrict mc, + MDBX_val *__restrict data); +MDBX_INTERNAL int __must_check_result outer_first(MDBX_cursor *__restrict mc, + MDBX_val *__restrict key, + MDBX_val *__restrict data); +MDBX_INTERNAL int __must_check_result outer_last(MDBX_cursor *__restrict mc, + MDBX_val *__restrict key, + MDBX_val *__restrict data); + +MDBX_INTERNAL int __must_check_result inner_next(MDBX_cursor *__restrict mc, + MDBX_val *__restrict data); +MDBX_INTERNAL int __must_check_result inner_prev(MDBX_cursor *__restrict mc, + MDBX_val *__restrict data); +MDBX_INTERNAL int __must_check_result outer_next(MDBX_cursor *__restrict mc, + MDBX_val *__restrict key, + MDBX_val *__restrict data, + MDBX_cursor_op op); +MDBX_INTERNAL int __must_check_result outer_prev(MDBX_cursor *__restrict mc, + MDBX_val *__restrict key, + MDBX_val *__restrict data, + MDBX_cursor_op op); + +MDBX_INTERNAL int cursor_init4walk(cursor_couple_t *couple, + const MDBX_txn *const txn, + tree_t *const tree, kvx_t *const kvx); + +MDBX_INTERNAL int __must_check_result cursor_init(MDBX_cursor *mc, + const MDBX_txn *txn, + size_t dbi); + +MDBX_INTERNAL int __must_check_result cursor_dupsort_setup(MDBX_cursor *mc, + const node_t *node, + const page_t *mp); + +MDBX_INTERNAL int __must_check_result cursor_touch(MDBX_cursor *const mc, + const MDBX_val *key, + const MDBX_val *data); -static struct cursor_set_result cursor_set(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data, MDBX_cursor_op op); -static int __must_check_result cursor_first(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data); -static int __must_check_result cursor_last(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data); - -static int __must_check_result cursor_init(MDBX_cursor *mc, const MDBX_txn *txn, - size_t dbi); -static int __must_check_result cursor_xinit0(MDBX_cursor *mc); -static int __must_check_result cursor_xinit1(MDBX_cursor *mc, - const MDBX_node *node, - const MDBX_page *mp); -static int __must_check_result cursor_xinit2(MDBX_cursor *mc, - MDBX_xcursor *src_mx, - bool new_dupdata); -static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst); - -static int __must_check_result drop_tree(MDBX_cursor *mc, - const bool may_have_subDBs); -static int __must_check_result fetch_sdb(MDBX_txn *txn, size_t dbi); -static int __must_check_result setup_sdb(MDBX_dbx *const dbx, - const MDBX_db *const db, - const unsigned pagesize); - -static __inline MDBX_cmp_func *get_default_keycmp(MDBX_db_flags_t flags); -static __inline MDBX_cmp_func *get_default_datacmp(MDBX_db_flags_t flags); +/*----------------------------------------------------------------------------*/ -__cold const char *mdbx_liberr2str(int errnum) { - /* Table of descriptions for MDBX errors */ - static const char *const tbl[] = { - "MDBX_KEYEXIST: Key/data pair already exists", - "MDBX_NOTFOUND: No matching key/data pair found", - "MDBX_PAGE_NOTFOUND: Requested page not found", - "MDBX_CORRUPTED: Database is corrupted", - "MDBX_PANIC: Environment had fatal error", - "MDBX_VERSION_MISMATCH: DB version mismatch libmdbx", - "MDBX_INVALID: File is not an MDBX file", - "MDBX_MAP_FULL: Environment mapsize limit reached", - "MDBX_DBS_FULL: Too many DBI-handles (maxdbs reached)", - "MDBX_READERS_FULL: Too many readers (maxreaders reached)", - NULL /* MDBX_TLS_FULL (-30789): unused in MDBX */, - "MDBX_TXN_FULL: Transaction has too many dirty pages," - " i.e transaction is too big", - "MDBX_CURSOR_FULL: Cursor stack limit reachedn - this usually indicates" - " corruption, i.e branch-pages loop", - "MDBX_PAGE_FULL: Internal error - Page has no more space", - "MDBX_UNABLE_EXTEND_MAPSIZE: Database engine was unable to extend" - " mapping, e.g. since address space is unavailable or busy," - " or Operation system not supported such operations", - "MDBX_INCOMPATIBLE: Environment or database is not compatible" - " with the requested operation or the specified flags", - "MDBX_BAD_RSLOT: Invalid reuse of reader locktable slot," - " e.g. read-transaction already run for current thread", - "MDBX_BAD_TXN: Transaction is not valid for requested operation," - " e.g. had errored and be must aborted, has a child, or is invalid", - "MDBX_BAD_VALSIZE: Invalid size or alignment of key or data" - " for target database, either invalid subDB name", - "MDBX_BAD_DBI: The specified DBI-handle is invalid" - " or changed by another thread/transaction", - "MDBX_PROBLEM: Unexpected internal error, transaction should be aborted", - "MDBX_BUSY: Another write transaction is running," - " or environment is already used while opening with MDBX_EXCLUSIVE flag", - }; +/* Update sub-page pointer, if any, in mc->subcur. + * Needed when the node which contains the sub-page may have moved. + * Called with mp = mc->pg[mc->top], ki = mc->ki[mc->top]. */ +MDBX_MAYBE_UNUSED static inline void +cursor_inner_refresh(const MDBX_cursor *mc, const page_t *mp, unsigned ki) { + cASSERT(mc, is_leaf(mp)); + const node_t *node = page_node(mp, ki); + if ((node_flags(node) & (N_DUPDATA | N_SUBDATA)) == N_DUPDATA) + mc->subcur->cursor.pg[0] = node_data(node); +} - if (errnum >= MDBX_KEYEXIST && errnum <= MDBX_BUSY) { - int i = errnum - MDBX_KEYEXIST; - return tbl[i]; - } +MDBX_MAYBE_UNUSED MDBX_INTERNAL bool cursor_is_tracked(const MDBX_cursor *mc); - switch (errnum) { - case MDBX_SUCCESS: - return "MDBX_SUCCESS: Successful"; - case MDBX_EMULTIVAL: - return "MDBX_EMULTIVAL: The specified key has" - " more than one associated value"; - case MDBX_EBADSIGN: - return "MDBX_EBADSIGN: Wrong signature of a runtime object(s)," - " e.g. memory corruption or double-free"; - case MDBX_WANNA_RECOVERY: - return "MDBX_WANNA_RECOVERY: Database should be recovered," - " but this could NOT be done automatically for now" - " since it opened in read-only mode"; - case MDBX_EKEYMISMATCH: - return "MDBX_EKEYMISMATCH: The given key value is mismatched to the" - " current cursor position"; - case MDBX_TOO_LARGE: - return "MDBX_TOO_LARGE: Database is too large for current system," - " e.g. could NOT be mapped into RAM"; - case MDBX_THREAD_MISMATCH: - return "MDBX_THREAD_MISMATCH: A thread has attempted to use a not" - " owned object, e.g. a transaction that started by another thread"; - case MDBX_TXN_OVERLAPPING: - return "MDBX_TXN_OVERLAPPING: Overlapping read and write transactions for" - " the current thread"; - case MDBX_DUPLICATED_CLK: - return "MDBX_DUPLICATED_CLK: Alternative/Duplicate LCK-file is exists," - " please keep one and remove unused other"; - case MDBX_DANGLING_DBI: - return "MDBX_DANGLING_DBI: Some cursors and/or other resources should be" - " closed before subDb or corresponding DBI-handle could be (re)used"; - default: - return NULL; - } + + + +static inline size_t dpl_setlen(dpl_t *dl, size_t len) { + static const page_t dpl_stub_pageE = {INVALID_TXNID, + 0, + P_BAD, + {0}, + /* pgno */ ~(pgno_t)0}; + assert(dpl_stub_pageE.flags == P_BAD && dpl_stub_pageE.pgno == P_INVALID); + dl->length = len; + dl->items[len + 1].ptr = (page_t *)&dpl_stub_pageE; + dl->items[len + 1].pgno = P_INVALID; + dl->items[len + 1].npages = 1; + return len; } -__cold const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen) { - const char *msg = mdbx_liberr2str(errnum); - if (!msg && buflen > 0 && buflen < INT_MAX) { -#if defined(_WIN32) || defined(_WIN64) - const DWORD size = FormatMessageA( - FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, - errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen, - NULL); - return size ? buf : "FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM) failed"; -#elif defined(_GNU_SOURCE) && defined(__GLIBC__) - /* GNU-specific */ - if (errnum > 0) - msg = strerror_r(errnum, buf, buflen); -#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) - /* XSI-compliant */ - if (errnum > 0 && strerror_r(errnum, buf, buflen) == 0) - msg = buf; -#else - if (errnum > 0) { - msg = strerror(errnum); - if (msg) { - strncpy(buf, msg, buflen); - msg = buf; - } - } -#endif - if (!msg) { - (void)snprintf(buf, buflen, "error %d", errnum); - msg = buf; - } - buf[buflen - 1] = '\0'; - } - return msg; +static inline void dpl_clear(dpl_t *dl) { + static const page_t dpl_stub_pageB = {INVALID_TXNID, + 0, + P_BAD, + {0}, + /* pgno */ 0}; + assert(dpl_stub_pageB.flags == P_BAD && dpl_stub_pageB.pgno == 0); + dl->sorted = dpl_setlen(dl, 0); + dl->pages_including_loose = 0; + dl->items[0].ptr = (page_t *)&dpl_stub_pageB; + dl->items[0].pgno = 0; + dl->items[0].npages = 1; + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); } -__cold const char *mdbx_strerror(int errnum) { -#if defined(_WIN32) || defined(_WIN64) - static char buf[1024]; - return mdbx_strerror_r(errnum, buf, sizeof(buf)); -#else - const char *msg = mdbx_liberr2str(errnum); - if (!msg) { - if (errnum > 0) - msg = strerror(errnum); - if (!msg) { - static char buf[32]; - (void)snprintf(buf, sizeof(buf) - 1, "error %d", errnum); - msg = buf; - } - } - return msg; -#endif +MDBX_INTERNAL int __must_check_result dpl_alloc(MDBX_txn *txn); + +MDBX_INTERNAL void dpl_free(MDBX_txn *txn); + +MDBX_INTERNAL dpl_t *dpl_reserve(MDBX_txn *txn, size_t size); + +MDBX_INTERNAL __noinline dpl_t *dpl_sort_slowpath(const MDBX_txn *txn); + +static inline dpl_t *dpl_sort(const MDBX_txn *txn) { + tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + + dpl_t *dl = txn->tw.dirtylist; + tASSERT(txn, dl->length <= PAGELIST_LIMIT); + tASSERT(txn, dl->sorted <= dl->length); + tASSERT(txn, dl->items[0].pgno == 0 && + dl->items[dl->length + 1].pgno == P_INVALID); + return likely(dl->sorted == dl->length) ? dl : dpl_sort_slowpath(txn); } -#if defined(_WIN32) || defined(_WIN64) /* Bit of madness for Windows */ -const char *mdbx_strerror_r_ANSI2OEM(int errnum, char *buf, size_t buflen) { - const char *msg = mdbx_liberr2str(errnum); - if (!msg && buflen > 0 && buflen < INT_MAX) { - const DWORD size = FormatMessageA( - FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, - errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen, - NULL); - if (!size) - msg = "FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM) failed"; - else if (!CharToOemBuffA(buf, buf, size)) - msg = "CharToOemBuffA() failed"; - else - msg = buf; - } - return msg; +MDBX_INTERNAL __noinline size_t dpl_search(const MDBX_txn *txn, pgno_t pgno); + +MDBX_MAYBE_UNUSED MDBX_INTERNAL const page_t * +debug_dpl_find(const MDBX_txn *txn, const pgno_t pgno); + +MDBX_NOTHROW_PURE_FUNCTION static inline unsigned dpl_npages(const dpl_t *dl, + size_t i) { + assert(0 <= (intptr_t)i && i <= dl->length); + unsigned n = dl->items[i].npages; + assert(n == (is_largepage(dl->items[i].ptr) ? dl->items[i].ptr->pages : 1)); + return n; } -const char *mdbx_strerror_ANSI2OEM(int errnum) { - static char buf[1024]; - return mdbx_strerror_r_ANSI2OEM(errnum, buf, sizeof(buf)); +MDBX_NOTHROW_PURE_FUNCTION static inline pgno_t dpl_endpgno(const dpl_t *dl, + size_t i) { + return dpl_npages(dl, i) + dl->items[i].pgno; } -#endif /* Bit of madness for Windows */ -__cold void debug_log_va(int level, const char *function, int line, - const char *fmt, va_list args) { - ENSURE(nullptr, osal_fastmutex_acquire(&debug_lock) == 0); - if (mdbx_static.logger.ptr) { - if (mdbx_static.logger_buffer == nullptr) - mdbx_static.logger.fmt(level, function, line, fmt, args); - else { - const int len = vsnprintf(mdbx_static.logger_buffer, - mdbx_static.logger_buffer_size, fmt, args); - if (len > 0) - mdbx_static.logger.nofmt(level, function, line, - mdbx_static.logger_buffer, len); - } - } else { -#if defined(_WIN32) || defined(_WIN64) - if (IsDebuggerPresent()) { - int prefix_len = 0; - char *prefix = nullptr; - if (function && line > 0) - prefix_len = osal_asprintf(&prefix, "%s:%d ", function, line); - else if (function) - prefix_len = osal_asprintf(&prefix, "%s: ", function); - else if (line > 0) - prefix_len = osal_asprintf(&prefix, "%d: ", line); - if (prefix_len > 0 && prefix) { - OutputDebugStringA(prefix); - osal_free(prefix); - } - char *msg = nullptr; - int msg_len = osal_vasprintf(&msg, fmt, args); - if (msg_len > 0 && msg) { - OutputDebugStringA(msg); - osal_free(msg); - } +static inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, + size_t npages) { + tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + + dpl_t *dl = txn->tw.dirtylist; + tASSERT(txn, dl->sorted == dl->length); + tASSERT(txn, dl->items[0].pgno == 0 && + dl->items[dl->length + 1].pgno == P_INVALID); + size_t const n = dpl_search(txn, pgno); + tASSERT(txn, n >= 1 && n <= dl->length + 1); + tASSERT(txn, pgno <= dl->items[n].pgno); + tASSERT(txn, pgno > dl->items[n - 1].pgno); + const bool rc = + /* intersection with founded */ pgno + npages > dl->items[n].pgno || + /* intersection with prev */ dpl_endpgno(dl, n - 1) > pgno; + if (ASSERT_ENABLED()) { + bool check = false; + for (size_t i = 1; i <= dl->length; ++i) { + const page_t *const dp = dl->items[i].ptr; + if (!(dp->pgno /* begin */ >= /* end */ pgno + npages || + dpl_endpgno(dl, i) /* end */ <= /* begin */ pgno)) + check |= true; } -#else - if (function && line > 0) - fprintf(stderr, "%s:%d ", function, line); - else if (function) - fprintf(stderr, "%s: ", function); - else if (line > 0) - fprintf(stderr, "%d: ", line); - vfprintf(stderr, fmt, args); - fflush(stderr); -#endif + tASSERT(txn, check == rc); } - ENSURE(nullptr, osal_fastmutex_release(&debug_lock) == 0); + return rc; } -__cold void debug_log(int level, const char *function, int line, - const char *fmt, ...) { - va_list args; - va_start(args, fmt); - debug_log_va(level, function, line, fmt, args); - va_end(args); +MDBX_NOTHROW_PURE_FUNCTION static inline size_t dpl_exist(const MDBX_txn *txn, + pgno_t pgno) { + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + dpl_t *dl = txn->tw.dirtylist; + size_t i = dpl_search(txn, pgno); + tASSERT(txn, (int)i > 0); + return (dl->items[i].pgno == pgno) ? i : 0; } -/* Dump a key in ascii or hexadecimal. */ -const char *mdbx_dump_val(const MDBX_val *key, char *const buf, - const size_t bufsize) { - if (!key) - return ""; - if (!key->iov_len) - return ""; - if (!buf || bufsize < 4) - return nullptr; +MDBX_INTERNAL void dpl_remove_ex(const MDBX_txn *txn, size_t i, size_t npages); - bool is_ascii = true; - const uint8_t *const data = key->iov_base; - for (size_t i = 0; i < key->iov_len; i++) - if (data[i] < ' ' || data[i] > '~') { - is_ascii = false; - break; - } +static inline void dpl_remove(const MDBX_txn *txn, size_t i) { + dpl_remove_ex(txn, i, dpl_npages(txn->tw.dirtylist, i)); +} - if (is_ascii) { - int len = - snprintf(buf, bufsize, "%.*s", - (key->iov_len > INT_MAX) ? INT_MAX : (int)key->iov_len, data); - assert(len > 0 && (size_t)len < bufsize); - (void)len; - } else { - char *const detent = buf + bufsize - 2; - char *ptr = buf; - *ptr++ = '<'; - for (size_t i = 0; i < key->iov_len && ptr < detent; i++) { - const char hex[16] = {'0', '1', '2', '3', '4', '5', '6', '7', - '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'}; - *ptr++ = hex[data[i] >> 4]; - *ptr++ = hex[data[i] & 15]; - } - if (ptr < detent) - *ptr++ = '>'; - *ptr = '\0'; - } - return buf; +MDBX_INTERNAL int __must_check_result dpl_append(MDBX_txn *txn, pgno_t pgno, + page_t *page, size_t npages); + +MDBX_MAYBE_UNUSED MDBX_INTERNAL bool dpl_check(MDBX_txn *txn); + +MDBX_NOTHROW_PURE_FUNCTION static inline uint32_t dpl_age(const MDBX_txn *txn, + size_t i) { + tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); + const dpl_t *dl = txn->tw.dirtylist; + assert((intptr_t)i > 0 && i <= dl->length); + size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t)); + return txn->tw.dirtylru - (uint32_t)*ptr; } -/*------------------------------------------------------------------------------ - LY: debug stuff */ +MDBX_INTERNAL void dpl_lru_reduce(MDBX_txn *txn); -static const char *leafnode_type(MDBX_node *n) { - static const char *const tp[2][2] = {{"", ": DB"}, - {": sub-page", ": sub-DB"}}; - return (node_flags(n) & F_BIGDATA) - ? ": large page" - : tp[!!(node_flags(n) & F_DUPDATA)][!!(node_flags(n) & F_SUBDATA)]; +static inline uint32_t dpl_lru_turn(MDBX_txn *txn) { + txn->tw.dirtylru += 1; + if (unlikely(txn->tw.dirtylru > UINT32_MAX / 3) && + (txn->flags & MDBX_WRITEMAP) == 0) + dpl_lru_reduce(txn); + return txn->tw.dirtylru; } -/* Display all the keys in the page. */ -MDBX_MAYBE_UNUSED static void page_list(MDBX_page *mp) { - pgno_t pgno = mp->mp_pgno; - const char *type; - MDBX_node *node; - size_t i, nkeys, nsize, total = 0; - MDBX_val key; - DKBUF; +MDBX_INTERNAL void dpl_sift(MDBX_txn *const txn, pnl_t pl, const bool spilled); - switch (PAGETYPE_WHOLE(mp)) { - case P_BRANCH: - type = "Branch page"; - break; - case P_LEAF: - type = "Leaf page"; - break; - case P_LEAF | P_SUBP: - type = "Leaf sub-page"; - break; - case P_LEAF | P_LEAF2: - type = "Leaf2 page"; - break; - case P_LEAF | P_LEAF2 | P_SUBP: - type = "Leaf2 sub-page"; - break; - case P_OVERFLOW: - VERBOSE("Overflow page %" PRIaPGNO " pages %u\n", pgno, mp->mp_pages); - return; - case P_META: - VERBOSE("Meta-page %" PRIaPGNO " txnid %" PRIu64 "\n", pgno, - unaligned_peek_u64(4, page_meta(mp)->mm_txnid_a)); - return; - default: - VERBOSE("Bad page %" PRIaPGNO " flags 0x%X\n", pgno, mp->mp_flags); - return; - } +MDBX_INTERNAL void dpl_release_shadows(MDBX_txn *txn); - nkeys = page_numkeys(mp); - VERBOSE("%s %" PRIaPGNO " numkeys %zu\n", type, pgno, nkeys); - for (i = 0; i < nkeys; i++) { - if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ - key.iov_len = nsize = mp->mp_leaf2_ksize; - key.iov_base = page_leaf2key(mp, i, nsize); - total += nsize; - VERBOSE("key %zu: nsize %zu, %s\n", i, nsize, DKEY(&key)); - continue; - } - node = page_node(mp, i); - key.iov_len = node_ks(node); - key.iov_base = node->mn_data; - nsize = NODESIZE + key.iov_len; - if (IS_BRANCH(mp)) { - VERBOSE("key %zu: page %" PRIaPGNO ", %s\n", i, node_pgno(node), - DKEY(&key)); - total += nsize; - } else { - if (node_flags(node) & F_BIGDATA) - nsize += sizeof(pgno_t); - else - nsize += node_ds(node); - total += nsize; - nsize += sizeof(indx_t); - VERBOSE("key %zu: nsize %zu, %s%s\n", i, nsize, DKEY(&key), - leafnode_type(node)); - } - total = EVEN(total); - } - VERBOSE("Total: header %zu + contents %zu + unused %zu\n", - IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower, total, - page_room(mp)); -} -/*----------------------------------------------------------------------------*/ -/* Check if there is an initialized xcursor, so XCURSOR_REFRESH() is proper */ -#define XCURSOR_INITED(mc) \ - ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) +#ifndef MDBX_ENABLE_GC_EXPERIMENTAL +#define MDBX_ENABLE_GC_EXPERIMENTAL 0 +#elif !(MDBX_ENABLE_GC_EXPERIMENTAL == 0 || MDBX_ENABLE_GC_EXPERIMENTAL == 1) +#error MDBX_ENABLE_GC_EXPERIMENTAL must be defined as 0 or 1 +#endif /* MDBX_ENABLE_GC_EXPERIMENTAL */ -/* Update sub-page pointer, if any, in mc->mc_xcursor. - * Needed when the node which contains the sub-page may have moved. - * Called with mp = mc->mc_pg[mc->mc_top], ki = mc->mc_ki[mc->mc_top]. */ -#define XCURSOR_REFRESH(mc, mp, ki) \ - do { \ - MDBX_page *xr_pg = (mp); \ - MDBX_node *xr_node = page_node(xr_pg, ki); \ - if ((node_flags(xr_node) & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) \ - (mc)->mc_xcursor->mx_cursor.mc_pg[0] = node_data(xr_node); \ - } while (0) +typedef struct gc_update_context { + unsigned loop; + pgno_t prev_first_unallocated; + bool dense; +#if MDBX_ENABLE_GC_EXPERIMENTAL + intptr_t reserve_adj; +#endif /* MDBX_ENABLE_GC_EXPERIMENTAL */ + size_t retired_stored; + size_t amount, reserved, cleaned_slot, reused_slot, fill_idx; + txnid_t cleaned_id, rid; +#if MDBX_ENABLE_BIGFOOT + txnid_t bigfoot; +#endif /* MDBX_ENABLE_BIGFOOT */ + union { + MDBX_cursor cursor; + cursor_couple_t couple; + }; +} gcu_t; -MDBX_MAYBE_UNUSED static bool cursor_is_tracked(const MDBX_cursor *mc) { - for (MDBX_cursor *scan = mc->mc_txn->mt_cursors[mc->mc_dbi]; scan; - scan = scan->mc_next) - if (mc == ((mc->mc_flags & C_SUB) ? &scan->mc_xcursor->mx_cursor : scan)) - return true; - return false; +static inline int gc_update_init(MDBX_txn *txn, gcu_t *ctx) { + memset(ctx, 0, offsetof(gcu_t, cursor)); + ctx->dense = txn->txnid < MIN_TXNID; +#if MDBX_ENABLE_BIGFOOT + ctx->bigfoot = txn->txnid; +#endif /* MDBX_ENABLE_BIGFOOT */ + return cursor_init(&ctx->cursor, txn, FREE_DBI); } -/* Perform act while tracking temporary cursor mn */ -#define WITH_CURSOR_TRACKING(mn, act) \ - do { \ - cASSERT(&(mn), \ - mn.mc_txn->mt_cursors != NULL /* must be not rdonly txt */); \ - cASSERT(&(mn), !cursor_is_tracked(&(mn))); \ - MDBX_cursor mc_dummy; \ - MDBX_cursor **tracking_head = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ - MDBX_cursor *tracked = &(mn); \ - if ((mn).mc_flags & C_SUB) { \ - mc_dummy.mc_flags = C_INITIALIZED; \ - mc_dummy.mc_top = 0; \ - mc_dummy.mc_snum = 0; \ - mc_dummy.mc_xcursor = (MDBX_xcursor *)&(mn); \ - tracked = &mc_dummy; \ - } \ - tracked->mc_next = *tracking_head; \ - *tracking_head = tracked; \ - { act; } \ - *tracking_head = tracked->mc_next; \ - } while (0) +#define ALLOC_DEFAULT 0 +#define ALLOC_RESERVE 1 +#define ALLOC_UNIMPORTANT 2 +MDBX_INTERNAL pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num, + uint8_t flags); -static int -env_defer_free_and_release(MDBX_env *const env, - struct mdbx_defer_free_item *const chain) { - size_t length = 0; - struct mdbx_defer_free_item *obsolete_chain = nullptr; -#if MDBX_ENABLE_DBI_LOCKFREE - const uint64_t now = osal_monotime(); - struct mdbx_defer_free_item **scan = &env->me_defer_free; - if (env->me_defer_free) { - const uint64_t threshold_1second = osal_16dot16_to_monotime(1 * 65536); - do { - struct mdbx_defer_free_item *item = *scan; - if (now - item->timestamp < threshold_1second) { - scan = &item->next; - length += 1; - } else { - *scan = item->next; - item->next = obsolete_chain; - obsolete_chain = item; - } - } while (*scan); - } +MDBX_INTERNAL pgr_t gc_alloc_single(const MDBX_cursor *const mc); +MDBX_INTERNAL int gc_update(MDBX_txn *txn, gcu_t *ctx); - eASSERT(env, *scan == nullptr); - if (chain) { - struct mdbx_defer_free_item *item = chain; - do { - item->timestamp = now; - item = item->next; - } while (item); - *scan = chain; - } -#else /* MDBX_ENABLE_DBI_LOCKFREE */ - obsolete_chain = chain; -#endif /* MDBX_ENABLE_DBI_LOCKFREE */ - ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); - if (length > 42) { -#if defined(_WIN32) || defined(_WIN64) - SwitchToThread(); -#else - sched_yield(); -#endif /* Windows */ - } - while (obsolete_chain) { - struct mdbx_defer_free_item *item = obsolete_chain; - obsolete_chain = obsolete_chain->next; - osal_free(item); - } - return chain ? MDBX_SUCCESS : MDBX_BAD_DBI; -} -#if MDBX_ENABLE_DBI_SPARSE -static __inline size_t dbi_bitmap_ctz(const MDBX_txn *txn, intptr_t bmi) { - tASSERT(txn, bmi > 0); - STATIC_ASSERT(sizeof(bmi) >= sizeof(txn->mt_dbi_sparse[0])); -#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl) - if (sizeof(txn->mt_dbi_sparse[0]) <= sizeof(int)) - return __builtin_ctz((int)bmi); - if (sizeof(txn->mt_dbi_sparse[0]) == sizeof(long)) - return __builtin_ctzl((long)bmi); -#if (defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ == 8) || \ - __has_builtin(__builtin_ctzll) - return __builtin_ctzll(bmi); -#endif /* have(long long) && long long == uint64_t */ -#endif /* GNU C */ +MDBX_INTERNAL int lck_setup(MDBX_env *env, mdbx_mode_t mode); +#if MDBX_LOCKING > MDBX_LOCKING_SYSV +MDBX_INTERNAL int lck_ipclock_stubinit(osal_ipclock_t *ipc); +MDBX_INTERNAL int lck_ipclock_destroy(osal_ipclock_t *ipc); +#endif /* MDBX_LOCKING > MDBX_LOCKING_SYSV */ -#if defined(_MSC_VER) - unsigned long index; - if (sizeof(txn->mt_dbi_sparse[0]) > 4) { -#if defined(_M_AMD64) || defined(_M_ARM64) || defined(_M_X64) - _BitScanForward64(&index, bmi); - return index; -#else - if (bmi > UINT32_MAX) { - _BitScanForward(&index, (uint32_t)((uint64_t)bmi >> 32)); - return index; - } -#endif - } - _BitScanForward(&index, (uint32_t)bmi); - return index; -#endif /* MSVC */ +MDBX_INTERNAL int lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor, + int global_uniqueness_flag); - bmi &= -bmi; - if (sizeof(txn->mt_dbi_sparse[0]) > 4) { - static const uint8_t debruijn_ctz64[64] = { - 0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28, - 62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11, - 63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10, - 51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12}; - return debruijn_ctz64[(UINT64_C(0x022FDD63CC95386D) * (uint64_t)bmi) >> 58]; - } else { - static const uint8_t debruijn_ctz32[32] = { - 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, - 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; - return debruijn_ctz32[(UINT32_C(0x077CB531) * (uint32_t)bmi) >> 27]; - } -} +MDBX_INTERNAL int lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor, + const uint32_t current_pid); -/* LY: Макрос целенаправленно сделан с одним циклом, чтобы сохранить возможность - * использования оператора break */ -#define TXN_FOREACH_DBI_FROM(TXN, I, FROM) \ - for (size_t bitmap_chunk = CHAR_BIT * sizeof(TXN->mt_dbi_sparse[0]), \ - bitmap_item = TXN->mt_dbi_sparse[0] >> FROM, I = FROM; \ - I < TXN->mt_numdbs; ++I) \ - if (bitmap_item == 0) { \ - I = (I - 1) | (bitmap_chunk - 1); \ - bitmap_item = TXN->mt_dbi_sparse[(1 + I) / bitmap_chunk]; \ - if (!bitmap_item) \ - I += bitmap_chunk; \ - continue; \ - } else if ((bitmap_item & 1) == 0) { \ - size_t bitmap_skip = dbi_bitmap_ctz(txn, bitmap_item); \ - bitmap_item >>= bitmap_skip; \ - I += bitmap_skip - 1; \ - continue; \ - } else if (bitmap_item >>= 1, TXN->mt_dbi_state[I]) -#else -#define TXN_FOREACH_DBI_FROM(TXN, I, SKIP) \ - for (size_t I = SKIP; I < TXN->mt_numdbs; ++I) \ - if (TXN->mt_dbi_state[I]) -#endif /* MDBX_ENABLE_DBI_SPARSE */ +MDBX_INTERNAL int lck_seize(MDBX_env *env); -#define TXN_FOREACH_DBI_ALL(TXN, I) TXN_FOREACH_DBI_FROM(TXN, I, 0) -#define TXN_FOREACH_DBI_USER(TXN, I) TXN_FOREACH_DBI_FROM(TXN, I, CORE_DBS) +MDBX_INTERNAL int lck_downgrade(MDBX_env *env); -/* Back up parent txn's cursor, then grab the original for tracking */ -static int cursor_shadow(MDBX_cursor *parent_cursor, MDBX_txn *nested_txn, - const size_t dbi) { +MDBX_MAYBE_UNUSED MDBX_INTERNAL int lck_upgrade(MDBX_env *env, bool dont_wait); - tASSERT(nested_txn, dbi > FREE_DBI && dbi < nested_txn->mt_numdbs); - const size_t size = parent_cursor->mc_xcursor - ? sizeof(MDBX_cursor) + sizeof(MDBX_xcursor) - : sizeof(MDBX_cursor); - for (MDBX_cursor *bk; parent_cursor; parent_cursor = bk->mc_next) { - bk = parent_cursor; - if (parent_cursor->mc_signature != MDBX_MC_LIVE) - continue; - bk = osal_malloc(size); - if (unlikely(!bk)) - return MDBX_ENOMEM; -#if MDBX_DEBUG - memset(bk, 0xCD, size); - VALGRIND_MAKE_MEM_UNDEFINED(bk, size); -#endif /* MDBX_DEBUG */ - *bk = *parent_cursor; - parent_cursor->mc_backup = bk; - /* Kill pointers into src to reduce abuse: The - * user may not use mc until dst ends. But we need a valid - * txn pointer here for cursor fixups to keep working. */ - parent_cursor->mc_txn = nested_txn; - parent_cursor->mc_db = &nested_txn->mt_dbs[dbi]; - parent_cursor->mc_dbi_state = &nested_txn->mt_dbi_state[dbi]; - MDBX_xcursor *mx = parent_cursor->mc_xcursor; - if (mx != NULL) { - *(MDBX_xcursor *)(bk + 1) = *mx; - mx->mx_cursor.mc_txn = nested_txn; - } - parent_cursor->mc_next = nested_txn->mt_cursors[dbi]; - nested_txn->mt_cursors[dbi] = parent_cursor; - } - return MDBX_SUCCESS; -} +MDBX_INTERNAL int lck_rdt_lock(MDBX_env *env); -/* Close this txn's cursors, give parent txn's cursors back to parent. - * - * [in] txn the transaction handle. - * [in] merge true to keep changes to parent cursors, false to revert. - * - * Returns 0 on success, non-zero on failure. */ -static void cursors_eot(MDBX_txn *txn, const bool merge) { - tASSERT(txn, txn->mt_cursors[FREE_DBI] == nullptr); - TXN_FOREACH_DBI_FROM(txn, i, /* skip FREE_DBI */ 1) { - MDBX_cursor *mc = txn->mt_cursors[i]; - if (!mc) - continue; - txn->mt_cursors[i] = nullptr; - do { - const unsigned stage = mc->mc_signature; - MDBX_cursor *const next = mc->mc_next; - MDBX_cursor *const bk = mc->mc_backup; - ENSURE(txn->mt_env, - stage == MDBX_MC_LIVE || (stage == MDBX_MC_WAIT4EOT && bk)); - cASSERT(mc, mc->mc_dbi == (MDBX_dbi)i); - if (bk) { - MDBX_xcursor *mx = mc->mc_xcursor; - tASSERT(txn, txn->mt_parent != NULL); - /* Zap: Using uninitialized memory '*mc->mc_backup'. */ - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6001); - ENSURE(txn->mt_env, bk->mc_signature == MDBX_MC_LIVE); - tASSERT(txn, mx == bk->mc_xcursor); - if (stage == MDBX_MC_WAIT4EOT /* Cursor was closed by user */) - mc->mc_signature = stage /* Promote closed state to parent txn */; - else if (merge) { - /* Restore pointers to parent txn */ - mc->mc_next = bk->mc_next; - mc->mc_backup = bk->mc_backup; - mc->mc_txn = bk->mc_txn; - mc->mc_db = bk->mc_db; - mc->mc_dbi_state = bk->mc_dbi_state; - if (mx) { - if (mx != bk->mc_xcursor) { - *bk->mc_xcursor = *mx; - mx = bk->mc_xcursor; - } - mx->mx_cursor.mc_txn = bk->mc_txn; - } - } else { - /* Restore from backup, i.e. rollback/abort nested txn */ - *mc = *bk; - if (mx) - *mx = *(MDBX_xcursor *)(bk + 1); - } - bk->mc_signature = 0; - osal_free(bk); - } else { - ENSURE(txn->mt_env, stage == MDBX_MC_LIVE); - mc->mc_signature = MDBX_MC_READY4CLOSE /* Cursor may be reused */; - mc->mc_flags = 0 /* reset C_UNTRACK */; - } - mc = next; - } while (mc); - } -} +MDBX_INTERNAL void lck_rdt_unlock(MDBX_env *env); -static __noinline int dbi_import(MDBX_txn *txn, const size_t dbi); +MDBX_INTERNAL int lck_txn_lock(MDBX_env *env, bool dont_wait); -static __inline bool db_check_flags(uint16_t db_flags) { - switch (db_flags & ~(DB_VALID | MDBX_REVERSEKEY | MDBX_INTEGERKEY)) { - default: - NOTICE("invalid db-flags 0x%x", db_flags); - return false; - case MDBX_DUPSORT: - case MDBX_DUPSORT | MDBX_REVERSEDUP: - case MDBX_DUPSORT | MDBX_DUPFIXED: - case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP: - case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP: - case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP: - case MDBX_DB_DEFAULTS: - return (db_flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY)) != - (MDBX_REVERSEKEY | MDBX_INTEGERKEY); - } -} +MDBX_INTERNAL void lck_txn_unlock(MDBX_env *env); -static __inline uint8_t dbi_state(const MDBX_txn *txn, const size_t dbi) { - STATIC_ASSERT(DBI_DIRTY == MDBX_DBI_DIRTY && DBI_STALE == MDBX_DBI_STALE && - DBI_FRESH == MDBX_DBI_FRESH && DBI_CREAT == MDBX_DBI_CREAT); +MDBX_INTERNAL int lck_rpid_set(MDBX_env *env); -#if MDBX_ENABLE_DBI_SPARSE - const size_t bitmap_chunk = CHAR_BIT * sizeof(txn->mt_dbi_sparse[0]); - const size_t bitmap_indx = dbi / bitmap_chunk; - const size_t bitmap_mask = (size_t)1 << dbi % bitmap_chunk; - return likely(dbi < txn->mt_numdbs && - (txn->mt_dbi_sparse[bitmap_indx] & bitmap_mask) != 0) - ? txn->mt_dbi_state[dbi] - : 0; +MDBX_INTERNAL int lck_rpid_clear(MDBX_env *env); + +MDBX_INTERNAL int lck_rpid_check(MDBX_env *env, uint32_t pid); + + + + +static inline uint64_t meta_sign_calculate(const meta_t *meta) { + uint64_t sign = DATASIGN_NONE; +#if 0 /* TODO */ + sign = hippeus_hash64(...); #else - return likely(dbi < txn->mt_numdbs) ? txn->mt_dbi_state[dbi] : 0; -#endif /* MDBX_ENABLE_DBI_SPARSE */ + (void)meta; +#endif + /* LY: newer returns DATASIGN_NONE or DATASIGN_WEAK */ + return (sign > DATASIGN_WEAK) ? sign : ~sign; } -static __inline bool dbi_changed(const MDBX_txn *txn, const size_t dbi) { - const MDBX_env *const env = txn->mt_env; - eASSERT(env, dbi_state(txn, dbi) & DBI_LINDO); - const uint32_t snap_seq = - atomic_load32(&env->me_dbi_seqs[dbi], mo_AcquireRelease); - return snap_seq != txn->mt_dbi_seqs[dbi]; +static inline uint64_t meta_sign_get(const volatile meta_t *meta) { + return unaligned_peek_u64_volatile(4, meta->sign); } -static __always_inline int dbi_check(const MDBX_txn *txn, const size_t dbi) { - const uint8_t state = dbi_state(txn, dbi); - if (likely((state & DBI_LINDO) != 0 && !dbi_changed(txn, dbi))) - return (state & DBI_VALID) ? MDBX_SUCCESS : MDBX_BAD_DBI; - - /* Медленный путь: ленивая до-инициализацяи и импорт */ - return dbi_import((MDBX_txn *)txn, dbi); +static inline void meta_sign_as_steady(meta_t *meta) { + unaligned_poke_u64(4, meta->sign, meta_sign_calculate(meta)); } -static __inline uint32_t dbi_seq_next(const MDBX_env *const env, size_t dbi) { - uint32_t v = atomic_load32(&env->me_dbi_seqs[dbi], mo_AcquireRelease) + 1; - return v ? v : 1; +static inline bool meta_is_steady(const volatile meta_t *meta) { + return SIGN_IS_STEADY(meta_sign_get(meta)); } -struct dbi_snap_result { - uint32_t sequence; - unsigned flags; -}; +MDBX_INTERNAL troika_t meta_tap(const MDBX_env *env); +MDBX_INTERNAL unsigned meta_eq_mask(const troika_t *troika); +MDBX_INTERNAL bool meta_should_retry(const MDBX_env *env, troika_t *troika); +MDBX_MAYBE_UNUSED MDBX_INTERNAL bool troika_verify_fsm(void); -static struct dbi_snap_result dbi_snap(const MDBX_env *env, const size_t dbi) { - eASSERT(env, dbi < env->me_numdbs); - struct dbi_snap_result r; - uint32_t snap = atomic_load32(&env->me_dbi_seqs[dbi], mo_AcquireRelease); - do { - r.sequence = snap; - r.flags = env->me_db_flags[dbi]; - snap = atomic_load32(&env->me_dbi_seqs[dbi], mo_AcquireRelease); - } while (unlikely(snap != r.sequence)); - return r; -} +struct meta_ptr { + txnid_t txnid; + union { + const volatile meta_t *ptr_v; + const meta_t *ptr_c; + }; + size_t is_steady; +}; -static __noinline int dbi_import(MDBX_txn *txn, const size_t dbi) { - const MDBX_env *const env = txn->mt_env; - if (dbi >= env->me_numdbs || !env->me_db_flags[dbi]) - return MDBX_BAD_DBI; +MDBX_INTERNAL meta_ptr_t meta_ptr(const MDBX_env *env, unsigned n); +MDBX_INTERNAL txnid_t meta_txnid(const volatile meta_t *meta); +MDBX_INTERNAL txnid_t recent_committed_txnid(const MDBX_env *env); +MDBX_INTERNAL int meta_sync(const MDBX_env *env, const meta_ptr_t head); -#if MDBX_ENABLE_DBI_SPARSE - const size_t bitmap_chunk = CHAR_BIT * sizeof(txn->mt_dbi_sparse[0]); - const size_t bitmap_indx = dbi / bitmap_chunk; - const size_t bitmap_mask = (size_t)1 << dbi % bitmap_chunk; - if (dbi >= txn->mt_numdbs) { - for (size_t i = (txn->mt_numdbs + bitmap_chunk - 1) / bitmap_chunk; - bitmap_indx >= i; ++i) - txn->mt_dbi_sparse[i] = 0; - eASSERT(env, (txn->mt_dbi_sparse[bitmap_indx] & bitmap_mask) == 0); - MDBX_txn *scan = txn; - do { - eASSERT(env, scan->mt_dbi_sparse == txn->mt_dbi_sparse); - eASSERT(env, scan->mt_numdbs < dbi + 1); - scan->mt_numdbs = (unsigned)dbi + 1; - scan->mt_dbi_state[dbi] = 0; - scan = scan->mt_parent; - } while (scan /* && scan->mt_dbi_sparse == txn->mt_dbi_sparse */); - txn->mt_dbi_sparse[bitmap_indx] |= bitmap_mask; - goto lindo; - } - if ((txn->mt_dbi_sparse[bitmap_indx] & bitmap_mask) == 0) { - MDBX_txn *scan = txn; - do { - eASSERT(env, scan->mt_dbi_sparse == txn->mt_dbi_sparse); - eASSERT(env, scan->mt_numdbs == txn->mt_numdbs); - scan->mt_dbi_state[dbi] = 0; - scan = scan->mt_parent; - } while (scan /* && scan->mt_dbi_sparse == txn->mt_dbi_sparse */); - txn->mt_dbi_sparse[bitmap_indx] |= bitmap_mask; - goto lindo; - } -#else - if (dbi >= txn->mt_numdbs) { - size_t i = txn->mt_numdbs; - do - txn->mt_dbi_state[i] = 0; - while (dbi >= ++i); - txn->mt_numdbs = i; - goto lindo; - } -#endif /* MDBX_ENABLE_DBI_SPARSE */ +MDBX_INTERNAL const char *durable_caption(const meta_t *const meta); +MDBX_INTERNAL void meta_troika_dump(const MDBX_env *env, + const troika_t *troika); - if (!txn->mt_dbi_state[dbi]) { - lindo: - /* dbi-слот еще не инициализирован в транзакции, а хендл не использовался */ - txn->mt_cursors[dbi] = nullptr; - MDBX_txn *const parent = txn->mt_parent; - if (parent) { - /* вложенная пишущая транзакция */ - int rc = dbi_check(parent, dbi); - /* копируем состояние subDB очищая new-флаги. */ - eASSERT(env, txn->mt_dbi_seqs == parent->mt_dbi_seqs); - txn->mt_dbi_state[dbi] = - parent->mt_dbi_state[dbi] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); - if (likely(rc == MDBX_SUCCESS)) { - txn->mt_dbs[dbi] = parent->mt_dbs[dbi]; - if (parent->mt_cursors[dbi]) { - rc = cursor_shadow(parent->mt_cursors[dbi], txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) { - /* не получилось забекапить курсоры */ - txn->mt_dbi_state[dbi] = DBI_OLDEN | DBI_LINDO | DBI_STALE; - txn->mt_flags |= MDBX_TXN_ERROR; - } - } - } - return rc; - } - txn->mt_dbi_seqs[dbi] = 0; - txn->mt_dbi_state[dbi] = DBI_LINDO; - } else { - eASSERT(env, txn->mt_dbi_seqs[dbi] != env->me_dbi_seqs[dbi].weak); - if (unlikely((txn->mt_dbi_state[dbi] & (DBI_VALID | DBI_OLDEN)) || - txn->mt_cursors[dbi])) { - /* хендл уже использовался в транзакции, но был закрыт или переоткрыт, - * либо при явном пере-открытии хендла есть висячие курсоры */ - eASSERT(env, (txn->mt_dbi_state[dbi] & DBI_STALE) == 0); - txn->mt_dbi_seqs[dbi] = env->me_dbi_seqs[dbi].weak; - txn->mt_dbi_state[dbi] = DBI_OLDEN | DBI_LINDO; - return txn->mt_cursors[dbi] ? MDBX_DANGLING_DBI : MDBX_BAD_DBI; - } - } +#define METAPAGE(env, n) page_meta(pgno2page(env, n)) +#define METAPAGE_END(env) METAPAGE(env, NUM_METAS) - /* хендл не использовался в транзакции, либо явно пере-отрывается при - * отсутствии висячих курсоров */ - eASSERT(env, (txn->mt_dbi_state[dbi] & DBI_LINDO) && !txn->mt_cursors[dbi]); +static inline meta_ptr_t meta_recent(const MDBX_env *env, + const troika_t *troika) { + meta_ptr_t r; + r.txnid = troika->txnid[troika->recent]; + r.ptr_v = METAPAGE(env, troika->recent); + r.is_steady = (troika->fsm >> troika->recent) & 1; + return r; +} - /* читаем актуальные флаги и sequence */ - struct dbi_snap_result snap = dbi_snap(env, dbi); - txn->mt_dbi_seqs[dbi] = snap.sequence; - if (snap.flags & DB_VALID) { - txn->mt_dbs[dbi].md_flags = snap.flags & DB_PERSISTENT_FLAGS; - txn->mt_dbi_state[dbi] = DBI_LINDO | DBI_VALID | DBI_STALE; - return MDBX_SUCCESS; - } - return MDBX_BAD_DBI; +static inline meta_ptr_t meta_prefer_steady(const MDBX_env *env, + const troika_t *troika) { + meta_ptr_t r; + r.txnid = troika->txnid[troika->prefer_steady]; + r.ptr_v = METAPAGE(env, troika->prefer_steady); + r.is_steady = (troika->fsm >> troika->prefer_steady) & 1; + return r; } -/* Export or close DBI handles opened in this txn. */ -static int dbi_update(MDBX_txn *txn, int keep) { - MDBX_env *const env = txn->mt_env; - tASSERT(txn, !txn->mt_parent && txn == env->me_txn0); - bool locked = false; - struct mdbx_defer_free_item *defer_chain = nullptr; - TXN_FOREACH_DBI_USER(txn, dbi) { - if (likely((txn->mt_dbi_state[dbi] & DBI_CREAT) == 0)) - continue; - if (!locked) { - int err = osal_fastmutex_acquire(&env->me_dbi_lock); - if (unlikely(err != MDBX_SUCCESS)) - return err; - locked = true; - if (dbi >= env->me_numdbs) - /* хендл был закрыт из другого потока пока захватывали блокировку */ - continue; - } - tASSERT(txn, dbi < env->me_numdbs); - if (keep) { - env->me_db_flags[dbi] = txn->mt_dbs[dbi].md_flags | DB_VALID; - } else { - uint32_t seq = dbi_seq_next(env, dbi); - struct mdbx_defer_free_item *item = env->me_dbxs[dbi].md_name.iov_base; - if (item) { - env->me_db_flags[dbi] = 0; - env->me_dbxs[dbi].md_name.iov_len = 0; - env->me_dbxs[dbi].md_name.iov_base = nullptr; - atomic_store32(&env->me_dbi_seqs[dbi], seq, mo_AcquireRelease); - osal_flush_incoherent_cpu_writeback(); - item->next = defer_chain; - defer_chain = item; - } else { - eASSERT(env, env->me_dbxs[dbi].md_name.iov_len == 0); - eASSERT(env, env->me_db_flags[dbi] == 0); - } - } - } +static inline meta_ptr_t meta_tail(const MDBX_env *env, + const troika_t *troika) { + const uint8_t tail = troika->tail_and_flags & 3; + MDBX_ANALYSIS_ASSUME(tail < NUM_METAS); + meta_ptr_t r; + r.txnid = troika->txnid[tail]; + r.ptr_v = METAPAGE(env, tail); + r.is_steady = (troika->fsm >> tail) & 1; + return r; +} - if (locked) { - size_t i = env->me_numdbs; - while ((env->me_db_flags[i - 1] & DB_VALID) == 0) { - --i; - eASSERT(env, i >= CORE_DBS); - eASSERT(env, !env->me_db_flags[i] && !env->me_dbxs[i].md_name.iov_len && - !env->me_dbxs[i].md_name.iov_base); - } - env->me_numdbs = (unsigned)i; - env_defer_free_and_release(env, defer_chain); - } - return MDBX_SUCCESS; +static inline bool meta_bootid_match(const meta_t *meta) { + return memcmp(&meta->bootid, &globals.bootid, 16) == 0 && + (globals.bootid.x | globals.bootid.y) != 0; } -int mdbx_cmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, - const MDBX_val *b) { - eASSERT(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); - tASSERT(txn, (dbi_state(txn, dbi) & DBI_VALID) && !dbi_changed(txn, dbi)); - tASSERT(txn, dbi < txn->mt_env->me_numdbs && - (txn->mt_env->me_db_flags[dbi] & DB_VALID) != 0); - return txn->mt_env->me_dbxs[dbi].md_cmp(a, b); +static inline bool meta_weak_acceptable(const MDBX_env *env, const meta_t *meta, + const int lck_exclusive) { + return lck_exclusive + ? /* exclusive lock */ meta_bootid_match(meta) + : /* db already opened */ env->lck_mmap.lck && + (env->lck_mmap.lck->envmode.weak & MDBX_RDONLY) == 0; } -int mdbx_dcmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, - const MDBX_val *b) { - eASSERT(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); - tASSERT(txn, (dbi_state(txn, dbi) & DBI_VALID) && !dbi_changed(txn, dbi)); - tASSERT(txn, dbi < txn->mt_env->me_numdbs && - (txn->mt_env->me_db_flags[dbi] & DB_VALID)); - return txn->mt_env->me_dbxs[dbi].md_dcmp(a, b); -} - -/* Allocate memory for a page. - * Re-use old malloc'ed pages first for singletons, otherwise just malloc. - * Set MDBX_TXN_ERROR on failure. */ -static MDBX_page *page_malloc(MDBX_txn *txn, size_t num) { - MDBX_env *env = txn->mt_env; - MDBX_page *np = env->me_dp_reserve; - size_t size = env->me_psize; - if (likely(num == 1 && np)) { - eASSERT(env, env->me_dp_reserve_len > 0); - MDBX_ASAN_UNPOISON_MEMORY_REGION(np, size); - VALGRIND_MEMPOOL_ALLOC(env, ptr_disp(np, -(ptrdiff_t)sizeof(size_t)), - size + sizeof(size_t)); - VALGRIND_MAKE_MEM_DEFINED(&mp_next(np), sizeof(MDBX_page *)); - env->me_dp_reserve = mp_next(np); - env->me_dp_reserve_len -= 1; - } else { - size = pgno2bytes(env, num); - void *const ptr = osal_malloc(size + sizeof(size_t)); - if (unlikely(!ptr)) { - txn->mt_flags |= MDBX_TXN_ERROR; - return nullptr; - } - VALGRIND_MEMPOOL_ALLOC(env, ptr, size + sizeof(size_t)); - np = ptr_disp(ptr, sizeof(size_t)); - } +MDBX_NOTHROW_PURE_FUNCTION static inline txnid_t +constmeta_txnid(const meta_t *meta) { + const txnid_t a = unaligned_peek_u64(4, &meta->txnid_a); + const txnid_t b = unaligned_peek_u64(4, &meta->txnid_b); + return likely(a == b) ? a : 0; +} - if ((env->me_flags & MDBX_NOMEMINIT) == 0) { - /* For a single page alloc, we init everything after the page header. - * For multi-page, we init the final page; if the caller needed that - * many pages they will be filling in at least up to the last page. */ - size_t skip = PAGEHDRSZ; - if (num > 1) - skip += pgno2bytes(env, num - 1); - memset(ptr_disp(np, skip), 0, size - skip); - } -#if MDBX_DEBUG - np->mp_pgno = 0; +static inline void meta_update_begin(const MDBX_env *env, meta_t *meta, + txnid_t txnid) { + eASSERT(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); + eASSERT(env, unaligned_peek_u64(4, meta->txnid_a) < txnid && + unaligned_peek_u64(4, meta->txnid_b) < txnid); + (void)env; +#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \ + MDBX_UNALIGNED_OK >= 8 + atomic_store64((mdbx_atomic_uint64_t *)&meta->txnid_b, 0, mo_AcquireRelease); + atomic_store64((mdbx_atomic_uint64_t *)&meta->txnid_a, txnid, + mo_AcquireRelease); +#else + atomic_store32(&meta->txnid_b[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], 0, + mo_AcquireRelease); + atomic_store32(&meta->txnid_b[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], 0, + mo_AcquireRelease); + atomic_store32(&meta->txnid_a[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], + (uint32_t)txnid, mo_AcquireRelease); + atomic_store32(&meta->txnid_a[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], + (uint32_t)(txnid >> 32), mo_AcquireRelease); #endif - VALGRIND_MAKE_MEM_UNDEFINED(np, size); - np->mp_flags = 0; - np->mp_pages = (pgno_t)num; - return np; } -/* Free a shadow dirty page */ -static void dpage_free(MDBX_env *env, MDBX_page *dp, size_t npages) { - VALGRIND_MAKE_MEM_UNDEFINED(dp, pgno2bytes(env, npages)); - MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, pgno2bytes(env, npages)); - if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) - memset(dp, -1, pgno2bytes(env, npages)); - if (npages == 1 && - env->me_dp_reserve_len < env->me_options.dp_reserve_limit) { - MDBX_ASAN_POISON_MEMORY_REGION(dp, env->me_psize); - MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(dp), sizeof(MDBX_page *)); - mp_next(dp) = env->me_dp_reserve; - VALGRIND_MEMPOOL_FREE(env, ptr_disp(dp, -(ptrdiff_t)sizeof(size_t))); - env->me_dp_reserve = dp; - env->me_dp_reserve_len += 1; - } else { - /* large pages just get freed directly */ - void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t)); - VALGRIND_MEMPOOL_FREE(env, ptr); - osal_free(ptr); - } +static inline void meta_update_end(const MDBX_env *env, meta_t *meta, + txnid_t txnid) { + eASSERT(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); + eASSERT(env, unaligned_peek_u64(4, meta->txnid_a) == txnid); + eASSERT(env, unaligned_peek_u64(4, meta->txnid_b) < txnid); + (void)env; + jitter4testing(true); + memcpy(&meta->bootid, &globals.bootid, 16); +#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \ + MDBX_UNALIGNED_OK >= 8 + atomic_store64((mdbx_atomic_uint64_t *)&meta->txnid_b, txnid, + mo_AcquireRelease); +#else + atomic_store32(&meta->txnid_b[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], + (uint32_t)txnid, mo_AcquireRelease); + atomic_store32(&meta->txnid_b[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], + (uint32_t)(txnid >> 32), mo_AcquireRelease); +#endif } -/* Return all dirty pages to dpage list */ -static void dlist_free(MDBX_txn *txn) { - tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); - MDBX_env *env = txn->mt_env; - MDBX_dpl *const dl = txn->tw.dirtylist; +static inline void meta_set_txnid(const MDBX_env *env, meta_t *meta, + const txnid_t txnid) { + eASSERT(env, !env->dxb_mmap.base || meta < METAPAGE(env, 0) || + meta >= METAPAGE_END(env)); + (void)env; + /* update inconsistently since this function used ONLY for filling meta-image + * for writing, but not the actual meta-page */ + memcpy(&meta->bootid, &globals.bootid, 16); + unaligned_poke_u64(4, meta->txnid_a, txnid); + unaligned_poke_u64(4, meta->txnid_b, txnid); +} - for (size_t i = 1; i <= dl->length; i++) - dpage_free(env, dl->items[i].ptr, dpl_npages(dl, i)); +static inline uint8_t meta_cmp2int(txnid_t a, txnid_t b, uint8_t s) { + return unlikely(a == b) ? 1 * s : (a > b) ? 2 * s : 0 * s; +} - dpl_clear(dl); +static inline uint8_t meta_cmp2recent(uint8_t ab_cmp2int, bool a_steady, + bool b_steady) { + assert(ab_cmp2int < 3 /* && a_steady< 2 && b_steady < 2 */); + return ab_cmp2int > 1 || (ab_cmp2int == 1 && a_steady > b_steady); } -static __always_inline MDBX_db *outer_db(MDBX_cursor *mc) { - cASSERT(mc, (mc->mc_flags & C_SUB) != 0); - MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db); - MDBX_cursor_couple *couple = container_of(mx, MDBX_cursor_couple, inner); - cASSERT(mc, mc->mc_db == &couple->outer.mc_xcursor->mx_db); - cASSERT(mc, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); - return couple->outer.mc_db; +static inline uint8_t meta_cmp2steady(uint8_t ab_cmp2int, bool a_steady, + bool b_steady) { + assert(ab_cmp2int < 3 /* && a_steady< 2 && b_steady < 2 */); + return a_steady > b_steady || (a_steady == b_steady && ab_cmp2int > 1); } -MDBX_MAYBE_UNUSED __cold static bool dirtylist_check(MDBX_txn *txn) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - const MDBX_dpl *const dl = txn->tw.dirtylist; - if (!dl) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); - return true; - } - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); +static inline bool meta_choice_recent(txnid_t a_txnid, bool a_steady, + txnid_t b_txnid, bool b_steady) { + return meta_cmp2recent(meta_cmp2int(a_txnid, b_txnid, 1), a_steady, b_steady); +} - assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - tASSERT(txn, txn->tw.dirtyroom + dl->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); +static inline bool meta_choice_steady(txnid_t a_txnid, bool a_steady, + txnid_t b_txnid, bool b_steady) { + return meta_cmp2steady(meta_cmp2int(a_txnid, b_txnid, 1), a_steady, b_steady); +} - if (!AUDIT_ENABLED()) - return true; +MDBX_INTERNAL meta_t *meta_init_triplet(const MDBX_env *env, void *buffer); - size_t loose = 0, pages = 0; - for (size_t i = dl->length; i > 0; --i) { - const MDBX_page *const dp = dl->items[i].ptr; - if (!dp) - continue; +MDBX_INTERNAL int meta_validate(MDBX_env *env, meta_t *const meta, + const page_t *const page, + const unsigned meta_number, + unsigned *guess_pagesize); - tASSERT(txn, dp->mp_pgno == dl->items[i].pgno); - if (unlikely(dp->mp_pgno != dl->items[i].pgno)) - return false; +MDBX_INTERNAL int __must_check_result meta_validate_copy(MDBX_env *env, + const meta_t *meta, + meta_t *dest); - if ((txn->mt_flags & MDBX_WRITEMAP) == 0) { - const uint32_t age = dpl_age(txn, i); - tASSERT(txn, age < UINT32_MAX / 3); - if (unlikely(age > UINT32_MAX / 3)) - return false; - } +MDBX_INTERNAL int __must_check_result meta_override(MDBX_env *env, + size_t target, + txnid_t txnid, + const meta_t *shape); - tASSERT(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); - if (dp->mp_flags == P_LOOSE) { - loose += 1; - } else if (unlikely(!IS_MODIFIABLE(txn, dp))) - return false; +MDBX_INTERNAL int meta_wipe_steady(MDBX_env *env, txnid_t inclusive_upto); - const unsigned num = dpl_npages(dl, i); - pages += num; - tASSERT(txn, txn->mt_next_pgno >= dp->mp_pgno + num); - if (unlikely(txn->mt_next_pgno < dp->mp_pgno + num)) - return false; - if (i < dl->sorted) { - tASSERT(txn, dl->items[i + 1].pgno >= dp->mp_pgno + num); - if (unlikely(dl->items[i + 1].pgno < dp->mp_pgno + num)) - return false; - } - const size_t rpa = - pnl_search(txn->tw.relist, dp->mp_pgno, txn->mt_next_pgno); - tASSERT(txn, rpa > MDBX_PNL_GETSIZE(txn->tw.relist) || - txn->tw.relist[rpa] != dp->mp_pgno); - if (rpa <= MDBX_PNL_GETSIZE(txn->tw.relist) && - unlikely(txn->tw.relist[rpa] == dp->mp_pgno)) - return false; - if (num > 1) { - const size_t rpb = - pnl_search(txn->tw.relist, dp->mp_pgno + num - 1, txn->mt_next_pgno); - tASSERT(txn, rpa == rpb); - if (unlikely(rpa != rpb)) - return false; - } - } - tASSERT(txn, loose == txn->tw.loose_count); - if (unlikely(loose != txn->tw.loose_count)) - return false; +#if !(defined(_WIN32) || defined(_WIN64)) +#define MDBX_WRITETHROUGH_THRESHOLD_DEFAULT 2 +#endif - tASSERT(txn, pages == dl->pages_including_loose); - if (unlikely(pages != dl->pages_including_loose)) - return false; +struct iov_ctx { + MDBX_env *env; + osal_ioring_t *ior; + mdbx_filehandle_t fd; + int err; +#ifndef MDBX_NEED_WRITTEN_RANGE +#define MDBX_NEED_WRITTEN_RANGE 1 +#endif /* MDBX_NEED_WRITTEN_RANGE */ +#if MDBX_NEED_WRITTEN_RANGE + pgno_t flush_begin; + pgno_t flush_end; +#endif /* MDBX_NEED_WRITTEN_RANGE */ + uint64_t coherency_timestamp; +}; - for (size_t i = 1; i <= MDBX_PNL_GETSIZE(txn->tw.retired_pages); ++i) { - const MDBX_page *const dp = debug_dpl_find(txn, txn->tw.retired_pages[i]); - tASSERT(txn, !dp); - if (unlikely(dp)) - return false; - } +MDBX_INTERNAL __must_check_result int +iov_init(MDBX_txn *const txn, iov_ctx_t *ctx, size_t items, size_t npages, + mdbx_filehandle_t fd, bool check_coherence); - return true; +static inline bool iov_empty(const iov_ctx_t *ctx) { + return osal_ioring_used(ctx->ior) == 0; } -#if MDBX_ENABLE_REFUND -static void refund_reclaimed(MDBX_txn *txn) { - /* Scanning in descend order */ - pgno_t next_pgno = txn->mt_next_pgno; - const MDBX_PNL pnl = txn->tw.relist; - tASSERT(txn, MDBX_PNL_GETSIZE(pnl) && MDBX_PNL_MOST(pnl) == next_pgno - 1); +MDBX_INTERNAL __must_check_result int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, + page_t *dp, size_t npages); + +MDBX_INTERNAL __must_check_result int iov_write(iov_ctx_t *ctx); + + + + +MDBX_INTERNAL void spill_remove(MDBX_txn *txn, size_t idx, size_t npages); +MDBX_INTERNAL pnl_t spill_purge(MDBX_txn *txn); +MDBX_INTERNAL int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, + const intptr_t wanna_spill_entries, + const intptr_t wanna_spill_npages, + const size_t need); +/*----------------------------------------------------------------------------*/ + +static inline size_t spill_search(const MDBX_txn *txn, pgno_t pgno) { + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + const pnl_t pnl = txn->tw.spilled.list; + if (likely(!pnl)) + return 0; + pgno <<= 1; + size_t n = pnl_search(pnl, pgno, (size_t)MAX_PAGENO + MAX_PAGENO + 1); + return (n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] == pgno) ? n : 0; +} + +static inline bool spill_intersect(const MDBX_txn *txn, pgno_t pgno, + size_t npages) { + const pnl_t pnl = txn->tw.spilled.list; + if (likely(!pnl)) + return false; + const size_t len = MDBX_PNL_GETSIZE(pnl); + if (LOG_ENABLED(MDBX_LOG_EXTRA)) { + DEBUG_EXTRA("PNL len %zu [", len); + for (size_t i = 1; i <= len; ++i) + DEBUG_EXTRA_PRINT(" %li", (pnl[i] & 1) ? -(long)(pnl[i] >> 1) + : (long)(pnl[i] >> 1)); + DEBUG_EXTRA_PRINT("%s\n", "]"); + } + const pgno_t spilled_range_begin = pgno << 1; + const pgno_t spilled_range_last = ((pgno + (pgno_t)npages) << 1) - 1; #if MDBX_PNL_ASCENDING - size_t i = MDBX_PNL_GETSIZE(pnl); - tASSERT(txn, pnl[i] == next_pgno - 1); - while (--next_pgno, --i > 0 && pnl[i] == next_pgno - 1) - ; - MDBX_PNL_SETSIZE(pnl, i); + const size_t n = + pnl_search(pnl, spilled_range_begin, (size_t)(MAX_PAGENO + 1) << 1); + tASSERT(txn, n && (n == MDBX_PNL_GETSIZE(pnl) + 1 || + spilled_range_begin <= pnl[n])); + const bool rc = n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] <= spilled_range_last; #else - size_t i = 1; - tASSERT(txn, pnl[i] == next_pgno - 1); - size_t len = MDBX_PNL_GETSIZE(pnl); - while (--next_pgno, ++i <= len && pnl[i] == next_pgno - 1) - ; - MDBX_PNL_SETSIZE(pnl, len -= i - 1); - for (size_t move = 0; move < len; ++move) - pnl[1 + move] = pnl[i + move]; + const size_t n = + pnl_search(pnl, spilled_range_last, (size_t)MAX_PAGENO + MAX_PAGENO + 1); + tASSERT(txn, n && (n == MDBX_PNL_GETSIZE(pnl) + 1 || + spilled_range_last >= pnl[n])); + const bool rc = n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] >= spilled_range_begin; #endif - VERBOSE("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, - txn->mt_next_pgno - next_pgno, txn->mt_next_pgno, next_pgno); - txn->mt_next_pgno = next_pgno; - tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - 1)); + if (ASSERT_ENABLED()) { + bool check = false; + for (size_t i = 0; i < npages; ++i) + check |= spill_search(txn, (pgno_t)(pgno + i)) != 0; + tASSERT(txn, check == rc); + } + return rc; } -static void refund_loose(MDBX_txn *txn) { - tASSERT(txn, txn->tw.loose_pages != nullptr); - tASSERT(txn, txn->tw.loose_count > 0); +static inline int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, + const size_t need) { + tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, !m0 || cursor_is_tracked(m0)); - MDBX_dpl *const dl = txn->tw.dirtylist; - if (dl) { - tASSERT(txn, dl->length >= txn->tw.loose_count); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - } else { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); - } + const intptr_t wanna_spill_entries = + txn->tw.dirtylist ? (need - txn->tw.dirtyroom - txn->tw.loose_count) : 0; + const intptr_t wanna_spill_npages = + need + + (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose + : txn->tw.writemap_dirty_npages) - + txn->tw.loose_count - txn->env->options.dp_limit; - pgno_t onstack[MDBX_CACHELINE_SIZE * 8 / sizeof(pgno_t)]; - MDBX_PNL suitable = onstack; + /* production mode */ + if (likely(wanna_spill_npages < 1 && wanna_spill_entries < 1) +#if xMDBX_DEBUG_SPILLING == 1 + /* debug mode: always try to spill if xMDBX_DEBUG_SPILLING == 1 */ + && txn->txnid % 23 > 11 +#endif + ) + return MDBX_SUCCESS; - if (!dl || dl->length - dl->sorted > txn->tw.loose_count) { - /* Dirty list is useless since unsorted. */ - if (pnl_bytes2size(sizeof(onstack)) < txn->tw.loose_count) { - suitable = pnl_alloc(txn->tw.loose_count); - if (unlikely(!suitable)) - return /* this is not a reason for transaction fail */; - } + return spill_slowpath(txn, m0, wanna_spill_entries, wanna_spill_npages, need); +} - /* Collect loose-pages which may be refunded. */ - tASSERT(txn, txn->mt_next_pgno >= MIN_PAGENO + txn->tw.loose_count); - pgno_t most = MIN_PAGENO; - size_t w = 0; - for (const MDBX_page *lp = txn->tw.loose_pages; lp; lp = mp_next(lp)) { - tASSERT(txn, lp->mp_flags == P_LOOSE); - tASSERT(txn, txn->mt_next_pgno > lp->mp_pgno); - if (likely(txn->mt_next_pgno - txn->tw.loose_count <= lp->mp_pgno)) { - tASSERT(txn, - w < ((suitable == onstack) ? pnl_bytes2size(sizeof(onstack)) - : MDBX_PNL_ALLOCLEN(suitable))); - suitable[++w] = lp->mp_pgno; - most = (lp->mp_pgno > most) ? lp->mp_pgno : most; - } - MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); - VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); - } - if (most + 1 == txn->mt_next_pgno) { - /* Sort suitable list and refund pages at the tail. */ - MDBX_PNL_SETSIZE(suitable, w); - pnl_sort(suitable, MAX_PAGENO + 1); - /* Scanning in descend order */ - const intptr_t step = MDBX_PNL_ASCENDING ? -1 : 1; - const intptr_t begin = - MDBX_PNL_ASCENDING ? MDBX_PNL_GETSIZE(suitable) : 1; - const intptr_t end = - MDBX_PNL_ASCENDING ? 0 : MDBX_PNL_GETSIZE(suitable) + 1; - tASSERT(txn, suitable[begin] >= suitable[end - step]); - tASSERT(txn, most == suitable[begin]); - for (intptr_t i = begin + step; i != end; i += step) { - if (suitable[i] != most - 1) - break; - most -= 1; - } - const size_t refunded = txn->mt_next_pgno - most; - DEBUG("refund-suitable %zu pages %" PRIaPGNO " -> %" PRIaPGNO, refunded, - most, txn->mt_next_pgno); - txn->mt_next_pgno = most; - txn->tw.loose_count -= refunded; - if (dl) { - txn->tw.dirtyroom += refunded; - dl->pages_including_loose -= refunded; - assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit); +MDBX_INTERNAL int __must_check_result tree_search_finalize(MDBX_cursor *mc, + const MDBX_val *key, + int flags); +MDBX_INTERNAL int tree_search_lowest(MDBX_cursor *mc); - /* Filter-out dirty list */ - size_t r = 0; - w = 0; - if (dl->sorted) { - do { - if (dl->items[++r].pgno < most) { - if (++w != r) - dl->items[w] = dl->items[r]; - } - } while (r < dl->sorted); - dl->sorted = w; - } - while (r < dl->length) { - if (dl->items[++r].pgno < most) { - if (++w != r) - dl->items[w] = dl->items[r]; - } - } - dpl_setlen(dl, w); - tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); - } - goto unlink_loose; - } - } else { - /* Dirtylist is mostly sorted, just refund loose pages at the end. */ - dpl_sort(txn); - tASSERT(txn, - dl->length < 2 || dl->items[1].pgno < dl->items[dl->length].pgno); - tASSERT(txn, dl->sorted == dl->length); +enum page_search_flags { + Z_MODIFY = 1, + Z_ROOTONLY = 2, + Z_FIRST = 4, + Z_LAST = 8, +}; +MDBX_INTERNAL int __must_check_result tree_search(MDBX_cursor *mc, + const MDBX_val *key, + int flags); - /* Scan dirtylist tail-forward and cutoff suitable pages. */ - size_t n; - for (n = dl->length; dl->items[n].pgno == txn->mt_next_pgno - 1 && - dl->items[n].ptr->mp_flags == P_LOOSE; - --n) { - tASSERT(txn, n > 0); - MDBX_page *dp = dl->items[n].ptr; - DEBUG("refund-sorted page %" PRIaPGNO, dp->mp_pgno); - tASSERT(txn, dp->mp_pgno == dl->items[n].pgno); - txn->mt_next_pgno -= 1; - } - dpl_setlen(dl, n); +#define MDBX_SPLIT_REPLACE MDBX_APPENDDUP /* newkey is not new */ +MDBX_INTERNAL int __must_check_result page_split(MDBX_cursor *mc, + const MDBX_val *const newkey, + MDBX_val *const newdata, + pgno_t newpgno, + const unsigned naf); - if (dl->sorted != dl->length) { - const size_t refunded = dl->sorted - dl->length; - dl->sorted = dl->length; - txn->tw.loose_count -= refunded; - txn->tw.dirtyroom += refunded; - dl->pages_including_loose -= refunded; - tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); +/*----------------------------------------------------------------------------*/ - /* Filter-out loose chain & dispose refunded pages. */ - unlink_loose: - for (MDBX_page *__restrict *__restrict link = &txn->tw.loose_pages; - *link;) { - MDBX_page *dp = *link; - tASSERT(txn, dp->mp_flags == P_LOOSE); - MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(dp), sizeof(MDBX_page *)); - VALGRIND_MAKE_MEM_DEFINED(&mp_next(dp), sizeof(MDBX_page *)); - if (txn->mt_next_pgno > dp->mp_pgno) { - link = &mp_next(dp); - } else { - *link = mp_next(dp); - if ((txn->mt_flags & MDBX_WRITEMAP) == 0) - dpage_free(txn->mt_env, dp, 1); - } - } - } - } +MDBX_INTERNAL int MDBX_PRINTF_ARGS(2, 3) + bad_page(const page_t *mp, const char *fmt, ...); - tASSERT(txn, dirtylist_check(txn)); - if (suitable != onstack) - pnl_free(suitable); - txn->tw.loose_refund_wl = txn->mt_next_pgno; +MDBX_INTERNAL void MDBX_PRINTF_ARGS(2, 3) + poor_page(const page_t *mp, const char *fmt, ...); + +MDBX_NOTHROW_PURE_FUNCTION static inline bool is_frozen(const MDBX_txn *txn, + const page_t *mp) { + return mp->txnid < txn->txnid; } -static bool txn_refund(MDBX_txn *txn) { - const pgno_t before = txn->mt_next_pgno; +MDBX_NOTHROW_PURE_FUNCTION static inline bool is_spilled(const MDBX_txn *txn, + const page_t *mp) { + return mp->txnid == txn->txnid; +} - if (txn->tw.loose_pages && txn->tw.loose_refund_wl > txn->mt_next_pgno) - refund_loose(txn); +MDBX_NOTHROW_PURE_FUNCTION static inline bool is_shadowed(const MDBX_txn *txn, + const page_t *mp) { + return mp->txnid > txn->txnid; +} - while (true) { - if (MDBX_PNL_GETSIZE(txn->tw.relist) == 0 || - MDBX_PNL_MOST(txn->tw.relist) != txn->mt_next_pgno - 1) - break; +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool +is_correct(const MDBX_txn *txn, const page_t *mp) { + return mp->txnid <= txn->front_txnid; +} - refund_reclaimed(txn); - if (!txn->tw.loose_pages || txn->tw.loose_refund_wl <= txn->mt_next_pgno) - break; +MDBX_NOTHROW_PURE_FUNCTION static inline bool is_modifable(const MDBX_txn *txn, + const page_t *mp) { + return mp->txnid == txn->front_txnid; +} - const pgno_t memo = txn->mt_next_pgno; - refund_loose(txn); - if (memo == txn->mt_next_pgno) - break; - } +MDBX_INTERNAL int __must_check_result page_check(const MDBX_cursor *const mc, + const page_t *const mp); - if (before == txn->mt_next_pgno) - return false; +MDBX_INTERNAL pgr_t page_get_any(const MDBX_cursor *const mc, const pgno_t pgno, + const txnid_t front); - if (txn->tw.spilled.list) - /* Squash deleted pagenums if we refunded any */ - spill_purge(txn); +MDBX_INTERNAL pgr_t page_get_three(const MDBX_cursor *const mc, + const pgno_t pgno, const txnid_t front); - return true; -} -#else /* MDBX_ENABLE_REFUND */ -static __inline bool txn_refund(MDBX_txn *txn) { - (void)txn; - /* No online auto-compactification. */ - return false; +MDBX_INTERNAL pgr_t page_get_large(const MDBX_cursor *const mc, + const pgno_t pgno, const txnid_t front); + +static inline int __must_check_result page_get(const MDBX_cursor *mc, + const pgno_t pgno, page_t **mp, + const txnid_t front) { + pgr_t ret = page_get_three(mc, pgno, front); + *mp = ret.page; + return ret.err; } -#endif /* MDBX_ENABLE_REFUND */ -__cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, - size_t npages) { - MDBX_env *const env = txn->mt_env; - DEBUG("kill %zu page(s) %" PRIaPGNO, npages, pgno); - eASSERT(env, pgno >= NUM_METAS && npages); - if (!IS_FROZEN(txn, mp)) { - const size_t bytes = pgno2bytes(env, npages); - memset(mp, -1, bytes); - mp->mp_pgno = pgno; - if ((txn->mt_flags & MDBX_WRITEMAP) == 0) - osal_pwrite(env->me_lazy_fd, mp, bytes, pgno2bytes(env, pgno)); - } else { - struct iovec iov[MDBX_AUXILARY_IOV_MAX]; - iov[0].iov_len = env->me_psize; - iov[0].iov_base = ptr_disp(env->me_pbuf, env->me_psize); - size_t iov_off = pgno2bytes(env, pgno), n = 1; - while (--npages) { - iov[n] = iov[0]; - if (++n == MDBX_AUXILARY_IOV_MAX) { - osal_pwritev(env->me_lazy_fd, iov, MDBX_AUXILARY_IOV_MAX, iov_off); - iov_off += pgno2bytes(env, MDBX_AUXILARY_IOV_MAX); - n = 0; - } +/*----------------------------------------------------------------------------*/ + +MDBX_INTERNAL int __must_check_result page_dirty(MDBX_txn *txn, page_t *mp, + size_t npages); +MDBX_INTERNAL pgr_t page_new(MDBX_cursor *mc, const unsigned flags); +MDBX_INTERNAL pgr_t page_new_large(MDBX_cursor *mc, const size_t npages); +MDBX_INTERNAL int page_touch_modifable(MDBX_txn *txn, const page_t *const mp); +MDBX_INTERNAL int page_touch_unmodifable(MDBX_txn *txn, MDBX_cursor *mc, + const page_t *const mp); + +static inline int page_touch(MDBX_cursor *mc) { + page_t *const mp = mc->pg[mc->top]; + MDBX_txn *txn = mc->txn; + + tASSERT(txn, mc->txn->flags & MDBX_TXN_DIRTY); + tASSERT(txn, + F_ISSET(*cursor_dbi_state(mc), DBI_LINDO | DBI_VALID | DBI_DIRTY)); + tASSERT(txn, !is_largepage(mp)); + if (ASSERT_ENABLED()) { + if (mc->flags & z_inner) { + subcur_t *mx = container_of(mc->tree, subcur_t, nested_tree); + cursor_couple_t *couple = container_of(mx, cursor_couple_t, inner); + tASSERT(txn, mc->tree == &couple->outer.subcur->nested_tree); + tASSERT(txn, &mc->clc->k == &couple->outer.clc->v); + tASSERT(txn, *couple->outer.dbi_state & DBI_DIRTY); + } + tASSERT(txn, dpl_check(txn)); + } + + if (is_modifable(txn, mp)) { + if (!txn->tw.dirtylist) { + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) && !MDBX_AVOID_MSYNC); + return MDBX_SUCCESS; } - osal_pwritev(env->me_lazy_fd, iov, n, iov_off); + return is_subpage(mp) ? MDBX_SUCCESS : page_touch_modifable(txn, mp); } + return page_touch_unmodifable(txn, mc, mp); +} + +MDBX_INTERNAL void page_copy(page_t *const dst, const page_t *const src, + const size_t size); +MDBX_INTERNAL pgr_t __must_check_result page_unspill(MDBX_txn *const txn, + const page_t *const mp); + +MDBX_INTERNAL page_t *page_shadow_alloc(MDBX_txn *txn, size_t num); + +MDBX_INTERNAL void page_shadow_release(MDBX_env *env, page_t *dp, + size_t npages); + +MDBX_INTERNAL int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, + page_t *mp /* maybe null */, + unsigned pageflags /* maybe unknown/zero */); + +static inline int page_retire(MDBX_cursor *mc, page_t *mp) { + return page_retire_ex(mc, mp->pgno, mp, mp->flags); } -/* Remove page from dirty list, etc */ -static __inline void page_wash(MDBX_txn *txn, size_t di, MDBX_page *const mp, - const size_t npages) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - mp->mp_txnid = INVALID_TXNID; - mp->mp_flags = P_BAD; +static inline void page_wash(MDBX_txn *txn, size_t di, page_t *const mp, + const size_t npages) { + tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0); + mp->txnid = INVALID_TXNID; + mp->flags = P_BAD; if (txn->tw.dirtylist) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); tASSERT(txn, MDBX_AVOID_MSYNC || (di && txn->tw.dirtylist->items[di].ptr == mp)); if (!MDBX_AVOID_MSYNC || di) { dpl_remove_ex(txn, di, npages); txn->tw.dirtyroom++; tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); - if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { - dpage_free(txn->mt_env, mp, npages); + (txn->parent ? txn->parent->tw.dirtyroom + : txn->env->options.dp_limit)); + if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP)) { + page_shadow_release(txn->env, mp, npages); return; } } } else { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) && !MDBX_AVOID_MSYNC && !di); + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) && !MDBX_AVOID_MSYNC && !di); txn->tw.writemap_dirty_npages -= (txn->tw.writemap_dirty_npages > npages) ? npages : txn->tw.writemap_dirty_npages; } VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ); VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), - pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); + pgno2bytes(txn->env, npages) - PAGEHDRSZ); MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), - pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); + pgno2bytes(txn->env, npages) - PAGEHDRSZ); } -static __inline bool suitable4loose(const MDBX_txn *txn, pgno_t pgno) { - /* TODO: - * 1) при включенной "экономии последовательностей" проверить, что - * страница не примыкает к какой-либо из уже находящийся в reclaimed. - * 2) стоит подумать над тем, чтобы при большом loose-списке отбрасывать - половину в reclaimed. */ - return txn->tw.loose_count < txn->mt_env->me_options.dp_loose_limit && - (!MDBX_ENABLE_REFUND || - /* skip pages near to the end in favor of compactification */ - txn->mt_next_pgno > pgno + txn->mt_env->me_options.dp_loose_limit || - txn->mt_next_pgno <= txn->mt_env->me_options.dp_loose_limit); -} +MDBX_INTERNAL size_t page_subleaf2_reserve(const MDBX_env *env, + size_t host_page_room, + size_t subpage_len, size_t item_len); -/* Retire, loosen or free a single page. - * - * For dirty pages, saves single pages to a list for future reuse in this same - * txn. It has been pulled from the GC and already resides on the dirty list, - * but has been deleted. Use these pages first before pulling again from the GC. - * - * If the page wasn't dirtied in this txn, just add it - * to this txn's free list. */ -static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, - MDBX_page *mp /* maybe null */, - unsigned pageflags /* maybe unknown/zero */) { - int rc; - MDBX_txn *const txn = mc->mc_txn; - tASSERT(txn, !mp || (mp->mp_pgno == pgno && mp->mp_flags == pageflags)); +#define page_next(mp) \ + (*(page_t **)ptr_disp((mp)->entries, sizeof(void *) - sizeof(uint32_t))) - /* During deleting entire subtrees, it is reasonable and possible to avoid - * reading leaf pages, i.e. significantly reduce hard page-faults & IOPs: - * - mp is null, i.e. the page has not yet been read; - * - pagetype is known and the P_LEAF bit is set; - * - we can determine the page status via scanning the lists - * of dirty and spilled pages. - * - * On the other hand, this could be suboptimal for WRITEMAP mode, since - * requires support the list of dirty pages and avoid explicit spilling. - * So for flexibility and avoid extra internal dependencies we just - * fallback to reading if dirty list was not allocated yet. */ - size_t di = 0, si = 0, npages = 1; - enum page_status { - unknown, - frozen, - spilled, - shadowed, - modifable - } status = unknown; - if (unlikely(!mp)) { - if (ASSERT_ENABLED() && pageflags) { - pgr_t check; - check = page_get_any(mc, pgno, txn->mt_front); - if (unlikely(check.err != MDBX_SUCCESS)) - return check.err; - tASSERT(txn, - (check.page->mp_flags & ~P_SPILLED) == (pageflags & ~P_FROZEN)); - tASSERT(txn, !(pageflags & P_FROZEN) || IS_FROZEN(txn, check.page)); - } - if (pageflags & P_FROZEN) { - status = frozen; - if (ASSERT_ENABLED()) { - for (MDBX_txn *scan = txn; scan; scan = scan->mt_parent) { - tASSERT(txn, !txn->tw.spilled.list || !search_spilled(scan, pgno)); - tASSERT(txn, !scan->tw.dirtylist || !debug_dpl_find(scan, pgno)); - } - } - goto status_done; - } else if (pageflags && txn->tw.dirtylist) { - if ((di = dpl_exist(txn, pgno)) != 0) { - mp = txn->tw.dirtylist->items[di].ptr; - tASSERT(txn, IS_MODIFIABLE(txn, mp)); - status = modifable; - goto status_done; - } - if ((si = search_spilled(txn, pgno)) != 0) { - status = spilled; - goto status_done; - } - for (MDBX_txn *parent = txn->mt_parent; parent; - parent = parent->mt_parent) { - if (dpl_exist(parent, pgno)) { - status = shadowed; - goto status_done; - } - if (search_spilled(parent, pgno)) { - status = spilled; - goto status_done; - } - } - status = frozen; - goto status_done; - } - pgr_t pg = page_get_any(mc, pgno, txn->mt_front); - if (unlikely(pg.err != MDBX_SUCCESS)) - return pg.err; - mp = pg.page; - tASSERT(txn, !pageflags || mp->mp_flags == pageflags); - pageflags = mp->mp_flags; - } - if (IS_FROZEN(txn, mp)) { - status = frozen; - tASSERT(txn, !IS_MODIFIABLE(txn, mp)); - tASSERT(txn, !IS_SPILLED(txn, mp)); - tASSERT(txn, !IS_SHADOWED(txn, mp)); - tASSERT(txn, !debug_dpl_find(txn, pgno)); - tASSERT(txn, !txn->tw.spilled.list || !search_spilled(txn, pgno)); - } else if (IS_MODIFIABLE(txn, mp)) { - status = modifable; - if (txn->tw.dirtylist) - di = dpl_exist(txn, pgno); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) || !IS_SPILLED(txn, mp)); - tASSERT(txn, !txn->tw.spilled.list || !search_spilled(txn, pgno)); - } else if (IS_SHADOWED(txn, mp)) { - status = shadowed; - tASSERT(txn, !txn->tw.spilled.list || !search_spilled(txn, pgno)); - tASSERT(txn, !debug_dpl_find(txn, pgno)); - } else { - tASSERT(txn, IS_SPILLED(txn, mp)); - status = spilled; - si = search_spilled(txn, pgno); - tASSERT(txn, !debug_dpl_find(txn, pgno)); - } +MDBX_INTERNAL void rthc_ctor(void); +MDBX_INTERNAL void rthc_dtor(const uint32_t current_pid); +MDBX_INTERNAL void rthc_lock(void); +MDBX_INTERNAL void rthc_unlock(void); -status_done: - if (likely((pageflags & P_OVERFLOW) == 0)) { - STATIC_ASSERT(P_BRANCH == 1); - const bool is_branch = pageflags & P_BRANCH; - if (unlikely(mc->mc_flags & C_SUB)) { - MDBX_db *outer = outer_db(mc); - cASSERT(mc, !is_branch || outer->md_branch_pages > 0); - outer->md_branch_pages -= is_branch; - cASSERT(mc, is_branch || outer->md_leaf_pages > 0); - outer->md_leaf_pages -= 1 - is_branch; - } - cASSERT(mc, !is_branch || mc->mc_db->md_branch_pages > 0); - mc->mc_db->md_branch_pages -= is_branch; - cASSERT(mc, (pageflags & P_LEAF) == 0 || mc->mc_db->md_leaf_pages > 0); - mc->mc_db->md_leaf_pages -= (pageflags & P_LEAF) != 0; - } else { - npages = mp->mp_pages; - cASSERT(mc, mc->mc_db->md_overflow_pages >= npages); - mc->mc_db->md_overflow_pages -= (pgno_t)npages; - } +MDBX_INTERNAL int rthc_register(MDBX_env *const env); +MDBX_INTERNAL int rthc_remove(MDBX_env *const env); +MDBX_INTERNAL int rthc_uniq_check(const osal_mmap_t *pending, MDBX_env **found); - if (status == frozen) { - retire: - DEBUG("retire %zu page %" PRIaPGNO, npages, pgno); - rc = pnl_append_range(false, &txn->tw.retired_pages, pgno, npages); - tASSERT(txn, dirtylist_check(txn)); - return rc; - } +/* dtor called for thread, i.e. for all mdbx's environment objects */ +MDBX_INTERNAL void rthc_thread_dtor(void *rthc); - /* Возврат страниц в нераспределенный "хвост" БД. - * Содержимое страниц не уничтожается, а для вложенных транзакций граница - * нераспределенного "хвоста" БД сдвигается только при их коммите. */ - if (MDBX_ENABLE_REFUND && unlikely(pgno + npages == txn->mt_next_pgno)) { - const char *kind = nullptr; - if (status == modifable) { - /* Страница испачкана в этой транзакции, но до этого могла быть - * аллоцирована, испачкана и пролита в одной из родительских транзакций. - * Её МОЖНО вытолкнуть в нераспределенный хвост. */ - kind = "dirty"; - /* Remove from dirty list */ - page_wash(txn, di, mp, npages); - } else if (si) { - /* Страница пролита в этой транзакции, т.е. она аллоцирована - * и запачкана в этой или одной из родительских транзакций. - * Её МОЖНО вытолкнуть в нераспределенный хвост. */ - kind = "spilled"; - tASSERT(txn, status == spilled); - spill_remove(txn, si, npages); - } else { - /* Страница аллоцирована, запачкана и возможно пролита в одной - * из родительских транзакций. - * Её МОЖНО вытолкнуть в нераспределенный хвост. */ - kind = "parent's"; - if (ASSERT_ENABLED() && mp) { - kind = nullptr; - for (MDBX_txn *parent = txn->mt_parent; parent; - parent = parent->mt_parent) { - if (search_spilled(parent, pgno)) { - kind = "parent-spilled"; - tASSERT(txn, status == spilled); - break; - } - if (mp == debug_dpl_find(parent, pgno)) { - kind = "parent-dirty"; - tASSERT(txn, status == shadowed); - break; - } - } - tASSERT(txn, kind != nullptr); - } - tASSERT(txn, status == spilled || status == shadowed); - } - DEBUG("refunded %zu %s page %" PRIaPGNO, npages, kind, pgno); - txn->mt_next_pgno = pgno; - txn_refund(txn); - return MDBX_SUCCESS; - } +static inline void *thread_rthc_get(osal_thread_key_t key) { +#if defined(_WIN32) || defined(_WIN64) + return TlsGetValue(key); +#else + return pthread_getspecific(key); +#endif +} - if (status == modifable) { - /* Dirty page from this transaction */ - /* If suitable we can reuse it through loose list */ - if (likely(npages == 1 && suitable4loose(txn, pgno)) && - (di || !txn->tw.dirtylist)) { - DEBUG("loosen dirty page %" PRIaPGNO, pgno); - if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) - memset(page_data(mp), -1, txn->mt_env->me_psize - PAGEHDRSZ); - mp->mp_txnid = INVALID_TXNID; - mp->mp_flags = P_LOOSE; - mp_next(mp) = txn->tw.loose_pages; - txn->tw.loose_pages = mp; - txn->tw.loose_count++; -#if MDBX_ENABLE_REFUND - txn->tw.loose_refund_wl = (pgno + 2 > txn->tw.loose_refund_wl) - ? pgno + 2 - : txn->tw.loose_refund_wl; -#endif /* MDBX_ENABLE_REFUND */ - VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), - txn->mt_env->me_psize - PAGEHDRSZ); - MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), - txn->mt_env->me_psize - PAGEHDRSZ); - return MDBX_SUCCESS; - } +MDBX_INTERNAL void thread_rthc_set(osal_thread_key_t key, const void *value); -#if !MDBX_DEBUG && !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) - if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) +#if !defined(_WIN32) && !defined(_WIN64) +MDBX_INTERNAL void rthc_afterfork(void); +MDBX_INTERNAL void workaround_glibc_bug21031(void); +#endif /* !Windows */ + +static inline void thread_key_delete(osal_thread_key_t key) { + TRACE("key = %" PRIuPTR, (uintptr_t)key); +#if defined(_WIN32) || defined(_WIN64) + ENSURE(nullptr, TlsFree(key)); +#else + ENSURE(nullptr, pthread_key_delete(key) == 0); + workaround_glibc_bug21031(); #endif - { - /* Страница могла быть изменена в одной из родительских транзакций, - * в том числе, позже выгружена и затем снова загружена и изменена. - * В обоих случаях её нельзя затирать на диске и помечать недоступной - * в asan и/или valgrind */ - for (MDBX_txn *parent = txn->mt_parent; - parent && (parent->mt_flags & MDBX_TXN_SPILLS); - parent = parent->mt_parent) { - if (intersect_spilled(parent, pgno, npages)) - goto skip_invalidate; - if (dpl_intersect(parent, pgno, npages)) - goto skip_invalidate; - } +} -#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) - if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) -#endif - kill_page(txn, mp, pgno, npages); - if ((txn->mt_flags & MDBX_WRITEMAP) == 0) { - VALGRIND_MAKE_MEM_NOACCESS(page_data(pgno2page(txn->mt_env, pgno)), - pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); - MDBX_ASAN_POISON_MEMORY_REGION(page_data(pgno2page(txn->mt_env, pgno)), - pgno2bytes(txn->mt_env, npages) - - PAGEHDRSZ); - } - } - skip_invalidate: - /* wash dirty page */ - page_wash(txn, di, mp, npages); - reclaim: - DEBUG("reclaim %zu %s page %" PRIaPGNO, npages, "dirty", pgno); - rc = pnl_insert_range(&txn->tw.relist, pgno, npages); - tASSERT(txn, pnl_check_allocated(txn->tw.relist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - tASSERT(txn, dirtylist_check(txn)); - return rc; - } - if (si) { - /* Page ws spilled in this txn */ - spill_remove(txn, si, npages); - /* Страница могла быть выделена и затем пролита в этой транзакции, - * тогда её необходимо поместить в reclaimed-список. - * Либо она могла быть выделена в одной из родительских транзакций и затем - * пролита в этой транзакции, тогда её необходимо поместить в - * retired-список для последующей фильтрации при коммите. */ - for (MDBX_txn *parent = txn->mt_parent; parent; - parent = parent->mt_parent) { - if (dpl_exist(parent, pgno)) - goto retire; - } - /* Страница точно была выделена в этой транзакции - * и теперь может быть использована повторно. */ - goto reclaim; - } +typedef struct walk_sdb { + MDBX_val name; + tree_t *internal, *nested; +} walk_sdb_t; - if (status == shadowed) { - /* Dirty page MUST BE a clone from (one of) parent transaction(s). */ - if (ASSERT_ENABLED()) { - const MDBX_page *parent_dp = nullptr; - /* Check parent(s)'s dirty lists. */ - for (MDBX_txn *parent = txn->mt_parent; parent && !parent_dp; - parent = parent->mt_parent) { - tASSERT(txn, !search_spilled(parent, pgno)); - parent_dp = debug_dpl_find(parent, pgno); - } - tASSERT(txn, parent_dp && (!mp || parent_dp == mp)); - } - /* Страница была выделена в родительской транзакции и теперь может быть - * использована повторно, но только внутри этой транзакции, либо дочерних. - */ - goto reclaim; - } +typedef int walk_func(const size_t pgno, const unsigned number, void *const ctx, + const int deep, const walk_sdb_t *subdb, + const size_t page_size, const page_type_t page_type, + const MDBX_error_t err, const size_t nentries, + const size_t payload_bytes, const size_t header_bytes, + const size_t unused_bytes); - /* Страница может входить в доступный читателям MVCC-снимок, либо же она - * могла быть выделена, а затем пролита в одной из родительских - * транзакций. Поэтому пока помещаем её в retired-список, который будет - * фильтроваться относительно dirty- и spilled-списков родительских - * транзакций при коммите дочерних транзакций, либо же будет записан - * в GC в неизменном виде. */ - goto retire; -} +typedef enum walk_options { dont_check_keys_ordering = 1 } walk_options_t; -static __inline int page_retire(MDBX_cursor *mc, MDBX_page *mp) { - return page_retire_ex(mc, mp->mp_pgno, mp, mp->mp_flags); -} +MDBX_INTERNAL int walk_pages(MDBX_txn *txn, walk_func *visitor, void *user, + walk_options_t options); -typedef struct iov_ctx { - MDBX_env *env; - osal_ioring_t *ior; - mdbx_filehandle_t fd; - int err; -#ifndef MDBX_NEED_WRITTEN_RANGE -#define MDBX_NEED_WRITTEN_RANGE 1 -#endif /* MDBX_NEED_WRITTEN_RANGE */ -#if MDBX_NEED_WRITTEN_RANGE - pgno_t flush_begin; - pgno_t flush_end; -#endif /* MDBX_NEED_WRITTEN_RANGE */ - uint64_t coherency_timestamp; -} iov_ctx_t; - -__must_check_result static int iov_init(MDBX_txn *const txn, iov_ctx_t *ctx, - size_t items, size_t npages, - mdbx_filehandle_t fd, - bool check_coherence) { - ctx->env = txn->mt_env; - ctx->ior = &txn->mt_env->me_ioring; - ctx->fd = fd; - ctx->coherency_timestamp = - (check_coherence || txn->mt_env->me_lck->mti_pgop_stat.incoherence.weak) - ? 0 - : UINT64_MAX /* не выполнять сверку */; - ctx->err = osal_ioring_prepare(ctx->ior, items, - pgno_align2os_bytes(txn->mt_env, npages)); - if (likely(ctx->err == MDBX_SUCCESS)) { -#if MDBX_NEED_WRITTEN_RANGE - ctx->flush_begin = MAX_PAGENO; - ctx->flush_end = MIN_PAGENO; -#endif /* MDBX_NEED_WRITTEN_RANGE */ - osal_ioring_reset(ctx->ior); - } - return ctx->err; -} +/// -static inline bool iov_empty(const iov_ctx_t *ctx) { - return osal_ioring_used(ctx->ior) == 0; -} -static void iov_callback4dirtypages(iov_ctx_t *ctx, size_t offset, void *data, - size_t bytes) { - MDBX_env *const env = ctx->env; - eASSERT(env, (env->me_flags & MDBX_WRITEMAP) == 0); +#define MDBX_RADIXSORT_THRESHOLD 142 - MDBX_page *wp = (MDBX_page *)data; - eASSERT(env, wp->mp_pgno == bytes2pgno(env, offset)); - eASSERT(env, bytes2pgno(env, bytes) >= (IS_OVERFLOW(wp) ? wp->mp_pages : 1u)); - eASSERT(env, (wp->mp_flags & P_ILL_BITS) == 0); +/* --------------------------------------------------------------------------- + * LY: State of the art quicksort-based sorting, with internal stack + * and network-sort for small chunks. + * Thanks to John M. Gamble for the http://pages.ripco.net/~jgamble/nw.html */ - if (likely(ctx->err == MDBX_SUCCESS)) { - const MDBX_page *const rp = ptr_disp(env->me_map, offset); - VALGRIND_MAKE_MEM_DEFINED(rp, bytes); - MDBX_ASAN_UNPOISON_MEMORY_REGION(rp, bytes); - osal_flush_incoherent_mmap(rp, bytes, env->me_os_psize); - /* check with timeout as the workaround - * for https://libmdbx.dqdkfa.ru/dead-github/issues/269 - * - * Проблема проявляется только при неупорядоченности: если записанная - * последней мета-страница "обгоняет" ранее записанные, т.е. когда - * записанное в файл позже становится видимым в отображении раньше, - * чем записанное ранее. - * - * Исходно здесь всегда выполнялась полная сверка. Это давало полную - * гарантию защиты от проявления проблемы, но порождало накладные расходы. - * В некоторых сценариях наблюдалось снижение производительности до 10-15%, - * а в синтетических тестах до 30%. Конечно никто не вникал в причины, - * а просто останавливался на мнении "libmdbx не быстрее LMDB", - * например: https://clck.ru/3386er - * - * Поэтому после серии экспериментов и тестов реализовано следующее: - * 0. Посредством опции сборки MDBX_FORCE_CHECK_MMAP_COHERENCY=1 - * можно включить полную сверку после записи. - * Остальные пункты являются взвешенным компромиссом между полной - * гарантией обнаружения проблемы и бесполезными затратами на системах - * без этого недостатка. - * 1. При старте транзакций проверяется соответствие выбранной мета-страницы - * корневым страницам b-tree проверяется. Эта проверка показала себя - * достаточной без сверки после записи. При обнаружении "некогерентности" - * эти случаи подсчитываются, а при их ненулевом счетчике выполняется - * полная сверка. Таким образом, произойдет переключение в режим полной - * сверки, если показавшая себя достаточной проверка заметит проявление - * проблемы хоты-бы раз. - * 2. Сверка не выполняется при фиксации транзакции, так как: - * - при наличии проблемы "не-когерентности" (при отложенном копировании - * или обновлении PTE, после возврата из write-syscall), проверка - * в этом процессе не гарантирует актуальность данных в другом - * процессе, который может запустить транзакцию сразу после коммита; - * - сверка только последнего блока позволяет почти восстановить - * производительность в больших транзакциях, но одновременно размывает - * уверенность в отсутствии сбоев, чем обесценивает всю затею; - * - после записи данных будет записана мета-страница, соответствие - * которой корневым страницам b-tree проверяется при старте - * транзакций, и только эта проверка показала себя достаточной; - * 3. При спиллинге производится полная сверка записанных страниц. Тут был - * соблазн сверять не полностью, а например начало и конец каждого блока. - * Но при спиллинге возможна ситуация повторного вытеснения страниц, в - * том числе large/overflow. При этом возникает риск прочитать в текущей - * транзакции старую версию страницы, до повторной записи. В этом случае - * могут возникать крайне редкие невоспроизводимые ошибки. С учетом того - * что спиллинг выполняет крайне редко, решено отказаться от экономии - * в пользу надежности. */ -#ifndef MDBX_FORCE_CHECK_MMAP_COHERENCY -#define MDBX_FORCE_CHECK_MMAP_COHERENCY 0 -#endif /* MDBX_FORCE_CHECK_MMAP_COHERENCY */ - if ((MDBX_FORCE_CHECK_MMAP_COHERENCY || - ctx->coherency_timestamp != UINT64_MAX) && - unlikely(memcmp(wp, rp, bytes))) { - ctx->coherency_timestamp = 0; - env->me_lck->mti_pgop_stat.incoherence.weak = - (env->me_lck->mti_pgop_stat.incoherence.weak >= INT32_MAX) - ? INT32_MAX - : env->me_lck->mti_pgop_stat.incoherence.weak + 1; - WARNING("catch delayed/non-arrived page %" PRIaPGNO " %s", wp->mp_pgno, - "(workaround for incoherent flaw of unified page/buffer cache)"); - do - if (coherency_timeout(&ctx->coherency_timestamp, wp->mp_pgno, env) != - MDBX_RESULT_TRUE) { - ctx->err = MDBX_PROBLEM; - break; - } - while (unlikely(memcmp(wp, rp, bytes))); - } - } +#if MDBX_HAVE_CMOV +#define SORT_CMP_SWAP(TYPE, CMP, a, b) \ + do { \ + const TYPE swap_tmp = (a); \ + const bool swap_cmp = expect_with_probability(CMP(swap_tmp, b), 0, .5); \ + (a) = swap_cmp ? swap_tmp : b; \ + (b) = swap_cmp ? b : swap_tmp; \ + } while (0) +#else +#define SORT_CMP_SWAP(TYPE, CMP, a, b) \ + do \ + if (expect_with_probability(!CMP(a, b), 0, .5)) { \ + const TYPE swap_tmp = (a); \ + (a) = (b); \ + (b) = swap_tmp; \ + } \ + while (0) +#endif - if (likely(bytes == env->me_psize)) - dpage_free(env, wp, 1); - else { - do { - eASSERT(env, wp->mp_pgno == bytes2pgno(env, offset)); - eASSERT(env, (wp->mp_flags & P_ILL_BITS) == 0); - size_t npages = IS_OVERFLOW(wp) ? wp->mp_pages : 1u; - size_t chunk = pgno2bytes(env, npages); - eASSERT(env, bytes >= chunk); - MDBX_page *next = ptr_disp(wp, chunk); - dpage_free(env, wp, npages); - wp = next; - offset += chunk; - bytes -= chunk; - } while (bytes); - } -} +// 3 comparators, 3 parallel operations +// o-----^--^--o +// | | +// o--^--|--v--o +// | | +// o--v--v-----o +// +// [[1,2]] +// [[0,2]] +// [[0,1]] +#define SORT_NETWORK_3(TYPE, CMP, begin) \ + do { \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + } while (0) -static void iov_complete(iov_ctx_t *ctx) { - if ((ctx->env->me_flags & MDBX_WRITEMAP) == 0) - osal_ioring_walk(ctx->ior, ctx, iov_callback4dirtypages); - osal_ioring_reset(ctx->ior); -} +// 5 comparators, 3 parallel operations +// o--^--^--------o +// | | +// o--v--|--^--^--o +// | | | +// o--^--v--|--v--o +// | | +// o--v-----v-----o +// +// [[0,1],[2,3]] +// [[0,2],[1,3]] +// [[1,2]] +#define SORT_NETWORK_4(TYPE, CMP, begin) \ + do { \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ + } while (0) -__must_check_result static int iov_write(iov_ctx_t *ctx) { - eASSERT(ctx->env, !iov_empty(ctx)); - osal_ioring_write_result_t r = osal_ioring_write(ctx->ior, ctx->fd); -#if MDBX_ENABLE_PGOP_STAT - ctx->env->me_lck->mti_pgop_stat.wops.weak += r.wops; -#endif /* MDBX_ENABLE_PGOP_STAT */ - ctx->err = r.err; - if (unlikely(ctx->err != MDBX_SUCCESS)) - ERROR("Write error: %s", mdbx_strerror(ctx->err)); - iov_complete(ctx); - return ctx->err; -} +// 9 comparators, 5 parallel operations +// o--^--^-----^-----------o +// | | | +// o--|--|--^--v-----^--^--o +// | | | | | +// o--|--v--|--^--^--|--v--o +// | | | | | +// o--|-----v--|--v--|--^--o +// | | | | +// o--v--------v-----v--v--o +// +// [[0,4],[1,3]] +// [[0,2]] +// [[2,4],[0,1]] +// [[2,3],[1,4]] +// [[1,2],[3,4]] +#define SORT_NETWORK_5(TYPE, CMP, begin) \ + do { \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ + } while (0) -__must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, - MDBX_page *dp, size_t npages) { - MDBX_env *const env = txn->mt_env; - tASSERT(txn, ctx->err == MDBX_SUCCESS); - tASSERT(txn, dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno); - tASSERT(txn, IS_MODIFIABLE(txn, dp)); - tASSERT(txn, !(dp->mp_flags & ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW))); - - if (IS_SHADOWED(txn, dp)) { - tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); - dp->mp_txnid = txn->mt_txnid; - tASSERT(txn, IS_SPILLED(txn, dp)); -#if MDBX_AVOID_MSYNC - doit:; -#endif /* MDBX_AVOID_MSYNC */ - int err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->mp_pgno), dp, - pgno2bytes(env, npages)); - if (unlikely(err != MDBX_SUCCESS)) { - ctx->err = err; - if (unlikely(err != MDBX_RESULT_TRUE)) { - iov_complete(ctx); - return err; - } - err = iov_write(ctx); - tASSERT(txn, iov_empty(ctx)); - if (likely(err == MDBX_SUCCESS)) { - err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->mp_pgno), dp, - pgno2bytes(env, npages)); - if (unlikely(err != MDBX_SUCCESS)) { - iov_complete(ctx); - return ctx->err = err; - } - } - tASSERT(txn, ctx->err == MDBX_SUCCESS); - } - } else { - tASSERT(txn, txn->mt_flags & MDBX_WRITEMAP); -#if MDBX_AVOID_MSYNC - goto doit; -#endif /* MDBX_AVOID_MSYNC */ - } +// 12 comparators, 6 parallel operations +// o-----^--^--^-----------------o +// | | | +// o--^--|--v--|--^--------^-----o +// | | | | | +// o--v--v-----|--|--^--^--|--^--o +// | | | | | | +// o-----^--^--v--|--|--|--v--v--o +// | | | | | +// o--^--|--v-----v--|--v--------o +// | | | +// o--v--v-----------v-----------o +// +// [[1,2],[4,5]] +// [[0,2],[3,5]] +// [[0,1],[3,4],[2,5]] +// [[0,3],[1,4]] +// [[2,4],[1,3]] +// [[2,3]] +#define SORT_NETWORK_6(TYPE, CMP, begin) \ + do { \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ + } while (0) -#if MDBX_NEED_WRITTEN_RANGE - ctx->flush_begin = - (ctx->flush_begin < dp->mp_pgno) ? ctx->flush_begin : dp->mp_pgno; - ctx->flush_end = (ctx->flush_end > dp->mp_pgno + (pgno_t)npages) - ? ctx->flush_end - : dp->mp_pgno + (pgno_t)npages; -#endif /* MDBX_NEED_WRITTEN_RANGE */ - return MDBX_SUCCESS; -} +// 16 comparators, 6 parallel operations +// o--^--------^-----^-----------------o +// | | | +// o--|--^-----|--^--v--------^--^-----o +// | | | | | | +// o--|--|--^--v--|--^-----^--|--v-----o +// | | | | | | | +// o--|--|--|-----v--|--^--v--|--^--^--o +// | | | | | | | | +// o--v--|--|--^-----v--|--^--v--|--v--o +// | | | | | | +// o-----v--|--|--------v--v-----|--^--o +// | | | | +// o--------v--v-----------------v--v--o +// +// [[0,4],[1,5],[2,6]] +// [[0,2],[1,3],[4,6]] +// [[2,4],[3,5],[0,1]] +// [[2,3],[4,5]] +// [[1,4],[3,6]] +// [[1,2],[3,4],[5,6]] +#define SORT_NETWORK_7(TYPE, CMP, begin) \ + do { \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ + } while (0) -static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp, - const size_t npages) { - tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); -#if MDBX_ENABLE_PGOP_STAT - txn->mt_env->me_lck->mti_pgop_stat.spill.weak += npages; -#endif /* MDBX_ENABLE_PGOP_STAT */ - const pgno_t pgno = dp->mp_pgno; - int err = iov_page(txn, ctx, dp, npages); - if (likely(err == MDBX_SUCCESS)) - err = pnl_append_range(true, &txn->tw.spilled.list, pgno << 1, npages); - return err; -} - -/* Set unspillable LRU-label for dirty pages watched by txn. - * Returns the number of pages marked as unspillable. */ -static size_t cursor_keep(const MDBX_txn *const txn, const MDBX_cursor *mc) { - tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); - size_t keep = 0; - while ((mc->mc_flags & C_INITIALIZED) && mc->mc_snum) { - tASSERT(txn, mc->mc_top == mc->mc_snum - 1); - const MDBX_page *mp; - size_t i = 0; - do { - mp = mc->mc_pg[i]; - tASSERT(txn, !IS_SUBP(mp)); - if (IS_MODIFIABLE(txn, mp)) { - size_t const n = dpl_search(txn, mp->mp_pgno); - if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && - /* не считаем дважды */ dpl_age(txn, n)) { - size_t *const ptr = ptr_disp(txn->tw.dirtylist->items[n].ptr, - -(ptrdiff_t)sizeof(size_t)); - *ptr = txn->tw.dirtylru; - tASSERT(txn, dpl_age(txn, n) == 0); - ++keep; - } - } - } while (++i < mc->mc_snum); - - tASSERT(txn, IS_LEAF(mp)); - if (!mc->mc_xcursor || mc->mc_ki[mc->mc_top] >= page_numkeys(mp)) - break; - if (!(node_flags(page_node(mp, mc->mc_ki[mc->mc_top])) & F_SUBDATA)) - break; - mc = &mc->mc_xcursor->mx_cursor; - } - return keep; -} - -static size_t txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { - tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); - txn_lru_turn(txn); - size_t keep = m0 ? cursor_keep(txn, m0) : 0; +// 19 comparators, 6 parallel operations +// o--^--------^-----^-----------------o +// | | | +// o--|--^-----|--^--v--------^--^-----o +// | | | | | | +// o--|--|--^--v--|--^-----^--|--v-----o +// | | | | | | | +// o--|--|--|--^--v--|--^--v--|--^--^--o +// | | | | | | | | | +// o--v--|--|--|--^--v--|--^--v--|--v--o +// | | | | | | | +// o-----v--|--|--|--^--v--v-----|--^--o +// | | | | | | +// o--------v--|--v--|--^--------v--v--o +// | | | +// o-----------v-----v--v--------------o +// +// [[0,4],[1,5],[2,6],[3,7]] +// [[0,2],[1,3],[4,6],[5,7]] +// [[2,4],[3,5],[0,1],[6,7]] +// [[2,3],[4,5]] +// [[1,4],[3,6]] +// [[1,2],[3,4],[5,6]] +#define SORT_NETWORK_8(TYPE, CMP, begin) \ + do { \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ + SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ + } while (0) - TXN_FOREACH_DBI_ALL(txn, dbi) { - if (F_ISSET(txn->mt_dbi_state[dbi], DBI_DIRTY | DBI_VALID) && - txn->mt_dbs[dbi].md_root != P_INVALID) - for (MDBX_cursor *mc = txn->mt_cursors[dbi]; mc; mc = mc->mc_next) - if (mc != m0) - keep += cursor_keep(txn, mc); +#define SORT_INNER(TYPE, CMP, begin, end, len) \ + switch (len) { \ + default: \ + assert(false); \ + __unreachable(); \ + case 0: \ + case 1: \ + break; \ + case 2: \ + SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ + break; \ + case 3: \ + SORT_NETWORK_3(TYPE, CMP, begin); \ + break; \ + case 4: \ + SORT_NETWORK_4(TYPE, CMP, begin); \ + break; \ + case 5: \ + SORT_NETWORK_5(TYPE, CMP, begin); \ + break; \ + case 6: \ + SORT_NETWORK_6(TYPE, CMP, begin); \ + break; \ + case 7: \ + SORT_NETWORK_7(TYPE, CMP, begin); \ + break; \ + case 8: \ + SORT_NETWORK_8(TYPE, CMP, begin); \ + break; \ } - return keep; -} +#define SORT_SWAP(TYPE, a, b) \ + do { \ + const TYPE swap_tmp = (a); \ + (a) = (b); \ + (b) = swap_tmp; \ + } while (0) -/* Returns the spilling priority (0..255) for a dirty page: - * 0 = should be spilled; - * ... - * > 255 = must not be spilled. */ -MDBX_NOTHROW_PURE_FUNCTION static unsigned -spill_prio(const MDBX_txn *txn, const size_t i, const uint32_t reciprocal) { - MDBX_dpl *const dl = txn->tw.dirtylist; - const uint32_t age = dpl_age(txn, i); - const size_t npages = dpl_npages(dl, i); - const pgno_t pgno = dl->items[i].pgno; - if (age == 0) { - DEBUG("skip %s %zu page %" PRIaPGNO, "keep", npages, pgno); - return 256; - } +#define SORT_PUSH(low, high) \ + do { \ + top->lo = (low); \ + top->hi = (high); \ + ++top; \ + } while (0) - MDBX_page *const dp = dl->items[i].ptr; - if (dp->mp_flags & (P_LOOSE | P_SPILLED)) { - DEBUG("skip %s %zu page %" PRIaPGNO, - (dp->mp_flags & P_LOOSE) ? "loose" : "parent-spilled", npages, pgno); - return 256; - } +#define SORT_POP(low, high) \ + do { \ + --top; \ + low = top->lo; \ + high = top->hi; \ + } while (0) - /* Can't spill twice, - * make sure it's not already in a parent's spill list(s). */ - MDBX_txn *parent = txn->mt_parent; - if (parent && (parent->mt_flags & MDBX_TXN_SPILLS)) { - do - if (intersect_spilled(parent, pgno, npages)) { - DEBUG("skip-2 parent-spilled %zu page %" PRIaPGNO, npages, pgno); - dp->mp_flags |= P_SPILLED; - return 256; - } - while ((parent = parent->mt_parent) != nullptr); +#define SORT_IMPL(NAME, EXPECT_LOW_CARDINALITY_OR_PRESORTED, TYPE, CMP) \ + \ + static inline bool NAME##_is_sorted(const TYPE *first, const TYPE *last) { \ + while (++first <= last) \ + if (expect_with_probability(CMP(first[0], first[-1]), 1, .1)) \ + return false; \ + return true; \ + } \ + \ + typedef struct { \ + TYPE *lo, *hi; \ + } NAME##_stack; \ + \ + __hot static void NAME(TYPE *const __restrict begin, \ + TYPE *const __restrict end) { \ + NAME##_stack stack[sizeof(size_t) * CHAR_BIT], *__restrict top = stack; \ + \ + TYPE *__restrict hi = end - 1; \ + TYPE *__restrict lo = begin; \ + while (true) { \ + const ptrdiff_t len = hi - lo; \ + if (len < 8) { \ + SORT_INNER(TYPE, CMP, lo, hi + 1, len + 1); \ + if (unlikely(top == stack)) \ + break; \ + SORT_POP(lo, hi); \ + continue; \ + } \ + \ + TYPE *__restrict mid = lo + (len >> 1); \ + SORT_CMP_SWAP(TYPE, CMP, *lo, *mid); \ + SORT_CMP_SWAP(TYPE, CMP, *mid, *hi); \ + SORT_CMP_SWAP(TYPE, CMP, *lo, *mid); \ + \ + TYPE *right = hi - 1; \ + TYPE *left = lo + 1; \ + while (1) { \ + while (expect_with_probability(CMP(*left, *mid), 0, .5)) \ + ++left; \ + while (expect_with_probability(CMP(*mid, *right), 0, .5)) \ + --right; \ + if (unlikely(left > right)) { \ + if (EXPECT_LOW_CARDINALITY_OR_PRESORTED) { \ + if (NAME##_is_sorted(lo, right)) \ + lo = right + 1; \ + if (NAME##_is_sorted(left, hi)) \ + hi = left; \ + } \ + break; \ + } \ + SORT_SWAP(TYPE, *left, *right); \ + mid = (mid == left) ? right : (mid == right) ? left : mid; \ + ++left; \ + --right; \ + } \ + \ + if (right - lo > hi - left) { \ + SORT_PUSH(lo, right); \ + lo = left; \ + } else { \ + SORT_PUSH(left, hi); \ + hi = right; \ + } \ + } \ + \ + if (AUDIT_ENABLED()) { \ + for (TYPE *scan = begin + 1; scan < end; ++scan) \ + assert(CMP(scan[-1], scan[0])); \ + } \ } - tASSERT(txn, age * (uint64_t)reciprocal < UINT32_MAX); - unsigned prio = age * reciprocal >> 24; - tASSERT(txn, prio < 256); - if (likely(npages == 1)) - return prio = 256 - prio; - - /* make a large/overflow pages be likely to spill */ - size_t factor = npages | npages >> 1; - factor |= factor >> 2; - factor |= factor >> 4; - factor |= factor >> 8; - factor |= factor >> 16; - factor = (size_t)prio * log2n_powerof2(factor + 1) + /* golden ratio */ 157; - factor = (factor < 256) ? 255 - factor : 0; - tASSERT(txn, factor < 256 && factor < (256 - prio)); - return prio = (unsigned)factor; -} - -__cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, - const intptr_t wanna_spill_entries, - const intptr_t wanna_spill_npages, - const size_t need); - -static __inline int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, - const size_t need) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, !m0 || cursor_is_tracked(m0)); - - const intptr_t wanna_spill_entries = - txn->tw.dirtylist ? (need - txn->tw.dirtyroom - txn->tw.loose_count) : 0; - const intptr_t wanna_spill_npages = - need + - (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose - : txn->tw.writemap_dirty_npages) - - txn->tw.loose_count - txn->mt_env->me_options.dp_limit; - - /* production mode */ - if (likely(wanna_spill_npages < 1 && wanna_spill_entries < 1) -#if xMDBX_DEBUG_SPILLING == 1 - /* debug mode: always try to spill if xMDBX_DEBUG_SPILLING == 1 */ - && txn->mt_txnid % 23 > 11 -#endif - ) - return MDBX_SUCCESS; - - return txn_spill_slowpath(txn, m0, wanna_spill_entries, wanna_spill_npages, - need); -} +/*------------------------------------------------------------------------------ + * LY: radix sort for large chunks */ -static size_t spill_gate(const MDBX_env *env, intptr_t part, - const size_t total) { - const intptr_t spill_min = - env->me_options.spill_min_denominator - ? (total + env->me_options.spill_min_denominator - 1) / - env->me_options.spill_min_denominator - : 1; - const intptr_t spill_max = - total - (env->me_options.spill_max_denominator - ? total / env->me_options.spill_max_denominator - : 0); - part = (part < spill_max) ? part : spill_max; - part = (part > spill_min) ? part : spill_min; - eASSERT(env, part >= 0 && (size_t)part <= total); - return (size_t)part; +#define RADIXSORT_IMPL(NAME, TYPE, EXTRACT_KEY, BUFFER_PREALLOCATED, END_GAP) \ + \ + __hot static bool NAME##_radixsort(TYPE *const begin, const size_t length) { \ + TYPE *tmp; \ + if (BUFFER_PREALLOCATED) { \ + tmp = begin + length + END_GAP; \ + /* memset(tmp, 0xDeadBeef, sizeof(TYPE) * length); */ \ + } else { \ + tmp = osal_malloc(sizeof(TYPE) * length); \ + if (unlikely(!tmp)) \ + return false; \ + } \ + \ + size_t key_shift = 0, key_diff_mask; \ + do { \ + struct { \ + pgno_t a[256], b[256]; \ + } counters; \ + memset(&counters, 0, sizeof(counters)); \ + \ + key_diff_mask = 0; \ + size_t prev_key = EXTRACT_KEY(begin) >> key_shift; \ + TYPE *r = begin, *end = begin + length; \ + do { \ + const size_t key = EXTRACT_KEY(r) >> key_shift; \ + counters.a[key & 255]++; \ + counters.b[(key >> 8) & 255]++; \ + key_diff_mask |= prev_key ^ key; \ + prev_key = key; \ + } while (++r != end); \ + \ + pgno_t ta = 0, tb = 0; \ + for (size_t i = 0; i < 256; ++i) { \ + const pgno_t ia = counters.a[i]; \ + counters.a[i] = ta; \ + ta += ia; \ + const pgno_t ib = counters.b[i]; \ + counters.b[i] = tb; \ + tb += ib; \ + } \ + \ + r = begin; \ + do { \ + const size_t key = EXTRACT_KEY(r) >> key_shift; \ + tmp[counters.a[key & 255]++] = *r; \ + } while (++r != end); \ + \ + if (unlikely(key_diff_mask < 256)) { \ + memcpy(begin, tmp, ptr_dist(end, begin)); \ + break; \ + } \ + end = (r = tmp) + length; \ + do { \ + const size_t key = EXTRACT_KEY(r) >> key_shift; \ + begin[counters.b[(key >> 8) & 255]++] = *r; \ + } while (++r != end); \ + \ + key_shift += 16; \ + } while (key_diff_mask >> 16); \ + \ + if (!(BUFFER_PREALLOCATED)) \ + osal_free(tmp); \ + return true; \ + } + +/*------------------------------------------------------------------------------ + * LY: Binary search */ + +#if defined(__clang__) && __clang_major__ > 4 && defined(__ia32__) +#define WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(size, flag) \ + do \ + __asm __volatile("" \ + : "+r"(size) \ + : "r" /* the `b` constraint is more suitable here, but \ + cause CLANG to allocate and push/pop an one more \ + register, so using the `r` which avoids this. */ \ + (flag)); \ + while (0) +#else +#define WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(size, flag) \ + do { \ + /* nope for non-clang or non-x86 */; \ + } while (0) +#endif /* Workaround for CLANG */ + +#define SEARCH_IMPL(NAME, TYPE_LIST, TYPE_ARG, CMP) \ + static __always_inline const TYPE_LIST *NAME( \ + const TYPE_LIST *it, size_t length, const TYPE_ARG item) { \ + const TYPE_LIST *const begin = it, *const end = begin + length; \ + \ + if (MDBX_HAVE_CMOV) \ + do { \ + /* Адаптивно-упрощенный шаг двоичного поиска: \ + * - без переходов при наличии cmov или аналога; \ + * - допускает лишние итерации; \ + * - но ищет пока size > 2, что требует дозавершения поиска \ + * среди остающихся 0-1-2 элементов. */ \ + const TYPE_LIST *const middle = it + (length >> 1); \ + length = (length + 1) >> 1; \ + const bool flag = expect_with_probability(CMP(*middle, item), 0, .5); \ + WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(length, flag); \ + it = flag ? middle : it; \ + } while (length > 2); \ + else \ + while (length > 2) { \ + /* Вариант с использованием условного перехода. Основное отличие в \ + * том, что при "не равно" (true от компаратора) переход делается на 1 \ + * ближе к концу массива. Алгоритмически это верно и обеспечивает \ + * чуть-чуть более быструю сходимость, но зато требует больше \ + * вычислений при true от компаратора. Также ВАЖНО(!) не допускается \ + * спекулятивное выполнение при size == 0. */ \ + const TYPE_LIST *const middle = it + (length >> 1); \ + length = (length + 1) >> 1; \ + const bool flag = expect_with_probability(CMP(*middle, item), 0, .5); \ + if (flag) { \ + it = middle + 1; \ + length -= 1; \ + } \ + } \ + it += length > 1 && expect_with_probability(CMP(*it, item), 0, .5); \ + it += length > 0 && expect_with_probability(CMP(*it, item), 0, .5); \ + \ + if (AUDIT_ENABLED()) { \ + for (const TYPE_LIST *scan = begin; scan < it; ++scan) \ + assert(CMP(*scan, item)); \ + for (const TYPE_LIST *scan = it; scan < end; ++scan) \ + assert(!CMP(*scan, item)); \ + (void)begin, (void)end; \ + } \ + \ + return it; \ + } +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 + + +MDBX_cursor *mdbx_cursor_create(void *context) { + cursor_couple_t *couple = osal_calloc(1, sizeof(cursor_couple_t)); + if (unlikely(!couple)) + return nullptr; + + VALGRIND_MAKE_MEM_UNDEFINED(couple, sizeof(cursor_couple_t)); + couple->outer.signature = cur_signature_ready4dispose; + couple->outer.next = &couple->outer; + couple->userctx = context; + couple->outer.top_and_flags = z_poor_mark; + couple->inner.cursor.top_and_flags = z_poor_mark | z_inner; + VALGRIND_MAKE_MEM_DEFINED(&couple->outer.backup, + sizeof(couple->outer.backup)); + VALGRIND_MAKE_MEM_DEFINED(&couple->outer.tree, sizeof(couple->outer.tree)); + VALGRIND_MAKE_MEM_DEFINED(&couple->outer.clc, sizeof(couple->outer.clc)); + VALGRIND_MAKE_MEM_DEFINED(&couple->outer.dbi_state, + sizeof(couple->outer.dbi_state)); + VALGRIND_MAKE_MEM_DEFINED(&couple->outer.subcur, + sizeof(couple->outer.subcur)); + VALGRIND_MAKE_MEM_DEFINED(&couple->outer.txn, sizeof(couple->outer.txn)); + return &couple->outer; } -__cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, - const intptr_t wanna_spill_entries, - const intptr_t wanna_spill_npages, - const size_t need) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); +int mdbx_cursor_renew(const MDBX_txn *txn, MDBX_cursor *mc) { + return likely(mc) + ? mdbx_cursor_bind(txn, mc, (kvx_t *)mc->clc - txn->env->kvs) + : MDBX_EINVAL; +} - int rc = MDBX_SUCCESS; - if (unlikely(txn->tw.loose_count >= - (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose - : txn->tw.writemap_dirty_npages))) - goto done; +int mdbx_cursor_reset(MDBX_cursor *mc) { + if (unlikely(!mc)) + return MDBX_EINVAL; - const size_t dirty_entries = - txn->tw.dirtylist ? (txn->tw.dirtylist->length - txn->tw.loose_count) : 1; - const size_t dirty_npages = - (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose - : txn->tw.writemap_dirty_npages) - - txn->tw.loose_count; - const size_t need_spill_entries = - spill_gate(txn->mt_env, wanna_spill_entries, dirty_entries); - const size_t need_spill_npages = - spill_gate(txn->mt_env, wanna_spill_npages, dirty_npages); + if (unlikely(mc->signature != cur_signature_ready4dispose && + mc->signature != cur_signature_live)) + return MDBX_EBADSIGN; - const size_t need_spill = (need_spill_entries > need_spill_npages) - ? need_spill_entries - : need_spill_npages; - if (!need_spill) - goto done; + cursor_couple_t *couple = (cursor_couple_t *)mc; + couple->outer.top_and_flags = z_poor_mark; + couple->inner.cursor.top_and_flags = z_poor_mark | z_inner; + return MDBX_SUCCESS; +} - if (txn->mt_flags & MDBX_WRITEMAP) { - NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "msync", - dirty_entries, dirty_npages); - const MDBX_env *env = txn->mt_env; - tASSERT(txn, txn->tw.spilled.list == nullptr); - rc = - osal_msync(&txn->mt_env->me_dxb_mmap, 0, - pgno_align2os_bytes(env, txn->mt_next_pgno), MDBX_SYNC_KICK); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; -#if MDBX_AVOID_MSYNC - MDBX_ANALYSIS_ASSUME(txn->tw.dirtylist != nullptr); - tASSERT(txn, dirtylist_check(txn)); - env->me_lck->mti_unsynced_pages.weak += - txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count; - dpl_clear(txn->tw.dirtylist); - txn->tw.dirtyroom = env->me_options.dp_limit - txn->tw.loose_count; - for (MDBX_page *lp = txn->tw.loose_pages; lp != nullptr; lp = mp_next(lp)) { - tASSERT(txn, lp->mp_flags == P_LOOSE); - rc = dpl_append(txn, lp->mp_pgno, lp, 1); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); - VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); - } - tASSERT(txn, dirtylist_check(txn)); -#else - tASSERT(txn, txn->tw.dirtylist == nullptr); - env->me_lck->mti_unsynced_pages.weak += txn->tw.writemap_dirty_npages; - txn->tw.writemap_spilled_npages += txn->tw.writemap_dirty_npages; - txn->tw.writemap_dirty_npages = 0; -#endif /* MDBX_AVOID_MSYNC */ - goto done; +int mdbx_cursor_bind(const MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { + if (unlikely(!mc)) + return MDBX_EINVAL; + + if (unlikely(mc->signature != cur_signature_ready4dispose && + mc->signature != cur_signature_live)) + return MDBX_EBADSIGN; + + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + rc = dbi_check(txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(dbi == FREE_DBI && !(txn->flags & MDBX_TXN_RDONLY))) + return MDBX_EACCESS; + + if (unlikely(mc->backup)) /* Cursor from parent transaction */ { + cASSERT(mc, mc->signature == cur_signature_live); + if (unlikely(cursor_dbi(mc) != dbi || + /* paranoia */ mc->signature != cur_signature_live || + mc->txn != txn)) + return MDBX_EINVAL; + + cASSERT(mc, mc->tree == &txn->dbs[dbi]); + cASSERT(mc, mc->clc == &txn->env->kvs[dbi].clc); + cASSERT(mc, cursor_dbi(mc) == dbi); + return likely(cursor_dbi(mc) == dbi && + /* paranoia */ mc->signature == cur_signature_live && + mc->txn == txn) + ? MDBX_SUCCESS + : MDBX_EINVAL /* Disallow change DBI in nested transactions */; } - NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "write", - need_spill_entries, need_spill_npages); - MDBX_ANALYSIS_ASSUME(txn->tw.dirtylist != nullptr); - tASSERT(txn, txn->tw.dirtylist->length - txn->tw.loose_count >= 1); - tASSERT(txn, txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count >= - need_spill_npages); - if (!txn->tw.spilled.list) { - txn->tw.spilled.least_removed = INT_MAX; - txn->tw.spilled.list = pnl_alloc(need_spill); - if (unlikely(!txn->tw.spilled.list)) { - rc = MDBX_ENOMEM; - bailout: - txn->mt_flags |= MDBX_TXN_ERROR; + if (mc->signature == cur_signature_live) { + rc = mdbx_cursor_unbind(mc); + if (unlikely(rc != MDBX_SUCCESS)) return rc; - } - } else { - /* purge deleted slots */ - spill_purge(txn); - rc = pnl_reserve(&txn->tw.spilled.list, need_spill); - (void)rc /* ignore since the resulting list may be shorter - and pnl_append() will increase pnl on demand */ - ; } + cASSERT(mc, mc->next == mc); - /* Сортируем чтобы запись на диск была полее последовательна */ - MDBX_dpl *const dl = dpl_sort(txn); + rc = cursor_init(mc, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - /* Preserve pages which may soon be dirtied again */ - const size_t unspillable = txn_keep(txn, m0); - if (unspillable + txn->tw.loose_count >= dl->length) { -#if xMDBX_DEBUG_SPILLING == 1 /* avoid false failure in debug mode */ - if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need)) - return MDBX_SUCCESS; -#endif /* xMDBX_DEBUG_SPILLING */ - ERROR("all %zu dirty pages are unspillable since referenced " - "by a cursor(s), use fewer cursors or increase " - "MDBX_opt_txn_dp_limit", - unspillable); - goto done; - } + mc->next = txn->cursors[dbi]; + txn->cursors[dbi] = mc; + return MDBX_SUCCESS; +} - /* Подзадача: Вытолкнуть часть страниц на диск в соответствии с LRU, - * но при этом учесть важные поправки: - * - лучше выталкивать старые large/overflow страницы, так будет освобождено - * больше памяти, а также так как они (в текущем понимании) гораздо реже - * повторно изменяются; - * - при прочих равных лучше выталкивать смежные страницы, так будет - * меньше I/O операций; - * - желательно потратить на это меньше времени чем std::partial_sort_copy; - * - * Решение: - * - Квантуем весь диапазон lru-меток до 256 значений и задействуем один - * проход 8-битного radix-sort. В результате получаем 256 уровней - * "свежести", в том числе значение lru-метки, старее которой страницы - * должны быть выгружены; - * - Двигаемся последовательно в сторону увеличения номеров страниц - * и выталкиваем страницы с lru-меткой старее отсекающего значения, - * пока не вытолкнем достаточно; - * - Встречая страницы смежные с выталкиваемыми для уменьшения кол-ва - * I/O операций выталкиваем и их, если они попадают в первую половину - * между выталкиваемыми и самыми свежими lru-метками; - * - дополнительно при сортировке умышленно старим large/overflow страницы, - * тем самым повышая их шансы на выталкивание. */ +int mdbx_cursor_unbind(MDBX_cursor *mc) { + if (unlikely(!mc)) + return MDBX_EINVAL; - /* get min/max of LRU-labels */ - uint32_t age_max = 0; - for (size_t i = 1; i <= dl->length; ++i) { - const uint32_t age = dpl_age(txn, i); - age_max = (age_max >= age) ? age_max : age; + if (unlikely(mc->signature != cur_signature_live)) + return (mc->signature == cur_signature_ready4dispose) ? MDBX_SUCCESS + : MDBX_EBADSIGN; + + if (unlikely(mc->backup)) /* Cursor from parent transaction */ + return MDBX_EINVAL; + + eASSERT(nullptr, mc->txn && mc->txn->signature == txn_signature); + cASSERT(mc, mc->signature == cur_signature_live); + cASSERT(mc, !mc->backup); + if (unlikely(!mc->txn || mc->txn->signature != txn_signature)) { + ERROR("Wrong cursor's transaction %p 0x%x", + __Wpedantic_format_voidptr(mc->txn), + mc->txn ? mc->txn->signature : 0); + return MDBX_PROBLEM; } + if (mc->next != mc) { + const size_t dbi = (kvx_t *)mc->clc - mc->txn->env->kvs; + cASSERT(mc, cursor_dbi(mc) == dbi); + cASSERT(mc, dbi < mc->txn->n_dbi); + if (dbi < mc->txn->n_dbi) { + MDBX_cursor **prev = &mc->txn->cursors[dbi]; + while (*prev && *prev != mc) + prev = &(*prev)->next; + cASSERT(mc, *prev == mc); + *prev = mc->next; + } + mc->next = mc; + } + mc->signature = cur_signature_ready4dispose; + mc->flags = 0; + return MDBX_SUCCESS; +} - VERBOSE("lru-head %u, age-max %u", txn->tw.dirtylru, age_max); +int mdbx_cursor_open(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_cursor **ret) { + if (unlikely(!ret)) + return MDBX_EINVAL; + *ret = nullptr; - /* half of 8-bit radix-sort */ - pgno_t radix_entries[256], radix_npages[256]; - memset(&radix_entries, 0, sizeof(radix_entries)); - memset(&radix_npages, 0, sizeof(radix_npages)); - size_t spillable_entries = 0, spillable_npages = 0; - const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1); - for (size_t i = 1; i <= dl->length; ++i) { - const unsigned prio = spill_prio(txn, i, reciprocal); - size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t)); - TRACE("page %" PRIaPGNO - ", lru %zu, is_multi %c, npages %u, age %u of %u, prio %u", - dl->items[i].pgno, *ptr, (dl->items[i].npages > 1) ? 'Y' : 'N', - dpl_npages(dl, i), dpl_age(txn, i), age_max, prio); - if (prio < 256) { - radix_entries[prio] += 1; - spillable_entries += 1; - const pgno_t npages = dpl_npages(dl, i); - radix_npages[prio] += npages; - spillable_npages += npages; - } + MDBX_cursor *const mc = mdbx_cursor_create(nullptr); + if (unlikely(!mc)) + return MDBX_ENOMEM; + + int rc = mdbx_cursor_bind(txn, mc, dbi); + if (unlikely(rc != MDBX_SUCCESS)) { + mdbx_cursor_close(mc); + return rc; } - tASSERT(txn, spillable_npages >= spillable_entries); - pgno_t spilled_entries = 0, spilled_npages = 0; - if (likely(spillable_entries > 0)) { - size_t prio2spill = 0, prio2adjacent = 128, - amount_entries = radix_entries[0], amount_npages = radix_npages[0]; - for (size_t i = 1; i < 256; i++) { - if (amount_entries < need_spill_entries || - amount_npages < need_spill_npages) { - prio2spill = i; - prio2adjacent = i + (257 - i) / 2; - amount_entries += radix_entries[i]; - amount_npages += radix_npages[i]; - } else if (amount_entries + amount_entries < - spillable_entries + need_spill_entries - /* РАВНОЗНАЧНО: amount - need_spill < spillable - amount */ - || amount_npages + amount_npages < - spillable_npages + need_spill_npages) { - prio2adjacent = i; - amount_entries += radix_entries[i]; - amount_npages += radix_npages[i]; - } else - break; + *ret = mc; + return MDBX_SUCCESS; +} + +void mdbx_cursor_close(MDBX_cursor *mc) { + if (likely(mc)) { + ENSURE(nullptr, mc->signature == cur_signature_live || + mc->signature == cur_signature_ready4dispose); + MDBX_txn *const txn = mc->txn; + if (!mc->backup) { + mc->txn = nullptr; + /* Unlink from txn, if tracked. */ + if (mc->next != mc) { + ENSURE(txn->env, check_txn(txn, 0) == MDBX_SUCCESS); + const size_t dbi = (kvx_t *)mc->clc - txn->env->kvs; + tASSERT(txn, dbi < txn->n_dbi); + if (dbi < txn->n_dbi) { + MDBX_cursor **prev = &txn->cursors[dbi]; + while (*prev && *prev != mc) + prev = &(*prev)->next; + tASSERT(txn, *prev == mc); + *prev = mc->next; + } + mc->next = mc; + } + mc->signature = 0; + osal_free(mc); + } else { + /* Cursor closed before nested txn ends */ + tASSERT(txn, mc->signature == cur_signature_live); + ENSURE(txn->env, check_txn_rw(txn, 0) == MDBX_SUCCESS); + mc->signature = cur_signature_wait4eot; } + } +} - VERBOSE("prio2spill %zu, prio2adjacent %zu, spillable %zu/%zu," - " wanna-spill %zu/%zu, amount %zu/%zu", - prio2spill, prio2adjacent, spillable_entries, spillable_npages, - need_spill_entries, need_spill_npages, amount_entries, - amount_npages); - tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256); +int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest) { + if (unlikely(!src)) + return MDBX_EINVAL; + if (unlikely(src->signature != cur_signature_live)) + return (src->signature == cur_signature_ready4dispose) ? MDBX_EINVAL + : MDBX_EBADSIGN; - iov_ctx_t ctx; - rc = - iov_init(txn, &ctx, amount_entries, amount_npages, -#if defined(_WIN32) || defined(_WIN64) - txn->mt_env->me_overlapped_fd ? txn->mt_env->me_overlapped_fd : -#endif - txn->mt_env->me_lazy_fd, - true); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; + int rc = mdbx_cursor_bind(src->txn, dest, cursor_dbi(src)); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - size_t r = 0, w = 0; - pgno_t last = 0; - while (r < dl->length && (spilled_entries < need_spill_entries || - spilled_npages < need_spill_npages)) { - dl->items[++w] = dl->items[++r]; - unsigned prio = spill_prio(txn, w, reciprocal); - if (prio > prio2spill && - (prio >= prio2adjacent || last != dl->items[w].pgno)) - continue; + assert(dest->tree == src->tree); + assert(cursor_dbi(dest) == cursor_dbi(src)); +again: + assert(dest->clc == src->clc); + assert(dest->txn == src->txn); + dest->top_and_flags = src->top_and_flags; + for (intptr_t i = 0; i <= src->top; ++i) { + dest->ki[i] = src->ki[i]; + dest->pg[i] = src->pg[i]; + } + + if (src->subcur) { + dest->subcur->nested_tree = src->subcur->nested_tree; + src = &src->subcur->cursor; + dest = &dest->subcur->cursor; + goto again; + } - const size_t e = w; - last = dpl_endpgno(dl, w); - while (--w && dpl_endpgno(dl, w) == dl->items[w + 1].pgno && - spill_prio(txn, w, reciprocal) < prio2adjacent) - ; + return MDBX_SUCCESS; +} - for (size_t i = w; ++i <= e;) { - const unsigned npages = dpl_npages(dl, i); - prio = spill_prio(txn, i, reciprocal); - DEBUG("%sspill[%zu] %u page %" PRIaPGNO " (age %d, prio %u)", - (prio > prio2spill) ? "co-" : "", i, npages, dl->items[i].pgno, - dpl_age(txn, i), prio); - tASSERT(txn, prio < 256); - ++spilled_entries; - spilled_npages += npages; - rc = spill_page(txn, &ctx, dl->items[i].ptr, npages); - if (unlikely(rc != MDBX_SUCCESS)) - goto failed; +int mdbx_txn_release_all_cursors(const MDBX_txn *txn, bool unbind) { + int rc = check_txn(txn, MDBX_TXN_FINISHED | MDBX_TXN_HAS_CHILD); + if (likely(rc == MDBX_SUCCESS)) { + TXN_FOREACH_DBI_FROM(txn, i, MAIN_DBI) { + while (txn->cursors[i]) { + MDBX_cursor *mc = txn->cursors[i]; + ENSURE(nullptr, mc->signature == cur_signature_live && + (mc->next != mc) && !mc->backup); + rc = likely(rc < INT_MAX) ? rc + 1 : rc; + txn->cursors[i] = mc->next; + mc->next = mc; + if (unbind) { + mc->signature = cur_signature_ready4dispose; + mc->flags = 0; + } else { + mc->signature = 0; + osal_free(mc); + } } } + } else { + eASSERT(nullptr, rc < 0); + } + return rc; +} - VERBOSE("spilled entries %u, spilled npages %u", spilled_entries, - spilled_npages); - tASSERT(txn, spillable_entries == 0 || spilled_entries > 0); - tASSERT(txn, spilled_npages >= spilled_entries); +int mdbx_cursor_compare(const MDBX_cursor *l, const MDBX_cursor *r, + bool ignore_multival) { + const int incomparable = INT16_MAX + 1; + if (unlikely(!l)) + return r ? -incomparable * 9 : 0; + else if (unlikely(!r)) + return incomparable * 9; - failed: - while (r < dl->length) - dl->items[++w] = dl->items[++r]; - tASSERT(txn, r - w == spilled_entries || rc != MDBX_SUCCESS); + if (unlikely(l->signature != cur_signature_live)) + return (r->signature == cur_signature_live) ? -incomparable * 8 : 0; + if (unlikely(r->signature != cur_signature_live)) + return (l->signature == cur_signature_live) ? incomparable * 8 : 0; - dl->sorted = dpl_setlen(dl, w); - txn->tw.dirtyroom += spilled_entries; - txn->tw.dirtylist->pages_including_loose -= spilled_npages; - tASSERT(txn, dirtylist_check(txn)); + if (unlikely(l->clc != r->clc)) { + if (l->txn->env != r->txn->env) + return (l->txn->env > r->txn->env) ? incomparable * 7 : -incomparable * 7; + if (l->txn->txnid != r->txn->txnid) + return (l->txn->txnid > r->txn->txnid) ? incomparable * 6 + : -incomparable * 6; + return (l->clc > r->clc) ? incomparable * 5 : -incomparable * 5; + } + assert(cursor_dbi(l) == cursor_dbi(r)); - if (!iov_empty(&ctx)) { - tASSERT(txn, rc == MDBX_SUCCESS); - rc = iov_write(&ctx); - } - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; + int diff = is_pointed(l) - is_pointed(r); + if (unlikely(diff)) + return (diff > 0) ? incomparable * 4 : -incomparable * 4; + if (unlikely(!is_pointed(l))) + return 0; - txn->mt_env->me_lck->mti_unsynced_pages.weak += spilled_npages; - pnl_sort(txn->tw.spilled.list, (size_t)txn->mt_next_pgno << 1); - txn->mt_flags |= MDBX_TXN_SPILLS; - NOTICE("spilled %u dirty-entries, %u dirty-npages, now have %zu dirty-room", - spilled_entries, spilled_npages, txn->tw.dirtyroom); - } else { - tASSERT(txn, rc == MDBX_SUCCESS); - for (size_t i = 1; i <= dl->length; ++i) { - MDBX_page *dp = dl->items[i].ptr; - VERBOSE( - "unspillable[%zu]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", - i, dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, dpl_age(txn, i), - spill_prio(txn, i, reciprocal)); - } + intptr_t detent = (l->top <= r->top) ? l->top : r->top; + for (intptr_t i = 0; i <= detent; ++i) { + diff = l->ki[i] - r->ki[i]; + if (diff) + return diff; } + if (unlikely(l->top != r->top)) + return (l->top > r->top) ? incomparable * 3 : -incomparable * 3; -#if xMDBX_DEBUG_SPILLING == 2 - if (txn->tw.loose_count + txn->tw.dirtyroom <= need / 2 + 1) - ERROR("dirty-list length: before %zu, after %zu, parent %zi, loose %zu; " - "needed %zu, spillable %zu; " - "spilled %u dirty-entries, now have %zu dirty-room", - dl->length + spilled_entries, dl->length, - (txn->mt_parent && txn->mt_parent->tw.dirtylist) - ? (intptr_t)txn->mt_parent->tw.dirtylist->length - : -1, - txn->tw.loose_count, need, spillable_entries, spilled_entries, - txn->tw.dirtyroom); - ENSURE(txn->mt_env, txn->tw.loose_count + txn->tw.dirtyroom > need / 2); -#endif /* xMDBX_DEBUG_SPILLING */ + assert((l->subcur != nullptr) == (r->subcur != nullptr)); + if (unlikely((l->subcur != nullptr) != (r->subcur != nullptr))) + return l->subcur ? incomparable * 2 : -incomparable * 2; + if (ignore_multival || !l->subcur) + return 0; -done: - return likely(txn->tw.dirtyroom + txn->tw.loose_count > - ((need > CURSOR_STACK) ? CURSOR_STACK : need)) - ? MDBX_SUCCESS - : MDBX_TXN_FULL; -} +#if MDBX_DEBUG + if (is_pointed(&l->subcur->cursor)) { + const page_t *mp = l->pg[l->top]; + const node_t *node = page_node(mp, l->ki[l->top]); + assert(node_flags(node) & N_DUPDATA); + } + if (is_pointed(&r->subcur->cursor)) { + const page_t *mp = r->pg[r->top]; + const node_t *node = page_node(mp, r->ki[r->top]); + assert(node_flags(node) & N_DUPDATA); + } +#endif /* MDBX_DEBUG */ -/*----------------------------------------------------------------------------*/ + l = &l->subcur->cursor; + r = &r->subcur->cursor; + diff = is_pointed(l) - is_pointed(r); + if (unlikely(diff)) + return (diff > 0) ? incomparable * 2 : -incomparable * 2; + if (unlikely(!is_pointed(l))) + return 0; -static bool meta_bootid_match(const MDBX_meta *meta) { - return memcmp(&meta->mm_bootid, &bootid, 16) == 0 && - (bootid.x | bootid.y) != 0; -} + detent = (l->top <= r->top) ? l->top : r->top; + for (intptr_t i = 0; i <= detent; ++i) { + diff = l->ki[i] - r->ki[i]; + if (diff) + return diff; + } + if (unlikely(l->top != r->top)) + return (l->top > r->top) ? incomparable : -incomparable; -static bool meta_weak_acceptable(const MDBX_env *env, const MDBX_meta *meta, - const int lck_exclusive) { - return lck_exclusive - ? /* exclusive lock */ meta_bootid_match(meta) - : /* db already opened */ env->me_lck_mmap.lck && - (env->me_lck_mmap.lck->mti_envmode.weak & MDBX_RDONLY) == 0; + return (l->flags & (z_eof | z_hollow)) - (r->flags & (z_eof | z_hollow)); } -#define METAPAGE(env, n) page_meta(pgno2page(env, n)) -#define METAPAGE_END(env) METAPAGE(env, NUM_METAS) +/* Return the count of duplicate data items for the current key */ +int mdbx_cursor_count(const MDBX_cursor *mc, size_t *countp) { + if (unlikely(mc == nullptr)) + return MDBX_EINVAL; -MDBX_NOTHROW_PURE_FUNCTION static txnid_t -constmeta_txnid(const MDBX_meta *meta) { - const txnid_t a = unaligned_peek_u64(4, &meta->mm_txnid_a); - const txnid_t b = unaligned_peek_u64(4, &meta->mm_txnid_b); - return likely(a == b) ? a : 0; -} + if (unlikely(mc->signature != cur_signature_live)) + return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL + : MDBX_EBADSIGN; -typedef struct { - uint64_t txnid; - size_t is_steady; -} meta_snap_t; + int rc = check_txn(mc->txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; -static __always_inline txnid_t -atomic_load_txnid(const volatile MDBX_atomic_uint32_t *ptr) { -#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \ - MDBX_UNALIGNED_OK >= 8 - return atomic_load64((const volatile MDBX_atomic_uint64_t *)ptr, - mo_AcquireRelease); -#else - const uint32_t l = atomic_load32( - &ptr[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], mo_AcquireRelease); - const uint32_t h = atomic_load32( - &ptr[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], mo_AcquireRelease); - return (uint64_t)h << 32 | l; -#endif + if (unlikely(countp == nullptr)) + return MDBX_EINVAL; + + if ((*countp = is_filled(mc)) > 0) { + if (inner_filled(mc)) { + const page_t *mp = mc->pg[mc->top]; + const node_t *node = page_node(mp, mc->ki[mc->top]); + cASSERT(mc, node_flags(node) & N_DUPDATA); + *countp = unlikely(mc->subcur->nested_tree.items > PTRDIFF_MAX) + ? PTRDIFF_MAX + : (size_t)mc->subcur->nested_tree.items; + } + } + return MDBX_SUCCESS; } -static __inline meta_snap_t meta_snap(const volatile MDBX_meta *meta) { - txnid_t txnid = atomic_load_txnid(meta->mm_txnid_a); - jitter4testing(true); - size_t is_steady = META_IS_STEADY(meta) && txnid >= MIN_TXNID; - jitter4testing(true); - if (unlikely(txnid != atomic_load_txnid(meta->mm_txnid_b))) - txnid = is_steady = 0; - meta_snap_t r = {txnid, is_steady}; - return r; -} - -static __inline txnid_t meta_txnid(const volatile MDBX_meta *meta) { - return meta_snap(meta).txnid; -} +int mdbx_cursor_on_first(const MDBX_cursor *mc) { + if (unlikely(mc == nullptr)) + return MDBX_EINVAL; -static __inline void meta_update_begin(const MDBX_env *env, MDBX_meta *meta, - txnid_t txnid) { - eASSERT(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); - eASSERT(env, unaligned_peek_u64(4, meta->mm_txnid_a) < txnid && - unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); - (void)env; -#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \ - MDBX_UNALIGNED_OK >= 8 - atomic_store64((MDBX_atomic_uint64_t *)&meta->mm_txnid_b, 0, - mo_AcquireRelease); - atomic_store64((MDBX_atomic_uint64_t *)&meta->mm_txnid_a, txnid, - mo_AcquireRelease); -#else - atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], - 0, mo_AcquireRelease); - atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], - 0, mo_AcquireRelease); - atomic_store32(&meta->mm_txnid_a[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], - (uint32_t)txnid, mo_AcquireRelease); - atomic_store32(&meta->mm_txnid_a[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], - (uint32_t)(txnid >> 32), mo_AcquireRelease); -#endif -} + if (unlikely(mc->signature != cur_signature_live)) + return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL + : MDBX_EBADSIGN; -static __inline void meta_update_end(const MDBX_env *env, MDBX_meta *meta, - txnid_t txnid) { - eASSERT(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); - eASSERT(env, unaligned_peek_u64(4, meta->mm_txnid_a) == txnid); - eASSERT(env, unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); - (void)env; - jitter4testing(true); - memcpy(&meta->mm_bootid, &bootid, 16); -#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \ - MDBX_UNALIGNED_OK >= 8 - atomic_store64((MDBX_atomic_uint64_t *)&meta->mm_txnid_b, txnid, - mo_AcquireRelease); -#else - atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], - (uint32_t)txnid, mo_AcquireRelease); - atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], - (uint32_t)(txnid >> 32), mo_AcquireRelease); -#endif -} + for (intptr_t i = 0; i <= mc->top; ++i) { + if (mc->ki[i]) + return MDBX_RESULT_FALSE; + } -static __inline void meta_set_txnid(const MDBX_env *env, MDBX_meta *meta, - const txnid_t txnid) { - eASSERT(env, - !env->me_map || meta < METAPAGE(env, 0) || meta >= METAPAGE_END(env)); - (void)env; - /* update inconsistently since this function used ONLY for filling meta-image - * for writing, but not the actual meta-page */ - memcpy(&meta->mm_bootid, &bootid, 16); - unaligned_poke_u64(4, meta->mm_txnid_a, txnid); - unaligned_poke_u64(4, meta->mm_txnid_b, txnid); + return MDBX_RESULT_TRUE; } -static __inline uint64_t meta_sign(const MDBX_meta *meta) { - uint64_t sign = MDBX_DATASIGN_NONE; -#if 0 /* TODO */ - sign = hippeus_hash64(...); -#else - (void)meta; -#endif - /* LY: newer returns MDBX_DATASIGN_NONE or MDBX_DATASIGN_WEAK */ - return (sign > MDBX_DATASIGN_WEAK) ? sign : ~sign; -} +int mdbx_cursor_on_first_dup(const MDBX_cursor *mc) { + if (unlikely(mc == nullptr)) + return MDBX_EINVAL; -typedef struct { - txnid_t txnid; - union { - const volatile MDBX_meta *ptr_v; - const MDBX_meta *ptr_c; - }; - size_t is_steady; -} meta_ptr_t; + if (unlikely(mc->signature != cur_signature_live)) + return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL + : MDBX_EBADSIGN; -static meta_ptr_t meta_ptr(const MDBX_env *env, unsigned n) { - eASSERT(env, n < NUM_METAS); - meta_ptr_t r; - meta_snap_t snap = meta_snap(r.ptr_v = METAPAGE(env, n)); - r.txnid = snap.txnid; - r.is_steady = snap.is_steady; - return r; -} + if (is_filled(mc) && mc->subcur) { + mc = &mc->subcur->cursor; + for (intptr_t i = 0; i <= mc->top; ++i) { + if (mc->ki[i]) + return MDBX_RESULT_FALSE; + } + } -static __always_inline uint8_t meta_cmp2int(txnid_t a, txnid_t b, uint8_t s) { - return unlikely(a == b) ? 1 * s : (a > b) ? 2 * s : 0 * s; + return MDBX_RESULT_TRUE; } -static __always_inline uint8_t meta_cmp2recent(uint8_t ab_cmp2int, - bool a_steady, bool b_steady) { - assert(ab_cmp2int < 3 /* && a_steady< 2 && b_steady < 2 */); - return ab_cmp2int > 1 || (ab_cmp2int == 1 && a_steady > b_steady); -} +int mdbx_cursor_on_last(const MDBX_cursor *mc) { + if (unlikely(mc == nullptr)) + return MDBX_EINVAL; -static __always_inline uint8_t meta_cmp2steady(uint8_t ab_cmp2int, - bool a_steady, bool b_steady) { - assert(ab_cmp2int < 3 /* && a_steady< 2 && b_steady < 2 */); - return a_steady > b_steady || (a_steady == b_steady && ab_cmp2int > 1); -} + if (unlikely(mc->signature != cur_signature_live)) + return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL + : MDBX_EBADSIGN; -static __inline bool meta_choice_recent(txnid_t a_txnid, bool a_steady, - txnid_t b_txnid, bool b_steady) { - return meta_cmp2recent(meta_cmp2int(a_txnid, b_txnid, 1), a_steady, b_steady); -} + for (intptr_t i = 0; i <= mc->top; ++i) { + size_t nkeys = page_numkeys(mc->pg[i]); + if (mc->ki[i] < nkeys - 1) + return MDBX_RESULT_FALSE; + } -static __inline bool meta_choice_steady(txnid_t a_txnid, bool a_steady, - txnid_t b_txnid, bool b_steady) { - return meta_cmp2steady(meta_cmp2int(a_txnid, b_txnid, 1), a_steady, b_steady); + return MDBX_RESULT_TRUE; } -MDBX_MAYBE_UNUSED static uint8_t meta_cmp2pack(uint8_t c01, uint8_t c02, - uint8_t c12, bool s0, bool s1, - bool s2) { - assert(c01 < 3 && c02 < 3 && c12 < 3); - /* assert(s0 < 2 && s1 < 2 && s2 < 2); */ - const uint8_t recent = meta_cmp2recent(c01, s0, s1) - ? (meta_cmp2recent(c02, s0, s2) ? 0 : 2) - : (meta_cmp2recent(c12, s1, s2) ? 1 : 2); - const uint8_t prefer_steady = meta_cmp2steady(c01, s0, s1) - ? (meta_cmp2steady(c02, s0, s2) ? 0 : 2) - : (meta_cmp2steady(c12, s1, s2) ? 1 : 2); +int mdbx_cursor_on_last_dup(const MDBX_cursor *mc) { + if (unlikely(mc == nullptr)) + return MDBX_EINVAL; - uint8_t tail; - if (recent == 0) - tail = meta_cmp2steady(c12, s1, s2) ? 2 : 1; - else if (recent == 1) - tail = meta_cmp2steady(c02, s0, s2) ? 2 : 0; - else - tail = meta_cmp2steady(c01, s0, s1) ? 1 : 0; + if (unlikely(mc->signature != cur_signature_live)) + return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL + : MDBX_EBADSIGN; - const bool valid = - c01 != 1 || s0 != s1 || c02 != 1 || s0 != s2 || c12 != 1 || s1 != s2; - const bool strict = (c01 != 1 || s0 != s1) && (c02 != 1 || s0 != s2) && - (c12 != 1 || s1 != s2); - return tail | recent << 2 | prefer_steady << 4 | strict << 6 | valid << 7; -} + if (is_filled(mc) && mc->subcur) { + mc = &mc->subcur->cursor; + for (intptr_t i = 0; i <= mc->top; ++i) { + size_t nkeys = page_numkeys(mc->pg[i]); + if (mc->ki[i] < nkeys - 1) + return MDBX_RESULT_FALSE; + } + } -static __inline void meta_troika_unpack(meta_troika_t *troika, - const uint8_t packed) { - troika->recent = (packed >> 2) & 3; - troika->prefer_steady = (packed >> 4) & 3; - troika->tail_and_flags = packed & 0xC3; -#if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */ - troika->unused_pad = 0; -#endif + return MDBX_RESULT_TRUE; } -static const uint8_t troika_fsm_map[2 * 2 * 2 * 3 * 3 * 3] = { - 232, 201, 216, 216, 232, 233, 232, 232, 168, 201, 216, 152, 168, 233, 232, - 168, 233, 201, 216, 201, 233, 233, 232, 233, 168, 201, 152, 216, 232, 169, - 232, 168, 168, 193, 152, 152, 168, 169, 232, 168, 169, 193, 152, 194, 233, - 169, 232, 169, 232, 201, 216, 216, 232, 201, 232, 232, 168, 193, 216, 152, - 168, 193, 232, 168, 193, 193, 210, 194, 225, 193, 225, 193, 168, 137, 212, - 214, 232, 233, 168, 168, 168, 137, 212, 150, 168, 233, 168, 168, 169, 137, - 216, 201, 233, 233, 168, 169, 168, 137, 148, 214, 232, 169, 168, 168, 40, - 129, 148, 150, 168, 169, 168, 40, 169, 129, 152, 194, 233, 169, 168, 169, - 168, 137, 214, 214, 232, 201, 168, 168, 168, 129, 214, 150, 168, 193, 168, - 168, 129, 129, 210, 194, 225, 193, 161, 129, 212, 198, 212, 214, 228, 228, - 212, 212, 148, 201, 212, 150, 164, 233, 212, 148, 233, 201, 216, 201, 233, - 233, 216, 233, 148, 198, 148, 214, 228, 164, 212, 148, 148, 194, 148, 150, - 164, 169, 212, 148, 169, 194, 152, 194, 233, 169, 216, 169, 214, 198, 214, - 214, 228, 198, 212, 214, 150, 194, 214, 150, 164, 193, 212, 150, 194, 194, - 210, 194, 225, 193, 210, 194}; - -__hot static meta_troika_t meta_tap(const MDBX_env *env) { - meta_snap_t snap; - meta_troika_t troika; - snap = meta_snap(METAPAGE(env, 0)); - troika.txnid[0] = snap.txnid; - troika.fsm = (uint8_t)snap.is_steady << 0; - snap = meta_snap(METAPAGE(env, 1)); - troika.txnid[1] = snap.txnid; - troika.fsm += (uint8_t)snap.is_steady << 1; - troika.fsm += meta_cmp2int(troika.txnid[0], troika.txnid[1], 8); - snap = meta_snap(METAPAGE(env, 2)); - troika.txnid[2] = snap.txnid; - troika.fsm += (uint8_t)snap.is_steady << 2; - troika.fsm += meta_cmp2int(troika.txnid[0], troika.txnid[2], 8 * 3); - troika.fsm += meta_cmp2int(troika.txnid[1], troika.txnid[2], 8 * 3 * 3); +int mdbx_cursor_eof(const MDBX_cursor *mc) { + if (unlikely(mc == nullptr)) + return MDBX_EINVAL; - meta_troika_unpack(&troika, troika_fsm_map[troika.fsm]); - return troika; -} + if (unlikely(mc->signature != cur_signature_live)) + return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL + : MDBX_EBADSIGN; -static txnid_t recent_committed_txnid(const MDBX_env *env) { - const txnid_t m0 = meta_txnid(METAPAGE(env, 0)); - const txnid_t m1 = meta_txnid(METAPAGE(env, 1)); - const txnid_t m2 = meta_txnid(METAPAGE(env, 2)); - return (m0 > m1) ? ((m0 > m2) ? m0 : m2) : ((m1 > m2) ? m1 : m2); + return is_eof(mc) ? MDBX_RESULT_TRUE : MDBX_RESULT_FALSE; } -static __inline bool meta_eq(const meta_troika_t *troika, size_t a, size_t b) { - assert(a < NUM_METAS && b < NUM_METAS); - return troika->txnid[a] == troika->txnid[b] && - (((troika->fsm >> a) ^ (troika->fsm >> b)) & 1) == 0 && - troika->txnid[a]; -} +int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { + if (unlikely(mc == nullptr)) + return MDBX_EINVAL; -static unsigned meta_eq_mask(const meta_troika_t *troika) { - return meta_eq(troika, 0, 1) | meta_eq(troika, 1, 2) << 1 | - meta_eq(troika, 2, 0) << 2; -} + if (unlikely(mc->signature != cur_signature_live)) + return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL + : MDBX_EBADSIGN; -__hot static bool meta_should_retry(const MDBX_env *env, - meta_troika_t *troika) { - const meta_troika_t prev = *troika; - *troika = meta_tap(env); - return prev.fsm != troika->fsm || prev.txnid[0] != troika->txnid[0] || - prev.txnid[1] != troika->txnid[1] || prev.txnid[2] != troika->txnid[2]; -} + int rc = check_txn(mc->txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; -static __always_inline meta_ptr_t meta_recent(const MDBX_env *env, - const meta_troika_t *troika) { - meta_ptr_t r; - r.txnid = troika->txnid[troika->recent]; - r.ptr_v = METAPAGE(env, troika->recent); - r.is_steady = (troika->fsm >> troika->recent) & 1; - return r; -} + if (unlikely(cursor_dbi_changed(mc))) + return MDBX_BAD_DBI; -static __always_inline meta_ptr_t -meta_prefer_steady(const MDBX_env *env, const meta_troika_t *troika) { - meta_ptr_t r; - r.txnid = troika->txnid[troika->prefer_steady]; - r.ptr_v = METAPAGE(env, troika->prefer_steady); - r.is_steady = (troika->fsm >> troika->prefer_steady) & 1; - return r; + return cursor_ops(mc, key, data, op); } -static __always_inline meta_ptr_t meta_tail(const MDBX_env *env, - const meta_troika_t *troika) { - const uint8_t tail = troika->tail_and_flags & 3; - MDBX_ANALYSIS_ASSUME(tail < NUM_METAS); - meta_ptr_t r; - r.txnid = troika->txnid[tail]; - r.ptr_v = METAPAGE(env, tail); - r.is_steady = (troika->fsm >> tail) & 1; - return r; -} +__hot static int scan_confinue(MDBX_cursor *mc, MDBX_predicate_func *predicate, + void *context, void *arg, MDBX_val *key, + MDBX_val *value, MDBX_cursor_op turn_op) { + int rc; + switch (turn_op) { + case MDBX_NEXT: + case MDBX_NEXT_NODUP: + for (;;) { + rc = predicate(context, key, value, arg); + if (rc != MDBX_RESULT_FALSE) + return rc; + rc = outer_next(mc, key, value, turn_op); + if (unlikely(rc != MDBX_SUCCESS)) + return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc; + } -static const char *durable_caption(const volatile MDBX_meta *const meta) { - if (META_IS_STEADY(meta)) - return (unaligned_peek_u64_volatile(4, meta->mm_sign) == - meta_sign((const MDBX_meta *)meta)) - ? "Steady" - : "Tainted"; - return "Weak"; -} + case MDBX_PREV: + case MDBX_PREV_NODUP: + for (;;) { + rc = predicate(context, key, value, arg); + if (rc != MDBX_RESULT_FALSE) + return rc; + rc = outer_prev(mc, key, value, turn_op); + if (unlikely(rc != MDBX_SUCCESS)) + return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc; + } -__cold static void meta_troika_dump(const MDBX_env *env, - const meta_troika_t *troika) { - const meta_ptr_t recent = meta_recent(env, troika); - const meta_ptr_t prefer_steady = meta_prefer_steady(env, troika); - const meta_ptr_t tail = meta_tail(env, troika); - NOTICE("troika: %" PRIaTXN ".%c:%" PRIaTXN ".%c:%" PRIaTXN ".%c, fsm=0x%02x, " - "head=%d-%" PRIaTXN ".%c, " - "base=%d-%" PRIaTXN ".%c, " - "tail=%d-%" PRIaTXN ".%c, " - "valid %c, strict %c", - troika->txnid[0], (troika->fsm & 1) ? 's' : 'w', troika->txnid[1], - (troika->fsm & 2) ? 's' : 'w', troika->txnid[2], - (troika->fsm & 4) ? 's' : 'w', troika->fsm, troika->recent, - recent.txnid, recent.is_steady ? 's' : 'w', troika->prefer_steady, - prefer_steady.txnid, prefer_steady.is_steady ? 's' : 'w', - troika->tail_and_flags % NUM_METAS, tail.txnid, - tail.is_steady ? 's' : 'w', TROIKA_VALID(troika) ? 'Y' : 'N', - TROIKA_STRICT_VALID(troika) ? 'Y' : 'N'); -} + case MDBX_NEXT_DUP: + if (mc->subcur) + for (;;) { + rc = predicate(context, key, value, arg); + if (rc != MDBX_RESULT_FALSE) + return rc; + rc = inner_next(&mc->subcur->cursor, value); + if (unlikely(rc != MDBX_SUCCESS)) + return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc; + } + return MDBX_NOTFOUND; -/*----------------------------------------------------------------------------*/ + case MDBX_PREV_DUP: + if (mc->subcur) + for (;;) { + rc = predicate(context, key, value, arg); + if (rc != MDBX_RESULT_FALSE) + return rc; + rc = inner_prev(&mc->subcur->cursor, value); + if (unlikely(rc != MDBX_SUCCESS)) + return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc; + } + return MDBX_NOTFOUND; -static __inline MDBX_CONST_FUNCTION MDBX_lockinfo * -lckless_stub(const MDBX_env *env) { - uintptr_t stub = (uintptr_t)&env->x_lckless_stub; - /* align to avoid false-positive alarm from UndefinedBehaviorSanitizer */ - stub = (stub + MDBX_CACHELINE_SIZE - 1) & ~(MDBX_CACHELINE_SIZE - 1); - return (MDBX_lockinfo *)stub; + default: + for (;;) { + rc = predicate(context, key, value, arg); + if (rc != MDBX_RESULT_FALSE) + return rc; + rc = cursor_ops(mc, key, value, turn_op); + if (unlikely(rc != MDBX_SUCCESS)) + return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc; + } + } } -/* Find oldest txnid still referenced. */ -static txnid_t find_oldest_reader(MDBX_env *const env, const txnid_t steady) { - const uint32_t nothing_changed = MDBX_STRING_TETRAD("None"); - eASSERT(env, steady <= env->me_txn0->mt_txnid); - - MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (unlikely(lck == NULL /* exclusive without-lck mode */)) { - eASSERT(env, env->me_lck == lckless_stub(env)); - env->me_lck->mti_readers_refresh_flag.weak = nothing_changed; - return env->me_lck->mti_oldest_reader.weak = steady; - } +int mdbx_cursor_scan(MDBX_cursor *mc, MDBX_predicate_func *predicate, + void *context, MDBX_cursor_op start_op, + MDBX_cursor_op turn_op, void *arg) { + if (unlikely(!predicate)) + return MDBX_EINVAL; - const txnid_t prev_oldest = - atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease); - eASSERT(env, steady >= prev_oldest); + const unsigned valid_start_mask = + 1 << MDBX_FIRST | 1 << MDBX_FIRST_DUP | 1 << MDBX_LAST | + 1 << MDBX_LAST_DUP | 1 << MDBX_GET_CURRENT | 1 << MDBX_GET_MULTIPLE; + if (unlikely(start_op > 30 || ((1 << start_op) & valid_start_mask) == 0)) + return MDBX_EINVAL; - txnid_t new_oldest = prev_oldest; - while (nothing_changed != - atomic_load32(&lck->mti_readers_refresh_flag, mo_AcquireRelease)) { - lck->mti_readers_refresh_flag.weak = nothing_changed; - jitter4testing(false); - const size_t snap_nreaders = - atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - new_oldest = steady; + const unsigned valid_turn_mask = + 1 << MDBX_NEXT | 1 << MDBX_NEXT_DUP | 1 << MDBX_NEXT_NODUP | + 1 << MDBX_PREV | 1 << MDBX_PREV_DUP | 1 << MDBX_PREV_NODUP | + 1 << MDBX_NEXT_MULTIPLE | 1 << MDBX_PREV_MULTIPLE; + if (unlikely(turn_op > 30 || ((1 << turn_op) & valid_turn_mask) == 0)) + return MDBX_EINVAL; - for (size_t i = 0; i < snap_nreaders; ++i) { - const uint32_t pid = - atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); - if (!pid) - continue; - jitter4testing(true); + MDBX_val key = {nullptr, 0}, value = {nullptr, 0}; + int rc = mdbx_cursor_get(mc, &key, &value, start_op); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + return scan_confinue(mc, predicate, context, arg, &key, &value, turn_op); +} - const txnid_t rtxn = safe64_read(&lck->mti_readers[i].mr_txnid); - if (unlikely(rtxn < prev_oldest)) { - if (unlikely(nothing_changed == - atomic_load32(&lck->mti_readers_refresh_flag, - mo_AcquireRelease)) && - safe64_reset_compare(&lck->mti_readers[i].mr_txnid, rtxn)) { - NOTICE("kick stuck reader[%zu of %zu].pid_%u %" PRIaTXN - " < prev-oldest %" PRIaTXN ", steady-txn %" PRIaTXN, - i, snap_nreaders, pid, rtxn, prev_oldest, steady); - } - continue; - } +int mdbx_cursor_scan_from(MDBX_cursor *mc, MDBX_predicate_func *predicate, + void *context, MDBX_cursor_op from_op, MDBX_val *key, + MDBX_val *value, MDBX_cursor_op turn_op, void *arg) { + if (unlikely(!predicate || !key)) + return MDBX_EINVAL; - if (rtxn < new_oldest) { - new_oldest = rtxn; - if (!MDBX_DEBUG && !MDBX_FORCE_ASSERTIONS && new_oldest == prev_oldest) - break; - } - } - } + const unsigned valid_start_mask = + 1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | 1 << MDBX_SET_KEY | + 1 << MDBX_GET_MULTIPLE | 1 << MDBX_SET_LOWERBOUND | + 1 << MDBX_SET_UPPERBOUND; + if (unlikely(from_op < MDBX_TO_KEY_LESSER_THAN && + ((1 << from_op) & valid_start_mask) == 0)) + return MDBX_EINVAL; - if (new_oldest != prev_oldest) { - VERBOSE("update oldest %" PRIaTXN " -> %" PRIaTXN, prev_oldest, new_oldest); - eASSERT(env, new_oldest >= lck->mti_oldest_reader.weak); - atomic_store64(&lck->mti_oldest_reader, new_oldest, mo_Relaxed); + const unsigned valid_turn_mask = + 1 << MDBX_NEXT | 1 << MDBX_NEXT_DUP | 1 << MDBX_NEXT_NODUP | + 1 << MDBX_PREV | 1 << MDBX_PREV_DUP | 1 << MDBX_PREV_NODUP | + 1 << MDBX_NEXT_MULTIPLE | 1 << MDBX_PREV_MULTIPLE; + if (unlikely(turn_op > 30 || ((1 << turn_op) & valid_turn_mask) == 0)) + return MDBX_EINVAL; + + int rc = mdbx_cursor_get(mc, key, value, from_op); + if (unlikely(MDBX_IS_ERROR(rc))) + return rc; + + cASSERT(mc, key != nullptr); + MDBX_val stub; + if (!value) { + value = &stub; + rc = cursor_ops(mc, key, value, MDBX_GET_CURRENT); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; } - return new_oldest; + return scan_confinue(mc, predicate, context, arg, key, value, turn_op); } -static txnid_t txn_oldest_reader(const MDBX_txn *const txn) { - return find_oldest_reader(txn->mt_env, - txn->tw.troika.txnid[txn->tw.troika.prefer_steady]); -} +int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, + size_t limit, MDBX_cursor_op op) { + if (unlikely(!count)) + return MDBX_EINVAL; -/* Find largest mvcc-snapshot still referenced. */ -static pgno_t find_largest_snapshot(const MDBX_env *env, - pgno_t last_used_page) { - MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (likely(lck != NULL /* check for exclusive without-lck mode */)) { - retry:; - const size_t snap_nreaders = - atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - for (size_t i = 0; i < snap_nreaders; ++i) { - if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { - /* jitter4testing(true); */ - const pgno_t snap_pages = atomic_load32( - &lck->mti_readers[i].mr_snapshot_pages_used, mo_Relaxed); - const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); - if (unlikely( - snap_pages != - atomic_load32(&lck->mti_readers[i].mr_snapshot_pages_used, - mo_AcquireRelease) || - snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))) - goto retry; - if (last_used_page < snap_pages && snap_txnid <= env->me_txn0->mt_txnid) - last_used_page = snap_pages; - } + *count = 0; + if (unlikely(mc == nullptr || limit < 4 || limit > INTPTR_MAX - 2)) + return MDBX_EINVAL; + + if (unlikely(mc->signature != cur_signature_live)) + return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + int rc = check_txn(mc->txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(cursor_dbi_changed(mc))) + return MDBX_BAD_DBI; + + if (unlikely(mc->subcur)) + return MDBX_INCOMPATIBLE /* must be a non-dupsort subDB */; + + switch (op) { + case MDBX_NEXT: + if (unlikely(is_eof(mc))) + return is_pointed(mc) ? MDBX_NOTFOUND : MDBX_ENODATA; + break; + + case MDBX_FIRST: + if (!is_filled(mc)) { + rc = outer_first(mc, nullptr, nullptr); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; } + break; + + default: + DEBUG("unhandled/unimplemented cursor operation %u", op); + return MDBX_EINVAL; } - return last_used_page; -} + const page_t *mp = mc->pg[mc->top]; + size_t nkeys = page_numkeys(mp); + size_t ki = mc->ki[mc->top]; + size_t n = 0; + while (n + 2 <= limit) { + cASSERT(mc, ki < nkeys); + if (unlikely(ki >= nkeys)) { + be_hollow(mc); + return MDBX_NOTFOUND; + } -/* Add a page to the txn's dirty list */ -__hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, - size_t npages) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - mp->mp_txnid = txn->mt_front; - if (!txn->tw.dirtylist) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); - txn->tw.writemap_dirty_npages += npages; - tASSERT(txn, txn->tw.spilled.list == nullptr); - return MDBX_SUCCESS; - } - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + const node_t *leaf = page_node(mp, ki); + pairs[n] = get_key(leaf); + rc = node_read(mc, leaf, &pairs[n + 1], mp); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; -#if xMDBX_DEBUG_SPILLING == 2 - txn->mt_env->debug_dirtied_act += 1; - ENSURE(txn->mt_env, - txn->mt_env->debug_dirtied_act < txn->mt_env->debug_dirtied_est); - ENSURE(txn->mt_env, txn->tw.dirtyroom + txn->tw.loose_count > 0); -#endif /* xMDBX_DEBUG_SPILLING == 2 */ + n += 2; + if (++ki == nkeys) { + rc = cursor_sibling_right(mc); + if (rc != MDBX_SUCCESS) { + if (rc == MDBX_NOTFOUND) + rc = MDBX_RESULT_TRUE; + goto bailout; + } - int rc; - if (unlikely(txn->tw.dirtyroom == 0)) { - if (txn->tw.loose_count) { - MDBX_page *lp = txn->tw.loose_pages; - DEBUG("purge-and-reclaim loose page %" PRIaPGNO, lp->mp_pgno); - rc = pnl_insert_range(&txn->tw.relist, lp->mp_pgno, 1); - if (unlikely(rc != MDBX_SUCCESS)) + mp = mc->pg[mc->top]; + DEBUG("next page is %" PRIaPGNO ", key index %u", mp->pgno, + mc->ki[mc->top]); + if (!MDBX_DISABLE_VALIDATION && unlikely(!check_leaf_type(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->pgno, mp->flags); + rc = MDBX_CORRUPTED; goto bailout; - size_t di = dpl_search(txn, lp->mp_pgno); - tASSERT(txn, txn->tw.dirtylist->items[di].ptr == lp); - dpl_remove(txn, di); - MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); - VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); - txn->tw.loose_pages = mp_next(lp); - txn->tw.loose_count--; - txn->tw.dirtyroom++; - if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) - dpage_free(txn->mt_env, lp, 1); - } else { - ERROR("Dirtyroom is depleted, DPL length %zu", txn->tw.dirtylist->length); - if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) - dpage_free(txn->mt_env, mp, npages); - return MDBX_TXN_FULL; + } + nkeys = page_numkeys(mp); + ki = 0; } } + mc->ki[mc->top] = (indx_t)ki; - rc = dpl_append(txn, mp->mp_pgno, mp, npages); - if (unlikely(rc != MDBX_SUCCESS)) { - bailout: - txn->mt_flags |= MDBX_TXN_ERROR; - return rc; - } - txn->tw.dirtyroom--; - tASSERT(txn, dirtylist_check(txn)); +bailout: + *count = n; + return rc; +} + +/*----------------------------------------------------------------------------*/ + +int mdbx_cursor_set_userctx(MDBX_cursor *mc, void *ctx) { + if (unlikely(!mc)) + return MDBX_EINVAL; + + if (unlikely(mc->signature != cur_signature_ready4dispose && + mc->signature != cur_signature_live)) + return MDBX_EBADSIGN; + + cursor_couple_t *couple = container_of(mc, cursor_couple_t, outer); + couple->userctx = ctx; return MDBX_SUCCESS; } -static void mincore_clean_cache(const MDBX_env *const env) { - memset(env->me_lck->mti_mincore_cache.begin, -1, - sizeof(env->me_lck->mti_mincore_cache.begin)); +void *mdbx_cursor_get_userctx(const MDBX_cursor *mc) { + if (unlikely(!mc)) + return nullptr; + + if (unlikely(mc->signature != cur_signature_ready4dispose && + mc->signature != cur_signature_live)) + return nullptr; + + cursor_couple_t *couple = container_of(mc, cursor_couple_t, outer); + return couple->userctx; } -#if !(defined(_WIN32) || defined(_WIN64)) -MDBX_MAYBE_UNUSED static __always_inline int ignore_enosys(int err) { -#ifdef ENOSYS - if (err == ENOSYS) - return MDBX_RESULT_TRUE; -#endif /* ENOSYS */ -#ifdef ENOIMPL - if (err == ENOIMPL) - return MDBX_RESULT_TRUE; -#endif /* ENOIMPL */ -#ifdef ENOTSUP - if (err == ENOTSUP) - return MDBX_RESULT_TRUE; -#endif /* ENOTSUP */ -#ifdef ENOSUPP - if (err == ENOSUPP) - return MDBX_RESULT_TRUE; -#endif /* ENOSUPP */ -#ifdef EOPNOTSUPP - if (err == EOPNOTSUPP) - return MDBX_RESULT_TRUE; -#endif /* EOPNOTSUPP */ - if (err == EAGAIN) - return MDBX_RESULT_TRUE; - return err; +MDBX_txn *mdbx_cursor_txn(const MDBX_cursor *mc) { + if (unlikely(!mc || mc->signature != cur_signature_live)) + return nullptr; + MDBX_txn *txn = mc->txn; + if (unlikely(!txn || txn->signature != txn_signature)) + return nullptr; + if (unlikely(txn->flags & MDBX_TXN_FINISHED)) + return nullptr; + return txn; } -#endif /* defined(_WIN32) || defined(_WIN64) */ -#if MDBX_ENABLE_MADVISE -/* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ -__cold static int set_readahead(const MDBX_env *env, const pgno_t edge, - const bool enable, const bool force_whole) { - eASSERT(env, edge >= NUM_METAS && edge <= MAX_PAGENO + 1); - eASSERT(env, (enable & 1) == (enable != 0)); - const bool toggle = force_whole || - ((enable ^ env->me_lck->mti_readahead_anchor) & 1) || - !env->me_lck->mti_readahead_anchor; - const pgno_t prev_edge = env->me_lck->mti_readahead_anchor >> 1; - const size_t limit = env->me_dxb_mmap.limit; - size_t offset = - toggle ? 0 - : pgno_align2os_bytes(env, (prev_edge < edge) ? prev_edge : edge); - offset = (offset < limit) ? offset : limit; +MDBX_dbi mdbx_cursor_dbi(const MDBX_cursor *mc) { + if (unlikely(!mc || mc->signature != cur_signature_live)) + return UINT_MAX; + return cursor_dbi(mc); +} - size_t length = - pgno_align2os_bytes(env, (prev_edge < edge) ? edge : prev_edge); - length = (length < limit) ? length : limit; - length -= offset; +/*----------------------------------------------------------------------------*/ - eASSERT(env, 0 <= (intptr_t)length); - if (length == 0) - return MDBX_SUCCESS; +int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, + MDBX_put_flags_t flags) { + if (unlikely(mc == nullptr || key == nullptr || data == nullptr)) + return MDBX_EINVAL; - NOTICE("readahead %s %u..%u", enable ? "ON" : "OFF", bytes2pgno(env, offset), - bytes2pgno(env, offset + length)); + if (unlikely(mc->signature != cur_signature_live)) + return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL + : MDBX_EBADSIGN; -#if defined(F_RDAHEAD) - if (toggle && unlikely(fcntl(env->me_lazy_fd, F_RDAHEAD, enable) == -1)) - return errno; -#endif /* F_RDAHEAD */ + int rc = check_txn_rw(mc->txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - int err; - void *const ptr = ptr_disp(env->me_map, offset); - if (enable) { -#if defined(MADV_NORMAL) - err = - madvise(ptr, length, MADV_NORMAL) ? ignore_enosys(errno) : MDBX_SUCCESS; - if (unlikely(MDBX_IS_ERROR(err))) - return err; -#elif defined(POSIX_MADV_NORMAL) - err = ignore_enosys(posix_madvise(ptr, length, POSIX_MADV_NORMAL)); - if (unlikely(MDBX_IS_ERROR(err))) - return err; -#elif defined(POSIX_FADV_NORMAL) && defined(POSIX_FADV_WILLNEED) - err = ignore_enosys( - posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_NORMAL)); - if (unlikely(MDBX_IS_ERROR(err))) - return err; -#elif defined(_WIN32) || defined(_WIN64) - /* no madvise on Windows */ -#else -#warning "FIXME" -#endif - if (toggle) { - /* NOTE: Seems there is a bug in the Mach/Darwin/OSX kernel, - * because MADV_WILLNEED with offset != 0 may cause SIGBUS - * on following access to the hinted region. - * 19.6.0 Darwin Kernel Version 19.6.0: Tue Jan 12 22:13:05 PST 2021; - * root:xnu-6153.141.16~1/RELEASE_X86_64 x86_64 */ -#if defined(F_RDADVISE) - struct radvisory hint; - hint.ra_offset = offset; - hint.ra_count = - unlikely(length > INT_MAX && sizeof(length) > sizeof(hint.ra_count)) - ? INT_MAX - : (int)length; - (void)/* Ignore ENOTTY for DB on the ram-disk and so on */ fcntl( - env->me_lazy_fd, F_RDADVISE, &hint); -#elif defined(MADV_WILLNEED) - err = madvise(ptr, length, MADV_WILLNEED) ? ignore_enosys(errno) - : MDBX_SUCCESS; - if (unlikely(MDBX_IS_ERROR(err))) - return err; -#elif defined(POSIX_MADV_WILLNEED) - err = ignore_enosys(posix_madvise(ptr, length, POSIX_MADV_WILLNEED)); - if (unlikely(MDBX_IS_ERROR(err))) - return err; -#elif defined(_WIN32) || defined(_WIN64) - if (mdbx_PrefetchVirtualMemory) { - WIN32_MEMORY_RANGE_ENTRY hint; - hint.VirtualAddress = ptr; - hint.NumberOfBytes = length; - (void)mdbx_PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0); - } -#elif defined(POSIX_FADV_WILLNEED) - err = ignore_enosys( - posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_WILLNEED)); - if (unlikely(MDBX_IS_ERROR(err))) - return err; -#else -#warning "FIXME" -#endif - } - } else { - mincore_clean_cache(env); -#if defined(MADV_RANDOM) - err = - madvise(ptr, length, MADV_RANDOM) ? ignore_enosys(errno) : MDBX_SUCCESS; - if (unlikely(MDBX_IS_ERROR(err))) - return err; -#elif defined(POSIX_MADV_RANDOM) - err = ignore_enosys(posix_madvise(ptr, length, POSIX_MADV_RANDOM)); - if (unlikely(MDBX_IS_ERROR(err))) - return err; -#elif defined(POSIX_FADV_RANDOM) - err = ignore_enosys( - posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_RANDOM)); - if (unlikely(MDBX_IS_ERROR(err))) - return err; -#elif defined(_WIN32) || defined(_WIN64) - /* no madvise on Windows */ -#else -#warning "FIXME" -#endif /* MADV_RANDOM */ - } + if (unlikely(cursor_dbi_changed(mc))) + return MDBX_BAD_DBI; - env->me_lck->mti_readahead_anchor = (enable & 1) + (edge << 1); - err = MDBX_SUCCESS; - return err; -} -#endif /* MDBX_ENABLE_MADVISE */ + cASSERT(mc, cursor_is_tracked(mc)); -__cold static void update_mlcnt(const MDBX_env *env, - const pgno_t new_aligned_mlocked_pgno, - const bool lock_not_release) { - for (;;) { - const pgno_t mlock_pgno_before = - atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease); - eASSERT(env, - pgno_align2os_pgno(env, mlock_pgno_before) == mlock_pgno_before); - eASSERT(env, pgno_align2os_pgno(env, new_aligned_mlocked_pgno) == - new_aligned_mlocked_pgno); - if (lock_not_release ? (mlock_pgno_before >= new_aligned_mlocked_pgno) - : (mlock_pgno_before <= new_aligned_mlocked_pgno)) - break; - if (likely(atomic_cas32(&((MDBX_env *)env)->me_mlocked_pgno, - mlock_pgno_before, new_aligned_mlocked_pgno))) - for (;;) { - MDBX_atomic_uint32_t *const mlcnt = env->me_lck->mti_mlcnt; - const int32_t snap_locked = atomic_load32(mlcnt + 0, mo_Relaxed); - const int32_t snap_unlocked = atomic_load32(mlcnt + 1, mo_Relaxed); - if (mlock_pgno_before == 0 && (snap_locked - snap_unlocked) < INT_MAX) { - eASSERT(env, lock_not_release); - if (unlikely(!atomic_cas32(mlcnt + 0, snap_locked, snap_locked + 1))) - continue; - } - if (new_aligned_mlocked_pgno == 0 && - (snap_locked - snap_unlocked) > 0) { - eASSERT(env, !lock_not_release); - if (unlikely( - !atomic_cas32(mlcnt + 1, snap_unlocked, snap_unlocked + 1))) - continue; - } - NOTICE("%s-pages %u..%u, mlocked-process(es) %u -> %u", - lock_not_release ? "lock" : "unlock", - lock_not_release ? mlock_pgno_before : new_aligned_mlocked_pgno, - lock_not_release ? new_aligned_mlocked_pgno : mlock_pgno_before, - snap_locked - snap_unlocked, - atomic_load32(mlcnt + 0, mo_Relaxed) - - atomic_load32(mlcnt + 1, mo_Relaxed)); - return; - } + /* Check this first so counter will always be zero on any early failures. */ + if (unlikely(flags & MDBX_MULTIPLE)) { + if (unlikely(flags & MDBX_RESERVE)) + return MDBX_EINVAL; + if (unlikely(!(mc->tree->flags & MDBX_DUPFIXED))) + return MDBX_INCOMPATIBLE; + const size_t dcount = data[1].iov_len; + if (unlikely(dcount < 2 || data->iov_len == 0)) + return MDBX_BAD_VALSIZE; + if (unlikely(mc->tree->dupfix_size != data->iov_len) && + mc->tree->dupfix_size) + return MDBX_BAD_VALSIZE; + if (unlikely(dcount > + MAX_MAPSIZE / 2 / + (BRANCH_NODE_MAX(MDBX_MAX_PAGESIZE) - NODESIZE))) { + /* checking for multiplication overflow */ + if (unlikely(dcount > MAX_MAPSIZE / 2 / data->iov_len)) + return MDBX_TOO_LARGE; + } } -} -__cold static void munlock_after(const MDBX_env *env, const pgno_t aligned_pgno, - const size_t end_bytes) { - if (atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease) > aligned_pgno) { - int err = MDBX_ENOSYS; - const size_t munlock_begin = pgno2bytes(env, aligned_pgno); - const size_t munlock_size = end_bytes - munlock_begin; - eASSERT(env, end_bytes % env->me_os_psize == 0 && - munlock_begin % env->me_os_psize == 0 && - munlock_size % env->me_os_psize == 0); -#if defined(_WIN32) || defined(_WIN64) - err = VirtualUnlock(ptr_disp(env->me_map, munlock_begin), munlock_size) - ? MDBX_SUCCESS - : (int)GetLastError(); - if (err == ERROR_NOT_LOCKED) - err = MDBX_SUCCESS; -#elif defined(_POSIX_MEMLOCK_RANGE) - err = munlock(ptr_disp(env->me_map, munlock_begin), munlock_size) - ? errno - : MDBX_SUCCESS; -#endif - if (likely(err == MDBX_SUCCESS)) - update_mlcnt(env, aligned_pgno, false); - else { -#if defined(_WIN32) || defined(_WIN64) - WARNING("VirtualUnlock(%zu, %zu) error %d", munlock_begin, munlock_size, - err); -#else - WARNING("munlock(%zu, %zu) error %d", munlock_begin, munlock_size, err); -#endif - } + if (flags & MDBX_RESERVE) { + if (unlikely(mc->tree->flags & (MDBX_DUPSORT | MDBX_REVERSEDUP | + MDBX_INTEGERDUP | MDBX_DUPFIXED))) + return MDBX_INCOMPATIBLE; + data->iov_base = nullptr; } -} -__cold static void munlock_all(const MDBX_env *env) { - munlock_after(env, 0, bytes_align2os_bytes(env, env->me_dxb_mmap.current)); -} + if (unlikely(mc->txn->flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) + return (mc->txn->flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; -__cold static unsigned default_rp_augment_limit(const MDBX_env *env) { - const size_t timeframe = /* 16 секунд */ 16 << 16; - const size_t remain_1sec = - (env->me_options.gc_time_limit < timeframe) - ? timeframe - (size_t)env->me_options.gc_time_limit - : 0; - const size_t minimum = (env->me_maxgc_ov1page * 2 > MDBX_PNL_INITIAL) - ? env->me_maxgc_ov1page * 2 - : MDBX_PNL_INITIAL; - const size_t one_third = env->me_dbgeo.now / 3 >> env->me_psize2log; - const size_t augment_limit = - (one_third > minimum) - ? minimum + (one_third - minimum) / timeframe * remain_1sec - : minimum; - eASSERT(env, augment_limit < MDBX_PGL_LIMIT); - return pnl_bytes2size(pnl_size2bytes(augment_limit)); + return cursor_put_checklen(mc, key, data, flags); } -static bool default_prefault_write(const MDBX_env *env) { - return !MDBX_MMAP_INCOHERENT_FILE_WRITE && !env->me_incore && - (env->me_flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == MDBX_WRITEMAP; -} +int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { + if (unlikely(!mc)) + return MDBX_EINVAL; -static void adjust_defaults(MDBX_env *env) { - if (!env->me_options.flags.non_auto.rp_augment_limit) - env->me_options.rp_augment_limit = default_rp_augment_limit(env); - if (!env->me_options.flags.non_auto.prefault_write) - env->me_options.prefault_write = default_prefault_write(env); + if (unlikely(mc->signature != cur_signature_live)) + return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL + : MDBX_EBADSIGN; - const size_t basis = env->me_dbgeo.now; - /* TODO: use options? */ - const unsigned factor = 9; - size_t threshold = (basis < ((size_t)65536 << factor)) - ? 65536 /* minimal threshold */ - : (basis > (MEGABYTE * 4 << factor)) - ? MEGABYTE * 4 /* maximal threshold */ - : basis >> factor; - threshold = (threshold < env->me_dbgeo.shrink || !env->me_dbgeo.shrink) - ? threshold - : env->me_dbgeo.shrink; + int rc = check_txn_rw(mc->txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(cursor_dbi_changed(mc))) + return MDBX_BAD_DBI; - env->me_madv_threshold = - bytes2pgno(env, bytes_align2os_bytes(env, threshold)); + return cursor_del(mc, flags); } -enum resize_mode { implicit_grow, impilict_shrink, explicit_resize }; +__cold int mdbx_cursor_ignord(MDBX_cursor *mc) { + if (unlikely(!mc)) + return MDBX_EINVAL; -__cold static int dxb_resize(MDBX_env *const env, const pgno_t used_pgno, - const pgno_t size_pgno, pgno_t limit_pgno, - const enum resize_mode mode) { - /* Acquire guard to avoid collision between read and write txns - * around me_dbgeo and me_dxb_mmap */ -#if defined(_WIN32) || defined(_WIN64) - osal_srwlock_AcquireExclusive(&env->me_remap_guard); - int rc = MDBX_SUCCESS; - mdbx_handle_array_t *suspended = NULL; - mdbx_handle_array_t array_onstack; -#else - int rc = osal_fastmutex_acquire(&env->me_remap_guard); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; -#endif + if (unlikely(mc->signature != cur_signature_live)) + return (mc->signature == cur_signature_ready4dispose) ? MDBX_EINVAL + : MDBX_EBADSIGN; - const size_t prev_size = env->me_dxb_mmap.current; - const size_t prev_limit = env->me_dxb_mmap.limit; - const pgno_t prev_limit_pgno = bytes2pgno(env, prev_limit); - eASSERT(env, limit_pgno >= size_pgno); - eASSERT(env, size_pgno >= used_pgno); - if (mode < explicit_resize && size_pgno <= prev_limit_pgno) { - /* The actual mapsize may be less since the geo.upper may be changed - * by other process. Avoids remapping until it necessary. */ - limit_pgno = prev_limit_pgno; - } - const size_t limit_bytes = pgno_align2os_bytes(env, limit_pgno); - const size_t size_bytes = pgno_align2os_bytes(env, size_pgno); -#if MDBX_ENABLE_MADVISE || defined(ENABLE_MEMCHECK) - const void *const prev_map = env->me_dxb_mmap.base; -#endif /* MDBX_ENABLE_MADVISE || ENABLE_MEMCHECK */ + mc->checking |= z_ignord; + if (mc->subcur) + mc->subcur->cursor.checking |= z_ignord; - VERBOSE("resize/%d datafile/mapping: " - "present %" PRIuPTR " -> %" PRIuPTR ", " - "limit %" PRIuPTR " -> %" PRIuPTR, - mode, prev_size, size_bytes, prev_limit, limit_bytes); + return MDBX_SUCCESS; +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 - eASSERT(env, limit_bytes >= size_bytes); - eASSERT(env, bytes2pgno(env, size_bytes) >= size_pgno); - eASSERT(env, bytes2pgno(env, limit_bytes) >= limit_pgno); - unsigned mresize_flags = - env->me_flags & (MDBX_RDONLY | MDBX_WRITEMAP | MDBX_UTTERLY_NOSYNC); - if (mode >= impilict_shrink) - mresize_flags |= MDBX_SHRINK_ALLOWED; +__cold static intptr_t reasonable_db_maxsize(void) { + static intptr_t cached_result; + if (cached_result == 0) { + intptr_t pagesize, total_ram_pages; + if (unlikely(mdbx_get_sysraminfo(&pagesize, &total_ram_pages, nullptr) != + MDBX_SUCCESS)) + /* the 32-bit limit is good enough for fallback */ + return cached_result = MAX_MAPSIZE32; - if (limit_bytes == env->me_dxb_mmap.limit && - size_bytes == env->me_dxb_mmap.current && - size_bytes == env->me_dxb_mmap.filesize) - goto bailout; + if (unlikely((size_t)total_ram_pages * 2 > MAX_MAPSIZE / (size_t)pagesize)) + return cached_result = MAX_MAPSIZE; + assert(MAX_MAPSIZE >= (size_t)(total_ram_pages * pagesize * 2)); - /* При использовании MDBX_NOSTICKYTHREADS с транзакциями могут работать любые - * потоки и у нас нет информации о том, какие именно. Поэтому нет возможности - * выполнить remap-действия требующие приостановки работающих с БД потоков. */ - if ((env->me_flags & MDBX_NOSTICKYTHREADS) == 0) { -#if defined(_WIN32) || defined(_WIN64) - if ((size_bytes < env->me_dxb_mmap.current && mode > implicit_grow) || - limit_bytes != env->me_dxb_mmap.limit) { - /* 1) Windows allows only extending a read-write section, but not a - * corresponding mapped view. Therefore in other cases we must suspend - * the local threads for safe remap. - * 2) At least on Windows 10 1803 the entire mapped section is unavailable - * for short time during NtExtendSection() or VirtualAlloc() execution. - * 3) Under Wine runtime environment on Linux a section extending is not - * supported. - * - * THEREFORE LOCAL THREADS SUSPENDING IS ALWAYS REQUIRED! */ - array_onstack.limit = ARRAY_LENGTH(array_onstack.handles); - array_onstack.count = 0; - suspended = &array_onstack; - rc = osal_suspend_threads_before_remap(env, &suspended); - if (rc != MDBX_SUCCESS) { - ERROR("failed suspend-for-remap: errcode %d", rc); - goto bailout; - } - mresize_flags |= (mode < explicit_resize) - ? MDBX_MRESIZE_MAY_UNMAP - : MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; - } -#else /* Windows */ - MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (mode == explicit_resize && limit_bytes != env->me_dxb_mmap.limit) { - mresize_flags |= MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; - if (lck) { - int err = osal_rdt_lock(env) /* lock readers table until remap done */; - if (unlikely(MDBX_IS_ERROR(err))) { - rc = err; - goto bailout; - } + /* Suggesting should not be more than golden ratio of the size of RAM. */ + cached_result = (intptr_t)((size_t)total_ram_pages * 207 >> 7) * pagesize; - /* looking for readers from this process */ - const size_t snap_nreaders = - atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - eASSERT(env, mode == explicit_resize); - for (size_t i = 0; i < snap_nreaders; ++i) { - if (lck->mti_readers[i].mr_pid.weak == env->me_pid && - lck->mti_readers[i].mr_tid.weak != osal_thread_self()) { - /* the base address of the mapping can't be changed since - * the other reader thread from this process exists. */ - osal_rdt_unlock(env); - mresize_flags &= ~(MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE); - break; - } - } - } + /* Round to the nearest human-readable granulation. */ + for (size_t unit = MEGABYTE; unit; unit <<= 5) { + const size_t floor = floor_powerof2(cached_result, unit); + const size_t ceil = ceil_powerof2(cached_result, unit); + const size_t threshold = (size_t)cached_result >> 4; + const bool down = + cached_result - floor < ceil - cached_result || ceil > MAX_MAPSIZE; + if (threshold < (down ? cached_result - floor : ceil - cached_result)) + break; + cached_result = down ? floor : ceil; } -#endif /* ! Windows */ } + return cached_result; +} - const pgno_t aligned_munlock_pgno = - (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) - ? 0 - : bytes2pgno(env, size_bytes); - if (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) { - mincore_clean_cache(env); - if ((env->me_flags & MDBX_WRITEMAP) && - env->me_lck->mti_unsynced_pages.weak) { -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.msync.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), - MDBX_SYNC_NONE); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } +__cold static int check_alternative_lck_absent(const pathchar_t *lck_pathname) { + int err = osal_fileexists(lck_pathname); + if (unlikely(err != MDBX_RESULT_FALSE)) { + if (err == MDBX_RESULT_TRUE) + err = MDBX_DUPLICATED_CLK; + ERROR("Alternative/Duplicate LCK-file '%" MDBX_PRIsPATH "' error %d", + lck_pathname, err); } - munlock_after(env, aligned_munlock_pgno, size_bytes); + return err; +} -#if MDBX_ENABLE_MADVISE - if (size_bytes < prev_size && mode > implicit_grow) { - NOTICE("resize-MADV_%s %u..%u", - (env->me_flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED", size_pgno, - bytes2pgno(env, prev_size)); - const uint32_t munlocks_before = - atomic_load32(&env->me_lck->mti_mlcnt[1], mo_Relaxed); - rc = MDBX_RESULT_TRUE; -#if defined(MADV_REMOVE) - if (env->me_flags & MDBX_WRITEMAP) - rc = madvise(ptr_disp(env->me_map, size_bytes), prev_size - size_bytes, - MADV_REMOVE) - ? ignore_enosys(errno) - : MDBX_SUCCESS; -#endif /* MADV_REMOVE */ -#if defined(MADV_DONTNEED) - if (rc == MDBX_RESULT_TRUE) - rc = madvise(ptr_disp(env->me_map, size_bytes), prev_size - size_bytes, - MADV_DONTNEED) - ? ignore_enosys(errno) - : MDBX_SUCCESS; -#elif defined(POSIX_MADV_DONTNEED) - if (rc == MDBX_RESULT_TRUE) - rc = ignore_enosys(posix_madvise(ptr_disp(env->me_map, size_bytes), - prev_size - size_bytes, - POSIX_MADV_DONTNEED)); -#elif defined(POSIX_FADV_DONTNEED) - if (rc == MDBX_RESULT_TRUE) - rc = ignore_enosys(posix_fadvise(env->me_lazy_fd, size_bytes, - prev_size - size_bytes, - POSIX_FADV_DONTNEED)); -#endif /* MADV_DONTNEED */ - if (unlikely(MDBX_IS_ERROR(rc))) { - const uint32_t mlocks_after = - atomic_load32(&env->me_lck->mti_mlcnt[0], mo_Relaxed); - if (rc == MDBX_EINVAL) { - const int severity = - (mlocks_after - munlocks_before) ? MDBX_LOG_NOTICE : MDBX_LOG_WARN; - if (LOG_ENABLED(severity)) - debug_log(severity, __func__, __LINE__, - "%s-madvise: ignore EINVAL (%d) since some pages maybe " - "locked (%u/%u mlcnt-processes)", - "resize", rc, mlocks_after, munlocks_before); - } else { - ERROR("%s-madvise(%s, %zu, +%zu), %u/%u mlcnt-processes, err %d", - "mresize", "DONTNEED", size_bytes, prev_size - size_bytes, - mlocks_after, munlocks_before, rc); - goto bailout; - } - } else - env->me_lck->mti_discarded_tail.weak = size_pgno; - } -#endif /* MDBX_ENABLE_MADVISE */ - - rc = osal_mresize(mresize_flags, &env->me_dxb_mmap, size_bytes, limit_bytes); - eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); +__cold static int env_handle_pathname(MDBX_env *env, const pathchar_t *pathname, + const mdbx_mode_t mode) { + memset(&env->pathname, 0, sizeof(env->pathname)); + if (unlikely(!pathname || !*pathname)) + return MDBX_EINVAL; -#if MDBX_ENABLE_MADVISE - if (rc == MDBX_SUCCESS) { - eASSERT(env, limit_bytes == env->me_dxb_mmap.limit); - eASSERT(env, size_bytes <= env->me_dxb_mmap.filesize); - if (mode == explicit_resize) - eASSERT(env, size_bytes == env->me_dxb_mmap.current); - else - eASSERT(env, size_bytes <= env->me_dxb_mmap.current); - env->me_lck->mti_discarded_tail.weak = size_pgno; - const bool readahead = - !(env->me_flags & MDBX_NORDAHEAD) && - mdbx_is_readahead_reasonable(size_bytes, -(intptr_t)prev_size); - const bool force = limit_bytes != prev_limit || - env->me_dxb_mmap.base != prev_map + int rc; #if defined(_WIN32) || defined(_WIN64) - || prev_size > size_bytes -#endif /* Windows */ - ; - rc = set_readahead(env, size_pgno, readahead, force); - } -#endif /* MDBX_ENABLE_MADVISE */ + const DWORD dwAttrib = GetFileAttributesW(pathname); + if (dwAttrib == INVALID_FILE_ATTRIBUTES) { + rc = GetLastError(); + if (rc != MDBX_ENOFILE) + return rc; + if (mode == 0 || (env->flags & MDBX_RDONLY) != 0) + /* can't open existing */ + return rc; -bailout: - if (rc == MDBX_SUCCESS) { - eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); - eASSERT(env, limit_bytes == env->me_dxb_mmap.limit); - eASSERT(env, size_bytes <= env->me_dxb_mmap.filesize); - if (mode == explicit_resize) - eASSERT(env, size_bytes == env->me_dxb_mmap.current); - else - eASSERT(env, size_bytes <= env->me_dxb_mmap.current); - /* update env-geo to avoid influences */ - env->me_dbgeo.now = env->me_dxb_mmap.current; - env->me_dbgeo.upper = env->me_dxb_mmap.limit; - adjust_defaults(env); -#ifdef ENABLE_MEMCHECK - if (prev_limit != env->me_dxb_mmap.limit || prev_map != env->me_map) { - VALGRIND_DISCARD(env->me_valgrind_handle); - env->me_valgrind_handle = 0; - if (env->me_dxb_mmap.limit) - env->me_valgrind_handle = - VALGRIND_CREATE_BLOCK(env->me_map, env->me_dxb_mmap.limit, "mdbx"); + /* auto-create directory if requested */ + if ((env->flags & MDBX_NOSUBDIR) == 0 && + !CreateDirectoryW(pathname, nullptr)) { + rc = GetLastError(); + if (rc != ERROR_ALREADY_EXISTS) + return rc; } -#endif /* ENABLE_MEMCHECK */ } else { - if (rc != MDBX_UNABLE_EXTEND_MAPSIZE && rc != MDBX_EPERM) { - ERROR("failed resize datafile/mapping: " - "present %" PRIuPTR " -> %" PRIuPTR ", " - "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", - prev_size, size_bytes, prev_limit, limit_bytes, rc); - } else { - WARNING("unable resize datafile/mapping: " - "present %" PRIuPTR " -> %" PRIuPTR ", " - "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", - prev_size, size_bytes, prev_limit, limit_bytes, rc); - eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); - } - if (!env->me_dxb_mmap.base) { - env->me_flags |= MDBX_FATAL_ERROR; - if (env->me_txn) - env->me_txn->mt_flags |= MDBX_TXN_ERROR; - rc = MDBX_PANIC; + /* ignore passed MDBX_NOSUBDIR flag and set it automatically */ + env->flags |= MDBX_NOSUBDIR; + if (dwAttrib & FILE_ATTRIBUTE_DIRECTORY) + env->flags -= MDBX_NOSUBDIR; + } +#else + struct stat st; + if (stat(pathname, &st) != 0) { + rc = errno; + if (rc != MDBX_ENOFILE) + return rc; + if (mode == 0 || (env->flags & MDBX_RDONLY) != 0) + /* can't open non-existing */ + return rc /* MDBX_ENOFILE */; + + /* auto-create directory if requested */ + const mdbx_mode_t dir_mode = + (/* inherit read/write permissions for group and others */ mode & + (S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) | + /* always add read/write/search for owner */ S_IRWXU | + ((mode & S_IRGRP) ? /* +search if readable by group */ S_IXGRP : 0) | + ((mode & S_IROTH) ? /* +search if readable by others */ S_IXOTH : 0); + if ((env->flags & MDBX_NOSUBDIR) == 0 && mkdir(pathname, dir_mode)) { + rc = errno; + if (rc != EEXIST) + return rc; } + } else { + /* ignore passed MDBX_NOSUBDIR flag and set it automatically */ + env->flags |= MDBX_NOSUBDIR; + if (S_ISDIR(st.st_mode)) + env->flags -= MDBX_NOSUBDIR; } +#endif + + static const pathchar_t dxb_name[] = MDBX_DATANAME; + static const pathchar_t lck_name[] = MDBX_LOCKNAME; + static const pathchar_t lock_suffix[] = MDBX_LOCK_SUFFIX; #if defined(_WIN32) || defined(_WIN64) - int err = MDBX_SUCCESS; - osal_srwlock_ReleaseExclusive(&env->me_remap_guard); - if (suspended) { - err = osal_resume_threads_after_remap(suspended); - if (suspended != &array_onstack) - osal_free(suspended); - } + assert(dxb_name[0] == '\\' && lck_name[0] == '\\'); + const size_t pathname_len = wcslen(pathname); #else - if (env->me_lck_mmap.lck && - (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) != 0) - osal_rdt_unlock(env); - int err = osal_fastmutex_release(&env->me_remap_guard); -#endif /* Windows */ - if (err != MDBX_SUCCESS) { - FATAL("failed resume-after-remap: errcode %d", err); - return MDBX_PANIC; + assert(dxb_name[0] == '/' && lck_name[0] == '/'); + const size_t pathname_len = strlen(pathname); +#endif + assert(!osal_isdirsep(lock_suffix[0])); + size_t base_len = pathname_len; + static const size_t dxb_name_len = ARRAY_LENGTH(dxb_name) - 1; + if (env->flags & MDBX_NOSUBDIR) { + if (base_len > dxb_name_len && + osal_pathequal(pathname + base_len - dxb_name_len, dxb_name, + dxb_name_len)) { + env->flags -= MDBX_NOSUBDIR; + base_len -= dxb_name_len; + } else if (base_len == dxb_name_len - 1 && osal_isdirsep(dxb_name[0]) && + osal_isdirsep(lck_name[0]) && + osal_pathequal(pathname + base_len - dxb_name_len + 1, + dxb_name + 1, dxb_name_len - 1)) { + env->flags -= MDBX_NOSUBDIR; + base_len -= dxb_name_len - 1; + } + } + + const size_t suflen_with_NOSUBDIR = sizeof(lock_suffix) + sizeof(pathchar_t); + const size_t suflen_without_NOSUBDIR = sizeof(lck_name) + sizeof(dxb_name); + const size_t enough4any = (suflen_with_NOSUBDIR > suflen_without_NOSUBDIR) + ? suflen_with_NOSUBDIR + : suflen_without_NOSUBDIR; + const size_t bytes_needed = + sizeof(pathchar_t) * (base_len * 2 + pathname_len + 1) + enough4any; + env->pathname.buffer = osal_malloc(bytes_needed); + if (!env->pathname.buffer) + return MDBX_ENOMEM; + + env->pathname.specified = env->pathname.buffer; + env->pathname.dxb = env->pathname.specified + pathname_len + 1; + env->pathname.lck = env->pathname.dxb + base_len + dxb_name_len + 1; + rc = MDBX_SUCCESS; + pathchar_t *const buf = env->pathname.buffer; + if (base_len) { + memcpy(buf, pathname, sizeof(pathchar_t) * pathname_len); + if (env->flags & MDBX_NOSUBDIR) { + const pathchar_t *const lck_ext = + osal_fileext(lck_name, ARRAY_LENGTH(lck_name)); + if (lck_ext) { + pathchar_t *pathname_ext = osal_fileext(buf, pathname_len); + memcpy(pathname_ext ? pathname_ext : buf + pathname_len, lck_ext, + sizeof(pathchar_t) * (ARRAY_END(lck_name) - lck_ext)); + rc = check_alternative_lck_absent(buf); + } + } else { + memcpy(buf + base_len, dxb_name, sizeof(dxb_name)); + memcpy(buf + base_len + dxb_name_len, lock_suffix, sizeof(lock_suffix)); + rc = check_alternative_lck_absent(buf); + } + + memcpy(env->pathname.dxb, pathname, sizeof(pathchar_t) * (base_len + 1)); + memcpy(env->pathname.lck, pathname, sizeof(pathchar_t) * base_len); + if (env->flags & MDBX_NOSUBDIR) { + memcpy(env->pathname.lck + base_len, lock_suffix, sizeof(lock_suffix)); + } else { + memcpy(env->pathname.dxb + base_len, dxb_name, sizeof(dxb_name)); + memcpy(env->pathname.lck + base_len, lck_name, sizeof(lck_name)); + } + } else { + assert(!(env->flags & MDBX_NOSUBDIR)); + memcpy(buf, dxb_name + 1, sizeof(dxb_name) - sizeof(pathchar_t)); + memcpy(buf + dxb_name_len - 1, lock_suffix, sizeof(lock_suffix)); + rc = check_alternative_lck_absent(buf); + + memcpy(env->pathname.dxb, dxb_name + 1, + sizeof(dxb_name) - sizeof(pathchar_t)); + memcpy(env->pathname.lck, lck_name + 1, + sizeof(lck_name) - sizeof(pathchar_t)); } + + memcpy(env->pathname.specified, pathname, + sizeof(pathchar_t) * (pathname_len + 1)); return rc; } -static int meta_unsteady(int err, MDBX_env *env, const txnid_t early_than, - const pgno_t pgno) { - MDBX_meta *const meta = METAPAGE(env, pgno); - const txnid_t txnid = constmeta_txnid(meta); - if (unlikely(err != MDBX_SUCCESS) || !META_IS_STEADY(meta) || - !(txnid < early_than)) - return err; +/*----------------------------------------------------------------------------*/ - WARNING("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, txnid, pgno); - const uint64_t wipe = MDBX_DATASIGN_NONE; - const void *ptr = &wipe; - size_t bytes = sizeof(meta->mm_sign), - offset = ptr_dist(&meta->mm_sign, env->me_map); - if (env->me_flags & MDBX_WRITEMAP) { - unaligned_poke_u64(4, meta->mm_sign, wipe); - osal_flush_incoherent_cpu_writeback(); - if (!MDBX_AVOID_MSYNC) { - err = - osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), - MDBX_SYNC_DATA | MDBX_SYNC_IODQ); -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.msync.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - return err; - } - ptr = data_page(meta); - offset = ptr_dist(ptr, env->me_map); - bytes = env->me_psize; +__cold int mdbx_env_create(MDBX_env **penv) { + if (unlikely(!penv)) + return MDBX_EINVAL; + *penv = nullptr; + +#ifdef MDBX_HAVE_C11ATOMICS + if (unlikely(!atomic_is_lock_free((const volatile uint32_t *)penv))) { + ERROR("lock-free atomic ops for %u-bit types is required", 32); + return MDBX_INCOMPATIBLE; + } +#if MDBX_64BIT_ATOMIC + if (unlikely(!atomic_is_lock_free((const volatile uint64_t *)penv))) { + ERROR("lock-free atomic ops for %u-bit types is required", 64); + return MDBX_INCOMPATIBLE; } +#endif /* MDBX_64BIT_ATOMIC */ +#endif /* MDBX_HAVE_C11ATOMICS */ -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - err = osal_pwrite(env->me_fd4meta, ptr, bytes, offset); - if (likely(err == MDBX_SUCCESS) && env->me_fd4meta == env->me_lazy_fd) { - err = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.fsync.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ + if (unlikely(!is_powerof2(globals.sys_pagesize) || + globals.sys_pagesize < MDBX_MIN_PAGESIZE)) { + ERROR("unsuitable system pagesize %u", globals.sys_pagesize); + return MDBX_INCOMPATIBLE; } - return err; -} -__cold static int wipe_steady(MDBX_txn *txn, txnid_t last_steady) { - MDBX_env *const env = txn->mt_env; - int err = MDBX_SUCCESS; +#if defined(__linux__) || defined(__gnu_linux__) + if (unlikely(globals.linux_kernel_version < 0x04000000)) { + /* 2022-09-01: Прошло уже больше двух после окончания какой-либо поддержки + * самого "долгоиграющего" ядра 3.16.85 ветки 3.x */ + ERROR("too old linux kernel %u.%u.%u.%u, the >= 4.0.0 is required", + globals.linux_kernel_version >> 24, + (globals.linux_kernel_version >> 16) & 255, + (globals.linux_kernel_version >> 8) & 255, + globals.linux_kernel_version & 255); + return MDBX_INCOMPATIBLE; + } +#endif /* Linux */ - /* early than last_steady */ - err = meta_unsteady(err, env, last_steady, 0); - err = meta_unsteady(err, env, last_steady, 1); - err = meta_unsteady(err, env, last_steady, 2); + MDBX_env *env = osal_calloc(1, sizeof(MDBX_env)); + if (unlikely(!env)) + return MDBX_ENOMEM; - /* the last_steady */ - err = meta_unsteady(err, env, last_steady + 1, 0); - err = meta_unsteady(err, env, last_steady + 1, 1); - err = meta_unsteady(err, env, last_steady + 1, 2); + env->max_readers = DEFAULT_READERS; + env->max_dbi = env->n_dbi = CORE_DBS; + env->lazy_fd = env->dsync_fd = env->fd4meta = env->lck_mmap.fd = + INVALID_HANDLE_VALUE; + env->stuck_meta = -1; - osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), - env->me_os_psize); + env_options_init(env); + env_setup_pagesize(env, (globals.sys_pagesize < MDBX_MAX_PAGESIZE) + ? globals.sys_pagesize + : MDBX_MAX_PAGESIZE); - /* force oldest refresh */ - atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); + int rc = osal_fastmutex_init(&env->dbi_lock); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - txn->tw.troika = meta_tap(env); - for (MDBX_txn *scan = txn->mt_env->me_txn0; scan; scan = scan->mt_child) - if (scan != txn) - scan->tw.troika = txn->tw.troika; - return err; -} +#if defined(_WIN32) || defined(_WIN64) + imports.srwl_Init(&env->remap_guard); + InitializeCriticalSection(&env->windowsbug_lock); +#else + rc = osal_fastmutex_init(&env->remap_guard); + if (unlikely(rc != MDBX_SUCCESS)) { + osal_fastmutex_destroy(&env->dbi_lock); + goto bailout; + } -//------------------------------------------------------------------------------ +#if MDBX_LOCKING > MDBX_LOCKING_SYSV + lck_t *const stub = lckless_stub(env); + rc = lck_ipclock_stubinit(&stub->wrt_lock); +#endif /* MDBX_LOCKING */ + if (unlikely(rc != MDBX_SUCCESS)) { + osal_fastmutex_destroy(&env->remap_guard); + osal_fastmutex_destroy(&env->dbi_lock); + goto bailout; + } +#endif /* Windows */ -MDBX_MAYBE_UNUSED __hot static pgno_t * -scan4seq_fallback(pgno_t *range, const size_t len, const size_t seq) { - assert(seq > 0 && len > seq); -#if MDBX_PNL_ASCENDING - assert(range[-1] == len); - const pgno_t *const detent = range + len - seq; - const ptrdiff_t offset = (ptrdiff_t)seq; - const pgno_t target = (pgno_t)offset; - if (likely(len > seq + 3)) { - do { - const pgno_t diff0 = range[offset + 0] - range[0]; - const pgno_t diff1 = range[offset + 1] - range[1]; - const pgno_t diff2 = range[offset + 2] - range[2]; - const pgno_t diff3 = range[offset + 3] - range[3]; - if (diff0 == target) - return range + 0; - if (diff1 == target) - return range + 1; - if (diff2 == target) - return range + 2; - if (diff3 == target) - return range + 3; - range += 4; - } while (range + 3 < detent); - if (range == detent) - return nullptr; - } - do - if (range[offset] - *range == target) - return range; - while (++range < detent); -#else - assert(range[-(ptrdiff_t)len] == len); - const pgno_t *const detent = range - len + seq; - const ptrdiff_t offset = -(ptrdiff_t)seq; - const pgno_t target = (pgno_t)offset; - if (likely(len > seq + 3)) { - do { - const pgno_t diff0 = range[-0] - range[offset - 0]; - const pgno_t diff1 = range[-1] - range[offset - 1]; - const pgno_t diff2 = range[-2] - range[offset - 2]; - const pgno_t diff3 = range[-3] - range[offset - 3]; - /* Смысл вычислений до ветвлений в том, чтобы позволить компилятору - * загружать и вычислять все значения параллельно. */ - if (diff0 == target) - return range - 0; - if (diff1 == target) - return range - 1; - if (diff2 == target) - return range - 2; - if (diff3 == target) - return range - 3; - range -= 4; - } while (range > detent + 3); - if (range == detent) - return nullptr; - } - do - if (*range - range[offset] == target) - return range; - while (--range > detent); -#endif /* MDBX_PNL sort-order */ - return nullptr; + VALGRIND_CREATE_MEMPOOL(env, 0, 0); + env->signature.weak = env_signature; + *penv = env; + return MDBX_SUCCESS; + +bailout: + osal_free(env); + return rc; } -MDBX_MAYBE_UNUSED static const pgno_t *scan4range_checker(const MDBX_PNL pnl, - const size_t seq) { - size_t begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_GETSIZE(pnl); -#if MDBX_PNL_ASCENDING - while (seq <= MDBX_PNL_GETSIZE(pnl) - begin) { - if (pnl[begin + seq] - pnl[begin] == seq) - return pnl + begin; - ++begin; +__cold int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target) { + if (unlikely(target >= NUM_METAS)) + return MDBX_EINVAL; + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely((env->flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_EXCLUSIVE)) + return MDBX_EPERM; + + const meta_t *const target_meta = METAPAGE(env, target); + txnid_t new_txnid = constmeta_txnid(target_meta); + if (new_txnid < MIN_TXNID) + new_txnid = MIN_TXNID; + for (unsigned n = 0; n < NUM_METAS; ++n) { + if (n == target) + continue; + page_t *const page = pgno2page(env, n); + meta_t meta = *page_meta(page); + if (meta_validate(env, &meta, page, n, nullptr) != MDBX_SUCCESS) { + int err = meta_override(env, n, 0, nullptr); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } else { + txnid_t txnid = constmeta_txnid(&meta); + if (new_txnid <= txnid) + new_txnid = safe64_txnid_next(txnid); + } } -#else - while (begin > seq) { - if (pnl[begin - seq] - pnl[begin] == seq) - return pnl + begin; - --begin; + + if (unlikely(new_txnid > MAX_TXNID)) { + ERROR("txnid overflow, raise %d", MDBX_TXN_FULL); + return MDBX_TXN_FULL; } -#endif /* MDBX_PNL sort-order */ - return nullptr; + return meta_override(env, target, new_txnid, target_meta); } -#if defined(_MSC_VER) && !defined(__builtin_clz) && \ - !__has_builtin(__builtin_clz) -MDBX_MAYBE_UNUSED static __always_inline size_t __builtin_clz(uint32_t value) { - unsigned long index; - _BitScanReverse(&index, value); - return 31 - index; +__cold int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, + unsigned target_meta, bool writeable) { +#if defined(_WIN32) || defined(_WIN64) + wchar_t *pathnameW = nullptr; + int rc = osal_mb2w(pathname, &pathnameW); + if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_env_open_for_recoveryW(env, pathnameW, target_meta, writeable); + osal_free(pathnameW); + } + return rc; } -#endif /* _MSC_VER */ -#if defined(_MSC_VER) && !defined(__builtin_clzl) && \ - !__has_builtin(__builtin_clzl) -MDBX_MAYBE_UNUSED static __always_inline size_t __builtin_clzl(size_t value) { - unsigned long index; -#ifdef _WIN64 - assert(sizeof(value) == 8); - _BitScanReverse64(&index, value); - return 63 - index; +__cold int mdbx_env_open_for_recoveryW(MDBX_env *env, const wchar_t *pathname, + unsigned target_meta, bool writeable) { +#endif /* Windows */ + + if (unlikely(target_meta >= NUM_METAS)) + return MDBX_EINVAL; + int rc = check_env(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + if (unlikely(env->dxb_mmap.base)) + return MDBX_EPERM; + + env->stuck_meta = (int8_t)target_meta; + return +#if defined(_WIN32) || defined(_WIN64) + mdbx_env_openW #else - assert(sizeof(value) == 4); - _BitScanReverse(&index, value); - return 31 - index; -#endif + mdbx_env_open +#endif /* Windows */ + (env, pathname, writeable ? MDBX_EXCLUSIVE : MDBX_EXCLUSIVE | MDBX_RDONLY, + 0); } -#endif /* _MSC_VER */ -#if !MDBX_PNL_ASCENDING +__cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { +#if defined(_WIN32) || defined(_WIN64) + wchar_t *pathnameW = nullptr; + int rc = osal_mb2w(pathname, &pathnameW); + if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_env_deleteW(pathnameW, mode); + osal_free(pathnameW); + } + return rc; +} -#if !defined(MDBX_ATTRIBUTE_TARGET) && \ - (__has_attribute(__target__) || __GNUC_PREREQ(5, 0)) -#define MDBX_ATTRIBUTE_TARGET(target) __attribute__((__target__(target))) -#endif /* MDBX_ATTRIBUTE_TARGET */ +__cold int mdbx_env_deleteW(const wchar_t *pathname, + MDBX_env_delete_mode_t mode) { +#endif /* Windows */ -#ifndef MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND -/* Workaround for GCC's bug with `-m32 -march=i686 -Ofast` - * gcc/i686-buildroot-linux-gnu/12.2.0/include/xmmintrin.h:814:1: - * error: inlining failed in call to 'always_inline' '_mm_movemask_ps': - * target specific option mismatch */ -#if !defined(__FAST_MATH__) || !__FAST_MATH__ || !defined(__GNUC__) || \ - defined(__e2k__) || defined(__clang__) || defined(__amd64__) || \ - defined(__SSE2__) -#define MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND 0 + switch (mode) { + default: + return MDBX_EINVAL; + case MDBX_ENV_JUST_DELETE: + case MDBX_ENV_ENSURE_UNUSED: + case MDBX_ENV_WAIT_FOR_UNUSED: + break; + } + +#ifdef __e2k__ /* https://bugs.mcst.ru/bugzilla/show_bug.cgi?id=6011 */ + MDBX_env *const dummy_env = alloca(sizeof(MDBX_env)); #else -#define MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND 1 + MDBX_env dummy_env_silo, *const dummy_env = &dummy_env_silo; #endif -#endif /* MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND */ - -#if defined(__SSE2__) && defined(__SSE__) -#define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */ -#elif (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(__amd64__) -#define __SSE2__ -#define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */ -#elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) && \ - !MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND -#define MDBX_ATTRIBUTE_TARGET_SSE2 MDBX_ATTRIBUTE_TARGET("sse,sse2") -#endif /* __SSE2__ */ + memset(dummy_env, 0, sizeof(*dummy_env)); + dummy_env->flags = + (mode == MDBX_ENV_ENSURE_UNUSED) ? MDBX_EXCLUSIVE : MDBX_ENV_DEFAULTS; + dummy_env->ps = (unsigned)mdbx_default_pagesize(); -#if defined(__AVX2__) -#define MDBX_ATTRIBUTE_TARGET_AVX2 /* nope */ -#elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) && \ - !MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND -#define MDBX_ATTRIBUTE_TARGET_AVX2 MDBX_ATTRIBUTE_TARGET("sse,sse2,avx,avx2") -#endif /* __AVX2__ */ + STATIC_ASSERT(sizeof(dummy_env->flags) == sizeof(MDBX_env_flags_t)); + int rc = MDBX_RESULT_TRUE, err = env_handle_pathname(dummy_env, pathname, 0); + if (likely(err == MDBX_SUCCESS)) { + mdbx_filehandle_t clk_handle = INVALID_HANDLE_VALUE, + dxb_handle = INVALID_HANDLE_VALUE; + if (mode > MDBX_ENV_JUST_DELETE) { + err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, dummy_env->pathname.dxb, + &dxb_handle, 0); + err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; + if (err == MDBX_SUCCESS) { + err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, + dummy_env->pathname.lck, &clk_handle, 0); + err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; + } + if (err == MDBX_SUCCESS && clk_handle != INVALID_HANDLE_VALUE) + err = osal_lockfile(clk_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); + if (err == MDBX_SUCCESS && dxb_handle != INVALID_HANDLE_VALUE) + err = osal_lockfile(dxb_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); + } -#if defined(MDBX_ATTRIBUTE_TARGET_AVX2) -#if defined(__AVX512BW__) -#define MDBX_ATTRIBUTE_TARGET_AVX512BW /* nope */ -#elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) && \ - !MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND && \ - (__GNUC_PREREQ(6, 0) || __CLANG_PREREQ(5, 0)) -#define MDBX_ATTRIBUTE_TARGET_AVX512BW \ - MDBX_ATTRIBUTE_TARGET("sse,sse2,avx,avx2,avx512bw") -#endif /* __AVX512BW__ */ -#endif /* MDBX_ATTRIBUTE_TARGET_AVX2 for MDBX_ATTRIBUTE_TARGET_AVX512BW */ + if (err == MDBX_SUCCESS) { + err = osal_removefile(dummy_env->pathname.dxb); + if (err == MDBX_SUCCESS) + rc = MDBX_SUCCESS; + else if (err == MDBX_ENOFILE) + err = MDBX_SUCCESS; + } -#ifdef MDBX_ATTRIBUTE_TARGET_SSE2 -MDBX_ATTRIBUTE_TARGET_SSE2 static __always_inline unsigned -diffcmp2mask_sse2(const pgno_t *const ptr, const ptrdiff_t offset, - const __m128i pattern) { - const __m128i f = _mm_loadu_si128((const __m128i *)ptr); - const __m128i l = _mm_loadu_si128((const __m128i *)(ptr + offset)); - const __m128i cmp = _mm_cmpeq_epi32(_mm_sub_epi32(f, l), pattern); - return _mm_movemask_ps(*(const __m128 *)&cmp); -} + if (err == MDBX_SUCCESS) { + err = osal_removefile(dummy_env->pathname.lck); + if (err == MDBX_SUCCESS) + rc = MDBX_SUCCESS; + else if (err == MDBX_ENOFILE) + err = MDBX_SUCCESS; + } -MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_SSE2 static pgno_t * -scan4seq_sse2(pgno_t *range, const size_t len, const size_t seq) { - assert(seq > 0 && len > seq); -#if MDBX_PNL_ASCENDING -#error "FIXME: Not implemented" -#endif /* MDBX_PNL_ASCENDING */ - assert(range[-(ptrdiff_t)len] == len); - pgno_t *const detent = range - len + seq; - const ptrdiff_t offset = -(ptrdiff_t)seq; - const pgno_t target = (pgno_t)offset; - const __m128i pattern = _mm_set1_epi32(target); - uint8_t mask; - if (likely(len > seq + 3)) { - do { - mask = (uint8_t)diffcmp2mask_sse2(range - 3, offset, pattern); - if (mask) { -#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) - found: -#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ - return range + 28 - __builtin_clz(mask); - } - range -= 4; - } while (range > detent + 3); - if (range == detent) - return nullptr; - } + if (err == MDBX_SUCCESS && !(dummy_env->flags & MDBX_NOSUBDIR) && + (/* pathname != "." */ pathname[0] != '.' || pathname[1] != 0) && + (/* pathname != ".." */ pathname[0] != '.' || pathname[1] != '.' || + pathname[2] != 0)) { + err = osal_removedirectory(pathname); + if (err == MDBX_SUCCESS) + rc = MDBX_SUCCESS; + else if (err == MDBX_ENOFILE) + err = MDBX_SUCCESS; + } - /* Далее происходит чтение от 4 до 12 лишних байт, которые могут быть не - * только за пределами региона выделенного под PNL, но и пересекать границу - * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. - * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ -#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) - const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */; - if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && - !RUNNING_ON_VALGRIND) { - const unsigned extra = (unsigned)(detent + 4 - range); - assert(extra > 0 && extra < 4); - mask = 0xF << extra; - mask &= diffcmp2mask_sse2(range - 3, offset, pattern); - if (mask) - goto found; - return nullptr; - } -#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ - do - if (*range - range[offset] == target) - return range; - while (--range != detent); - return nullptr; -} -#endif /* MDBX_ATTRIBUTE_TARGET_SSE2 */ + if (dxb_handle != INVALID_HANDLE_VALUE) + osal_closefile(dxb_handle); + if (clk_handle != INVALID_HANDLE_VALUE) + osal_closefile(clk_handle); + } else if (err == MDBX_ENOFILE) + err = MDBX_SUCCESS; -#ifdef MDBX_ATTRIBUTE_TARGET_AVX2 -MDBX_ATTRIBUTE_TARGET_AVX2 static __always_inline unsigned -diffcmp2mask_avx2(const pgno_t *const ptr, const ptrdiff_t offset, - const __m256i pattern) { - const __m256i f = _mm256_loadu_si256((const __m256i *)ptr); - const __m256i l = _mm256_loadu_si256((const __m256i *)(ptr + offset)); - const __m256i cmp = _mm256_cmpeq_epi32(_mm256_sub_epi32(f, l), pattern); - return _mm256_movemask_ps(*(const __m256 *)&cmp); + osal_free(dummy_env->pathname.buffer); + return (err == MDBX_SUCCESS) ? rc : err; } -MDBX_ATTRIBUTE_TARGET_AVX2 static __always_inline unsigned -diffcmp2mask_sse2avx(const pgno_t *const ptr, const ptrdiff_t offset, - const __m128i pattern) { - const __m128i f = _mm_loadu_si128((const __m128i *)ptr); - const __m128i l = _mm_loadu_si128((const __m128i *)(ptr + offset)); - const __m128i cmp = _mm_cmpeq_epi32(_mm_sub_epi32(f, l), pattern); - return _mm_movemask_ps(*(const __m128 *)&cmp); +__cold int mdbx_env_open(MDBX_env *env, const char *pathname, + MDBX_env_flags_t flags, mdbx_mode_t mode) { +#if defined(_WIN32) || defined(_WIN64) + wchar_t *pathnameW = nullptr; + int rc = osal_mb2w(pathname, &pathnameW); + if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_env_openW(env, pathnameW, flags, mode); + osal_free(pathnameW); + if (rc == MDBX_SUCCESS) + /* force to make cache of the multi-byte pathname representation */ + mdbx_env_get_path(env, &pathname); + } + return rc; } -MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX2 static pgno_t * -scan4seq_avx2(pgno_t *range, const size_t len, const size_t seq) { - assert(seq > 0 && len > seq); -#if MDBX_PNL_ASCENDING -#error "FIXME: Not implemented" -#endif /* MDBX_PNL_ASCENDING */ - assert(range[-(ptrdiff_t)len] == len); - pgno_t *const detent = range - len + seq; - const ptrdiff_t offset = -(ptrdiff_t)seq; - const pgno_t target = (pgno_t)offset; - const __m256i pattern = _mm256_set1_epi32(target); - uint8_t mask; - if (likely(len > seq + 7)) { - do { - mask = (uint8_t)diffcmp2mask_avx2(range - 7, offset, pattern); - if (mask) { -#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) - found: -#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ - return range + 24 - __builtin_clz(mask); +__cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, + MDBX_env_flags_t flags, mdbx_mode_t mode) { +#endif /* Windows */ + + int rc = check_env(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(flags & ~ENV_USABLE_FLAGS)) + return MDBX_EINVAL; + + if (unlikely(env->lazy_fd != INVALID_HANDLE_VALUE || + (env->flags & ENV_ACTIVE) != 0 || env->dxb_mmap.base)) + return MDBX_EPERM; + + /* Pickup previously mdbx_env_set_flags(), + * but avoid MDBX_UTTERLY_NOSYNC by disjunction */ + const uint32_t saved_me_flags = env->flags; + flags = combine_durability_flags(flags | DEPRECATED_COALESCE, env->flags); + + if (flags & MDBX_RDONLY) { + /* Silently ignore irrelevant flags when we're only getting read access */ + flags &= ~(MDBX_WRITEMAP | DEPRECATED_MAPASYNC | MDBX_SAFE_NOSYNC | + MDBX_NOMETASYNC | DEPRECATED_COALESCE | MDBX_LIFORECLAIM | + MDBX_NOMEMINIT | MDBX_ACCEDE); + mode = 0; + } else { +#if MDBX_MMAP_INCOHERENT_FILE_WRITE + /* Temporary `workaround` for OpenBSD kernel's flaw. + * See https://libmdbx.dqdkfa.ru/dead-github/issues/67 */ + if ((flags & MDBX_WRITEMAP) == 0) { + if (flags & MDBX_ACCEDE) + flags |= MDBX_WRITEMAP; + else { + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, + "System (i.e. OpenBSD) requires MDBX_WRITEMAP because " + "of an internal flaw(s) in a file/buffer/page cache.\n"); + return 42 /* ENOPROTOOPT */; } - range -= 8; - } while (range > detent + 7); - if (range == detent) - return nullptr; + } +#endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ } - /* Далее происходит чтение от 4 до 28 лишних байт, которые могут быть не - * только за пределами региона выделенного под PNL, но и пересекать границу - * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. - * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ -#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) - const unsigned on_page_safe_mask = 0xfe0 /* enough for '-31' bytes offset */; - if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && - !RUNNING_ON_VALGRIND) { - const unsigned extra = (unsigned)(detent + 8 - range); - assert(extra > 0 && extra < 8); - mask = 0xFF << extra; - mask &= diffcmp2mask_avx2(range - 7, offset, pattern); - if (mask) - goto found; - return nullptr; + env->flags = (flags & ~ENV_FATAL_ERROR); + rc = env_handle_pathname(env, pathname, mode); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + env->kvs = osal_calloc(env->max_dbi, sizeof(env->kvs[0])); + env->dbs_flags = osal_calloc(env->max_dbi, sizeof(env->dbs_flags[0])); + env->dbi_seqs = osal_calloc(env->max_dbi, sizeof(env->dbi_seqs[0])); + if (unlikely(!(env->kvs && env->dbs_flags && env->dbi_seqs))) { + rc = MDBX_ENOMEM; + goto bailout; } -#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ - if (range - 3 > detent) { - mask = diffcmp2mask_sse2avx(range - 3, offset, *(const __m128i *)&pattern); - if (mask) - return range + 28 - __builtin_clz(mask); - range -= 4; + + if ((flags & MDBX_RDONLY) == 0) { + MDBX_txn *txn = nullptr; + const intptr_t bitmap_bytes = +#if MDBX_ENABLE_DBI_SPARSE + ceil_powerof2(env->max_dbi, CHAR_BIT * sizeof(txn->dbi_sparse[0])) / + CHAR_BIT; +#else + 0; +#endif /* MDBX_ENABLE_DBI_SPARSE */ + const size_t base = sizeof(MDBX_txn) + sizeof(cursor_couple_t); + const size_t size = + base + bitmap_bytes + + env->max_dbi * (sizeof(txn->dbs[0]) + sizeof(txn->cursors[0]) + + sizeof(txn->dbi_seqs[0]) + sizeof(txn->dbi_state[0])); + rc = env_page_auxbuffer(env); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + memset(env->page_auxbuf, -1, env->ps * (size_t)2); + memset(ptr_disp(env->page_auxbuf, env->ps * (size_t)2), 0, env->ps); + txn = osal_calloc(1, size); + if (unlikely(!txn)) { + rc = MDBX_ENOMEM; + goto bailout; + } + txn->dbs = ptr_disp(txn, base); + txn->cursors = ptr_disp(txn->dbs, env->max_dbi * sizeof(txn->dbs[0])); + txn->dbi_seqs = + ptr_disp(txn->cursors, env->max_dbi * sizeof(txn->cursors[0])); + txn->dbi_state = + ptr_disp(txn, size - env->max_dbi * sizeof(txn->dbi_state[0])); +#if MDBX_ENABLE_DBI_SPARSE + txn->dbi_sparse = ptr_disp(txn->dbi_state, -bitmap_bytes); +#endif /* MDBX_ENABLE_DBI_SPARSE */ + txn->env = env; + txn->flags = MDBX_TXN_FINISHED; + env->basal_txn = txn; + txn->tw.retired_pages = pnl_alloc(MDBX_PNL_INITIAL); + txn->tw.relist = pnl_alloc(MDBX_PNL_INITIAL); + if (unlikely(!txn->tw.retired_pages || !txn->tw.relist)) { + rc = MDBX_ENOMEM; + goto bailout; + } + env_options_adjust_defaults(env); } - while (range > detent) { - if (*range - range[offset] == target) - return range; - --range; + + rc = env_open(env, mode); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + +#if MDBX_DEBUG + const troika_t troika = meta_tap(env); + const meta_ptr_t head = meta_recent(env, &troika); + const tree_t *db = &head.ptr_c->trees.main; + + DEBUG("opened database version %u, pagesize %u", + (uint8_t)unaligned_peek_u64(4, head.ptr_c->magic_and_version), env->ps); + DEBUG("using meta page %" PRIaPGNO ", txn %" PRIaTXN, + data_page(head.ptr_c)->pgno, head.txnid); + DEBUG("depth: %u", db->height); + DEBUG("entries: %" PRIu64, db->items); + DEBUG("branch pages: %" PRIaPGNO, db->branch_pages); + DEBUG("leaf pages: %" PRIaPGNO, db->leaf_pages); + DEBUG("large/overflow pages: %" PRIaPGNO, db->large_pages); + DEBUG("root: %" PRIaPGNO, db->root); + DEBUG("schema_altered: %" PRIaTXN, db->mod_txnid); +#endif /* MDBX_DEBUG */ + + if (likely(rc == MDBX_SUCCESS)) { + dxb_sanitize_tail(env, nullptr); + } else { + bailout: + if (likely(env_close(env, false) == MDBX_SUCCESS)) { + env->flags = saved_me_flags; + } else { + rc = MDBX_PANIC; + env->flags = saved_me_flags | ENV_FATAL_ERROR; + } } - return nullptr; + return rc; } -#endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */ -#ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW -MDBX_ATTRIBUTE_TARGET_AVX512BW static __always_inline unsigned -diffcmp2mask_avx512bw(const pgno_t *const ptr, const ptrdiff_t offset, - const __m512i pattern) { - const __m512i f = _mm512_loadu_si512((const __m512i *)ptr); - const __m512i l = _mm512_loadu_si512((const __m512i *)(ptr + offset)); - return _mm512_cmpeq_epi32_mask(_mm512_sub_epi32(f, l), pattern); -} +/*----------------------------------------------------------------------------*/ -MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX512BW static pgno_t * -scan4seq_avx512bw(pgno_t *range, const size_t len, const size_t seq) { - assert(seq > 0 && len > seq); -#if MDBX_PNL_ASCENDING -#error "FIXME: Not implemented" -#endif /* MDBX_PNL_ASCENDING */ - assert(range[-(ptrdiff_t)len] == len); - pgno_t *const detent = range - len + seq; - const ptrdiff_t offset = -(ptrdiff_t)seq; - const pgno_t target = (pgno_t)offset; - const __m512i pattern = _mm512_set1_epi32(target); - unsigned mask; - if (likely(len > seq + 15)) { - do { - mask = diffcmp2mask_avx512bw(range - 15, offset, pattern); - if (mask) { -#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) - found: -#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ - return range + 16 - __builtin_clz(mask); - } - range -= 16; - } while (range > detent + 15); - if (range == detent) - return nullptr; - } +#if !(defined(_WIN32) || defined(_WIN64)) +__cold int mdbx_env_resurrect_after_fork(MDBX_env *env) { + if (unlikely(!env)) + return MDBX_EINVAL; - /* Далее происходит чтение от 4 до 60 лишних байт, которые могут быть не - * только за пределами региона выделенного под PNL, но и пересекать границу - * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. - * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ -#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) - const unsigned on_page_safe_mask = 0xfc0 /* enough for '-63' bytes offset */; - if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && - !RUNNING_ON_VALGRIND) { - const unsigned extra = (unsigned)(detent + 16 - range); - assert(extra > 0 && extra < 16); - mask = 0xFFFF << extra; - mask &= diffcmp2mask_avx512bw(range - 15, offset, pattern); - if (mask) - goto found; - return nullptr; - } -#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ - if (range - 7 > detent) { - mask = diffcmp2mask_avx2(range - 7, offset, *(const __m256i *)&pattern); - if (mask) - return range + 24 - __builtin_clz(mask); - range -= 8; - } - if (range - 3 > detent) { - mask = diffcmp2mask_sse2avx(range - 3, offset, *(const __m128i *)&pattern); - if (mask) - return range + 28 - __builtin_clz(mask); - range -= 4; - } - while (range > detent) { - if (*range - range[offset] == target) - return range; - --range; - } - return nullptr; -} -#endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ + if (unlikely(env->signature.weak != env_signature)) + return MDBX_EBADSIGN; -#if (defined(__ARM_NEON) || defined(__ARM_NEON__)) && \ - (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) -static __always_inline size_t diffcmp2mask_neon(const pgno_t *const ptr, - const ptrdiff_t offset, - const uint32x4_t pattern) { - const uint32x4_t f = vld1q_u32(ptr); - const uint32x4_t l = vld1q_u32(ptr + offset); - const uint16x4_t cmp = vmovn_u32(vceqq_u32(vsubq_u32(f, l), pattern)); - if (sizeof(size_t) > 7) - return vget_lane_u64(vreinterpret_u64_u16(cmp), 0); - else - return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(cmp, cmp))), - 0); -} + if (unlikely(env->flags & ENV_FATAL_ERROR)) + return MDBX_PANIC; -__hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, - const size_t seq) { - assert(seq > 0 && len > seq); -#if MDBX_PNL_ASCENDING -#error "FIXME: Not implemented" -#endif /* MDBX_PNL_ASCENDING */ - assert(range[-(ptrdiff_t)len] == len); - pgno_t *const detent = range - len + seq; - const ptrdiff_t offset = -(ptrdiff_t)seq; - const pgno_t target = (pgno_t)offset; - const uint32x4_t pattern = vmovq_n_u32(target); - size_t mask; - if (likely(len > seq + 3)) { - do { - mask = diffcmp2mask_neon(range - 3, offset, pattern); - if (mask) { -#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) - found: -#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ - return ptr_disp(range, -(__builtin_clzl(mask) >> sizeof(size_t) / 4)); - } - range -= 4; - } while (range > detent + 3); - if (range == detent) - return nullptr; - } + if (unlikely((env->flags & ENV_ACTIVE) == 0)) + return MDBX_SUCCESS; - /* Далее происходит чтение от 4 до 12 лишних байт, которые могут быть не - * только за пределами региона выделенного под PNL, но и пересекать границу - * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. - * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ -#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) - const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */; - if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && - !RUNNING_ON_VALGRIND) { - const unsigned extra = (unsigned)(detent + 4 - range); - assert(extra > 0 && extra < 4); - mask = (~(size_t)0) << (extra * sizeof(size_t) * 2); - mask &= diffcmp2mask_neon(range - 3, offset, pattern); - if (mask) - goto found; - return nullptr; + const uint32_t new_pid = osal_getpid(); + if (unlikely(env->pid == new_pid)) + return MDBX_SUCCESS; + + if (!atomic_cas32(&env->signature, env_signature, ~env_signature)) + return MDBX_EBADSIGN; + + if (env->txn) + txn_abort(env->basal_txn); + env->registered_reader_pid = 0; + int rc = env_close(env, true); + env->signature.weak = env_signature; + if (likely(rc == MDBX_SUCCESS)) { + rc = (env->flags & MDBX_EXCLUSIVE) ? MDBX_BUSY : env_open(env, 0); + if (unlikely(rc != MDBX_SUCCESS && env_close(env, false) != MDBX_SUCCESS)) { + rc = MDBX_PANIC; + env->flags |= ENV_FATAL_ERROR; + } } -#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ - do - if (*range - range[offset] == target) - return range; - while (--range != detent); - return nullptr; + return rc; } -#endif /* __ARM_NEON || __ARM_NEON__ */ +#endif /* Windows */ -#if defined(__AVX512BW__) && defined(MDBX_ATTRIBUTE_TARGET_AVX512BW) -#define scan4seq_default scan4seq_avx512bw -#define scan4seq_impl scan4seq_default -#elif defined(__AVX2__) && defined(MDBX_ATTRIBUTE_TARGET_AVX2) -#define scan4seq_default scan4seq_avx2 -#elif defined(__SSE2__) && defined(MDBX_ATTRIBUTE_TARGET_SSE2) -#define scan4seq_default scan4seq_sse2 -#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && \ - (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) -#define scan4seq_default scan4seq_neon -/* Choosing of another variants should be added here. */ -#endif /* scan4seq_default */ +__cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { + page_t *dp; + int rc = MDBX_SUCCESS; -#endif /* MDBX_PNL_ASCENDING */ + if (unlikely(!env)) + return MDBX_EINVAL; -#ifndef scan4seq_default -#define scan4seq_default scan4seq_fallback -#endif /* scan4seq_default */ + if (unlikely(env->signature.weak != env_signature)) + return MDBX_EBADSIGN; -#ifdef scan4seq_impl -/* The scan4seq_impl() is the best or no alternatives */ -#elif !MDBX_HAVE_BUILTIN_CPU_SUPPORTS -/* The scan4seq_default() will be used since no cpu-features detection support - * from compiler. Please don't ask to implement cpuid-based detection and don't - * make such PRs. */ -#define scan4seq_impl scan4seq_default -#else -/* Selecting the most appropriate implementation at runtime, - * depending on the available CPU features. */ -static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, - const size_t seq); -static pgno_t *(*scan4seq_impl)(pgno_t *range, const size_t len, - const size_t seq) = scan4seq_resolver; +#if MDBX_ENV_CHECKPID || !(defined(_WIN32) || defined(_WIN64)) + /* Check the PID even if MDBX_ENV_CHECKPID=0 on non-Windows + * platforms (i.e. where fork() is available). + * This is required to legitimize a call after fork() + * from a child process, that should be allowed to free resources. */ + if (unlikely(env->pid != osal_getpid())) + env->flags |= ENV_FATAL_ERROR; +#endif /* MDBX_ENV_CHECKPID */ -static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, - const size_t seq) { - pgno_t *(*choice)(pgno_t *range, const size_t len, const size_t seq) = - nullptr; -#if __has_builtin(__builtin_cpu_init) || defined(__BUILTIN_CPU_INIT__) || \ - __GNUC_PREREQ(4, 8) - __builtin_cpu_init(); -#endif /* __builtin_cpu_init() */ -#ifdef MDBX_ATTRIBUTE_TARGET_SSE2 - if (__builtin_cpu_supports("sse2")) - choice = scan4seq_sse2; -#endif /* MDBX_ATTRIBUTE_TARGET_SSE2 */ -#ifdef MDBX_ATTRIBUTE_TARGET_AVX2 - if (__builtin_cpu_supports("avx2")) - choice = scan4seq_avx2; -#endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */ -#ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW - if (__builtin_cpu_supports("avx512bw")) - choice = scan4seq_avx512bw; -#endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ - /* Choosing of another variants should be added here. */ - scan4seq_impl = choice ? choice : scan4seq_default; - return scan4seq_impl(range, len, seq); -} -#endif /* scan4seq_impl */ + if (env->dxb_mmap.base && + (env->flags & (MDBX_RDONLY | ENV_FATAL_ERROR)) == 0 && env->basal_txn) { + if (env->basal_txn->owner && env->basal_txn->owner != osal_thread_self()) + return MDBX_BUSY; + } else + dont_sync = true; -//------------------------------------------------------------------------------ + if (!atomic_cas32(&env->signature, env_signature, 0)) + return MDBX_EBADSIGN; -#define MDBX_ALLOC_DEFAULT 0 -#define MDBX_ALLOC_RESERVE 1 -#define MDBX_ALLOC_UNIMPORTANT 2 -#define MDBX_ALLOC_COALESCE 4 /* внутреннее состояние */ -#define MDBX_ALLOC_SHOULD_SCAN 8 /* внутреннее состояние */ -#define MDBX_ALLOC_LIFO 16 /* внутреннее состояние */ + if (!dont_sync) { +#if defined(_WIN32) || defined(_WIN64) + /* On windows, without blocking is impossible to determine whether another + * process is running a writing transaction or not. + * Because in the "owner died" condition kernel don't release + * file lock immediately. */ + rc = env_sync(env, true, false); + rc = (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; +#else + struct stat st; + if (unlikely(fstat(env->lazy_fd, &st))) + rc = errno; + else if (st.st_nlink > 0 /* don't sync deleted files */) { + rc = env_sync(env, true, true); + rc = (rc == MDBX_BUSY || rc == EAGAIN || rc == EACCES || rc == EBUSY || + rc == EWOULDBLOCK || rc == MDBX_RESULT_TRUE) + ? MDBX_SUCCESS + : rc; + } +#endif /* Windows */ + } -static __inline bool is_gc_usable(MDBX_txn *txn, const MDBX_cursor *mc, - const uint8_t flags) { - /* If txn is updating the GC, then the retired-list cannot play catch-up with - * itself by growing while trying to save it. */ - if (mc->mc_dbi == FREE_DBI && !(flags & MDBX_ALLOC_RESERVE) && - !(mc->mc_flags & C_GCU)) - return false; + if (env->basal_txn && env->basal_txn->owner == osal_thread_self()) + lck_txn_unlock(env); - /* avoid search inside empty tree and while tree is updating, - https://libmdbx.dqdkfa.ru/dead-github/issues/31 */ - if (unlikely(txn->mt_dbs[FREE_DBI].md_entries == 0)) { - txn->mt_flags |= MDBX_TXN_DRAINED_GC; - return false; + eASSERT(env, env->signature.weak == 0); + rc = env_close(env, false) ? MDBX_PANIC : rc; + ENSURE(env, osal_fastmutex_destroy(&env->dbi_lock) == MDBX_SUCCESS); +#if defined(_WIN32) || defined(_WIN64) + /* remap_guard don't have destructor (Slim Reader/Writer Lock) */ + DeleteCriticalSection(&env->windowsbug_lock); +#else + ENSURE(env, osal_fastmutex_destroy(&env->remap_guard) == MDBX_SUCCESS); +#endif /* Windows */ + +#if MDBX_LOCKING > MDBX_LOCKING_SYSV + lck_t *const stub = lckless_stub(env); + /* может вернуть ошибку в дочернем процессе после fork() */ + lck_ipclock_destroy(&stub->wrt_lock); +#endif /* MDBX_LOCKING */ + + while ((dp = env->shadow_reserve) != nullptr) { + MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->ps); + VALGRIND_MAKE_MEM_DEFINED(&page_next(dp), sizeof(page_t *)); + env->shadow_reserve = page_next(dp); + void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t)); + osal_free(ptr); } + VALGRIND_DESTROY_MEMPOOL(env); + osal_free(env); - return true; + return rc; } -__hot static bool is_already_reclaimed(const MDBX_txn *txn, txnid_t id) { - const size_t len = MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed); - for (size_t i = 1; i <= len; ++i) - if (txn->tw.lifo_reclaimed[i] == id) - return true; - return false; -} +/*----------------------------------------------------------------------------*/ -__hot static pgno_t relist_get_single(MDBX_txn *txn) { - const size_t len = MDBX_PNL_GETSIZE(txn->tw.relist); - assert(len > 0); - pgno_t *target = MDBX_PNL_EDGE(txn->tw.relist); - const ptrdiff_t dir = MDBX_PNL_ASCENDING ? 1 : -1; +static int env_info_snap(const MDBX_env *env, const MDBX_txn *txn, + MDBX_envinfo *out, const size_t bytes, + troika_t *const troika) { + const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid); + const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat); + if (unlikely(env->flags & ENV_FATAL_ERROR)) + return MDBX_PANIC; - /* Есть ТРИ потенциально выигрышные, но противо-направленные тактики: - * - * 1. Стараться использовать страницы с наименьшими номерами. Так обмен с - * диском будет более кучным, а у страниц ближе к концу БД будет больше шансов - * попасть под авто-компактификацию. Частично эта тактика уже реализована, но - * для её эффективности требуется явно приоритезировать выделение страниц: - * - поддерживать для relist, для ближних и для дальних страниц; - * - использовать страницы из дальнего списка, если первый пуст, - * а второй слишком большой, либо при пустой GC. - * - * 2. Стараться выделять страницы последовательно. Так записываемые на диск - * регионы будут линейными, что принципиально ускоряет запись на HDD. - * Одновременно, в среднем это не повлияет на чтение, точнее говоря, если - * порядок чтения не совпадает с порядком изменения (иначе говоря, если - * чтение не коррклирует с обновлениями и/или вставками) то не повлияет, иначе - * может ускорить. Однако, последовательности в среднем достаточно редки. - * Поэтому для эффективности требуется аккумулировать и поддерживать в ОЗУ - * огромные списки страниц, а затем сохранять их обратно в БД. Текущий формат - * БД (без битовых карт) для этого крайне не удачен. Поэтому эта тактика не - * имеет шансов быть успешной без смены формата БД (Mithril). - * - * 3. Стараться экономить последовательности страниц. Это позволяет избегать - * лишнего чтения/поиска в GC при более-менее постоянном размещении и/или - * обновлении данных требующих более одной страницы. Проблема в том, что без - * информации от приложения библиотека не может знать насколько - * востребованными будут последовательности в ближайшей перспективе, а - * экономия последовательностей "на всякий случай" не только затратна - * сама-по-себе, но и работает во вред. - * - * Поэтому: - * - в TODO добавляется разделение relist на «ближние» и «дальние» страницы, - * с последующей реализацией первой тактики; - * - преимущественное использование последовательностей отправляется - * в MithrilDB как составляющая "HDD frendly" feature; - * - реализованная в 3757eb72f7c6b46862f8f17881ac88e8cecc1979 экономия - * последовательностей отключается через MDBX_ENABLE_SAVING_SEQUENCES=0. - * - * В качестве альтернативы для безусловной «экономии» последовательностей, - * в следующих версиях libmdbx, вероятно, будет предложено - * API для взаимодействия с GC: - * - получение размера GC, включая гистограммы размеров последовательностей - * и близости к концу БД; - * - включение формирования "линейного запаса" для последующего использования - * в рамках текущей транзакции; - * - намеренная загрузка GC в память для коагуляции и "выпрямления"; - * - намеренное копирование данных из страниц в конце БД для последующего - * из освобождения, т.е. контролируемая компактификация по запросу. */ - -#ifndef MDBX_ENABLE_SAVING_SEQUENCES -#define MDBX_ENABLE_SAVING_SEQUENCES 0 -#endif - if (MDBX_ENABLE_SAVING_SEQUENCES && unlikely(target[dir] == *target + 1) && - len > 2) { - /* Пытаемся пропускать последовательности при наличии одиночных элементов. - * TODO: необходимо кэшировать пропускаемые последовательности - * чтобы не сканировать список сначала при каждом выделении. */ - pgno_t *scan = target + dir + dir; - size_t left = len; - do { - if (likely(scan[-dir] != *scan - 1 && *scan + 1 != scan[dir])) { -#if MDBX_PNL_ASCENDING - target = scan; - break; + /* is the environment open? + * (https://libmdbx.dqdkfa.ru/dead-github/issues/171) */ + if (unlikely(!env->dxb_mmap.base)) { + /* environment not yet opened */ +#if 1 + /* default behavior: returns the available info but zeroed the rest */ + memset(out, 0, bytes); + out->mi_geo.lower = env->geo_in_bytes.lower; + out->mi_geo.upper = env->geo_in_bytes.upper; + out->mi_geo.shrink = env->geo_in_bytes.shrink; + out->mi_geo.grow = env->geo_in_bytes.grow; + out->mi_geo.current = env->geo_in_bytes.now; + out->mi_maxreaders = env->max_readers; + out->mi_dxb_pagesize = env->ps; + out->mi_sys_pagesize = globals.sys_pagesize; + if (likely(bytes > size_before_bootid)) { + out->mi_bootid.current.x = globals.bootid.x; + out->mi_bootid.current.y = globals.bootid.y; + } + return MDBX_SUCCESS; #else - /* вырезаем элемент с перемещением хвоста */ - const pgno_t pgno = *scan; - MDBX_PNL_SETSIZE(txn->tw.relist, len - 1); - while (++scan <= target) - scan[-1] = *scan; - return pgno; + /* some users may prefer this behavior: return appropriate error */ + return MDBX_EPERM; #endif - } - scan += dir; - } while (--left > 2); } - const pgno_t pgno = *target; -#if MDBX_PNL_ASCENDING - /* вырезаем элемент с перемещением хвоста */ - MDBX_PNL_SETSIZE(txn->tw.relist, len - 1); - for (const pgno_t *const end = txn->tw.relist + len - 1; target <= end; - ++target) - *target = target[1]; -#else - /* перемещать хвост не нужно, просто усекам список */ - MDBX_PNL_SETSIZE(txn->tw.relist, len - 1); -#endif - return pgno; -} - -__hot static pgno_t relist_get_sequence(MDBX_txn *txn, const size_t num, - uint8_t flags) { - const size_t len = MDBX_PNL_GETSIZE(txn->tw.relist); - pgno_t *edge = MDBX_PNL_EDGE(txn->tw.relist); - assert(len >= num && num > 1); - const size_t seq = num - 1; -#if !MDBX_PNL_ASCENDING - if (edge[-(ptrdiff_t)seq] - *edge == seq) { - if (unlikely(flags & MDBX_ALLOC_RESERVE)) - return P_INVALID; - assert(edge == scan4range_checker(txn->tw.relist, seq)); - /* перемещать хвост не нужно, просто усекам список */ - MDBX_PNL_SETSIZE(txn->tw.relist, len - num); - return *edge; - } -#endif - pgno_t *target = scan4seq_impl(edge, len, seq); - assert(target == scan4range_checker(txn->tw.relist, seq)); - if (target) { - if (unlikely(flags & MDBX_ALLOC_RESERVE)) - return P_INVALID; - const pgno_t pgno = *target; - /* вырезаем найденную последовательность с перемещением хвоста */ - MDBX_PNL_SETSIZE(txn->tw.relist, len - num); -#if MDBX_PNL_ASCENDING - for (const pgno_t *const end = txn->tw.relist + len - num; target <= end; - ++target) - *target = target[num]; -#else - for (const pgno_t *const end = txn->tw.relist + len; ++target <= end;) - target[-(ptrdiff_t)num] = *target; -#endif - return pgno; + *troika = + (txn && !(txn->flags & MDBX_TXN_RDONLY)) ? txn->tw.troika : meta_tap(env); + const meta_ptr_t head = meta_recent(env, troika); + const meta_t *const meta0 = METAPAGE(env, 0); + const meta_t *const meta1 = METAPAGE(env, 1); + const meta_t *const meta2 = METAPAGE(env, 2); + out->mi_recent_txnid = head.txnid; + out->mi_meta_txnid[0] = troika->txnid[0]; + out->mi_meta_sign[0] = unaligned_peek_u64(4, meta0->sign); + out->mi_meta_txnid[1] = troika->txnid[1]; + out->mi_meta_sign[1] = unaligned_peek_u64(4, meta1->sign); + out->mi_meta_txnid[2] = troika->txnid[2]; + out->mi_meta_sign[2] = unaligned_peek_u64(4, meta2->sign); + if (likely(bytes > size_before_bootid)) { + memcpy(&out->mi_bootid.meta[0], &meta0->bootid, 16); + memcpy(&out->mi_bootid.meta[1], &meta1->bootid, 16); + memcpy(&out->mi_bootid.meta[2], &meta2->bootid, 16); } - return 0; -} -#if MDBX_ENABLE_MINCORE -static __inline bool bit_tas(uint64_t *field, char bit) { - const uint64_t m = UINT64_C(1) << bit; - const bool r = (*field & m) != 0; - *field |= m; - return r; -} + const volatile meta_t *txn_meta = head.ptr_v; + out->mi_last_pgno = txn_meta->geometry.first_unallocated - 1; + out->mi_geo.current = pgno2bytes(env, txn_meta->geometry.now); + if (txn) { + out->mi_last_pgno = txn->geo.first_unallocated - 1; + out->mi_geo.current = pgno2bytes(env, txn->geo.end_pgno); -static bool mincore_fetch(MDBX_env *const env, const size_t unit_begin) { - MDBX_lockinfo *const lck = env->me_lck; - for (size_t i = 1; i < ARRAY_LENGTH(lck->mti_mincore_cache.begin); ++i) { - const ptrdiff_t dist = unit_begin - lck->mti_mincore_cache.begin[i]; - if (likely(dist >= 0 && dist < 64)) { - const pgno_t tmp_begin = lck->mti_mincore_cache.begin[i]; - const uint64_t tmp_mask = lck->mti_mincore_cache.mask[i]; - do { - lck->mti_mincore_cache.begin[i] = lck->mti_mincore_cache.begin[i - 1]; - lck->mti_mincore_cache.mask[i] = lck->mti_mincore_cache.mask[i - 1]; - } while (--i); - lck->mti_mincore_cache.begin[0] = tmp_begin; - lck->mti_mincore_cache.mask[0] = tmp_mask; - return bit_tas(lck->mti_mincore_cache.mask, (char)dist); - } + const txnid_t wanna_meta_txnid = (txn->flags & MDBX_TXN_RDONLY) + ? txn->txnid + : txn->txnid - xMDBX_TXNID_STEP; + txn_meta = (out->mi_meta_txnid[0] == wanna_meta_txnid) ? meta0 : txn_meta; + txn_meta = (out->mi_meta_txnid[1] == wanna_meta_txnid) ? meta1 : txn_meta; + txn_meta = (out->mi_meta_txnid[2] == wanna_meta_txnid) ? meta2 : txn_meta; } + out->mi_geo.lower = pgno2bytes(env, txn_meta->geometry.lower); + out->mi_geo.upper = pgno2bytes(env, txn_meta->geometry.upper); + out->mi_geo.shrink = pgno2bytes(env, pv2pages(txn_meta->geometry.shrink_pv)); + out->mi_geo.grow = pgno2bytes(env, pv2pages(txn_meta->geometry.grow_pv)); + out->mi_mapsize = env->dxb_mmap.limit; - size_t pages = 64; - unsigned unit_log = sys_pagesize_ln2; - unsigned shift = 0; - if (env->me_psize > env->me_os_psize) { - unit_log = env->me_psize2log; - shift = env->me_psize2log - sys_pagesize_ln2; - pages <<= shift; - } + const lck_t *const lck = env->lck; + out->mi_maxreaders = env->max_readers; + out->mi_numreaders = env->lck_mmap.lck + ? atomic_load32(&lck->rdt_length, mo_Relaxed) + : INT32_MAX; + out->mi_dxb_pagesize = env->ps; + out->mi_sys_pagesize = globals.sys_pagesize; - const size_t offset = unit_begin << unit_log; - size_t length = pages << sys_pagesize_ln2; - if (offset + length > env->me_dxb_mmap.current) { - length = env->me_dxb_mmap.current - offset; - pages = length >> sys_pagesize_ln2; + if (likely(bytes > size_before_bootid)) { + const uint64_t unsynced_pages = + atomic_load64(&lck->unsynced_pages, mo_Relaxed) + + ((uint32_t)out->mi_recent_txnid != + atomic_load32(&lck->meta_sync_txnid, mo_Relaxed)); + out->mi_unsync_volume = pgno2bytes(env, (size_t)unsynced_pages); + const uint64_t monotime_now = osal_monotime(); + uint64_t ts = atomic_load64(&lck->eoos_timestamp, mo_Relaxed); + out->mi_since_sync_seconds16dot16 = + ts ? osal_monotime_to_16dot16_noUnderflow(monotime_now - ts) : 0; + ts = atomic_load64(&lck->readers_check_timestamp, mo_Relaxed); + out->mi_since_reader_check_seconds16dot16 = + ts ? osal_monotime_to_16dot16_noUnderflow(monotime_now - ts) : 0; + out->mi_autosync_threshold = + pgno2bytes(env, atomic_load32(&lck->autosync_threshold, mo_Relaxed)); + out->mi_autosync_period_seconds16dot16 = + osal_monotime_to_16dot16_noUnderflow( + atomic_load64(&lck->autosync_period, mo_Relaxed)); + out->mi_bootid.current.x = globals.bootid.x; + out->mi_bootid.current.y = globals.bootid.y; + out->mi_mode = env->lck_mmap.lck ? lck->envmode.weak : env->flags; } + if (likely(bytes > size_before_pgop_stat)) { #if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.mincore.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - uint8_t *const vector = alloca(pages); - if (unlikely(mincore(ptr_disp(env->me_dxb_mmap.base, offset), length, - (void *)vector))) { - NOTICE("mincore(+%zu, %zu), err %d", offset, length, errno); - return false; - } - - for (size_t i = 1; i < ARRAY_LENGTH(lck->mti_mincore_cache.begin); ++i) { - lck->mti_mincore_cache.begin[i] = lck->mti_mincore_cache.begin[i - 1]; - lck->mti_mincore_cache.mask[i] = lck->mti_mincore_cache.mask[i - 1]; + out->mi_pgop_stat.newly = atomic_load64(&lck->pgops.newly, mo_Relaxed); + out->mi_pgop_stat.cow = atomic_load64(&lck->pgops.cow, mo_Relaxed); + out->mi_pgop_stat.clone = atomic_load64(&lck->pgops.clone, mo_Relaxed); + out->mi_pgop_stat.split = atomic_load64(&lck->pgops.split, mo_Relaxed); + out->mi_pgop_stat.merge = atomic_load64(&lck->pgops.merge, mo_Relaxed); + out->mi_pgop_stat.spill = atomic_load64(&lck->pgops.spill, mo_Relaxed); + out->mi_pgop_stat.unspill = atomic_load64(&lck->pgops.unspill, mo_Relaxed); + out->mi_pgop_stat.wops = atomic_load64(&lck->pgops.wops, mo_Relaxed); + out->mi_pgop_stat.prefault = + atomic_load64(&lck->pgops.prefault, mo_Relaxed); + out->mi_pgop_stat.mincore = atomic_load64(&lck->pgops.mincore, mo_Relaxed); + out->mi_pgop_stat.msync = atomic_load64(&lck->pgops.msync, mo_Relaxed); + out->mi_pgop_stat.fsync = atomic_load64(&lck->pgops.fsync, mo_Relaxed); +#else + memset(&out->mi_pgop_stat, 0, sizeof(out->mi_pgop_stat)); +#endif /* MDBX_ENABLE_PGOP_STAT*/ } - lck->mti_mincore_cache.begin[0] = unit_begin; - uint64_t mask = 0; -#ifdef MINCORE_INCORE - STATIC_ASSERT(MINCORE_INCORE == 1); -#endif - for (size_t i = 0; i < pages; ++i) { - uint64_t bit = (vector[i] & 1) == 0; - bit <<= i >> shift; - mask |= bit; + txnid_t overall_latter_reader_txnid = out->mi_recent_txnid; + txnid_t self_latter_reader_txnid = overall_latter_reader_txnid; + if (env->lck_mmap.lck) { + for (size_t i = 0; i < out->mi_numreaders; ++i) { + const uint32_t pid = atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease); + if (pid) { + const txnid_t txnid = safe64_read(&lck->rdt[i].txnid); + if (overall_latter_reader_txnid > txnid) + overall_latter_reader_txnid = txnid; + if (pid == env->pid && self_latter_reader_txnid > txnid) + self_latter_reader_txnid = txnid; + } + } } + out->mi_self_latter_reader_txnid = self_latter_reader_txnid; + out->mi_latter_reader_txnid = overall_latter_reader_txnid; - lck->mti_mincore_cache.mask[0] = ~mask; - return bit_tas(lck->mti_mincore_cache.mask, 0); + osal_compiler_barrier(); + return MDBX_SUCCESS; } -#endif /* MDBX_ENABLE_MINCORE */ -MDBX_MAYBE_UNUSED static __inline bool mincore_probe(MDBX_env *const env, - const pgno_t pgno) { -#if MDBX_ENABLE_MINCORE - const size_t offset_aligned = - floor_powerof2(pgno2bytes(env, pgno), env->me_os_psize); - const unsigned unit_log2 = (env->me_psize2log > sys_pagesize_ln2) - ? env->me_psize2log - : sys_pagesize_ln2; - const size_t unit_begin = offset_aligned >> unit_log2; - eASSERT(env, (unit_begin << unit_log2) == offset_aligned); - const ptrdiff_t dist = unit_begin - env->me_lck->mti_mincore_cache.begin[0]; - if (likely(dist >= 0 && dist < 64)) - return bit_tas(env->me_lck->mti_mincore_cache.mask, (char)dist); - return mincore_fetch(env, unit_begin); -#else - (void)env; - (void)pgno; - return false; -#endif /* MDBX_ENABLE_MINCORE */ +__cold int env_info(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo *out, + size_t bytes, troika_t *troika) { + MDBX_envinfo snap; + int rc = env_info_snap(env, txn, &snap, sizeof(snap), troika); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + eASSERT(env, sizeof(snap) >= bytes); + while (1) { + rc = env_info_snap(env, txn, out, bytes, troika); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + snap.mi_since_sync_seconds16dot16 = out->mi_since_sync_seconds16dot16; + snap.mi_since_reader_check_seconds16dot16 = + out->mi_since_reader_check_seconds16dot16; + if (likely(memcmp(&snap, out, bytes) == 0)) + return MDBX_SUCCESS; + memcpy(&snap, out, bytes); + } } -static __inline pgr_t page_alloc_finalize(MDBX_env *const env, - MDBX_txn *const txn, - const MDBX_cursor *const mc, - const pgno_t pgno, const size_t num) { -#if MDBX_ENABLE_PROFGC - size_t majflt_before; - const uint64_t cputime_before = osal_cputime(&majflt_before); - profgc_stat_t *const prof = (mc->mc_dbi == FREE_DBI) - ? &env->me_lck->mti_pgop_stat.gc_prof.self - : &env->me_lck->mti_pgop_stat.gc_prof.work; -#else - (void)mc; -#endif /* MDBX_ENABLE_PROFGC */ - ENSURE(env, pgno >= NUM_METAS); +__cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, + MDBX_envinfo *arg, size_t bytes) { + if (unlikely((env == nullptr && txn == nullptr) || arg == nullptr)) + return MDBX_EINVAL; - pgr_t ret; - bool need_clean = (env->me_flags & MDBX_PAGEPERTURB) != 0; - if (env->me_flags & MDBX_WRITEMAP) { - ret.page = pgno2page(env, pgno); - MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num)); - VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); + const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid); + const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat); + if (unlikely(bytes != sizeof(MDBX_envinfo)) && bytes != size_before_bootid && + bytes != size_before_pgop_stat) + return MDBX_EINVAL; - /* Содержимое выделенной страницы не нужно, но если страница отсутствует - * в ОЗУ (что весьма вероятно), то любое обращение к ней приведет - * к page-fault: - * - прерыванию по отсутствию страницы; - * - переключение контекста в режим ядра с засыпанием процесса; - * - чтение страницы с диска; - * - обновление PTE и пробуждением процесса; - * - переключение контекста по доступности ЦПУ. - * - * Пытаемся минимизировать накладные расходы записывая страницу, что при - * наличии unified page cache приведет к появлению страницы в ОЗУ без чтения - * с диска. При этом запись на диск должна быть отложена адекватным ядром, - * так как страница отображена в память в режиме чтения-записи и следом в - * неё пишет ЦПУ. */ - - /* В случае если страница в памяти процесса, то излишняя запись может быть - * достаточно дорогой. Кроме системного вызова и копирования данных, в особо - * одаренных ОС при этом могут включаться файловая система, выделяться - * временная страница, пополняться очереди асинхронного выполнения, - * обновляться PTE с последующей генерацией page-fault и чтением данных из - * грязной I/O очереди. Из-за этого штраф за лишнюю запись может быть - * сравним с избегаемым ненужным чтением. */ - if (env->me_prefault_write) { - void *const pattern = ptr_disp( - env->me_pbuf, need_clean ? env->me_psize : env->me_psize * 2); - size_t file_offset = pgno2bytes(env, pgno); - if (likely(num == 1)) { - if (!mincore_probe(env, pgno)) { - osal_pwrite(env->me_lazy_fd, pattern, env->me_psize, file_offset); -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.prefault.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - need_clean = false; - } - } else { - struct iovec iov[MDBX_AUXILARY_IOV_MAX]; - size_t n = 0, cleared = 0; - for (size_t i = 0; i < num; ++i) { - if (!mincore_probe(env, pgno + (pgno_t)i)) { - ++cleared; - iov[n].iov_len = env->me_psize; - iov[n].iov_base = pattern; - if (unlikely(++n == MDBX_AUXILARY_IOV_MAX)) { - osal_pwritev(env->me_lazy_fd, iov, MDBX_AUXILARY_IOV_MAX, - file_offset); -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.prefault.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - file_offset += pgno2bytes(env, MDBX_AUXILARY_IOV_MAX); - n = 0; - } - } - } - if (likely(n > 0)) { - osal_pwritev(env->me_lazy_fd, iov, n, file_offset); -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.prefault.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - } - if (cleared == num) - need_clean = false; - } - } - } else { - ret.page = page_malloc(txn, num); - if (unlikely(!ret.page)) { - ret.err = MDBX_ENOMEM; - goto bailout; - } + if (txn) { + int err = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR); + if (unlikely(err != MDBX_SUCCESS)) + return err; } - - if (unlikely(need_clean)) - memset(ret.page, -1, pgno2bytes(env, num)); - - VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); - ret.page->mp_pgno = pgno; - ret.page->mp_leaf2_ksize = 0; - ret.page->mp_flags = 0; - if ((ASSERT_ENABLED() || AUDIT_ENABLED()) && num > 1) { - ret.page->mp_pages = (pgno_t)num; - ret.page->mp_flags = P_OVERFLOW; + if (env) { + int err = check_env(env, false); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (txn && unlikely(txn->env != env)) + return MDBX_EINVAL; + } else { + env = txn->env; } - ret.err = page_dirty(txn, ret.page, (pgno_t)num); -bailout: - tASSERT(txn, pnl_check_allocated(txn->tw.relist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); -#if MDBX_ENABLE_PROFGC - size_t majflt_after; - prof->xtime_cpu += osal_cputime(&majflt_after) - cputime_before; - prof->majflt += (uint32_t)(majflt_after - majflt_before); -#endif /* MDBX_ENABLE_PROFGC */ - return ret; + troika_t troika; + return env_info(env, txn, arg, bytes, &troika); } -struct monotime_cache { - uint64_t value; - int expire_countdown; -}; - -static __inline uint64_t monotime_since_cached(uint64_t begin_timestamp, - struct monotime_cache *cache) { - if (cache->expire_countdown) - cache->expire_countdown -= 1; - else { - cache->value = osal_monotime(); - cache->expire_countdown = 42 / 3; +__cold int mdbx_preopen_snapinfo(const char *pathname, MDBX_envinfo *out, + size_t bytes) { +#if defined(_WIN32) || defined(_WIN64) + wchar_t *pathnameW = nullptr; + int rc = osal_mb2w(pathname, &pathnameW); + if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_preopen_snapinfoW(pathnameW, out, bytes); + osal_free(pathnameW); } - return cache->value - begin_timestamp; + return rc; } -static pgr_t page_alloc_slowpath(const MDBX_cursor *const mc, const size_t num, - uint8_t flags) { - pgr_t ret; - MDBX_txn *const txn = mc->mc_txn; - MDBX_env *const env = txn->mt_env; -#if MDBX_ENABLE_PROFGC - profgc_stat_t *const prof = (mc->mc_dbi == FREE_DBI) - ? &env->me_lck->mti_pgop_stat.gc_prof.self - : &env->me_lck->mti_pgop_stat.gc_prof.work; - prof->spe_counter += 1; -#endif /* MDBX_ENABLE_PROFGC */ +__cold int mdbx_preopen_snapinfoW(const wchar_t *pathname, MDBX_envinfo *out, + size_t bytes) { +#endif /* Windows */ + if (unlikely(!out)) + return MDBX_EINVAL; - eASSERT(env, num > 0 || (flags & MDBX_ALLOC_RESERVE)); - eASSERT(env, pnl_check_allocated(txn->tw.relist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid); + const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat); + if (unlikely(bytes != sizeof(MDBX_envinfo)) && bytes != size_before_bootid && + bytes != size_before_pgop_stat) + return MDBX_EINVAL; - size_t newnext; - const uint64_t monotime_begin = - (MDBX_ENABLE_PROFGC || (num > 1 && env->me_options.gc_time_limit)) - ? osal_monotime() - : 0; - struct monotime_cache now_cache; - now_cache.expire_countdown = - 1 /* старт с 1 позволяет избавиться как от лишних системных вызовов когда - лимит времени задан нулевой или уже исчерпан, так и от подсчета - времени при не-достижении rp_augment_limit */ - ; - now_cache.value = monotime_begin; - pgno_t pgno = 0; - if (num > 1) { -#if MDBX_ENABLE_PROFGC - prof->xpages += 1; -#endif /* MDBX_ENABLE_PROFGC */ - if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) { - eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && - MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); - pgno = relist_get_sequence(txn, num, flags); - if (likely(pgno)) - goto done; - } - } else { - eASSERT(env, num == 0 || MDBX_PNL_GETSIZE(txn->tw.relist) == 0); - eASSERT(env, !(flags & MDBX_ALLOC_RESERVE) || num == 0); + memset(out, 0, bytes); + if (likely(bytes > size_before_bootid)) { + out->mi_bootid.current.x = globals.bootid.x; + out->mi_bootid.current.y = globals.bootid.y; } - //--------------------------------------------------------------------------- - - if (unlikely(!is_gc_usable(txn, mc, flags))) { - eASSERT(env, (txn->mt_flags & MDBX_TXN_DRAINED_GC) || num > 1); - goto no_gc; + MDBX_env env; + memset(&env, 0, sizeof(env)); + env.pid = osal_getpid(); + if (unlikely(!is_powerof2(globals.sys_pagesize) || + globals.sys_pagesize < MDBX_MIN_PAGESIZE)) { + ERROR("unsuitable system pagesize %u", globals.sys_pagesize); + return MDBX_INCOMPATIBLE; } + out->mi_sys_pagesize = globals.sys_pagesize; + env.flags = MDBX_RDONLY | MDBX_NORDAHEAD | MDBX_ACCEDE | MDBX_VALIDATION; + env.stuck_meta = -1; + env.lck_mmap.fd = INVALID_HANDLE_VALUE; + env.lazy_fd = INVALID_HANDLE_VALUE; + env.dsync_fd = INVALID_HANDLE_VALUE; + env.fd4meta = INVALID_HANDLE_VALUE; +#if defined(_WIN32) || defined(_WIN64) + env.dxb_lock_event = INVALID_HANDLE_VALUE; + env.ioring.overlapped_fd = INVALID_HANDLE_VALUE; +#endif /* Windows */ + env_options_init(&env); - eASSERT(env, (flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_LIFO | - MDBX_ALLOC_SHOULD_SCAN)) == 0); - flags += (env->me_flags & MDBX_LIFORECLAIM) ? MDBX_ALLOC_LIFO : 0; + int rc = env_handle_pathname(&env, pathname, 0); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + rc = osal_openfile(MDBX_OPEN_DXB_READ, &env, env.pathname.dxb, &env.lazy_fd, + 0); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; - if (/* Не коагулируем записи при подготовке резерва для обновления GC. - * Иначе попытка увеличить резерв может приводить к необходимости ещё - * большего резерва из-за увеличения списка переработанных страниц. */ - (flags & MDBX_ALLOC_RESERVE) == 0) { - if (txn->mt_dbs[FREE_DBI].md_branch_pages && - MDBX_PNL_GETSIZE(txn->tw.relist) < env->me_maxgc_ov1page / 2) - flags += MDBX_ALLOC_COALESCE; - } + meta_t header; + rc = dxb_read_header(&env, &header, 0, 0); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; - MDBX_cursor *const gc = ptr_disp(env->me_txn0, sizeof(MDBX_txn)); - eASSERT(env, mc != gc && gc->mc_next == nullptr); - gc->mc_txn = txn; - gc->mc_flags = 0; + out->mi_dxb_pagesize = env_setup_pagesize(&env, header.pagesize); + out->mi_geo.lower = pgno2bytes(&env, header.geometry.lower); + out->mi_geo.upper = pgno2bytes(&env, header.geometry.upper); + out->mi_geo.shrink = pgno2bytes(&env, pv2pages(header.geometry.shrink_pv)); + out->mi_geo.grow = pgno2bytes(&env, pv2pages(header.geometry.grow_pv)); + out->mi_geo.current = pgno2bytes(&env, header.geometry.now); + out->mi_last_pgno = header.geometry.first_unallocated - 1; - env->me_prefault_write = env->me_options.prefault_write; - if (env->me_prefault_write) { - /* Проверка посредством minicore() существенно снижает затраты, но в - * простейших случаях (тривиальный бенчмарк) интегральная производительность - * становится вдвое меньше. А на платформах без mincore() и с проблемной - * подсистемой виртуальной памяти ситуация может быть многократно хуже. - * Поэтому избегаем затрат в ситуациях когда prefault-write скорее всего не - * нужна. */ - const bool readahead_enabled = env->me_lck->mti_readahead_anchor & 1; - const pgno_t readahead_edge = env->me_lck->mti_readahead_anchor >> 1; - if (/* Не суетимся если GC почти пустая и БД маленькая */ - (txn->mt_dbs[FREE_DBI].md_branch_pages == 0 && - txn->mt_geo.now < 1234) || - /* Не суетимся если страница в зоне включенного упреждающего чтения */ - (readahead_enabled && pgno + num < readahead_edge)) - env->me_prefault_write = false; - } + const unsigned n = 0; + out->mi_recent_txnid = constmeta_txnid(&header); + out->mi_meta_sign[n] = unaligned_peek_u64(4, &header.sign); + if (likely(bytes > size_before_bootid)) + memcpy(&out->mi_bootid.meta[n], &header.bootid, 16); -retry_gc_refresh_oldest:; - txnid_t oldest = txn_oldest_reader(txn); -retry_gc_have_oldest: - if (unlikely(oldest >= txn->mt_txnid)) { - ERROR("unexpected/invalid oldest-readed txnid %" PRIaTXN - " for current-txnid %" PRIaTXN, - oldest, txn->mt_txnid); - ret.err = MDBX_PROBLEM; - goto fail; - } - const txnid_t detent = oldest + 1; +bailout: + env_close(&env, false); + return rc; +} - txnid_t id = 0; - MDBX_cursor_op op = MDBX_FIRST; - if (flags & MDBX_ALLOC_LIFO) { - if (!txn->tw.lifo_reclaimed) { - txn->tw.lifo_reclaimed = txl_alloc(); - if (unlikely(!txn->tw.lifo_reclaimed)) { - ret.err = MDBX_ENOMEM; - goto fail; - } - } - /* Begin lookup backward from oldest reader */ - id = detent - 1; - op = MDBX_SET_RANGE; - } else if (txn->tw.last_reclaimed) { - /* Continue lookup forward from last-reclaimed */ - id = txn->tw.last_reclaimed + 1; - if (id >= detent) - goto depleted_gc; - op = MDBX_SET_RANGE; - } +/*----------------------------------------------------------------------------*/ -next_gc:; - MDBX_val key; - key.iov_base = &id; - key.iov_len = sizeof(id); +__cold int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, + intptr_t size_now, intptr_t size_upper, + intptr_t growth_step, + intptr_t shrink_threshold, intptr_t pagesize) { + int rc = check_env(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; -#if MDBX_ENABLE_PROFGC - prof->rsteps += 1; -#endif /* MDBX_ENABLE_PROFGC */ + const bool txn0_owned = env->basal_txn && env_txn0_owned(env); + const bool inside_txn = txn0_owned && env->txn; + bool should_unlock = false; - /* Seek first/next GC record */ - ret.err = cursor_get(gc, &key, NULL, op); - if (unlikely(ret.err != MDBX_SUCCESS)) { - if (unlikely(ret.err != MDBX_NOTFOUND)) - goto fail; - if ((flags & MDBX_ALLOC_LIFO) && op == MDBX_SET_RANGE) { - op = MDBX_PREV; - goto next_gc; - } - goto depleted_gc; - } - if (unlikely(key.iov_len != sizeof(txnid_t))) { - ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "invalid GC key-length"); - ret.err = MDBX_CORRUPTED; - goto fail; - } - id = unaligned_peek_u64(4, key.iov_base); - if (flags & MDBX_ALLOC_LIFO) { - op = MDBX_PREV; - if (id >= detent || is_already_reclaimed(txn, id)) - goto next_gc; - } else { - op = MDBX_NEXT; - if (unlikely(id >= detent)) - goto depleted_gc; +#if MDBX_DEBUG + if (growth_step < 0) { + growth_step = 1; + if (shrink_threshold < 0) + shrink_threshold = 1; } - txn->mt_flags &= ~MDBX_TXN_DRAINED_GC; +#endif /* MDBX_DEBUG */ - /* Reading next GC record */ - MDBX_val data; - MDBX_page *const mp = gc->mc_pg[gc->mc_top]; - if (unlikely((ret.err = node_read(gc, page_node(mp, gc->mc_ki[gc->mc_top]), - &data, mp)) != MDBX_SUCCESS)) - goto fail; + if (env->dxb_mmap.base) { + /* env already mapped */ + if (unlikely(env->flags & MDBX_RDONLY)) + return MDBX_EACCESS; - pgno_t *gc_pnl = (pgno_t *)data.iov_base; - if (unlikely(data.iov_len % sizeof(pgno_t) || - data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || - !pnl_check(gc_pnl, txn->mt_next_pgno))) { - ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "invalid GC value-length"); - ret.err = MDBX_CORRUPTED; - goto fail; - } + if (!txn0_owned) { + int err = lck_txn_lock(env, false); + if (unlikely(err != MDBX_SUCCESS)) + return err; + should_unlock = true; + env->basal_txn->tw.troika = meta_tap(env); + eASSERT(env, !env->txn && !env->basal_txn->nested); + env->basal_txn->txnid = + env->basal_txn->tw.troika.txnid[env->basal_txn->tw.troika.recent]; + txn_snapshot_oldest(env->basal_txn); + } - const size_t gc_len = MDBX_PNL_GETSIZE(gc_pnl); - TRACE("gc-read: id #%" PRIaTXN " len %zu, re-list will %zu ", id, gc_len, - gc_len + MDBX_PNL_GETSIZE(txn->tw.relist)); + /* get untouched params from current TXN or DB */ + if (pagesize <= 0 || pagesize >= INT_MAX) + pagesize = env->ps; + const geo_t *const geo = + inside_txn + ? &env->txn->geo + : &meta_recent(env, &env->basal_txn->tw.troika).ptr_c->geometry; + if (size_lower < 0) + size_lower = pgno2bytes(env, geo->lower); + if (size_now < 0) + size_now = pgno2bytes(env, geo->now); + if (size_upper < 0) + size_upper = pgno2bytes(env, geo->upper); + if (growth_step < 0) + growth_step = pgno2bytes(env, pv2pages(geo->grow_pv)); + if (shrink_threshold < 0) + shrink_threshold = pgno2bytes(env, pv2pages(geo->shrink_pv)); - if (unlikely(gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= - env->me_maxgc_ov1page)) { - /* Don't try to coalesce too much. */ - if (flags & MDBX_ALLOC_SHOULD_SCAN) { - eASSERT(env, flags & MDBX_ALLOC_COALESCE); - eASSERT(env, !(flags & MDBX_ALLOC_RESERVE)); - eASSERT(env, num > 0); -#if MDBX_ENABLE_PROFGC - env->me_lck->mti_pgop_stat.gc_prof.coalescences += 1; -#endif /* MDBX_ENABLE_PROFGC */ - TRACE("clear %s %s", "MDBX_ALLOC_COALESCE", "since got threshold"); - if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) { - eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && - MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); - if (likely(num == 1)) { - pgno = relist_get_single(txn); - goto done; - } - pgno = relist_get_sequence(txn, num, flags); - if (likely(pgno)) - goto done; - } - flags -= MDBX_ALLOC_COALESCE | MDBX_ALLOC_SHOULD_SCAN; + if (pagesize != (intptr_t)env->ps) { + rc = MDBX_EINVAL; + goto bailout; } - if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE( - txn->tw.relist) >= env->me_options.rp_augment_limit) && - ((/* not a slot-request from gc-update */ num && - /* have enough unallocated space */ txn->mt_geo.upper >= - txn->mt_next_pgno + num && - monotime_since_cached(monotime_begin, &now_cache) + - txn->tw.gc_time_acc >= - env->me_options.gc_time_limit) || - gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= MDBX_PGL_LIMIT)) { - /* Stop reclaiming to avoid large/overflow the page list. This is a rare - * case while search for a continuously multi-page region in a - * large database, see https://libmdbx.dqdkfa.ru/dead-github/issues/123 */ - NOTICE("stop reclaiming %s: %zu (current) + %zu " - "(chunk) -> %zu, rp_augment_limit %u", - likely(gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) < MDBX_PGL_LIMIT) - ? "since rp_augment_limit was reached" - : "to avoid PNL overflow", - MDBX_PNL_GETSIZE(txn->tw.relist), gc_len, - gc_len + MDBX_PNL_GETSIZE(txn->tw.relist), - env->me_options.rp_augment_limit); - goto depleted_gc; + const size_t usedbytes = + pgno2bytes(env, mvcc_snapshot_largest(env, geo->first_unallocated)); + if ((size_t)size_upper < usedbytes) { + rc = MDBX_MAP_FULL; + goto bailout; } - } - - /* Remember ID of readed GC record */ - txn->tw.last_reclaimed = id; - if (flags & MDBX_ALLOC_LIFO) { - ret.err = txl_append(&txn->tw.lifo_reclaimed, id); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - } - - /* Append PNL from GC record to tw.relist */ - ret.err = pnl_need(&txn->tw.relist, gc_len); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; + if ((size_t)size_now < usedbytes) + size_now = usedbytes; + } else { + /* env NOT yet mapped */ + if (unlikely(inside_txn)) + return MDBX_PANIC; - if (LOG_ENABLED(MDBX_LOG_EXTRA)) { - DEBUG_EXTRA("readed GC-pnl txn %" PRIaTXN " root %" PRIaPGNO - " len %zu, PNL", - id, txn->mt_dbs[FREE_DBI].md_root, gc_len); - for (size_t i = gc_len; i; i--) - DEBUG_EXTRA_PRINT(" %" PRIaPGNO, gc_pnl[i]); - DEBUG_EXTRA_PRINT(", next_pgno %u\n", txn->mt_next_pgno); - } + /* is requested some auto-value for pagesize ? */ + if (pagesize >= INT_MAX /* maximal */) + pagesize = MDBX_MAX_PAGESIZE; + else if (pagesize <= 0) { + if (pagesize < 0 /* default */) { + pagesize = globals.sys_pagesize; + if ((uintptr_t)pagesize > MDBX_MAX_PAGESIZE) + pagesize = MDBX_MAX_PAGESIZE; + eASSERT(env, (uintptr_t)pagesize >= MDBX_MIN_PAGESIZE); + } else if (pagesize == 0 /* minimal */) + pagesize = MDBX_MIN_PAGESIZE; - /* Merge in descending sorted order */ - pnl_merge(txn->tw.relist, gc_pnl); - flags |= MDBX_ALLOC_SHOULD_SCAN; - if (AUDIT_ENABLED()) { - if (unlikely(!pnl_check(txn->tw.relist, txn->mt_next_pgno))) { - ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "invalid txn retired-list"); - ret.err = MDBX_CORRUPTED; - goto fail; + /* choose pagesize */ + intptr_t top = (size_now > size_lower) ? size_now : size_lower; + if (size_upper > top) + top = size_upper; + if (top < 0 /* default */) + top = reasonable_db_maxsize(); + else if (top == 0 /* minimal */) + top = MIN_MAPSIZE; + else if (top >= (intptr_t)MAX_MAPSIZE /* maximal */) + top = MAX_MAPSIZE; + + while (top > pagesize * (int64_t)(MAX_PAGENO + 1) && + pagesize < MDBX_MAX_PAGESIZE) + pagesize <<= 1; } - } else { - eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno)); } - eASSERT(env, dirtylist_check(txn)); - eASSERT(env, MDBX_PNL_GETSIZE(txn->tw.relist) == 0 || - MDBX_PNL_MOST(txn->tw.relist) < txn->mt_next_pgno); - if (MDBX_ENABLE_REFUND && MDBX_PNL_GETSIZE(txn->tw.relist) && - unlikely(MDBX_PNL_MOST(txn->tw.relist) == txn->mt_next_pgno - 1)) { - /* Refund suitable pages into "unallocated" space */ - txn_refund(txn); + if (pagesize < (intptr_t)MDBX_MIN_PAGESIZE || + pagesize > (intptr_t)MDBX_MAX_PAGESIZE || !is_powerof2(pagesize)) { + rc = MDBX_EINVAL; + goto bailout; } - eASSERT(env, pnl_check_allocated(txn->tw.relist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - /* Done for a kick-reclaim mode, actually no page needed */ - if (unlikely(num == 0)) { - eASSERT(env, ret.err == MDBX_SUCCESS); - TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "early-exit for slot", id, - MDBX_PNL_GETSIZE(txn->tw.relist)); - goto early_exit; + if (size_lower <= 0) { + size_lower = MIN_MAPSIZE; + if (MIN_MAPSIZE / pagesize < MIN_PAGENO) + size_lower = MIN_PAGENO * pagesize; } - - /* TODO: delete reclaimed records */ - - eASSERT(env, op == MDBX_PREV || op == MDBX_NEXT); - if (flags & MDBX_ALLOC_COALESCE) { - TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "coalesce-continue", id, - MDBX_PNL_GETSIZE(txn->tw.relist)); - goto next_gc; + if (size_lower >= INTPTR_MAX) { + size_lower = reasonable_db_maxsize(); + if ((size_t)size_lower / pagesize > MAX_PAGENO + 1) + size_lower = pagesize * (MAX_PAGENO + 1); } -scan: - eASSERT(env, flags & MDBX_ALLOC_SHOULD_SCAN); - eASSERT(env, num > 0); - if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) { - eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && - MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); - if (likely(num == 1)) { - eASSERT(env, !(flags & MDBX_ALLOC_RESERVE)); - pgno = relist_get_single(txn); - goto done; - } - pgno = relist_get_sequence(txn, num, flags); - if (likely(pgno)) - goto done; + if (size_now <= 0) { + size_now = size_lower; + if (size_upper >= size_lower && size_now > size_upper) + size_now = size_upper; } - flags -= MDBX_ALLOC_SHOULD_SCAN; - if (ret.err == MDBX_SUCCESS) { - TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "continue-search", id, - MDBX_PNL_GETSIZE(txn->tw.relist)); - goto next_gc; + if (size_now >= INTPTR_MAX) { + size_now = reasonable_db_maxsize(); + if ((size_t)size_now / pagesize > MAX_PAGENO + 1) + size_now = pagesize * (MAX_PAGENO + 1); } -depleted_gc: - TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "gc-depleted", id, - MDBX_PNL_GETSIZE(txn->tw.relist)); - ret.err = MDBX_NOTFOUND; - if (flags & MDBX_ALLOC_SHOULD_SCAN) - goto scan; - txn->mt_flags |= MDBX_TXN_DRAINED_GC; - - //------------------------------------------------------------------------- - - /* There is no suitable pages in the GC and to be able to allocate - * we should CHOICE one of: - * - make a new steady checkpoint if reclaiming was stopped by - * the last steady-sync, or wipe it in the MDBX_UTTERLY_NOSYNC mode; - * - kick lagging reader(s) if reclaiming was stopped by ones of it. - * - extend the database file. */ - - /* Will use new pages from the map if nothing is suitable in the GC. */ - newnext = txn->mt_next_pgno + num; - - /* Does reclaiming stopped at the last steady point? */ - const meta_ptr_t recent = meta_recent(env, &txn->tw.troika); - const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.troika); - if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady && - detent == prefer_steady.txnid + 1) { - DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN - "-%s, detent %" PRIaTXN, - recent.txnid, durable_caption(recent.ptr_c), prefer_steady.txnid, - durable_caption(prefer_steady.ptr_c), detent); - const pgno_t autosync_threshold = - atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); - const uint64_t autosync_period = - atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); - uint64_t eoos_timestamp; - /* wipe the last steady-point if one of: - * - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified - * - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted - * otherwise, make a new steady-point if one of: - * - auto-sync threshold is specified and reached; - * - upper limit of database size is reached; - * - database is full (with the current file size) - * AND auto-sync threshold it NOT specified */ - if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) && - ((autosync_threshold | autosync_period) == 0 || - newnext >= prefer_steady.ptr_c->mm_geo.now)) { - /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode - * without any auto-sync threshold(s). */ -#if MDBX_ENABLE_PROFGC - env->me_lck->mti_pgop_stat.gc_prof.wipes += 1; -#endif /* MDBX_ENABLE_PROFGC */ - ret.err = wipe_steady(txn, detent); - DEBUG("gc-wipe-steady, rc %d", ret.err); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - eASSERT(env, prefer_steady.ptr_c != - meta_prefer_steady(env, &txn->tw.troika).ptr_c); - goto retry_gc_refresh_oldest; - } - if ((autosync_threshold && - atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= - autosync_threshold) || - (autosync_period && - (eoos_timestamp = - atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && - osal_monotime() - eoos_timestamp >= autosync_period) || - newnext >= txn->mt_geo.upper || - ((num == 0 || newnext >= txn->mt_end_pgno) && - (autosync_threshold | autosync_period) == 0)) { - /* make steady checkpoint. */ -#if MDBX_ENABLE_PROFGC - env->me_lck->mti_pgop_stat.gc_prof.flushes += 1; -#endif /* MDBX_ENABLE_PROFGC */ - MDBX_meta meta = *recent.ptr_c; - ret.err = sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta, - &txn->tw.troika); - DEBUG("gc-make-steady, rc %d", ret.err); - eASSERT(env, ret.err != MDBX_RESULT_TRUE); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - eASSERT(env, prefer_steady.ptr_c != - meta_prefer_steady(env, &txn->tw.troika).ptr_c); - goto retry_gc_refresh_oldest; + if (size_upper <= 0) { + if (growth_step == 0 || size_upper == 0) + size_upper = size_now; + else if (size_now >= reasonable_db_maxsize() / 2) + size_upper = reasonable_db_maxsize(); + else if ((size_t)size_now >= MAX_MAPSIZE32 / 2 && + (size_t)size_now <= MAX_MAPSIZE32 / 4 * 3) + size_upper = MAX_MAPSIZE32; + else { + size_upper = ceil_powerof2(((size_t)size_now < MAX_MAPSIZE / 4) + ? size_now + size_now + : size_now + size_now / 2, + MEGABYTE * MDBX_WORDBITS * MDBX_WORDBITS / 32); + if ((size_t)size_upper > MAX_MAPSIZE) + size_upper = MAX_MAPSIZE; } + if ((size_t)size_upper / pagesize > (MAX_PAGENO + 1)) + size_upper = pagesize * (MAX_PAGENO + 1); + } else if (size_upper >= INTPTR_MAX) { + size_upper = reasonable_db_maxsize(); + if ((size_t)size_upper / pagesize > MAX_PAGENO + 1) + size_upper = pagesize * (MAX_PAGENO + 1); } - if (unlikely(true == atomic_load32(&env->me_lck->mti_readers_refresh_flag, - mo_AcquireRelease))) { - oldest = txn_oldest_reader(txn); - if (oldest >= detent) - goto retry_gc_have_oldest; + if (unlikely(size_lower < (intptr_t)MIN_MAPSIZE || size_lower > size_upper)) { + rc = MDBX_EINVAL; + goto bailout; } - /* Avoid kick lagging reader(s) if is enough unallocated space - * at the end of database file. */ - if (!(flags & MDBX_ALLOC_RESERVE) && newnext <= txn->mt_end_pgno) { - eASSERT(env, pgno == 0); - goto done; + if ((uint64_t)size_lower / pagesize < MIN_PAGENO) { + size_lower = pagesize * MIN_PAGENO; + if (unlikely(size_lower > size_upper)) { + rc = MDBX_EINVAL; + goto bailout; + } + if (size_now < size_lower) + size_now = size_lower; } - if (oldest < txn->mt_txnid - xMDBX_TXNID_STEP) { - oldest = kick_longlived_readers(env, oldest); - if (oldest >= detent) - goto retry_gc_have_oldest; + if (unlikely((size_t)size_upper > MAX_MAPSIZE || + (uint64_t)size_upper / pagesize > MAX_PAGENO + 1)) { + rc = MDBX_TOO_LARGE; + goto bailout; } - //--------------------------------------------------------------------------- + const size_t unit = (globals.sys_pagesize > (size_t)pagesize) + ? globals.sys_pagesize + : (size_t)pagesize; + size_lower = ceil_powerof2(size_lower, unit); + size_upper = ceil_powerof2(size_upper, unit); + size_now = ceil_powerof2(size_now, unit); -no_gc: - eASSERT(env, pgno == 0); -#ifndef MDBX_ENABLE_BACKLOG_DEPLETED -#define MDBX_ENABLE_BACKLOG_DEPLETED 0 -#endif /* MDBX_ENABLE_BACKLOG_DEPLETED*/ - if (MDBX_ENABLE_BACKLOG_DEPLETED && - unlikely(!(txn->mt_flags & MDBX_TXN_DRAINED_GC))) { - ret.err = MDBX_BACKLOG_DEPLETED; - goto fail; - } - if (flags & MDBX_ALLOC_RESERVE) { - ret.err = MDBX_NOTFOUND; - goto fail; + /* LY: подбираем значение size_upper: + * - кратное размеру страницы + * - без нарушения MAX_MAPSIZE и MAX_PAGENO */ + while (unlikely((size_t)size_upper > MAX_MAPSIZE || + (uint64_t)size_upper / pagesize > MAX_PAGENO + 1)) { + if ((size_t)size_upper < unit + MIN_MAPSIZE || + (size_t)size_upper < (size_t)pagesize * (MIN_PAGENO + 1)) { + /* паранойа на случай переполнения при невероятных значениях */ + rc = MDBX_EINVAL; + goto bailout; + } + size_upper -= unit; + if ((size_t)size_upper < (size_t)size_lower) + size_lower = size_upper; } + eASSERT(env, (size_upper - size_lower) % globals.sys_pagesize == 0); - /* Will use new pages from the map if nothing is suitable in the GC. */ - newnext = txn->mt_next_pgno + num; - if (newnext <= txn->mt_end_pgno) - goto done; + if (size_now < size_lower) + size_now = size_lower; + if (size_now > size_upper) + size_now = size_upper; - if (newnext > txn->mt_geo.upper || !txn->mt_geo.grow_pv) { - NOTICE("gc-alloc: next %zu > upper %" PRIaPGNO, newnext, txn->mt_geo.upper); - ret.err = MDBX_MAP_FULL; - goto fail; + if (growth_step < 0) { + growth_step = ((size_t)(size_upper - size_lower)) / 42; + if (growth_step > size_lower && size_lower < (intptr_t)MEGABYTE) + growth_step = size_lower; + if (growth_step < 65536) + growth_step = 65536; + if ((size_t)growth_step > MAX_MAPSIZE / 64) + growth_step = MAX_MAPSIZE / 64; } + if (growth_step == 0 && shrink_threshold > 0) + growth_step = 1; + growth_step = ceil_powerof2(growth_step, unit); - eASSERT(env, newnext > txn->mt_end_pgno); - const size_t grow_step = pv2pages(txn->mt_geo.grow_pv); - size_t aligned = pgno_align2os_pgno( - env, (pgno_t)(newnext + grow_step - newnext % grow_step)); + if (shrink_threshold < 0) + shrink_threshold = growth_step + growth_step; + shrink_threshold = ceil_powerof2(shrink_threshold, unit); - if (aligned > txn->mt_geo.upper) - aligned = txn->mt_geo.upper; - eASSERT(env, aligned >= newnext); + //---------------------------------------------------------------------------- - VERBOSE("try growth datafile to %zu pages (+%zu)", aligned, - aligned - txn->mt_end_pgno); - ret.err = dxb_resize(env, txn->mt_next_pgno, (pgno_t)aligned, - txn->mt_geo.upper, implicit_grow); - if (ret.err != MDBX_SUCCESS) { - ERROR("unable growth datafile to %zu pages (+%zu), errcode %d", aligned, - aligned - txn->mt_end_pgno, ret.err); - goto fail; - } - env->me_txn->mt_end_pgno = (pgno_t)aligned; - eASSERT(env, pgno == 0); + if (!env->dxb_mmap.base) { + /* save user's geo-params for future open/create */ + if (pagesize != (intptr_t)env->ps) + env_setup_pagesize(env, pagesize); + env->geo_in_bytes.lower = size_lower; + env->geo_in_bytes.now = size_now; + env->geo_in_bytes.upper = size_upper; + env->geo_in_bytes.grow = + pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, growth_step)))); + env->geo_in_bytes.shrink = + pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, shrink_threshold)))); + env_options_adjust_defaults(env); - //--------------------------------------------------------------------------- + ENSURE(env, env->geo_in_bytes.lower >= MIN_MAPSIZE); + ENSURE(env, env->geo_in_bytes.lower / (unsigned)pagesize >= MIN_PAGENO); + ENSURE(env, env->geo_in_bytes.lower % (unsigned)pagesize == 0); + ENSURE(env, env->geo_in_bytes.lower % globals.sys_pagesize == 0); -done: - ret.err = MDBX_SUCCESS; - if (likely((flags & MDBX_ALLOC_RESERVE) == 0)) { - if (pgno) { - eASSERT(env, pgno + num <= txn->mt_next_pgno && pgno >= NUM_METAS); - eASSERT(env, pnl_check_allocated(txn->tw.relist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - } else { - pgno = txn->mt_next_pgno; - txn->mt_next_pgno += (pgno_t)num; - eASSERT(env, txn->mt_next_pgno <= txn->mt_end_pgno); - eASSERT(env, pgno >= NUM_METAS && pgno + num <= txn->mt_next_pgno); - } + ENSURE(env, env->geo_in_bytes.upper <= MAX_MAPSIZE); + ENSURE(env, env->geo_in_bytes.upper / (unsigned)pagesize <= MAX_PAGENO + 1); + ENSURE(env, env->geo_in_bytes.upper % (unsigned)pagesize == 0); + ENSURE(env, env->geo_in_bytes.upper % globals.sys_pagesize == 0); - ret = page_alloc_finalize(env, txn, mc, pgno, num); - if (unlikely(ret.err != MDBX_SUCCESS)) { - fail: - eASSERT(env, ret.err != MDBX_SUCCESS); - eASSERT(env, pnl_check_allocated(txn->tw.relist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - int level; - const char *what; - if (flags & MDBX_ALLOC_RESERVE) { - level = - (flags & MDBX_ALLOC_UNIMPORTANT) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE; - what = num ? "reserve-pages" : "fetch-slot"; - } else { - txn->mt_flags |= MDBX_TXN_ERROR; - level = MDBX_LOG_ERROR; - what = "pages"; + ENSURE(env, env->geo_in_bytes.now >= env->geo_in_bytes.lower); + ENSURE(env, env->geo_in_bytes.now <= env->geo_in_bytes.upper); + ENSURE(env, env->geo_in_bytes.now % (unsigned)pagesize == 0); + ENSURE(env, env->geo_in_bytes.now % globals.sys_pagesize == 0); + + ENSURE(env, env->geo_in_bytes.grow % (unsigned)pagesize == 0); + ENSURE(env, env->geo_in_bytes.grow % globals.sys_pagesize == 0); + ENSURE(env, env->geo_in_bytes.shrink % (unsigned)pagesize == 0); + ENSURE(env, env->geo_in_bytes.shrink % globals.sys_pagesize == 0); + + rc = MDBX_SUCCESS; + } else { + /* apply new params to opened environment */ + ENSURE(env, pagesize == (intptr_t)env->ps); + meta_t meta; + memset(&meta, 0, sizeof(meta)); + if (!inside_txn) { + eASSERT(env, should_unlock); + const meta_ptr_t head = meta_recent(env, &env->basal_txn->tw.troika); + + uint64_t timestamp = 0; + while ("workaround for " + "https://libmdbx.dqdkfa.ru/dead-github/issues/269") { + rc = coherency_check_head(env->basal_txn, head, ×tamp); + if (likely(rc == MDBX_SUCCESS)) + break; + if (unlikely(rc != MDBX_RESULT_TRUE)) + goto bailout; } - if (LOG_ENABLED(level)) - debug_log(level, __func__, __LINE__, - "unable alloc %zu %s, alloc-flags 0x%x, err %d, txn-flags " - "0x%x, re-list-len %zu, loose-count %zu, gc: height %u, " - "branch %zu, leaf %zu, large %zu, entries %zu\n", - num, what, flags, ret.err, txn->mt_flags, - MDBX_PNL_GETSIZE(txn->tw.relist), txn->tw.loose_count, - txn->mt_dbs[FREE_DBI].md_depth, - (size_t)txn->mt_dbs[FREE_DBI].md_branch_pages, - (size_t)txn->mt_dbs[FREE_DBI].md_leaf_pages, - (size_t)txn->mt_dbs[FREE_DBI].md_overflow_pages, - (size_t)txn->mt_dbs[FREE_DBI].md_entries); - ret.page = NULL; + meta = *head.ptr_c; + const txnid_t txnid = safe64_txnid_next(head.txnid); + if (unlikely(txnid > MAX_TXNID)) { + rc = MDBX_TXN_FULL; + ERROR("txnid overflow, raise %d", rc); + goto bailout; + } + meta_set_txnid(env, &meta, txnid); } - if (num > 1) - txn->tw.gc_time_acc += monotime_since_cached(monotime_begin, &now_cache); - } else { - early_exit: - DEBUG("return NULL for %zu pages for ALLOC_%s, rc %d", num, - num ? "RESERVE" : "SLOT", ret.err); - ret.page = NULL; - } -#if MDBX_ENABLE_PROFGC - prof->rtime_monotonic += osal_monotime() - monotime_begin; -#endif /* MDBX_ENABLE_PROFGC */ - return ret; -} + const geo_t *const current_geo = + &(env->txn ? env->txn : env->basal_txn)->geo; + /* update env-geo to avoid influences */ + env->geo_in_bytes.now = pgno2bytes(env, current_geo->now); + env->geo_in_bytes.lower = pgno2bytes(env, current_geo->lower); + env->geo_in_bytes.upper = pgno2bytes(env, current_geo->upper); + env->geo_in_bytes.grow = pgno2bytes(env, pv2pages(current_geo->grow_pv)); + env->geo_in_bytes.shrink = + pgno2bytes(env, pv2pages(current_geo->shrink_pv)); + + geo_t new_geo; + new_geo.lower = bytes2pgno(env, size_lower); + new_geo.now = bytes2pgno(env, size_now); + new_geo.upper = bytes2pgno(env, size_upper); + new_geo.grow_pv = pages2pv(bytes2pgno(env, growth_step)); + new_geo.shrink_pv = pages2pv(bytes2pgno(env, shrink_threshold)); + new_geo.first_unallocated = current_geo->first_unallocated; -__hot static pgr_t page_alloc(const MDBX_cursor *const mc) { - MDBX_txn *const txn = mc->mc_txn; - tASSERT(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); - tASSERT(txn, F_ISSET(dbi_state(txn, mc->mc_dbi), - DBI_LINDO | DBI_VALID | DBI_DIRTY)); + ENSURE(env, pgno_align2os_bytes(env, new_geo.lower) == (size_t)size_lower); + ENSURE(env, pgno_align2os_bytes(env, new_geo.upper) == (size_t)size_upper); + ENSURE(env, pgno_align2os_bytes(env, new_geo.now) == (size_t)size_now); + ENSURE(env, new_geo.grow_pv == pages2pv(pv2pages(new_geo.grow_pv))); + ENSURE(env, new_geo.shrink_pv == pages2pv(pv2pages(new_geo.shrink_pv))); - /* If there are any loose pages, just use them */ - while (likely(txn->tw.loose_pages)) { -#if MDBX_ENABLE_REFUND - if (unlikely(txn->tw.loose_refund_wl > txn->mt_next_pgno)) { - txn_refund(txn); - if (!txn->tw.loose_pages) - break; - } -#endif /* MDBX_ENABLE_REFUND */ + ENSURE(env, (size_t)size_lower >= MIN_MAPSIZE); + ENSURE(env, new_geo.lower >= MIN_PAGENO); + ENSURE(env, (size_t)size_upper <= MAX_MAPSIZE); + ENSURE(env, new_geo.upper <= MAX_PAGENO + 1); + ENSURE(env, new_geo.now >= new_geo.first_unallocated); + ENSURE(env, new_geo.upper >= new_geo.now); + ENSURE(env, new_geo.now >= new_geo.lower); - MDBX_page *lp = txn->tw.loose_pages; - MDBX_ASAN_UNPOISON_MEMORY_REGION(lp, txn->mt_env->me_psize); - VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); - txn->tw.loose_pages = mp_next(lp); - txn->tw.loose_count--; - DEBUG_EXTRA("db %d use loose page %" PRIaPGNO, DDBI(mc), lp->mp_pgno); - tASSERT(txn, lp->mp_pgno < txn->mt_next_pgno); - tASSERT(txn, lp->mp_pgno >= NUM_METAS); - VALGRIND_MAKE_MEM_UNDEFINED(page_data(lp), page_space(txn->mt_env)); - lp->mp_txnid = txn->mt_front; - pgr_t ret = {lp, MDBX_SUCCESS}; - return ret; - } + if (memcmp(current_geo, &new_geo, sizeof(geo_t)) != 0) { +#if defined(_WIN32) || defined(_WIN64) + /* Was DB shrinking disabled before and now it will be enabled? */ + if (new_geo.lower < new_geo.upper && new_geo.shrink_pv && + !(current_geo->lower < current_geo->upper && + current_geo->shrink_pv)) { + if (!env->lck_mmap.lck) { + rc = MDBX_EPERM; + goto bailout; + } + int err = lck_rdt_lock(env); + if (unlikely(MDBX_IS_ERROR(err))) { + rc = err; + goto bailout; + } - if (likely(MDBX_PNL_GETSIZE(txn->tw.relist) > 0)) - return page_alloc_finalize(txn->mt_env, txn, mc, relist_get_single(txn), 1); + /* Check if there are any reading threads that do not use the SRWL */ + const size_t CurrentTid = GetCurrentThreadId(); + const reader_slot_t *const begin = env->lck_mmap.lck->rdt; + const reader_slot_t *const end = + begin + + atomic_load32(&env->lck_mmap.lck->rdt_length, mo_AcquireRelease); + for (const reader_slot_t *reader = begin; reader < end; ++reader) { + if (reader->pid.weak == env->pid && reader->tid.weak && + reader->tid.weak != CurrentTid) { + /* At least one thread may don't use SRWL */ + rc = MDBX_EPERM; + break; + } + } - return page_alloc_slowpath(mc, 1, MDBX_ALLOC_DEFAULT); -} + lck_rdt_unlock(env); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } +#endif /* Windows */ -/* Copy the used portions of a page. */ -__hot static void page_copy(MDBX_page *const dst, const MDBX_page *const src, - const size_t size) { - STATIC_ASSERT(UINT16_MAX > MAX_PAGESIZE - PAGEHDRSZ); - STATIC_ASSERT(MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 4); - void *copy_dst = dst; - const void *copy_src = src; - size_t copy_len = size; - if (src->mp_flags & P_LEAF2) { - copy_len = PAGEHDRSZ + src->mp_leaf2_ksize * page_numkeys(src); - if (unlikely(copy_len > size)) - goto bailout; - } - if ((src->mp_flags & (P_LEAF2 | P_OVERFLOW)) == 0) { - size_t upper = src->mp_upper, lower = src->mp_lower; - intptr_t unused = upper - lower; - /* If page isn't full, just copy the used portion. Adjust - * alignment so memcpy may copy words instead of bytes. */ - if (unused > MDBX_CACHELINE_SIZE * 3) { - lower = ceil_powerof2(lower + PAGEHDRSZ, sizeof(void *)); - upper = floor_powerof2(upper + PAGEHDRSZ, sizeof(void *)); - if (unlikely(upper > copy_len)) - goto bailout; - memcpy(copy_dst, copy_src, lower); - copy_dst = ptr_disp(copy_dst, upper); - copy_src = ptr_disp(copy_src, upper); - copy_len -= upper; + if (new_geo.now != current_geo->now || + new_geo.upper != current_geo->upper) { + rc = dxb_resize(env, current_geo->first_unallocated, new_geo.now, + new_geo.upper, explicit_resize); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + if (inside_txn) { + env->txn->geo = new_geo; + env->txn->flags |= MDBX_TXN_DIRTY; + } else { + meta.geometry = new_geo; + rc = + dxb_sync_locked(env, env->flags, &meta, &env->basal_txn->tw.troika); + if (likely(rc == MDBX_SUCCESS)) { + env->geo_in_bytes.now = + pgno2bytes(env, new_geo.now = meta.geometry.now); + env->geo_in_bytes.upper = + pgno2bytes(env, new_geo.upper = meta.geometry.upper); + } + } + } + if (likely(rc == MDBX_SUCCESS)) { + /* update env-geo to avoid influences */ + eASSERT(env, env->geo_in_bytes.now == pgno2bytes(env, new_geo.now)); + env->geo_in_bytes.lower = pgno2bytes(env, new_geo.lower); + eASSERT(env, env->geo_in_bytes.upper == pgno2bytes(env, new_geo.upper)); + env->geo_in_bytes.grow = pgno2bytes(env, pv2pages(new_geo.grow_pv)); + env->geo_in_bytes.shrink = pgno2bytes(env, pv2pages(new_geo.shrink_pv)); } } - memcpy(copy_dst, copy_src, copy_len); - return; bailout: - if (src->mp_flags & P_LEAF2) - bad_page(src, "%s addr %p, n-keys %zu, ksize %u", - "invalid/corrupted source page", __Wpedantic_format_voidptr(src), - page_numkeys(src), src->mp_leaf2_ksize); - else - bad_page(src, "%s addr %p, upper %u", "invalid/corrupted source page", - __Wpedantic_format_voidptr(src), src->mp_upper); - memset(dst, -1, size); + if (should_unlock) + lck_txn_unlock(env); + return rc; } -/* Pull a page off the txn's spill list, if present. - * - * If a page being referenced was spilled to disk in this txn, bring - * it back and make it dirty/writable again. */ -static pgr_t __must_check_result page_unspill(MDBX_txn *const txn, - const MDBX_page *const mp) { - VERBOSE("unspill page %" PRIaPGNO, mp->mp_pgno); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); - tASSERT(txn, IS_SPILLED(txn, mp)); - const MDBX_txn *scan = txn; - pgr_t ret; - do { - tASSERT(txn, (scan->mt_flags & MDBX_TXN_SPILLS) != 0); - const size_t si = search_spilled(scan, mp->mp_pgno); - if (!si) - continue; - const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1; - ret.page = page_malloc(txn, npages); - if (unlikely(!ret.page)) { - ret.err = MDBX_ENOMEM; - return ret; - } - page_copy(ret.page, mp, pgno2bytes(txn->mt_env, npages)); - if (scan == txn) { - /* If in current txn, this page is no longer spilled. - * If it happens to be the last page, truncate the spill list. - * Otherwise mark it as deleted by setting the LSB. */ - spill_remove(txn, si, npages); - } /* otherwise, if belonging to a parent txn, the - * page remains spilled until child commits */ +__cold int mdbx_env_sync_ex(MDBX_env *env, bool force, bool nonblock) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - ret.err = page_dirty(txn, ret.page, npages); - if (unlikely(ret.err != MDBX_SUCCESS)) - return ret; -#if MDBX_ENABLE_PGOP_STAT - txn->mt_env->me_lck->mti_pgop_stat.unspill.weak += npages; -#endif /* MDBX_ENABLE_PGOP_STAT */ - ret.page->mp_flags |= (scan == txn) ? 0 : P_SPILLED; - ret.err = MDBX_SUCCESS; - return ret; - } while (likely((scan = scan->mt_parent) != nullptr && - (scan->mt_flags & MDBX_TXN_SPILLS) != 0)); - ERROR("Page %" PRIaPGNO " mod-txnid %" PRIaTXN - " not found in the spill-list(s), current txn %" PRIaTXN - " front %" PRIaTXN ", root txn %" PRIaTXN " front %" PRIaTXN, - mp->mp_pgno, mp->mp_txnid, txn->mt_txnid, txn->mt_front, - txn->mt_env->me_txn0->mt_txnid, txn->mt_env->me_txn0->mt_front); - ret.err = MDBX_PROBLEM; - ret.page = NULL; - return ret; + return env_sync(env, force, nonblock); } +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 -/* Touch a page: make it dirty and re-insert into tree with updated pgno. - * Set MDBX_TXN_ERROR on failure. - * - * [in] mc cursor pointing to the page to be touched - * - * Returns 0 on success, non-zero on failure. */ -__hot static int page_touch(MDBX_cursor *mc) { - const MDBX_page *const mp = mc->mc_pg[mc->mc_top]; - MDBX_page *np; - MDBX_txn *txn = mc->mc_txn; - int rc; - tASSERT(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); - tASSERT(txn, F_ISSET(*mc->mc_dbi_state, DBI_LINDO | DBI_VALID | DBI_DIRTY)); - tASSERT(txn, !IS_OVERFLOW(mp)); - if (ASSERT_ENABLED()) { - if (mc->mc_flags & C_SUB) { - MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db); - MDBX_cursor_couple *couple = container_of(mx, MDBX_cursor_couple, inner); - tASSERT(txn, mc->mc_db == &couple->outer.mc_xcursor->mx_db); - tASSERT(txn, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); - tASSERT(txn, *couple->outer.mc_dbi_state & DBI_DIRTY); - } - tASSERT(txn, dirtylist_check(txn)); - } +/*------------------------------------------------------------------------------ + * Readers API */ - if (IS_MODIFIABLE(txn, mp)) { - if (!txn->tw.dirtylist) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) && !MDBX_AVOID_MSYNC); - return MDBX_SUCCESS; - } - if (IS_SUBP(mp)) - return MDBX_SUCCESS; - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - const size_t n = dpl_search(txn, mp->mp_pgno); - if (MDBX_AVOID_MSYNC && - unlikely(txn->tw.dirtylist->items[n].pgno != mp->mp_pgno)) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP)); - tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length + 1); - VERBOSE("unspill page %" PRIaPGNO, mp->mp_pgno); - np = (MDBX_page *)mp; -#if MDBX_ENABLE_PGOP_STAT - txn->mt_env->me_lck->mti_pgop_stat.unspill.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - return page_dirty(txn, np, 1); - } - tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length); - tASSERT(txn, txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && - txn->tw.dirtylist->items[n].ptr == mp); - if (!MDBX_AVOID_MSYNC || (txn->mt_flags & MDBX_WRITEMAP) == 0) { - size_t *const ptr = - ptr_disp(txn->tw.dirtylist->items[n].ptr, -(ptrdiff_t)sizeof(size_t)); - *ptr = txn->tw.dirtylru; - } - return MDBX_SUCCESS; - } - if (IS_SUBP(mp)) { - np = (MDBX_page *)mp; - np->mp_txnid = txn->mt_front; - return MDBX_SUCCESS; - } - tASSERT(txn, !IS_OVERFLOW(mp) && !IS_SUBP(mp)); +__cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, + void *ctx) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (IS_FROZEN(txn, mp)) { - /* CoW the page */ - rc = pnl_need(&txn->tw.retired_pages, 1); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - const pgr_t par = page_alloc(mc); - rc = par.err; - np = par.page; - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - - const pgno_t pgno = np->mp_pgno; - DEBUG("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, DDBI(mc), - mp->mp_pgno, pgno); - tASSERT(txn, mp->mp_pgno != pgno); - pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); - /* Update the parent page, if any, to point to the new page */ - if (mc->mc_top) { - MDBX_page *parent = mc->mc_pg[mc->mc_top - 1]; - MDBX_node *node = page_node(parent, mc->mc_ki[mc->mc_top - 1]); - node_set_pgno(node, pgno); - } else { - mc->mc_db->md_root = pgno; - } - -#if MDBX_ENABLE_PGOP_STAT - txn->mt_env->me_lck->mti_pgop_stat.cow.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - page_copy(np, mp, txn->mt_env->me_psize); - np->mp_pgno = pgno; - np->mp_txnid = txn->mt_front; - } else if (IS_SPILLED(txn, mp)) { - pgr_t pur = page_unspill(txn, mp); - np = pur.page; - rc = pur.err; - if (likely(rc == MDBX_SUCCESS)) { - tASSERT(txn, np != nullptr); - goto done; - } - goto fail; - } else { - if (unlikely(!txn->mt_parent)) { - ERROR("Unexpected not frozen/modifiable/spilled but shadowed %s " - "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," - " without parent transaction, current txn %" PRIaTXN - " front %" PRIaTXN, - IS_BRANCH(mp) ? "branch" : "leaf", mp->mp_pgno, mp->mp_txnid, - mc->mc_txn->mt_txnid, mc->mc_txn->mt_front); - rc = MDBX_PROBLEM; - goto fail; - } + if (unlikely(!func)) + return MDBX_EINVAL; - DEBUG("clone db %d page %" PRIaPGNO, DDBI(mc), mp->mp_pgno); - tASSERT(txn, - txn->tw.dirtylist->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); - /* No - copy it */ - np = page_malloc(txn, 1); - if (unlikely(!np)) { - rc = MDBX_ENOMEM; - goto fail; - } - page_copy(np, mp, txn->mt_env->me_psize); + rc = MDBX_RESULT_TRUE; + int serial = 0; + lck_t *const lck = env->lck_mmap.lck; + if (likely(lck)) { + const size_t snap_nreaders = + atomic_load32(&lck->rdt_length, mo_AcquireRelease); + for (size_t i = 0; i < snap_nreaders; i++) { + const reader_slot_t *r = lck->rdt + i; + retry_reader:; + const uint32_t pid = atomic_load32(&r->pid, mo_AcquireRelease); + if (!pid) + continue; + txnid_t txnid = safe64_read(&r->txnid); + const uint64_t tid = atomic_load64(&r->tid, mo_Relaxed); + const pgno_t pages_used = + atomic_load32(&r->snapshot_pages_used, mo_Relaxed); + const uint64_t reader_pages_retired = + atomic_load64(&r->snapshot_pages_retired, mo_Relaxed); + if (unlikely(txnid != safe64_read(&r->txnid) || + pid != atomic_load32(&r->pid, mo_AcquireRelease) || + tid != atomic_load64(&r->tid, mo_Relaxed) || + pages_used != + atomic_load32(&r->snapshot_pages_used, mo_Relaxed) || + reader_pages_retired != + atomic_load64(&r->snapshot_pages_retired, mo_Relaxed))) + goto retry_reader; - /* insert a clone of parent's dirty page, so don't touch dirtyroom */ - rc = page_dirty(txn, np, 1); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; + eASSERT(env, txnid > 0); + if (txnid >= SAFE64_INVALID_THRESHOLD) + txnid = 0; -#if MDBX_ENABLE_PGOP_STAT - txn->mt_env->me_lck->mti_pgop_stat.clone.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - } + size_t bytes_used = 0; + size_t bytes_retained = 0; + uint64_t lag = 0; + if (txnid) { + troika_t troika = meta_tap(env); + retry_header:; + const meta_ptr_t head = meta_recent(env, &troika); + const uint64_t head_pages_retired = + unaligned_peek_u64_volatile(4, head.ptr_v->pages_retired); + if (unlikely(meta_should_retry(env, &troika) || + head_pages_retired != unaligned_peek_u64_volatile( + 4, head.ptr_v->pages_retired))) + goto retry_header; -done: - /* Adjust cursors pointing to mp */ - mc->mc_pg[mc->mc_top] = np; - MDBX_cursor *m2 = txn->mt_cursors[mc->mc_dbi]; - if (mc->mc_flags & C_SUB) { - for (; m2; m2 = m2->mc_next) { - MDBX_cursor *m3 = &m2->mc_xcursor->mx_cursor; - if (m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) - m3->mc_pg[mc->mc_top] = np; - } - } else { - for (; m2; m2 = m2->mc_next) { - if (m2->mc_snum < mc->mc_snum) - continue; - if (m2 == mc) - continue; - if (m2->mc_pg[mc->mc_top] == mp) { - m2->mc_pg[mc->mc_top] = np; - if (XCURSOR_INITED(m2) && IS_LEAF(np)) - XCURSOR_REFRESH(m2, np, m2->mc_ki[mc->mc_top]); + lag = (head.txnid - txnid) / xMDBX_TXNID_STEP; + bytes_used = pgno2bytes(env, pages_used); + bytes_retained = (head_pages_retired > reader_pages_retired) + ? pgno2bytes(env, (pgno_t)(head_pages_retired - + reader_pages_retired)) + : 0; } + rc = func(ctx, ++serial, (unsigned)i, pid, (mdbx_tid_t)((intptr_t)tid), + txnid, lag, bytes_used, bytes_retained); + if (unlikely(rc != MDBX_SUCCESS)) + break; } } - return MDBX_SUCCESS; -fail: - txn->mt_flags |= MDBX_TXN_ERROR; return rc; } -static int meta_sync(const MDBX_env *env, const meta_ptr_t head) { - eASSERT(env, atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != - (uint32_t)head.txnid); - /* Функция может вызываться (в том числе) при (env->me_flags & - * MDBX_NOMETASYNC) == 0 и env->me_fd4meta == env->me_dsync_fd, например если - * предыдущая транзакция была выполненна с флагом MDBX_NOMETASYNC. */ +__cold int mdbx_reader_check(MDBX_env *env, int *dead) { + if (dead) + *dead = 0; + return mvcc_cleanup_dead(env, false, dead); +} - int rc = MDBX_RESULT_TRUE; - if (env->me_flags & MDBX_WRITEMAP) { - if (!MDBX_AVOID_MSYNC) { - rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), - MDBX_SYNC_DATA | MDBX_SYNC_IODQ); -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.msync.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - } else { -#if MDBX_ENABLE_PGOP_ST - env->me_lck->mti_pgop_stat.wops.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - const MDBX_page *page = data_page(head.ptr_c); - rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, - ptr_dist(page, env->me_map)); +/*------------------------------------------------------------------------------ + * Locking API */ - if (likely(rc == MDBX_SUCCESS) && env->me_fd4meta == env->me_lazy_fd) { - rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.fsync.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - } - } - } else { - rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.fsync.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - } +int mdbx_txn_lock(MDBX_env *env, bool dont_wait) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (likely(rc == MDBX_SUCCESS)) - env->me_lck->mti_meta_sync_txnid.weak = (uint32_t)head.txnid; - return rc; -} + if (unlikely(env->flags & MDBX_RDONLY)) + return MDBX_EACCESS; + if (unlikely(env->basal_txn->owner || + (env->basal_txn->flags & MDBX_TXN_FINISHED) == 0)) + return MDBX_BUSY; -static __inline bool env_txn0_owned(const MDBX_env *env) { - return (env->me_flags & MDBX_NOSTICKYTHREADS) - ? (env->me_txn0->mt_owner != 0) - : (env->me_txn0->mt_owner == osal_thread_self()); + return lck_txn_lock(env, dont_wait); } -__cold static int env_sync(MDBX_env *env, bool force, bool nonblock) { - if (unlikely(env->me_flags & MDBX_RDONLY)) +int mdbx_txn_unlock(MDBX_env *env) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(env->flags & MDBX_RDONLY)) return MDBX_EACCESS; + if (unlikely(env->basal_txn->owner != osal_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely((env->basal_txn->flags & MDBX_TXN_FINISHED) == 0)) + return MDBX_BUSY; - const bool txn0_owned = env_txn0_owned(env); - bool should_unlock = false; - int rc = MDBX_RESULT_TRUE /* means "nothing to sync" */; + lck_txn_unlock(env); + return MDBX_SUCCESS; +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 -retry:; - unsigned flags = env->me_flags & ~(MDBX_NOMETASYNC | MDBX_SHRINK_ALLOWED); - if (unlikely((flags & (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE)) != - MDBX_ENV_ACTIVE)) { - rc = (flags & MDBX_FATAL_ERROR) ? MDBX_PANIC : MDBX_EPERM; - goto bailout; - } - const meta_troika_t troika = - (txn0_owned | should_unlock) ? env->me_txn0->tw.troika : meta_tap(env); - const meta_ptr_t head = meta_recent(env, &troika); - const uint64_t unsynced_pages = - atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed); - if (unsynced_pages == 0) { - const uint32_t synched_meta_txnid_u32 = - atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed); - if (synched_meta_txnid_u32 == (uint32_t)head.txnid && head.is_steady) - goto bailout; - } +static inline double key2double(const int64_t key) { + union { + uint64_t u; + double f; + } casting; - if (should_unlock && (env->me_flags & MDBX_WRITEMAP) && - unlikely(head.ptr_c->mm_geo.next > - bytes2pgno(env, env->me_dxb_mmap.current))) { + casting.u = (key < 0) ? key + UINT64_C(0x8000000000000000) + : UINT64_C(0xffffFFFFffffFFFF) - key; + return casting.f; +} - if (unlikely(env->me_stuck_meta >= 0) && - troika.recent != (uint8_t)env->me_stuck_meta) { - NOTICE("skip %s since wagering meta-page (%u) is mispatch the recent " - "meta-page (%u)", - "sync datafile", env->me_stuck_meta, troika.recent); - rc = MDBX_RESULT_TRUE; - } else { - rc = dxb_resize(env, head.ptr_c->mm_geo.next, head.ptr_c->mm_geo.now, - head.ptr_c->mm_geo.upper, implicit_grow); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } +static inline uint64_t double2key(const double *const ptr) { + STATIC_ASSERT(sizeof(double) == sizeof(int64_t)); + const int64_t i = *(const int64_t *)ptr; + const uint64_t u = (i < 0) ? UINT64_C(0xffffFFFFffffFFFF) - i + : i + UINT64_C(0x8000000000000000); + if (ASSERT_ENABLED()) { + const double f = key2double(u); + assert(memcmp(&f, ptr, sizeof(double)) == 0); } + return u; +} - const size_t autosync_threshold = - atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); - const uint64_t autosync_period = - atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); - uint64_t eoos_timestamp; - if (force || (autosync_threshold && unsynced_pages >= autosync_threshold) || - (autosync_period && - (eoos_timestamp = - atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && - osal_monotime() - eoos_timestamp >= autosync_period)) - flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; - - if (!txn0_owned) { - if (!should_unlock) { -#if MDBX_ENABLE_PGOP_STAT - unsigned wops = 0; -#endif /* MDBX_ENABLE_PGOP_STAT */ - - int err; - /* pre-sync to avoid latency for writer */ - if (unsynced_pages > /* FIXME: define threshold */ 42 && - (flags & MDBX_SAFE_NOSYNC) == 0) { - eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); - if (flags & MDBX_WRITEMAP) { - /* Acquire guard to avoid collision with remap */ -#if defined(_WIN32) || defined(_WIN64) - osal_srwlock_AcquireShared(&env->me_remap_guard); -#else - err = osal_fastmutex_acquire(&env->me_remap_guard); - if (unlikely(err != MDBX_SUCCESS)) - return err; -#endif - const size_t usedbytes = - pgno_align2os_bytes(env, head.ptr_c->mm_geo.next); - err = osal_msync(&env->me_dxb_mmap, 0, usedbytes, MDBX_SYNC_DATA); -#if defined(_WIN32) || defined(_WIN64) - osal_srwlock_ReleaseShared(&env->me_remap_guard); -#else - int unlock_err = osal_fastmutex_release(&env->me_remap_guard); - if (unlikely(unlock_err != MDBX_SUCCESS) && err == MDBX_SUCCESS) - err = unlock_err; -#endif - } else - err = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); - - if (unlikely(err != MDBX_SUCCESS)) - return err; - -#if MDBX_ENABLE_PGOP_STAT - wops = 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - /* pre-sync done */ - rc = MDBX_SUCCESS /* means "some data was synced" */; - } +static inline float key2float(const int32_t key) { + union { + uint32_t u; + float f; + } casting; - err = osal_txn_lock(env, nonblock); - if (unlikely(err != MDBX_SUCCESS)) - return err; + casting.u = + (key < 0) ? key + UINT32_C(0x80000000) : UINT32_C(0xffffFFFF) - key; + return casting.f; +} - should_unlock = true; -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += wops; -#endif /* MDBX_ENABLE_PGOP_STAT */ - env->me_txn0->tw.troika = meta_tap(env); - eASSERT(env, !env->me_txn && !env->me_txn0->mt_child); - goto retry; - } - eASSERT(env, head.txnid == recent_committed_txnid(env)); - env->me_txn0->mt_txnid = head.txnid; - txn_oldest_reader(env->me_txn0); - flags |= MDBX_SHRINK_ALLOWED; +static inline uint32_t float2key(const float *const ptr) { + STATIC_ASSERT(sizeof(float) == sizeof(int32_t)); + const int32_t i = *(const int32_t *)ptr; + const uint32_t u = + (i < 0) ? UINT32_C(0xffffFFFF) - i : i + UINT32_C(0x80000000); + if (ASSERT_ENABLED()) { + const float f = key2float(u); + assert(memcmp(&f, ptr, sizeof(float)) == 0); } + return u; +} - eASSERT(env, txn0_owned || should_unlock); - eASSERT(env, !txn0_owned || (flags & MDBX_SHRINK_ALLOWED) == 0); +uint64_t mdbx_key_from_double(const double ieee754_64bit) { + return double2key(&ieee754_64bit); +} - if (!head.is_steady && unlikely(env->me_stuck_meta >= 0) && - troika.recent != (uint8_t)env->me_stuck_meta) { - NOTICE("skip %s since wagering meta-page (%u) is mispatch the recent " - "meta-page (%u)", - "sync datafile", env->me_stuck_meta, troika.recent); - rc = MDBX_RESULT_TRUE; - goto bailout; - } - if (!head.is_steady || ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) { - DEBUG("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIu64, - data_page(head.ptr_c)->mp_pgno, durable_caption(head.ptr_c), - unsynced_pages); - MDBX_meta meta = *head.ptr_c; - rc = sync_locked(env, flags, &meta, &env->me_txn0->tw.troika); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } +uint64_t mdbx_key_from_ptrdouble(const double *const ieee754_64bit) { + return double2key(ieee754_64bit); +} - /* LY: sync meta-pages if MDBX_NOMETASYNC enabled - * and someone was not synced above. */ - if (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != - (uint32_t)head.txnid) - rc = meta_sync(env, head); +uint32_t mdbx_key_from_float(const float ieee754_32bit) { + return float2key(&ieee754_32bit); +} -bailout: - if (should_unlock) - osal_txn_unlock(env); - return rc; +uint32_t mdbx_key_from_ptrfloat(const float *const ieee754_32bit) { + return float2key(ieee754_32bit); } -static __inline int check_env(const MDBX_env *env, const bool wanna_active) { - if (unlikely(!env)) - return MDBX_EINVAL; - - if (unlikely(env->me_signature.weak != MDBX_ME_SIGNATURE)) - return MDBX_EBADSIGN; +#define IEEE754_DOUBLE_MANTISSA_SIZE 52 +#define IEEE754_DOUBLE_EXPONENTA_BIAS 0x3FF +#define IEEE754_DOUBLE_EXPONENTA_MAX 0x7FF +#define IEEE754_DOUBLE_IMPLICIT_LEAD UINT64_C(0x0010000000000000) +#define IEEE754_DOUBLE_MANTISSA_MASK UINT64_C(0x000FFFFFFFFFFFFF) +#define IEEE754_DOUBLE_MANTISSA_AMAX UINT64_C(0x001FFFFFFFFFFFFF) - if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) - return MDBX_PANIC; +static inline int clz64(uint64_t value) { +#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_clzl) + if (sizeof(value) == sizeof(int)) + return __builtin_clz(value); + if (sizeof(value) == sizeof(long)) + return __builtin_clzl(value); +#if (defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ == 8) || \ + __has_builtin(__builtin_clzll) + return __builtin_clzll(value); +#endif /* have(long long) && long long == uint64_t */ +#endif /* GNU C */ - if (wanna_active) { -#if MDBX_ENV_CHECKPID - if (unlikely(env->me_pid != osal_getpid()) && env->me_pid) { - ((MDBX_env *)env)->me_flags |= MDBX_FATAL_ERROR; - return MDBX_PANIC; - } -#endif /* MDBX_ENV_CHECKPID */ - if (unlikely((env->me_flags & MDBX_ENV_ACTIVE) == 0)) - return MDBX_EPERM; - eASSERT(env, env->me_map != nullptr); +#if defined(_MSC_VER) + unsigned long index; +#if defined(_M_AMD64) || defined(_M_ARM64) || defined(_M_X64) + _BitScanReverse64(&index, value); + return 63 - index; +#else + if (value > UINT32_MAX) { + _BitScanReverse(&index, (uint32_t)(value >> 32)); + return 31 - index; } + _BitScanReverse(&index, (uint32_t)value); + return 63 - index; +#endif +#endif /* MSVC */ - return MDBX_SUCCESS; + value |= value >> 1; + value |= value >> 2; + value |= value >> 4; + value |= value >> 8; + value |= value >> 16; + value |= value >> 32; + static const uint8_t debruijn_clz64[64] = { + 63, 16, 62, 7, 15, 36, 61, 3, 6, 14, 22, 26, 35, 47, 60, 2, + 9, 5, 28, 11, 13, 21, 42, 19, 25, 31, 34, 40, 46, 52, 59, 1, + 17, 8, 37, 4, 23, 27, 48, 10, 29, 12, 43, 20, 32, 41, 53, 18, + 38, 24, 49, 30, 44, 33, 54, 39, 50, 45, 55, 51, 56, 57, 58, 0}; + return debruijn_clz64[value * UINT64_C(0x03F79D71B4CB0A89) >> 58]; } -__cold int mdbx_env_sync_ex(MDBX_env *env, bool force, bool nonblock) { - int rc = check_env(env, true); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - return env_sync(env, force, nonblock); +static inline uint64_t round_mantissa(const uint64_t u64, int shift) { + assert(shift < 0 && u64 > 0); + shift = -shift; + const unsigned half = 1 << (shift - 1); + const unsigned lsb = 1 & (unsigned)(u64 >> shift); + const unsigned tie2even = 1 ^ lsb; + return (u64 + half - tie2even) >> shift; } -#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) -/* Find largest mvcc-snapshot still referenced by this process. */ -static pgno_t find_largest_this(MDBX_env *env, pgno_t largest) { - MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (likely(lck != NULL /* exclusive mode */)) { - const size_t snap_nreaders = - atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - for (size_t i = 0; i < snap_nreaders; ++i) { - retry: - if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease) == - env->me_pid) { - /* jitter4testing(true); */ - const pgno_t snap_pages = atomic_load32( - &lck->mti_readers[i].mr_snapshot_pages_used, mo_Relaxed); - const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); - if (unlikely( - snap_pages != - atomic_load32(&lck->mti_readers[i].mr_snapshot_pages_used, - mo_AcquireRelease) || - snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))) - goto retry; - if (largest < snap_pages && - atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease) <= - /* ignore pending updates */ snap_txnid && - snap_txnid <= MAX_TXNID) - largest = snap_pages; - } +uint64_t mdbx_key_from_jsonInteger(const int64_t json_integer) { + const uint64_t bias = UINT64_C(0x8000000000000000); + if (json_integer > 0) { + const uint64_t u64 = json_integer; + int shift = clz64(u64) - (64 - IEEE754_DOUBLE_MANTISSA_SIZE - 1); + uint64_t mantissa = u64 << shift; + if (unlikely(shift < 0)) { + mantissa = round_mantissa(u64, shift); + if (mantissa > IEEE754_DOUBLE_MANTISSA_AMAX) + mantissa = round_mantissa(u64, --shift); } - } - return largest; -} -static void txn_valgrind(MDBX_env *env, MDBX_txn *txn) { -#if !defined(__SANITIZE_ADDRESS__) - if (!RUNNING_ON_VALGRIND) - return; -#endif + assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD && + mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX); + const uint64_t exponent = (uint64_t)IEEE754_DOUBLE_EXPONENTA_BIAS + + IEEE754_DOUBLE_MANTISSA_SIZE - shift; + assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX); + const uint64_t key = bias + (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) + + (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD); +#if !defined(_MSC_VER) || \ + defined( \ + _DEBUG) /* Workaround for MSVC error LNK2019: unresolved external \ + symbol __except1 referenced in function __ftol3_except */ + assert(key == mdbx_key_from_double((double)json_integer)); +#endif /* Workaround for MSVC */ + return key; + } - if (txn) { /* transaction start */ - if (env->me_poison_edge < txn->mt_next_pgno) - env->me_poison_edge = txn->mt_next_pgno; - VALGRIND_MAKE_MEM_DEFINED(env->me_map, pgno2bytes(env, txn->mt_next_pgno)); - MDBX_ASAN_UNPOISON_MEMORY_REGION(env->me_map, - pgno2bytes(env, txn->mt_next_pgno)); - /* don't touch more, it should be already poisoned */ - } else { /* transaction end */ - bool should_unlock = false; - pgno_t last = MAX_PAGENO + 1; - if (env->me_pid != osal_getpid()) { - /* resurrect after fork */ - return; - } else if (env->me_txn && env_txn0_owned(env)) { - /* inside write-txn */ - last = meta_recent(env, &env->me_txn0->tw.troika).ptr_v->mm_geo.next; - } else if (env->me_flags & MDBX_RDONLY) { - /* read-only mode, no write-txn, no wlock mutex */ - last = NUM_METAS; - } else if (osal_txn_lock(env, true) == MDBX_SUCCESS) { - /* no write-txn */ - last = NUM_METAS; - should_unlock = true; - } else { - /* write txn is running, therefore shouldn't poison any memory range */ - return; + if (json_integer < 0) { + const uint64_t u64 = -json_integer; + int shift = clz64(u64) - (64 - IEEE754_DOUBLE_MANTISSA_SIZE - 1); + uint64_t mantissa = u64 << shift; + if (unlikely(shift < 0)) { + mantissa = round_mantissa(u64, shift); + if (mantissa > IEEE754_DOUBLE_MANTISSA_AMAX) + mantissa = round_mantissa(u64, --shift); } - last = find_largest_this(env, last); - const pgno_t edge = env->me_poison_edge; - if (edge > last) { - eASSERT(env, last >= NUM_METAS); - env->me_poison_edge = last; - VALGRIND_MAKE_MEM_NOACCESS(ptr_disp(env->me_map, pgno2bytes(env, last)), - pgno2bytes(env, edge - last)); - MDBX_ASAN_POISON_MEMORY_REGION( - ptr_disp(env->me_map, pgno2bytes(env, last)), - pgno2bytes(env, edge - last)); - } - if (should_unlock) - osal_txn_unlock(env); + assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD && + mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX); + const uint64_t exponent = (uint64_t)IEEE754_DOUBLE_EXPONENTA_BIAS + + IEEE754_DOUBLE_MANTISSA_SIZE - shift; + assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX); + const uint64_t key = bias - 1 - (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) - + (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD); +#if !defined(_MSC_VER) || \ + defined( \ + _DEBUG) /* Workaround for MSVC error LNK2019: unresolved external \ + symbol __except1 referenced in function __ftol3_except */ + assert(key == mdbx_key_from_double((double)json_integer)); +#endif /* Workaround for MSVC */ + return key; } + + return bias; } -#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ -typedef struct { - int err; - MDBX_reader *rslot; -} bind_rslot_result; +int64_t mdbx_jsonInteger_from_key(const MDBX_val v) { + assert(v.iov_len == 8); + const uint64_t key = unaligned_peek_u64(2, v.iov_base); + const uint64_t bias = UINT64_C(0x8000000000000000); + const uint64_t covalent = (key > bias) ? key - bias : bias - key - 1; + const int shift = IEEE754_DOUBLE_EXPONENTA_BIAS + 63 - + (IEEE754_DOUBLE_EXPONENTA_MAX & + (int)(covalent >> IEEE754_DOUBLE_MANTISSA_SIZE)); + if (unlikely(shift < 1)) + return (key < bias) ? INT64_MIN : INT64_MAX; + if (unlikely(shift > 63)) + return 0; -static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) { - eASSERT(env, env->me_lck_mmap.lck); - eASSERT(env, env->me_lck->mti_magic_and_version == MDBX_LOCK_MAGIC); - eASSERT(env, env->me_lck->mti_os_and_format == MDBX_LOCK_FORMAT); + const uint64_t unscaled = ((covalent & IEEE754_DOUBLE_MANTISSA_MASK) + << (63 - IEEE754_DOUBLE_MANTISSA_SIZE)) + + bias; + const int64_t absolute = unscaled >> shift; + const int64_t value = (key < bias) ? -absolute : absolute; + assert(key == mdbx_key_from_jsonInteger(value) || + (mdbx_key_from_jsonInteger(value - 1) < key && + key < mdbx_key_from_jsonInteger(value + 1))); + return value; +} - bind_rslot_result result = {osal_rdt_lock(env), nullptr}; - if (unlikely(MDBX_IS_ERROR(result.err))) - return result; - if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { - osal_rdt_unlock(env); - result.err = MDBX_PANIC; - return result; - } - if (unlikely(!env->me_map)) { - osal_rdt_unlock(env); - result.err = MDBX_EPERM; - return result; - } +double mdbx_double_from_key(const MDBX_val v) { + assert(v.iov_len == 8); + return key2double(unaligned_peek_u64(2, v.iov_base)); +} - if (unlikely(env->me_live_reader != env->me_pid)) { - result.err = osal_rpid_set(env); - if (unlikely(result.err != MDBX_SUCCESS)) { - osal_rdt_unlock(env); - return result; - } - env->me_live_reader = env->me_pid; - } +float mdbx_float_from_key(const MDBX_val v) { + assert(v.iov_len == 4); + return key2float(unaligned_peek_u32(2, v.iov_base)); +} - result.err = MDBX_SUCCESS; - size_t slot, nreaders; - while (1) { - nreaders = env->me_lck->mti_numreaders.weak; - for (slot = 0; slot < nreaders; slot++) - if (!atomic_load32(&env->me_lck->mti_readers[slot].mr_pid, - mo_AcquireRelease)) - break; +int32_t mdbx_int32_from_key(const MDBX_val v) { + assert(v.iov_len == 4); + return (int32_t)(unaligned_peek_u32(2, v.iov_base) - UINT32_C(0x80000000)); +} - if (likely(slot < env->me_maxreaders)) - break; +int64_t mdbx_int64_from_key(const MDBX_val v) { + assert(v.iov_len == 8); + return (int64_t)(unaligned_peek_u64(2, v.iov_base) - + UINT64_C(0x8000000000000000)); +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 - result.err = cleanup_dead_readers(env, true, NULL); - if (result.err != MDBX_RESULT_TRUE) { - osal_rdt_unlock(env); - result.err = - (result.err == MDBX_SUCCESS) ? MDBX_READERS_FULL : result.err; - return result; - } - } - result.rslot = &env->me_lck->mti_readers[slot]; - /* Claim the reader slot, carefully since other code - * uses the reader table un-mutexed: First reset the - * slot, next publish it in lck->mti_numreaders. After - * that, it is safe for mdbx_env_close() to touch it. - * When it will be closed, we can finally claim it. */ - atomic_store32(&result.rslot->mr_pid, 0, mo_AcquireRelease); - safe64_reset(&result.rslot->mr_txnid, true); - if (slot == nreaders) - env->me_lck->mti_numreaders.weak = (uint32_t)++nreaders; - result.rslot->mr_tid.weak = (env->me_flags & MDBX_NOSTICKYTHREADS) ? 0 : tid; - atomic_store32(&result.rslot->mr_pid, env->me_pid, mo_AcquireRelease); - osal_rdt_unlock(env); +#ifdef __SANITIZE_THREAD__ +/* LY: avoid tsan-trap by txn, mm_last_pg and geo.first_unallocated */ +__attribute__((__no_sanitize_thread__, __noinline__)) +#endif +int mdbx_txn_straggler(const MDBX_txn *txn, int *percent) +{ + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return (rc > 0) ? -rc : rc; - if (likely(env->me_flags & MDBX_ENV_TXKEY)) { - eASSERT(env, env->me_live_reader == env->me_pid); - thread_rthc_set(env->me_txkey, result.rslot); + MDBX_env *env = txn->env; + if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0)) { + if (percent) + *percent = (int)((txn->geo.first_unallocated * UINT64_C(100) + + txn->geo.end_pgno / 2) / + txn->geo.end_pgno); + return 0; } - return result; + + txnid_t lag; + troika_t troika = meta_tap(env); + do { + const meta_ptr_t head = meta_recent(env, &troika); + if (percent) { + const pgno_t maxpg = head.ptr_v->geometry.now; + *percent = (int)((head.ptr_v->geometry.first_unallocated * UINT64_C(100) + + maxpg / 2) / + maxpg); + } + lag = (head.txnid - txn->txnid) / xMDBX_TXNID_STEP; + } while (unlikely(meta_should_retry(env, &troika))); + + return (lag > INT_MAX) ? INT_MAX : (int)lag; } -__cold int mdbx_thread_register(const MDBX_env *env) { - int rc = check_env(env, true); +__cold int mdbx_dbi_dupsort_depthmask(const MDBX_txn *txn, MDBX_dbi dbi, + uint32_t *mask) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (unlikely(!env->me_lck_mmap.lck)) - return (env->me_flags & MDBX_EXCLUSIVE) ? MDBX_EINVAL : MDBX_EPERM; + if (unlikely(!mask)) + return MDBX_EINVAL; - if (unlikely((env->me_flags & MDBX_ENV_TXKEY) == 0)) { - eASSERT(env, env->me_flags & MDBX_NOSTICKYTHREADS); - return MDBX_EINVAL /* MDBX_NOSTICKYTHREADS mode */; - } + cursor_couple_t cx; + rc = cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + if ((cx.outer.tree->flags & MDBX_DUPSORT) == 0) + return MDBX_RESULT_TRUE; - eASSERT(env, (env->me_flags & (MDBX_NOSTICKYTHREADS | MDBX_ENV_TXKEY)) == - MDBX_ENV_TXKEY); - MDBX_reader *r = thread_rthc_get(env->me_txkey); - if (unlikely(r != NULL)) { - eASSERT(env, r->mr_pid.weak == env->me_pid); - eASSERT(env, r->mr_tid.weak == osal_thread_self()); - if (unlikely(r->mr_pid.weak != env->me_pid)) - return MDBX_BAD_RSLOT; - return MDBX_RESULT_TRUE /* already registered */; + MDBX_val key, data; + rc = outer_first(&cx.outer, &key, &data); + *mask = 0; + while (rc == MDBX_SUCCESS) { + const node_t *node = + page_node(cx.outer.pg[cx.outer.top], cx.outer.ki[cx.outer.top]); + const tree_t *db = node_data(node); + const unsigned flags = node_flags(node); + switch (flags) { + case N_BIGDATA: + case 0: + /* single-value entry, deep = 0 */ + *mask |= 1 << 0; + break; + case N_DUPDATA: + /* single sub-page, deep = 1 */ + *mask |= 1 << 1; + break; + case N_DUPDATA | N_SUBDATA: + /* sub-tree */ + *mask |= 1 << UNALIGNED_PEEK_16(db, tree_t, height); + break; + default: + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid node-size", flags); + return MDBX_CORRUPTED; + } + rc = outer_next(&cx.outer, &key, &data, MDBX_NEXT_NODUP); } - const uintptr_t tid = osal_thread_self(); - if (env->me_txn && unlikely(env->me_txn0->mt_owner == tid)) - return MDBX_TXN_OVERLAPPING; - return bind_rslot((MDBX_env *)env, tid).err; + return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc; } -__cold int mdbx_thread_unregister(const MDBX_env *env) { - int rc = check_env(env, true); +int mdbx_canary_get(const MDBX_txn *txn, MDBX_canary *canary) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (unlikely(!env->me_lck_mmap.lck)) - return MDBX_RESULT_TRUE; + if (unlikely(canary == nullptr)) + return MDBX_EINVAL; - if (unlikely((env->me_flags & MDBX_ENV_TXKEY) == 0)) { - eASSERT(env, env->me_flags & MDBX_NOSTICKYTHREADS); - return MDBX_RESULT_TRUE /* MDBX_NOSTICKYTHREADS mode */; - } + *canary = txn->canary; + return MDBX_SUCCESS; +} - eASSERT(env, (env->me_flags & (MDBX_NOSTICKYTHREADS | MDBX_ENV_TXKEY)) == - MDBX_ENV_TXKEY); - MDBX_reader *r = thread_rthc_get(env->me_txkey); - if (unlikely(r == NULL)) - return MDBX_RESULT_TRUE /* not registered */; +int mdbx_get(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, + MDBX_val *data) { + DKBUF_DEBUG; + DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); - eASSERT(env, r->mr_pid.weak == env->me_pid); - eASSERT(env, r->mr_tid.weak == osal_thread_self()); - if (unlikely(r->mr_pid.weak != env->me_pid || - r->mr_tid.weak != osal_thread_self())) - return MDBX_BAD_RSLOT; + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - eASSERT(env, r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); - if (unlikely(r->mr_txnid.weak < SAFE64_INVALID_THRESHOLD)) - return MDBX_BUSY /* transaction is still active */; + if (unlikely(!key || !data)) + return MDBX_EINVAL; - atomic_store32(&r->mr_pid, 0, mo_Relaxed); - atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, - mo_AcquireRelease); - thread_rthc_set(env->me_txkey, nullptr); - return MDBX_SUCCESS; + cursor_couple_t cx; + rc = cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + return cursor_seek(&cx.outer, (MDBX_val *)key, data, MDBX_SET).err; } -/* check against https://libmdbx.dqdkfa.ru/dead-github/issues/269 */ -static bool coherency_check(const MDBX_env *env, const txnid_t txnid, - const volatile MDBX_db *dbs, - const volatile MDBX_meta *meta, bool report) { - const txnid_t freedb_mod_txnid = dbs[FREE_DBI].md_mod_txnid; - const txnid_t maindb_mod_txnid = dbs[MAIN_DBI].md_mod_txnid; - const pgno_t last_pgno = meta->mm_geo.now; - - const pgno_t freedb_root_pgno = dbs[FREE_DBI].md_root; - const MDBX_page *freedb_root = (env->me_map && freedb_root_pgno < last_pgno) - ? pgno2page(env, freedb_root_pgno) - : nullptr; - - const pgno_t maindb_root_pgno = dbs[MAIN_DBI].md_root; - const MDBX_page *maindb_root = (env->me_map && maindb_root_pgno < last_pgno) - ? pgno2page(env, maindb_root_pgno) - : nullptr; - const uint64_t magic_and_version = - unaligned_peek_u64_volatile(4, &meta->mm_magic_and_version); +int mdbx_get_equal_or_great(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, + MDBX_val *data) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - bool ok = true; - if (freedb_root_pgno != P_INVALID && - unlikely(freedb_root_pgno >= last_pgno)) { - if (report) - WARNING( - "catch invalid %sdb root %" PRIaPGNO " for meta_txnid %" PRIaTXN - " %s", - "free", freedb_root_pgno, txnid, - (env->me_stuck_meta < 0) - ? "(workaround for incoherent flaw of unified page/buffer cache)" - : "(wagering meta)"); - ok = false; - } - if (maindb_root_pgno != P_INVALID && - unlikely(maindb_root_pgno >= last_pgno)) { - if (report) - WARNING( - "catch invalid %sdb root %" PRIaPGNO " for meta_txnid %" PRIaTXN - " %s", - "main", maindb_root_pgno, txnid, - (env->me_stuck_meta < 0) - ? "(workaround for incoherent flaw of unified page/buffer cache)" - : "(wagering meta)"); - ok = false; - } - if (unlikely(txnid < freedb_mod_txnid || - (!freedb_mod_txnid && freedb_root && - likely(magic_and_version == MDBX_DATA_MAGIC)))) { - if (report) - WARNING( - "catch invalid %sdb.mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN - " %s", - "free", freedb_mod_txnid, txnid, - (env->me_stuck_meta < 0) - ? "(workaround for incoherent flaw of unified page/buffer cache)" - : "(wagering meta)"); - ok = false; - } - if (unlikely(txnid < maindb_mod_txnid || - (!maindb_mod_txnid && maindb_root && - likely(magic_and_version == MDBX_DATA_MAGIC)))) { - if (report) - WARNING( - "catch invalid %sdb.mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN - " %s", - "main", maindb_mod_txnid, txnid, - (env->me_stuck_meta < 0) - ? "(workaround for incoherent flaw of unified page/buffer cache)" - : "(wagering meta)"); - ok = false; - } - if (likely(freedb_root && freedb_mod_txnid)) { - VALGRIND_MAKE_MEM_DEFINED(freedb_root, sizeof(freedb_root->mp_txnid)); - MDBX_ASAN_UNPOISON_MEMORY_REGION(freedb_root, - sizeof(freedb_root->mp_txnid)); - const txnid_t root_txnid = freedb_root->mp_txnid; - if (unlikely(root_txnid != freedb_mod_txnid)) { - if (report) - WARNING("catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN - " for %sdb.mod_txnid %" PRIaTXN " %s", - freedb_root_pgno, root_txnid, "free", freedb_mod_txnid, - (env->me_stuck_meta < 0) ? "(workaround for incoherent flaw of " - "unified page/buffer cache)" - : "(wagering meta)"); - ok = false; - } - } - if (likely(maindb_root && maindb_mod_txnid)) { - VALGRIND_MAKE_MEM_DEFINED(maindb_root, sizeof(maindb_root->mp_txnid)); - MDBX_ASAN_UNPOISON_MEMORY_REGION(maindb_root, - sizeof(maindb_root->mp_txnid)); - const txnid_t root_txnid = maindb_root->mp_txnid; - if (unlikely(root_txnid != maindb_mod_txnid)) { - if (report) - WARNING("catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN - " for %sdb.mod_txnid %" PRIaTXN " %s", - maindb_root_pgno, root_txnid, "main", maindb_mod_txnid, - (env->me_stuck_meta < 0) ? "(workaround for incoherent flaw of " - "unified page/buffer cache)" - : "(wagering meta)"); - ok = false; - } - } - if (unlikely(!ok) && report) - env->me_lck->mti_pgop_stat.incoherence.weak = - (env->me_lck->mti_pgop_stat.incoherence.weak >= INT32_MAX) - ? INT32_MAX - : env->me_lck->mti_pgop_stat.incoherence.weak + 1; - return ok; -} + if (unlikely(!key || !data)) + return MDBX_EINVAL; -__cold static int coherency_timeout(uint64_t *timestamp, intptr_t pgno, - const MDBX_env *env) { - if (likely(timestamp && *timestamp == 0)) - *timestamp = osal_monotime(); - else if (unlikely(!timestamp || osal_monotime() - *timestamp > - osal_16dot16_to_monotime(65536 / 10))) { - if (pgno >= 0 && pgno != env->me_stuck_meta) - ERROR("bailout waiting for %" PRIuSIZE " page arrival %s", pgno, - "(workaround for incoherent flaw of unified page/buffer cache)"); - else if (env->me_stuck_meta < 0) - ERROR("bailout waiting for valid snapshot (%s)", - "workaround for incoherent flaw of unified page/buffer cache"); - return MDBX_PROBLEM; - } + if (unlikely(txn->flags & MDBX_TXN_BLOCKED)) + return MDBX_BAD_TXN; - osal_memory_fence(mo_AcquireRelease, true); -#if defined(_WIN32) || defined(_WIN64) - SwitchToThread(); -#elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE) - sched_yield(); -#elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS) - pthread_yield(); -#else - usleep(42); -#endif - return MDBX_RESULT_TRUE; -} + cursor_couple_t cx; + rc = cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; -/* check with timeout as the workaround - * for https://libmdbx.dqdkfa.ru/dead-github/issues/269 */ -__hot static int coherency_check_head(MDBX_txn *txn, const meta_ptr_t head, - uint64_t *timestamp) { - /* Copy the DB info and flags */ - txn->mt_geo = head.ptr_v->mm_geo; - memcpy(txn->mt_dbs, head.ptr_c->mm_dbs, CORE_DBS * sizeof(MDBX_db)); - VALGRIND_MAKE_MEM_UNDEFINED(txn->mt_dbs + CORE_DBS, - txn->mt_env->me_maxdbs - CORE_DBS); - txn->mt_canary = head.ptr_v->mm_canary; - - if (unlikely(!coherency_check(txn->mt_env, head.txnid, txn->mt_dbs, - head.ptr_v, *timestamp == 0))) - return coherency_timeout(timestamp, -1, txn->mt_env); - - tASSERT(txn, txn->mt_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); - tASSERT(txn, db_check_flags(txn->mt_dbs[MAIN_DBI].md_flags)); - return MDBX_SUCCESS; + return cursor_ops(&cx.outer, key, data, MDBX_SET_LOWERBOUND); } -static int coherency_check_written(const MDBX_env *env, const txnid_t txnid, - const volatile MDBX_meta *meta, - const intptr_t pgno, uint64_t *timestamp) { - const bool report = !(timestamp && *timestamp); - const txnid_t head_txnid = meta_txnid(meta); - if (unlikely(head_txnid < MIN_TXNID || head_txnid < txnid)) { - if (report) { - env->me_lck->mti_pgop_stat.incoherence.weak = - (env->me_lck->mti_pgop_stat.incoherence.weak >= INT32_MAX) - ? INT32_MAX - : env->me_lck->mti_pgop_stat.incoherence.weak + 1; - WARNING("catch %s txnid %" PRIaTXN " for meta_%" PRIaPGNO " %s", - (head_txnid < MIN_TXNID) ? "invalid" : "unexpected", head_txnid, - bytes2pgno(env, ptr_dist(meta, env->me_map)), - "(workaround for incoherent flaw of unified page/buffer cache)"); - } - return coherency_timeout(timestamp, pgno, env); - } - if (unlikely(!coherency_check(env, head_txnid, meta->mm_dbs, meta, report))) - return coherency_timeout(timestamp, pgno, env); +int mdbx_get_ex(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, + MDBX_val *data, size_t *values_count) { + DKBUF_DEBUG; + DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); - eASSERT(env, meta->mm_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); - eASSERT(env, db_check_flags(meta->mm_dbs[MAIN_DBI].md_flags)); - return MDBX_SUCCESS; -} + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; -static bool check_meta_coherency(const MDBX_env *env, - const volatile MDBX_meta *meta, bool report) { - uint64_t timestamp = 0; - return coherency_check_written(env, 0, meta, -1, - report ? ×tamp : nullptr) == MDBX_SUCCESS; -} + if (unlikely(!key || !data)) + return MDBX_EINVAL; -/* Common code for mdbx_txn_begin() and mdbx_txn_renew(). */ -static int txn_renew(MDBX_txn *txn, unsigned flags) { - MDBX_env *env = txn->mt_env; - int rc; + cursor_couple_t cx; + rc = cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; -#if MDBX_ENV_CHECKPID - if (unlikely(env->me_pid != osal_getpid())) { - env->me_flags |= MDBX_FATAL_ERROR; - return MDBX_PANIC; + rc = cursor_seek(&cx.outer, key, data, MDBX_SET_KEY).err; + if (unlikely(rc != MDBX_SUCCESS)) { + if (values_count) + *values_count = 0; + return rc; } -#endif /* MDBX_ENV_CHECKPID */ - STATIC_ASSERT(sizeof(MDBX_reader) == 32); -#if MDBX_LOCKING > 0 - STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_wlock) % MDBX_CACHELINE_SIZE == 0); - STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_rlock) % MDBX_CACHELINE_SIZE == 0); -#else - STATIC_ASSERT( - offsetof(MDBX_lockinfo, mti_oldest_reader) % MDBX_CACHELINE_SIZE == 0); - STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_numreaders) % MDBX_CACHELINE_SIZE == - 0); -#endif /* MDBX_LOCKING */ - STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_readers) % MDBX_CACHELINE_SIZE == - 0); + if (values_count) { + *values_count = 1; + if (inner_pointed(&cx.outer)) + *values_count = + (sizeof(*values_count) >= sizeof(cx.inner.nested_tree.items) || + cx.inner.nested_tree.items <= PTRDIFF_MAX) + ? (size_t)cx.inner.nested_tree.items + : PTRDIFF_MAX; + } + return MDBX_SUCCESS; +} - const uintptr_t tid = osal_thread_self(); - flags |= env->me_flags & (MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP); - if (flags & MDBX_TXN_RDONLY) { - eASSERT(env, (flags & ~(MDBX_TXN_RO_BEGIN_FLAGS | MDBX_WRITEMAP | - MDBX_NOSTICKYTHREADS)) == 0); - txn->mt_flags = flags; - MDBX_reader *r = txn->to.reader; - STATIC_ASSERT(sizeof(uintptr_t) <= sizeof(r->mr_tid)); - if (likely(env->me_flags & MDBX_ENV_TXKEY)) { - eASSERT(env, !(env->me_flags & MDBX_NOSTICKYTHREADS)); - r = thread_rthc_get(env->me_txkey); - if (likely(r)) { - if (unlikely(!r->mr_pid.weak) && - (mdbx_static.flags & MDBX_DBG_LEGACY_MULTIOPEN)) { - thread_rthc_set(env->me_txkey, nullptr); - r = nullptr; - } else { - eASSERT(env, r->mr_pid.weak == env->me_pid); - eASSERT(env, r->mr_tid.weak == osal_thread_self()); - } - } - } else { - eASSERT(env, - !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOSTICKYTHREADS)); - } +/*----------------------------------------------------------------------------*/ - if (likely(r)) { - if (unlikely(r->mr_pid.weak != env->me_pid || - r->mr_txnid.weak < SAFE64_INVALID_THRESHOLD)) - return MDBX_BAD_RSLOT; - } else if (env->me_lck_mmap.lck) { - bind_rslot_result brs = bind_rslot(env, tid); - if (unlikely(brs.err != MDBX_SUCCESS)) - return brs.err; - r = brs.rslot; - } - txn->to.reader = r; - STATIC_ASSERT(MDBX_TXN_RDONLY_PREPARE > MDBX_TXN_RDONLY); - if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) { - eASSERT(env, txn->mt_txnid == 0); - eASSERT(env, txn->mt_owner == 0); - eASSERT(env, txn->mt_numdbs == 0); - if (likely(r)) { - eASSERT(env, r->mr_snapshot_pages_used.weak == 0); - eASSERT(env, r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); - atomic_store32(&r->mr_snapshot_pages_used, 0, mo_Relaxed); - } - txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED; +int mdbx_canary_put(MDBX_txn *txn, const MDBX_canary *canary) { + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (likely(canary)) { + if (txn->canary.x == canary->x && txn->canary.y == canary->y && + txn->canary.z == canary->z) return MDBX_SUCCESS; - } - txn->mt_owner = tid; + txn->canary.x = canary->x; + txn->canary.y = canary->y; + txn->canary.z = canary->z; + } + txn->canary.v = txn->txnid; + txn->flags |= MDBX_TXN_DIRTY; - /* Seek & fetch the last meta */ - uint64_t timestamp = 0; - size_t loop = 0; - meta_troika_t troika = meta_tap(env); - while (1) { - const meta_ptr_t head = - likely(env->me_stuck_meta < 0) - ? /* regular */ meta_recent(env, &troika) - : /* recovery mode */ meta_ptr(env, env->me_stuck_meta); - if (likely(r)) { - safe64_reset(&r->mr_txnid, false); - atomic_store32(&r->mr_snapshot_pages_used, head.ptr_v->mm_geo.next, - mo_Relaxed); - atomic_store64( - &r->mr_snapshot_pages_retired, - unaligned_peek_u64_volatile(4, head.ptr_v->mm_pages_retired), - mo_Relaxed); - safe64_write(&r->mr_txnid, head.txnid); - eASSERT(env, r->mr_pid.weak == osal_getpid()); - eASSERT(env, r->mr_tid.weak == ((env->me_flags & MDBX_NOSTICKYTHREADS) - ? 0 - : osal_thread_self())); - eASSERT(env, r->mr_txnid.weak == head.txnid || - (r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD && - head.txnid < env->me_lck->mti_oldest_reader.weak)); - atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, - mo_AcquireRelease); - } else { - /* exclusive mode without lck */ - eASSERT(env, !env->me_lck_mmap.lck && env->me_lck == lckless_stub(env)); - } - jitter4testing(true); - - /* Snap the state from current meta-head */ - txn->mt_txnid = head.txnid; - if (likely(env->me_stuck_meta < 0) && - unlikely(meta_should_retry(env, &troika) || - head.txnid < atomic_load64(&env->me_lck->mti_oldest_reader, - mo_AcquireRelease))) { - if (unlikely(++loop > 42)) { - ERROR("bailout waiting for valid snapshot (%s)", - "metapages are too volatile"); - rc = MDBX_PROBLEM; - txn->mt_txnid = INVALID_TXNID; - if (likely(r)) - safe64_reset(&r->mr_txnid, false); - goto bailout; - } - timestamp = 0; - continue; - } + return MDBX_SUCCESS; +} - rc = coherency_check_head(txn, head, ×tamp); - jitter4testing(false); - if (likely(rc == MDBX_SUCCESS)) - break; +/* Функция сообщает находится ли указанный адрес в "грязной" странице у + * заданной пишущей транзакции. В конечном счете это позволяет избавиться от + * лишнего копирования данных из НЕ-грязных страниц. + * + * "Грязные" страницы - это те, которые уже были изменены в ходе пишущей + * транзакции. Соответственно, какие-либо дальнейшие изменения могут привести + * к перезаписи таких страниц. Поэтому все функции, выполняющие изменения, в + * качестве аргументов НЕ должны получать указатели на данные в таких + * страницах. В свою очередь "НЕ грязные" страницы перед модификацией будут + * скопированы. + * + * Другими словами, данные из "грязных" страниц должны быть либо скопированы + * перед передачей в качестве аргументов для дальнейших модификаций, либо + * отвергнуты на стадии проверки корректности аргументов. + * + * Таким образом, функция позволяет как избавится от лишнего копирования, + * так и выполнить более полную проверку аргументов. + * + * ВАЖНО: Передаваемый указатель должен указывать на начало данных. Только + * так гарантируется что актуальный заголовок страницы будет физически + * расположен в той-же странице памяти, в том числе для многостраничных + * P_LARGE страниц с длинными данными. */ +int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (unlikely(rc != MDBX_RESULT_TRUE)) { - txn->mt_txnid = INVALID_TXNID; - if (likely(r)) - safe64_reset(&r->mr_txnid, false); - goto bailout; + const MDBX_env *env = txn->env; + const ptrdiff_t offset = ptr_dist(ptr, env->dxb_mmap.base); + if (offset >= 0) { + const pgno_t pgno = bytes2pgno(env, offset); + if (likely(pgno < txn->geo.first_unallocated)) { + const page_t *page = pgno2page(env, pgno); + if (unlikely(page->pgno != pgno || (page->flags & P_ILL_BITS) != 0)) { + /* The ptr pointed into middle of a large page, + * not to the beginning of a data. */ + return MDBX_EINVAL; } + return ((txn->flags & MDBX_TXN_RDONLY) || !is_modifable(txn, page)) + ? MDBX_RESULT_FALSE + : MDBX_RESULT_TRUE; } - - if (unlikely(txn->mt_txnid < MIN_TXNID || txn->mt_txnid > MAX_TXNID)) { - ERROR("%s", "environment corrupted by died writer, must shutdown!"); - if (likely(r)) - safe64_reset(&r->mr_txnid, false); - txn->mt_txnid = INVALID_TXNID; - rc = MDBX_CORRUPTED; - goto bailout; - } - ENSURE(env, txn->mt_txnid >= - /* paranoia is appropriate here */ env->me_lck - ->mti_oldest_reader.weak); - tASSERT(txn, txn->mt_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); - tASSERT(txn, db_check_flags(txn->mt_dbs[MAIN_DBI].md_flags)); - } else { - eASSERT(env, (flags & ~(MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS | - MDBX_WRITEMAP | MDBX_NOSTICKYTHREADS)) == 0); - if (unlikely(txn->mt_owner == tid || - /* not recovery mode */ env->me_stuck_meta >= 0)) - return MDBX_BUSY; - MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (lck && (env->me_flags & MDBX_NOSTICKYTHREADS) == 0 && - (mdbx_static.flags & MDBX_DBG_LEGACY_OVERLAP) == 0) { - const size_t snap_nreaders = - atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - for (size_t i = 0; i < snap_nreaders; ++i) { - if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_Relaxed) == - env->me_pid && - unlikely(atomic_load64(&lck->mti_readers[i].mr_tid, mo_Relaxed) == - tid)) { - const txnid_t txnid = safe64_read(&lck->mti_readers[i].mr_txnid); - if (txnid >= MIN_TXNID && txnid <= MAX_TXNID) - return MDBX_TXN_OVERLAPPING; - } - } + if ((size_t)offset < env->dxb_mmap.limit) { + /* Указатель адресует что-то в пределах mmap, но за границей + * распределенных страниц. Такое может случится если mdbx_is_dirty() + * вызывается после операции, в ходе которой грязная страница была + * возвращена в нераспределенное пространство. */ + return (txn->flags & MDBX_TXN_RDONLY) ? MDBX_EINVAL : MDBX_RESULT_TRUE; } + } - /* Not yet touching txn == env->me_txn0, it may be active */ - jitter4testing(false); - rc = osal_txn_lock(env, !!(flags & MDBX_TXN_TRY)); - if (unlikely(rc)) - return rc; - if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { - osal_txn_unlock(env); - return MDBX_PANIC; - } -#if defined(_WIN32) || defined(_WIN64) - if (unlikely(!env->me_map)) { - osal_txn_unlock(env); - return MDBX_EPERM; - } -#endif /* Windows */ + /* Страница вне используемого mmap-диапазона, т.е. либо в функцию был + * передан некорректный адрес, либо адрес в теневой странице, которая была + * выделена посредством malloc(). + * + * Для режима MDBX_WRITE_MAP режима страница однозначно "не грязная", + * а для режимов без MDBX_WRITE_MAP однозначно "не чистая". */ + return (txn->flags & (MDBX_WRITEMAP | MDBX_TXN_RDONLY)) ? MDBX_EINVAL + : MDBX_RESULT_TRUE; +} - txn->tw.troika = meta_tap(env); - const meta_ptr_t head = meta_recent(env, &txn->tw.troika); - uint64_t timestamp = 0; - while ("workaround for https://libmdbx.dqdkfa.ru/dead-github/issues/269") { - rc = coherency_check_head(txn, head, ×tamp); - if (likely(rc == MDBX_SUCCESS)) - break; - if (unlikely(rc != MDBX_RESULT_TRUE)) - goto bailout; - } - eASSERT(env, meta_txnid(head.ptr_v) == head.txnid); - txn->mt_txnid = safe64_txnid_next(head.txnid); - if (unlikely(txn->mt_txnid > MAX_TXNID)) { - rc = MDBX_TXN_FULL; - ERROR("txnid overflow, raise %d", rc); - goto bailout; - } +int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, + const MDBX_val *data) { + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - tASSERT(txn, txn->mt_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); - tASSERT(txn, db_check_flags(txn->mt_dbs[MAIN_DBI].md_flags)); - txn->mt_flags = flags; - txn->mt_child = NULL; - txn->tw.loose_pages = NULL; - txn->tw.loose_count = 0; -#if MDBX_ENABLE_REFUND - txn->tw.loose_refund_wl = 0; -#endif /* MDBX_ENABLE_REFUND */ - MDBX_PNL_SETSIZE(txn->tw.retired_pages, 0); - txn->tw.spilled.list = NULL; - txn->tw.spilled.least_removed = 0; - txn->tw.gc_time_acc = 0; - txn->tw.last_reclaimed = 0; - if (txn->tw.lifo_reclaimed) - MDBX_PNL_SETSIZE(txn->tw.lifo_reclaimed, 0); - env->me_txn = txn; + if (unlikely(!key)) + return MDBX_EINVAL; - if ((txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) { - rc = dpl_alloc(txn); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - txn->tw.dirtyroom = txn->mt_env->me_options.dp_limit; - txn->tw.dirtylru = MDBX_DEBUG ? UINT32_MAX / 3 - 42 : 0; - } else { - tASSERT(txn, txn->tw.dirtylist == nullptr); - txn->tw.dirtylist = nullptr; - txn->tw.dirtyroom = MAX_PAGENO; - txn->tw.dirtylru = 0; - } - eASSERT(env, txn->tw.writemap_dirty_npages == 0); - eASSERT(env, txn->tw.writemap_spilled_npages == 0); + if (unlikely(dbi <= FREE_DBI)) + return MDBX_BAD_DBI; + + if (unlikely(txn->flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) + return (txn->flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; + + cursor_couple_t cx; + rc = cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + MDBX_val proxy; + MDBX_cursor_op op = MDBX_SET; + unsigned flags = MDBX_ALLDUPS; + if (data) { + proxy = *data; + data = &proxy; + op = MDBX_GET_BOTH; + flags = 0; } + rc = cursor_seek(&cx.outer, (MDBX_val *)key, (MDBX_val *)data, op).err; + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - txn->mt_front = - txn->mt_txnid + ((flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == 0); + cx.outer.next = txn->cursors[dbi]; + txn->cursors[dbi] = &cx.outer; + rc = cursor_del(&cx.outer, flags); + txn->cursors[dbi] = cx.outer.next; + return rc; +} - /* Setup db info */ - tASSERT(txn, txn->mt_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); - tASSERT(txn, db_check_flags(txn->mt_dbs[MAIN_DBI].md_flags)); - VALGRIND_MAKE_MEM_UNDEFINED(txn->mt_dbi_state, env->me_maxdbs); -#if MDBX_ENABLE_DBI_SPARSE - txn->mt_numdbs = CORE_DBS; - VALGRIND_MAKE_MEM_UNDEFINED( - txn->mt_dbi_sparse, - ceil_powerof2(env->me_maxdbs, CHAR_BIT * sizeof(txn->mt_dbi_sparse[0])) / - CHAR_BIT); - txn->mt_dbi_sparse[0] = (1 << CORE_DBS) - 1; -#else - txn->mt_numdbs = (env->me_numdbs < 8) ? env->me_numdbs : 8; - if (txn->mt_numdbs > CORE_DBS) - memset(txn->mt_dbi_state + CORE_DBS, 0, txn->mt_numdbs - CORE_DBS); -#endif /* MDBX_ENABLE_DBI_SPARSE */ - txn->mt_dbi_state[FREE_DBI] = DBI_LINDO | DBI_VALID; - txn->mt_dbi_state[MAIN_DBI] = DBI_LINDO | DBI_VALID; - txn->mt_cursors[FREE_DBI] = nullptr; - txn->mt_cursors[MAIN_DBI] = nullptr; - txn->mt_dbi_seqs[FREE_DBI] = 0; - txn->mt_dbi_seqs[MAIN_DBI] = - atomic_load32(&env->me_dbi_seqs[MAIN_DBI], mo_AcquireRelease); - - if (unlikely(env->me_db_flags[MAIN_DBI] != - (DB_VALID | txn->mt_dbs[MAIN_DBI].md_flags))) { - const bool need_txn_lock = env->me_txn0 && env->me_txn0->mt_owner != tid; - bool should_unlock = false; - if (need_txn_lock) { - rc = osal_txn_lock(env, true); - if (rc == MDBX_SUCCESS) - should_unlock = true; - else if (rc != MDBX_BUSY && rc != MDBX_EDEADLK) - goto bailout; - } - rc = osal_fastmutex_acquire(&env->me_dbi_lock); - if (likely(rc == MDBX_SUCCESS)) { - uint32_t seq = dbi_seq_next(env, MAIN_DBI); - /* проверяем повторно после захвата блокировки */ - if (env->me_db_flags[MAIN_DBI] != - (DB_VALID | txn->mt_dbs[MAIN_DBI].md_flags)) { - if (!need_txn_lock || should_unlock || - /* если нет активной пишущей транзакции, - * то следующая будет ждать на me_dbi_lock */ - !env->me_txn) { - if (env->me_db_flags[MAIN_DBI] != 0 || MDBX_DEBUG) - NOTICE("renew MainDB for %s-txn %" PRIaTXN - " since db-flags changes 0x%x -> 0x%x", - (txn->mt_flags & MDBX_TXN_RDONLY) ? "ro" : "rw", - txn->mt_txnid, env->me_db_flags[MAIN_DBI] & ~DB_VALID, - txn->mt_dbs[MAIN_DBI].md_flags); - env->me_db_flags[MAIN_DBI] = DB_POISON; - atomic_store32(&env->me_dbi_seqs[MAIN_DBI], seq, mo_AcquireRelease); - rc = setup_sdb(&env->me_dbxs[MAIN_DBI], &txn->mt_dbs[MAIN_DBI], - env->me_psize); - if (likely(rc == MDBX_SUCCESS)) { - seq = dbi_seq_next(env, MAIN_DBI); - env->me_db_flags[MAIN_DBI] = - DB_VALID | txn->mt_dbs[MAIN_DBI].md_flags; - txn->mt_dbi_seqs[MAIN_DBI] = atomic_store32( - &env->me_dbi_seqs[MAIN_DBI], seq, mo_AcquireRelease); - } - } else { - ERROR("MainDB db-flags changes 0x%x -> 0x%x ahead of read-txn " - "%" PRIaTXN, - txn->mt_dbs[MAIN_DBI].md_flags, - env->me_db_flags[MAIN_DBI] & ~DB_VALID, txn->mt_txnid); - rc = MDBX_INCOMPATIBLE; +int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, + MDBX_put_flags_t flags) { + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!key || !data)) + return MDBX_EINVAL; + + if (unlikely(dbi <= FREE_DBI)) + return MDBX_BAD_DBI; + + if (unlikely(flags & ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS | + MDBX_ALLDUPS | MDBX_RESERVE | MDBX_APPEND | + MDBX_APPENDDUP | MDBX_CURRENT | MDBX_MULTIPLE))) + return MDBX_EINVAL; + + if (unlikely(txn->flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) + return (txn->flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; + + cursor_couple_t cx; + rc = cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + cx.outer.next = txn->cursors[dbi]; + txn->cursors[dbi] = &cx.outer; + + /* LY: support for update (explicit overwrite) */ + if (flags & MDBX_CURRENT) { + rc = cursor_seek(&cx.outer, (MDBX_val *)key, nullptr, MDBX_SET).err; + if (likely(rc == MDBX_SUCCESS) && (txn->dbs[dbi].flags & MDBX_DUPSORT) && + (flags & MDBX_ALLDUPS) == 0) { + /* LY: allows update (explicit overwrite) only for unique keys */ + node_t *node = + page_node(cx.outer.pg[cx.outer.top], cx.outer.ki[cx.outer.top]); + if (node_flags(node) & N_DUPDATA) { + tASSERT(txn, inner_pointed(&cx.outer) && + cx.outer.subcur->nested_tree.items > 1); + rc = MDBX_EMULTIVAL; + if ((flags & MDBX_NOOVERWRITE) == 0) { + flags -= MDBX_CURRENT; + rc = cursor_del(&cx.outer, MDBX_ALLDUPS); } } - ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); - } else { - DEBUG("me_dbi_lock failed, err %d", rc); } - if (should_unlock) - osal_txn_unlock(env); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; } - if (unlikely(txn->mt_dbs[FREE_DBI].md_flags != MDBX_INTEGERKEY)) { - ERROR("unexpected/invalid db-flags 0x%u for GC/FreeDB", - txn->mt_dbs[FREE_DBI].md_flags); - rc = MDBX_INCOMPATIBLE; - goto bailout; - } + if (likely(rc == MDBX_SUCCESS)) + rc = cursor_put_checklen(&cx.outer, key, data, flags); + txn->cursors[dbi] = cx.outer.next; - tASSERT(txn, txn->mt_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); - tASSERT(txn, db_check_flags(txn->mt_dbs[MAIN_DBI].md_flags)); - if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { - WARNING("%s", "environment had fatal error, must shutdown!"); - rc = MDBX_PANIC; - } else { - const size_t size_bytes = pgno2bytes(env, txn->mt_end_pgno); - const size_t used_bytes = pgno2bytes(env, txn->mt_next_pgno); - const size_t required_bytes = - (txn->mt_flags & MDBX_TXN_RDONLY) ? used_bytes : size_bytes; - eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); - if (unlikely(required_bytes > env->me_dxb_mmap.current)) { - /* Размер БД (для пишущих транзакций) или используемых данных (для - * читающих транзакций) больше предыдущего/текущего размера внутри - * процесса, увеличиваем. Сюда также попадает случай увеличения верхней - * границы размера БД и отображения. В читающих транзакциях нельзя - * изменять размер файла, который может быть больше необходимого этой - * транзакции. */ - if (txn->mt_geo.upper > MAX_PAGENO + 1 || - bytes2pgno(env, pgno2bytes(env, txn->mt_geo.upper)) != - txn->mt_geo.upper) { - rc = MDBX_UNABLE_EXTEND_MAPSIZE; - goto bailout; - } - rc = dxb_resize(env, txn->mt_next_pgno, txn->mt_end_pgno, - txn->mt_geo.upper, implicit_grow); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); - } else if (unlikely(size_bytes < env->me_dxb_mmap.current)) { - /* Размер БД меньше предыдущего/текущего размера внутри процесса, можно - * уменьшить, но всё сложнее: - * - размер файла согласован со всеми читаемыми снимками на момент - * коммита последней транзакции; - * - в читающей транзакции размер файла может быть больше и него нельзя - * изменять, в том числе менять madvise (меньша размера файла нельзя, - * а за размером нет смысла). - * - в пишущей транзакции уменьшать размер файла можно только после - * проверки размера читаемых снимков, но в этом нет смысла, так как - * это будет сделано при фиксации транзакции. - * - * В сухом остатке, можно только установить dxb_mmap.current равным - * размеру файла, а это проще сделать без вызова dxb_resize() и усложения - * внутренней логики. - * - * В этой тактике есть недостаток: если пишущите транзакции не регулярны, - * и при завершении такой транзакции файл БД остаётся не-уменьшеным из-за - * читающих транзакций использующих предыдущие снимки. */ -#if defined(_WIN32) || defined(_WIN64) - osal_srwlock_AcquireShared(&env->me_remap_guard); -#else - rc = osal_fastmutex_acquire(&env->me_remap_guard); -#endif - if (likely(rc == MDBX_SUCCESS)) { - eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); - rc = osal_filesize(env->me_dxb_mmap.fd, &env->me_dxb_mmap.filesize); - if (likely(rc == MDBX_SUCCESS)) { - eASSERT(env, env->me_dxb_mmap.filesize >= required_bytes); - if (env->me_dxb_mmap.current > env->me_dxb_mmap.filesize) - env->me_dxb_mmap.current = - (env->me_dxb_mmap.limit < env->me_dxb_mmap.filesize) - ? env->me_dxb_mmap.limit - : (size_t)env->me_dxb_mmap.filesize; - } -#if defined(_WIN32) || defined(_WIN64) - osal_srwlock_ReleaseShared(&env->me_remap_guard); -#else - int err = osal_fastmutex_release(&env->me_remap_guard); - if (unlikely(err) && likely(rc == MDBX_SUCCESS)) - rc = err; -#endif - } - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } - eASSERT(env, - pgno2bytes(env, txn->mt_next_pgno) <= env->me_dxb_mmap.current); - eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); - if (txn->mt_flags & MDBX_TXN_RDONLY) { -#if defined(_WIN32) || defined(_WIN64) - if (((used_bytes > env->me_dbgeo.lower && env->me_dbgeo.shrink) || - (mdbx_RunningUnderWine() && - /* under Wine acquisition of remap_guard is always required, - * since Wine don't support section extending, - * i.e. in both cases unmap+map are required. */ - used_bytes < env->me_dbgeo.upper && env->me_dbgeo.grow)) && - /* avoid recursive use SRW */ (txn->mt_flags & - MDBX_NOSTICKYTHREADS) == 0) { - txn->mt_flags |= MDBX_SHRINK_ALLOWED; - osal_srwlock_AcquireShared(&env->me_remap_guard); - } -#endif /* Windows */ - } else { - tASSERT(txn, txn == env->me_txn0); - MDBX_cursor *const gc = ptr_disp(txn, sizeof(MDBX_txn)); - rc = cursor_init(gc, txn, FREE_DBI); - if (rc != MDBX_SUCCESS) - goto bailout; - } -#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) - txn_valgrind(env, txn); -#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ - return MDBX_SUCCESS; - } -bailout: - tASSERT(txn, rc != MDBX_SUCCESS); - txn_end(txn, TXN_END_SLOT | TXN_END_FAIL_BEGIN); return rc; } -static __always_inline int check_txn(const MDBX_txn *txn, int bad_bits) { - if (unlikely(!txn)) - return MDBX_EINVAL; +//------------------------------------------------------------------------------ - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; +/* Позволяет обновить или удалить существующую запись с получением + * в old_data предыдущего значения данных. При этом если new_data равен + * нулю, то выполняется удаление, иначе обновление/вставка. + * + * Текущее значение может находиться в уже измененной (грязной) странице. + * В этом случае страница будет перезаписана при обновлении, а само старое + * значение утрачено. Поэтому исходно в old_data должен быть передан + * дополнительный буфер для копирования старого значения. + * Если переданный буфер слишком мал, то функция вернет -1, установив + * old_data->iov_len в соответствующее значение. + * + * Для не-уникальных ключей также возможен второй сценарий использования, + * когда посредством old_data из записей с одинаковым ключом для + * удаления/обновления выбирается конкретная. Для выбора этого сценария + * во flags следует одновременно указать MDBX_CURRENT и MDBX_NOOVERWRITE. + * Именно эта комбинация выбрана, так как она лишена смысла, и этим позволяет + * идентифицировать запрос такого сценария. + * + * Функция может быть замещена соответствующими операциями с курсорами + * после двух доработок (TODO): + * - внешняя аллокация курсоров, в том числе на стеке (без malloc). + * - получения dirty-статуса страницы по адресу (знать о MUTABLE/WRITEABLE). + */ - if (unlikely(txn->mt_flags & bad_bits)) - return MDBX_BAD_TXN; +int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, + MDBX_val *new_data, MDBX_val *old_data, + MDBX_put_flags_t flags, MDBX_preserve_func preserver, + void *preserver_context) { + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - tASSERT(txn, (txn->mt_flags & MDBX_TXN_FINISHED) || - (txn->mt_flags & MDBX_NOSTICKYTHREADS) == - (txn->mt_env->me_flags & MDBX_NOSTICKYTHREADS)); -#if MDBX_TXN_CHECKOWNER - STATIC_ASSERT((long)MDBX_NOSTICKYTHREADS > (long)MDBX_TXN_FINISHED); - if ((txn->mt_flags & (MDBX_NOSTICKYTHREADS | MDBX_TXN_FINISHED)) < - MDBX_TXN_FINISHED && - unlikely(txn->mt_owner != osal_thread_self())) - return txn->mt_owner ? MDBX_THREAD_MISMATCH : MDBX_BAD_TXN; -#endif /* MDBX_TXN_CHECKOWNER */ + if (unlikely(!key || !old_data || old_data == new_data)) + return MDBX_EINVAL; - if (bad_bits && unlikely(!txn->mt_env->me_map)) - return MDBX_EPERM; + if (unlikely(old_data->iov_base == nullptr && old_data->iov_len)) + return MDBX_EINVAL; - return MDBX_SUCCESS; -} + if (unlikely(new_data == nullptr && + (flags & (MDBX_CURRENT | MDBX_RESERVE)) != MDBX_CURRENT)) + return MDBX_EINVAL; -static __always_inline int check_txn_rw(const MDBX_txn *txn, int bad_bits) { - int err = check_txn(txn, bad_bits); - if (unlikely(err)) - return err; + if (unlikely(dbi <= FREE_DBI)) + return MDBX_BAD_DBI; - if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) - return MDBX_EACCESS; + if (unlikely(flags & + ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS | + MDBX_RESERVE | MDBX_APPEND | MDBX_APPENDDUP | MDBX_CURRENT))) + return MDBX_EINVAL; - return MDBX_SUCCESS; -} + cursor_couple_t cx; + rc = cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + cx.outer.next = txn->cursors[dbi]; + txn->cursors[dbi] = &cx.outer; -int mdbx_txn_renew(MDBX_txn *txn) { - if (unlikely(!txn)) - return MDBX_EINVAL; + MDBX_val present_key = *key; + if (F_ISSET(flags, MDBX_CURRENT | MDBX_NOOVERWRITE)) { + /* в old_data значение для выбора конкретного дубликата */ + if (unlikely(!(txn->dbs[dbi].flags & MDBX_DUPSORT))) { + rc = MDBX_EINVAL; + goto bailout; + } - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; + /* убираем лишний бит, он был признаком запрошенного режима */ + flags -= MDBX_NOOVERWRITE; - if (unlikely((txn->mt_flags & MDBX_TXN_RDONLY) == 0)) - return MDBX_EINVAL; + rc = cursor_seek(&cx.outer, &present_key, old_data, MDBX_GET_BOTH).err; + if (rc != MDBX_SUCCESS) + goto bailout; + } else { + /* в old_data буфер для сохранения предыдущего значения */ + if (unlikely(new_data && old_data->iov_base == new_data->iov_base)) + return MDBX_EINVAL; + MDBX_val present_data; + rc = cursor_seek(&cx.outer, &present_key, &present_data, MDBX_SET_KEY).err; + if (unlikely(rc != MDBX_SUCCESS)) { + old_data->iov_base = nullptr; + old_data->iov_len = 0; + if (rc != MDBX_NOTFOUND || (flags & MDBX_CURRENT)) + goto bailout; + } else if (flags & MDBX_NOOVERWRITE) { + rc = MDBX_KEYEXIST; + *old_data = present_data; + goto bailout; + } else { + page_t *page = cx.outer.pg[cx.outer.top]; + if (txn->dbs[dbi].flags & MDBX_DUPSORT) { + if (flags & MDBX_CURRENT) { + /* disallow update/delete for multi-values */ + node_t *node = page_node(page, cx.outer.ki[cx.outer.top]); + if (node_flags(node) & N_DUPDATA) { + tASSERT(txn, inner_pointed(&cx.outer) && + cx.outer.subcur->nested_tree.items > 1); + if (cx.outer.subcur->nested_tree.items > 1) { + rc = MDBX_EMULTIVAL; + goto bailout; + } + } + /* В LMDB флажок MDBX_CURRENT здесь приведет + * к замене данных без учета MDBX_DUPSORT сортировки, + * но здесь это в любом случае допустимо, так как мы + * проверили что для ключа есть только одно значение. */ + } + } - int rc; - if (unlikely(txn->mt_owner != 0 || !(txn->mt_flags & MDBX_TXN_FINISHED))) { - rc = mdbx_txn_reset(txn); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + if (is_modifable(txn, page)) { + if (new_data && cmp_lenfast(&present_data, new_data) == 0) { + /* если данные совпадают, то ничего делать не надо */ + *old_data = *new_data; + goto bailout; + } + rc = preserver ? preserver(preserver_context, old_data, + present_data.iov_base, present_data.iov_len) + : MDBX_SUCCESS; + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } else { + *old_data = present_data; + } + flags |= MDBX_CURRENT; + } } - rc = txn_renew(txn, MDBX_TXN_RDONLY); - if (rc == MDBX_SUCCESS) { - tASSERT(txn, txn->mt_owner == osal_thread_self()); - DEBUG("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO - "/%" PRIaPGNO, - txn->mt_txnid, (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', - (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root, - txn->mt_dbs[FREE_DBI].md_root); - } + if (likely(new_data)) + rc = cursor_put_checklen(&cx.outer, key, new_data, flags); + else + rc = cursor_del(&cx.outer, flags & MDBX_ALLDUPS); + +bailout: + txn->cursors[dbi] = cx.outer.next; return rc; } -int mdbx_txn_set_userctx(MDBX_txn *txn, void *ctx) { - int rc = check_txn(txn, MDBX_TXN_FINISHED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - txn->mt_userctx = ctx; +static int default_value_preserver(void *context, MDBX_val *target, + const void *src, size_t bytes) { + (void)context; + if (unlikely(target->iov_len < bytes)) { + target->iov_base = nullptr; + target->iov_len = bytes; + return MDBX_RESULT_TRUE; + } + memcpy(target->iov_base, src, target->iov_len = bytes); return MDBX_SUCCESS; } -void *mdbx_txn_get_userctx(const MDBX_txn *txn) { - return check_txn(txn, MDBX_TXN_FINISHED) ? nullptr : txn->mt_userctx; +int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, + MDBX_val *new_data, MDBX_val *old_data, + MDBX_put_flags_t flags) { + return mdbx_replace_ex(txn, dbi, key, new_data, old_data, flags, + default_value_preserver, nullptr); } +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 -int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, - MDBX_txn **ret, void *context) { - if (unlikely(!ret)) - return MDBX_EINVAL; - *ret = NULL; - if (unlikely((flags & ~MDBX_TXN_RW_BEGIN_FLAGS) && - (flags & ~MDBX_TXN_RO_BEGIN_FLAGS))) - return MDBX_EINVAL; +__cold static tree_t *audit_db_dig(const MDBX_txn *txn, const size_t dbi, + tree_t *fallback) { + const MDBX_txn *dig = txn; + do { + tASSERT(txn, txn->n_dbi == dig->n_dbi); + const uint8_t state = dbi_state(dig, dbi); + if (state & DBI_LINDO) + switch (state & (DBI_VALID | DBI_STALE | DBI_OLDEN)) { + case DBI_VALID: + case DBI_OLDEN: + return dig->dbs + dbi; + case 0: + return nullptr; + case DBI_VALID | DBI_STALE: + case DBI_OLDEN | DBI_STALE: + break; + default: + tASSERT(txn, !!"unexpected dig->dbi_state[dbi]"); + } + dig = dig->parent; + } while (dig); + return fallback; +} - int rc = check_env(env, true); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +static size_t audit_db_used(const tree_t *db) { + return db ? (size_t)db->branch_pages + (size_t)db->leaf_pages + + (size_t)db->large_pages + : 0; +} - if (unlikely(env->me_flags & MDBX_RDONLY & - ~flags)) /* write txn in RDONLY env */ - return MDBX_EACCESS; +__cold static int audit_ex_locked(MDBX_txn *txn, size_t retired_stored, + bool dont_filter_gc) { + const MDBX_env *const env = txn->env; + size_t pending = 0; + if ((txn->flags & MDBX_TXN_RDONLY) == 0) + pending = txn->tw.loose_count + MDBX_PNL_GETSIZE(txn->tw.relist) + + (MDBX_PNL_GETSIZE(txn->tw.retired_pages) - retired_stored); - MDBX_txn *txn = nullptr; - if (parent) { - /* Nested transactions: Max 1 child, write txns only, no writemap */ - rc = check_txn_rw(parent, - MDBX_TXN_RDONLY | MDBX_WRITEMAP | MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + cursor_couple_t cx; + int rc = cursor_init(&cx.outer, txn, FREE_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (env->me_options.spill_parent4child_denominator) { - /* Spill dirty-pages of parent to provide dirtyroom for child txn */ - rc = txn_spill(parent, nullptr, - parent->tw.dirtylist->length / - env->me_options.spill_parent4child_denominator); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + size_t gc = 0; + MDBX_val key, data; + rc = outer_first(&cx.outer, &key, &data); + while (rc == MDBX_SUCCESS) { + if (!dont_filter_gc) { + if (unlikely(key.iov_len != sizeof(txnid_t))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC-key size", (unsigned)key.iov_len); + return MDBX_CORRUPTED; + } + txnid_t id = unaligned_peek_u64(4, key.iov_base); + if (txn->tw.gc.reclaimed) { + for (size_t i = 1; i <= MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed); ++i) + if (id == txn->tw.gc.reclaimed[i]) + goto skip; + } else if (id <= txn->tw.gc.last_reclaimed) + goto skip; } - tASSERT(parent, audit_ex(parent, 0, false) == 0); - - flags |= parent->mt_flags & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS | - MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP); - } else if (flags & MDBX_TXN_RDONLY) { - if ((env->me_flags & MDBX_NOSTICKYTHREADS) == 0 && env->me_txn && - unlikely(env->me_txn0->mt_owner == osal_thread_self()) && - (mdbx_static.flags & MDBX_DBG_LEGACY_OVERLAP) == 0) - return MDBX_TXN_OVERLAPPING; - } else { - /* Reuse preallocated write txn. However, do not touch it until - * txn_renew() succeeds, since it currently may be active. */ - txn = env->me_txn0; - goto renew; + gc += *(pgno_t *)data.iov_base; + skip: + rc = outer_next(&cx.outer, &key, &data, MDBX_NEXT); } + tASSERT(txn, rc == MDBX_NOTFOUND); - const intptr_t bitmap_bytes = + const size_t done_bitmap_size = (txn->n_dbi + CHAR_BIT - 1) / CHAR_BIT; + uint8_t *const done_bitmap = alloca(done_bitmap_size); + memset(done_bitmap, 0, done_bitmap_size); + if (txn->parent) { + tASSERT(txn, txn->n_dbi == txn->parent->n_dbi && + txn->n_dbi == txn->env->txn->n_dbi); #if MDBX_ENABLE_DBI_SPARSE - ceil_powerof2(env->me_maxdbs, CHAR_BIT * sizeof(txn->mt_dbi_sparse[0])) / - CHAR_BIT; -#else - 0; + tASSERT(txn, txn->dbi_sparse == txn->parent->dbi_sparse && + txn->dbi_sparse == txn->env->txn->dbi_sparse); #endif /* MDBX_ENABLE_DBI_SPARSE */ - STATIC_ASSERT(sizeof(txn->tw) > sizeof(txn->to)); - const size_t base = (flags & MDBX_TXN_RDONLY) - ? sizeof(MDBX_txn) - sizeof(txn->tw) + sizeof(txn->to) - : sizeof(MDBX_txn); - const size_t size = - base + - ((flags & MDBX_TXN_RDONLY) - ? (size_t)bitmap_bytes + env->me_maxdbs * sizeof(txn->mt_dbi_seqs[0]) - : 0) + - env->me_maxdbs * (sizeof(txn->mt_dbs[0]) + sizeof(txn->mt_cursors[0]) + - sizeof(txn->mt_dbi_state[0])); - txn = osal_malloc(size); - if (unlikely(txn == nullptr)) { - DEBUG("calloc: %s", "failed"); - return MDBX_ENOMEM; } -#if MDBX_DEBUG - memset(txn, 0xCD, size); - VALGRIND_MAKE_MEM_UNDEFINED(txn, size); -#endif /* MDBX_DEBUG */ - MDBX_ANALYSIS_ASSUME(size > base); - memset(txn, 0, - (MDBX_GOOFY_MSVC_STATIC_ANALYZER && base > size) ? size : base); - txn->mt_dbs = ptr_disp(txn, base); - txn->mt_cursors = - ptr_disp(txn->mt_dbs, env->me_maxdbs * sizeof(txn->mt_dbs[0])); -#if MDBX_DEBUG - txn->mt_cursors[FREE_DBI] = nullptr; /* avoid SIGSEGV in an assertion later */ -#endif - txn->mt_dbi_state = - ptr_disp(txn, size - env->me_maxdbs * sizeof(txn->mt_dbi_state[0])); - txn->mt_flags = flags; - txn->mt_env = env; - - if (parent) { - tASSERT(parent, dirtylist_check(parent)); -#if MDBX_ENABLE_DBI_SPARSE - txn->mt_dbi_sparse = parent->mt_dbi_sparse; -#endif /* MDBX_ENABLE_DBI_SPARSE */ - txn->mt_dbi_seqs = parent->mt_dbi_seqs; - txn->mt_geo = parent->mt_geo; - rc = dpl_alloc(txn); - if (likely(rc == MDBX_SUCCESS)) { - const size_t len = - MDBX_PNL_GETSIZE(parent->tw.relist) + parent->tw.loose_count; - txn->tw.relist = - pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL); - if (unlikely(!txn->tw.relist)) - rc = MDBX_ENOMEM; - } - if (unlikely(rc != MDBX_SUCCESS)) { - nested_failed: - pnl_free(txn->tw.relist); - dpl_free(txn); - osal_free(txn); - return rc; - } - - /* Move loose pages to reclaimed list */ - if (parent->tw.loose_count) { - do { - MDBX_page *lp = parent->tw.loose_pages; - tASSERT(parent, lp->mp_flags == P_LOOSE); - rc = pnl_insert_range(&parent->tw.relist, lp->mp_pgno, 1); - if (unlikely(rc != MDBX_SUCCESS)) - goto nested_failed; - MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); - VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); - parent->tw.loose_pages = mp_next(lp); - /* Remove from dirty list */ - page_wash(parent, dpl_exist(parent, lp->mp_pgno), lp, 1); - } while (parent->tw.loose_pages); - parent->tw.loose_count = 0; -#if MDBX_ENABLE_REFUND - parent->tw.loose_refund_wl = 0; -#endif /* MDBX_ENABLE_REFUND */ - tASSERT(parent, dirtylist_check(parent)); - } - txn->tw.dirtyroom = parent->tw.dirtyroom; - txn->tw.dirtylru = parent->tw.dirtylru; - - dpl_sort(parent); - if (parent->tw.spilled.list) - spill_purge(parent); - - tASSERT(txn, MDBX_PNL_ALLOCLEN(txn->tw.relist) >= - MDBX_PNL_GETSIZE(parent->tw.relist)); - memcpy(txn->tw.relist, parent->tw.relist, - MDBX_PNL_SIZEOF(parent->tw.relist)); - eASSERT(env, pnl_check_allocated( - txn->tw.relist, - (txn->mt_next_pgno /* LY: intentional assignment here, - only for assertion */ - = parent->mt_next_pgno) - - MDBX_ENABLE_REFUND)); - txn->tw.gc_time_acc = parent->tw.gc_time_acc; - txn->tw.last_reclaimed = parent->tw.last_reclaimed; - if (parent->tw.lifo_reclaimed) { - txn->tw.lifo_reclaimed = parent->tw.lifo_reclaimed; - parent->tw.lifo_reclaimed = - (void *)(intptr_t)MDBX_PNL_GETSIZE(parent->tw.lifo_reclaimed); - } + size_t used = NUM_METAS + + audit_db_used(audit_db_dig(txn, FREE_DBI, nullptr)) + + audit_db_used(audit_db_dig(txn, MAIN_DBI, nullptr)); + rc = cursor_init(&cx.outer, txn, MAIN_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - txn->tw.retired_pages = parent->tw.retired_pages; - parent->tw.retired_pages = - (void *)(intptr_t)MDBX_PNL_GETSIZE(parent->tw.retired_pages); + rc = tree_search(&cx.outer, nullptr, Z_FIRST); + while (rc == MDBX_SUCCESS) { + page_t *mp = cx.outer.pg[cx.outer.top]; + for (size_t k = 0; k < page_numkeys(mp); k++) { + node_t *node = page_node(mp, k); + if (node_flags(node) != N_SUBDATA) + continue; + if (unlikely(node_ds(node) != sizeof(tree_t))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid dupsort sub-tree node size", (unsigned)node_ds(node)); + return MDBX_CORRUPTED; + } - txn->mt_txnid = parent->mt_txnid; - txn->mt_front = parent->mt_front + 1; -#if MDBX_ENABLE_REFUND - txn->tw.loose_refund_wl = 0; -#endif /* MDBX_ENABLE_REFUND */ - txn->mt_canary = parent->mt_canary; - parent->mt_flags |= MDBX_TXN_HAS_CHILD; - parent->mt_child = txn; - txn->mt_parent = parent; - txn->mt_owner = parent->mt_owner; - txn->tw.troika = parent->tw.troika; + tree_t reside; + const tree_t *db = memcpy(&reside, node_data(node), sizeof(reside)); + const MDBX_val name = {node_key(node), node_ks(node)}; + for (size_t dbi = CORE_DBS; dbi < env->n_dbi; ++dbi) { + if (dbi >= txn->n_dbi || !(env->dbs_flags[dbi] & DB_VALID)) + continue; + if (env->kvs[MAIN_DBI].clc.k.cmp(&name, &env->kvs[dbi].name)) + continue; - txn->mt_cursors[FREE_DBI] = nullptr; - txn->mt_cursors[MAIN_DBI] = nullptr; - txn->mt_dbi_state[FREE_DBI] = - parent->mt_dbi_state[FREE_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); - txn->mt_dbi_state[MAIN_DBI] = - parent->mt_dbi_state[MAIN_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); - memset(txn->mt_dbi_state + CORE_DBS, 0, - (txn->mt_numdbs = parent->mt_numdbs) - CORE_DBS); - memcpy(txn->mt_dbs, parent->mt_dbs, sizeof(txn->mt_dbs[0]) * CORE_DBS); - - tASSERT(parent, - parent->tw.dirtyroom + parent->tw.dirtylist->length == - (parent->mt_parent ? parent->mt_parent->tw.dirtyroom - : parent->mt_env->me_options.dp_limit)); - tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); - env->me_txn = txn; - tASSERT(parent, parent->mt_cursors[FREE_DBI] == nullptr); - rc = parent->mt_cursors[MAIN_DBI] - ? cursor_shadow(parent->mt_cursors[MAIN_DBI], txn, MAIN_DBI) - : MDBX_SUCCESS; - if (AUDIT_ENABLED() && ASSERT_ENABLED()) { - txn->mt_signature = MDBX_MT_SIGNATURE; - tASSERT(txn, audit_ex(txn, 0, false) == 0); + done_bitmap[dbi / CHAR_BIT] |= 1 << dbi % CHAR_BIT; + db = audit_db_dig(txn, dbi, &reside); + break; + } + used += audit_db_used(db); } - if (unlikely(rc != MDBX_SUCCESS)) - txn_end(txn, TXN_END_FAIL_BEGINCHILD); - } else { /* MDBX_TXN_RDONLY */ - txn->mt_dbi_seqs = - ptr_disp(txn->mt_cursors, env->me_maxdbs * sizeof(txn->mt_cursors[0])); -#if MDBX_ENABLE_DBI_SPARSE - txn->mt_dbi_sparse = ptr_disp(txn->mt_dbi_state, -bitmap_bytes); -#endif /* MDBX_ENABLE_DBI_SPARSE */ - renew: - rc = txn_renew(txn, flags); + rc = cursor_sibling_right(&cx.outer); } + tASSERT(txn, rc == MDBX_NOTFOUND); - if (unlikely(rc != MDBX_SUCCESS)) { - if (txn != env->me_txn0) - osal_free(txn); - } else { - if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) - eASSERT(env, txn->mt_flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED)); - else if (flags & MDBX_TXN_RDONLY) - eASSERT(env, (txn->mt_flags & - ~(MDBX_NOSTICKYTHREADS | MDBX_TXN_RDONLY | MDBX_WRITEMAP | - /* Win32: SRWL flag */ MDBX_SHRINK_ALLOWED)) == 0); - else { - eASSERT(env, - (txn->mt_flags & - ~(MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED | - MDBX_NOMETASYNC | MDBX_SAFE_NOSYNC | MDBX_TXN_SPILLS)) == 0); - assert(!txn->tw.spilled.list && !txn->tw.spilled.least_removed); - } - txn->mt_signature = MDBX_MT_SIGNATURE; - txn->mt_userctx = context; - *ret = txn; - DEBUG("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO - "/%" PRIaPGNO, - txn->mt_txnid, (flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, - (void *)env, txn->mt_dbs[MAIN_DBI].md_root, - txn->mt_dbs[FREE_DBI].md_root); + for (size_t dbi = CORE_DBS; dbi < txn->n_dbi; ++dbi) { + if (done_bitmap[dbi / CHAR_BIT] & (1 << dbi % CHAR_BIT)) + continue; + const tree_t *db = audit_db_dig(txn, dbi, nullptr); + if (db) + used += audit_db_used(db); + else if (dbi_state(txn, dbi)) + WARNING("audit %s@%" PRIaTXN + ": unable account dbi %zd / \"%*s\", state 0x%02x", + txn->parent ? "nested-" : "", txn->txnid, dbi, + (int)env->kvs[dbi].name.iov_len, + (const char *)env->kvs[dbi].name.iov_base, dbi_state(txn, dbi)); } - return rc; -} - -int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { - int rc = check_txn(txn, MDBX_TXN_FINISHED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + if (pending + gc + used == txn->geo.first_unallocated) + return MDBX_SUCCESS; - if (unlikely(!info)) - return MDBX_EINVAL; + if ((txn->flags & MDBX_TXN_RDONLY) == 0) + ERROR("audit @%" PRIaTXN ": %zu(pending) = %zu(loose) + " + "%zu(reclaimed) + %zu(retired-pending) - %zu(retired-stored)", + txn->txnid, pending, txn->tw.loose_count, + MDBX_PNL_GETSIZE(txn->tw.relist), + txn->tw.retired_pages ? MDBX_PNL_GETSIZE(txn->tw.retired_pages) : 0, + retired_stored); + ERROR("audit @%" PRIaTXN ": %zu(pending) + %zu" + "(gc) + %zu(count) = %zu(total) <> %zu" + "(allocated)", + txn->txnid, pending, gc, used, pending + gc + used, + (size_t)txn->geo.first_unallocated); + return MDBX_PROBLEM; +} - MDBX_env *const env = txn->mt_env; -#if MDBX_ENV_CHECKPID - if (unlikely(env->me_pid != osal_getpid())) { - env->me_flags |= MDBX_FATAL_ERROR; - return MDBX_PANIC; +__cold int audit_ex(MDBX_txn *txn, size_t retired_stored, bool dont_filter_gc) { + MDBX_env *const env = txn->env; + int rc = osal_fastmutex_acquire(&env->dbi_lock); + if (likely(rc == MDBX_SUCCESS)) { + rc = audit_ex_locked(txn, retired_stored, dont_filter_gc); + ENSURE(txn->env, osal_fastmutex_release(&env->dbi_lock) == MDBX_SUCCESS); } -#endif /* MDBX_ENV_CHECKPID */ + return rc; +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 - info->txn_id = txn->mt_txnid; - info->txn_space_used = pgno2bytes(env, txn->mt_geo.next); - if (txn->mt_flags & MDBX_TXN_RDONLY) { - meta_ptr_t head; - uint64_t head_retired; - meta_troika_t troika = meta_tap(env); - do { - /* fetch info from volatile head */ - head = meta_recent(env, &troika); - head_retired = - unaligned_peek_u64_volatile(4, head.ptr_v->mm_pages_retired); - info->txn_space_limit_soft = pgno2bytes(env, head.ptr_v->mm_geo.now); - info->txn_space_limit_hard = pgno2bytes(env, head.ptr_v->mm_geo.upper); - info->txn_space_leftover = - pgno2bytes(env, head.ptr_v->mm_geo.now - head.ptr_v->mm_geo.next); - } while (unlikely(meta_should_retry(env, &troika))); +typedef struct MDBX_chk_internal { + MDBX_chk_context_t *usr; + const struct MDBX_chk_callbacks *cb; + uint64_t monotime_timeout; - info->txn_reader_lag = head.txnid - info->txn_id; - info->txn_space_dirty = info->txn_space_retired = 0; - uint64_t reader_snapshot_pages_retired; - if (txn->to.reader && - head_retired > - (reader_snapshot_pages_retired = atomic_load64( - &txn->to.reader->mr_snapshot_pages_retired, mo_Relaxed))) { - info->txn_space_dirty = info->txn_space_retired = pgno2bytes( - env, (pgno_t)(head_retired - reader_snapshot_pages_retired)); + size_t *problem_counter; + uint8_t flags; + bool got_break; + bool write_locked; + uint8_t scope_depth; - size_t retired_next_reader = 0; - MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (scan_rlt && info->txn_reader_lag > 1 && lck) { - /* find next more recent reader */ - txnid_t next_reader = head.txnid; - const size_t snap_nreaders = - atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - for (size_t i = 0; i < snap_nreaders; ++i) { - retry: - if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { - jitter4testing(true); - const txnid_t snap_txnid = - safe64_read(&lck->mti_readers[i].mr_txnid); - const uint64_t snap_retired = - atomic_load64(&lck->mti_readers[i].mr_snapshot_pages_retired, - mo_AcquireRelease); - if (unlikely(snap_retired != - atomic_load64( - &lck->mti_readers[i].mr_snapshot_pages_retired, - mo_Relaxed)) || - snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid)) - goto retry; - if (snap_txnid <= txn->mt_txnid) { - retired_next_reader = 0; - break; - } - if (snap_txnid < next_reader) { - next_reader = snap_txnid; - retired_next_reader = pgno2bytes( - env, (pgno_t)(snap_retired - - atomic_load64( - &txn->to.reader->mr_snapshot_pages_retired, - mo_Relaxed))); - } - } - } - } - info->txn_space_dirty = retired_next_reader; - } - } else { - info->txn_space_limit_soft = pgno2bytes(env, txn->mt_geo.now); - info->txn_space_limit_hard = pgno2bytes(env, txn->mt_geo.upper); - info->txn_space_retired = pgno2bytes( - env, txn->mt_child ? (size_t)txn->tw.retired_pages - : MDBX_PNL_GETSIZE(txn->tw.retired_pages)); - info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom); - info->txn_space_dirty = pgno2bytes( - env, txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose - : (txn->tw.writemap_dirty_npages + - txn->tw.writemap_spilled_npages)); - info->txn_reader_lag = INT64_MAX; - MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (scan_rlt && lck) { - txnid_t oldest_snapshot = txn->mt_txnid; - const size_t snap_nreaders = - atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - if (snap_nreaders) { - oldest_snapshot = txn_oldest_reader(txn); - if (oldest_snapshot == txn->mt_txnid - 1) { - /* check if there is at least one reader */ - bool exists = false; - for (size_t i = 0; i < snap_nreaders; ++i) { - if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_Relaxed) && - txn->mt_txnid > safe64_read(&lck->mti_readers[i].mr_txnid)) { - exists = true; - break; - } - } - oldest_snapshot += !exists; - } - } - info->txn_reader_lag = txn->mt_txnid - oldest_snapshot; - } - } + MDBX_chk_subdb_t subdb_gc, subdb_main; + int16_t *pagemap; + MDBX_chk_subdb_t *last_lookup; + const void *last_nested; + MDBX_chk_scope_t scope_stack[12]; + MDBX_chk_subdb_t *subdb[MDBX_MAX_DBI + CORE_DBS]; - return MDBX_SUCCESS; -} + MDBX_envinfo envinfo; + troika_t troika; + MDBX_val v2a_buf; +} MDBX_chk_internal_t; -MDBX_env *mdbx_txn_env(const MDBX_txn *txn) { - if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE || - txn->mt_env->me_signature.weak != MDBX_ME_SIGNATURE)) - return NULL; - return txn->mt_env; +__cold static int chk_check_break(MDBX_chk_scope_t *const scope) { + MDBX_chk_internal_t *const chk = scope->internal; + return (chk->got_break || (chk->cb->check_break && + (chk->got_break = chk->cb->check_break(chk->usr)))) + ? MDBX_RESULT_TRUE + : MDBX_RESULT_FALSE; } -uint64_t mdbx_txn_id(const MDBX_txn *txn) { - if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) - return 0; - return txn->mt_txnid; +__cold static void chk_line_end(MDBX_chk_line_t *line) { + if (likely(line)) { + MDBX_chk_internal_t *chk = line->ctx->internal; + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + if (likely(chk->cb->print_done)) + chk->cb->print_done(line); + } } -int mdbx_txn_flags(const MDBX_txn *txn) { - STATIC_ASSERT( - (MDBX_TXN_INVALID & - (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | - MDBX_TXN_HAS_CHILD | MDBX_TXN_DRAINED_GC | MDBX_SHRINK_ALLOWED | - MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) == 0); - if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_TXN_INVALID; - assert(0 == (int)(txn->mt_flags & MDBX_TXN_INVALID)); - return txn->mt_flags; +__cold __must_check_result static MDBX_chk_line_t * +chk_line_begin(MDBX_chk_scope_t *const scope, enum MDBX_chk_severity severity) { + MDBX_chk_internal_t *const chk = scope->internal; + if (severity < MDBX_chk_warning) + mdbx_env_chk_encount_problem(chk->usr); + MDBX_chk_line_t *line = nullptr; + if (likely(chk->cb->print_begin)) { + line = chk->cb->print_begin(chk->usr, severity); + if (likely(line)) { + assert(line->ctx == nullptr || (line->ctx == chk->usr && line->empty)); + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + line->ctx = chk->usr; + } + } + return line; } -/* Filter-out pgno list from transaction's dirty-page list */ -static void dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, const bool spilled) { - tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - if (MDBX_PNL_GETSIZE(pl) && txn->tw.dirtylist->length) { - tASSERT(txn, pnl_check_allocated(pl, (size_t)txn->mt_next_pgno << spilled)); - MDBX_dpl *dl = dpl_sort(txn); - - /* Scanning in ascend order */ - const intptr_t step = MDBX_PNL_ASCENDING ? 1 : -1; - const intptr_t begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_GETSIZE(pl); - const intptr_t end = MDBX_PNL_ASCENDING ? MDBX_PNL_GETSIZE(pl) + 1 : 0; - tASSERT(txn, pl[begin] <= pl[end - step]); - - size_t w, r = dpl_search(txn, pl[begin] >> spilled); - tASSERT(txn, dl->sorted == dl->length); - for (intptr_t i = begin; r <= dl->length;) { /* scan loop */ - assert(i != end); - tASSERT(txn, !spilled || (pl[i] & 1) == 0); - pgno_t pl_pgno = pl[i] >> spilled; - pgno_t dp_pgno = dl->items[r].pgno; - if (likely(dp_pgno != pl_pgno)) { - const bool cmp = dp_pgno < pl_pgno; - r += cmp; - i += cmp ? 0 : step; - if (likely(i != end)) - continue; - return; - } +__cold static MDBX_chk_line_t *chk_line_feed(MDBX_chk_line_t *line) { + if (likely(line)) { + MDBX_chk_internal_t *chk = line->ctx->internal; + enum MDBX_chk_severity severity = line->severity; + chk_line_end(line); + line = chk_line_begin(chk->usr->scope, severity); + } + return line; +} - /* update loop */ - unsigned npages; - w = r; - remove_dl: - npages = dpl_npages(dl, r); - dl->pages_including_loose -= npages; - if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) - dpage_free(txn->mt_env, dl->items[r].ptr, npages); - ++r; - next_i: - i += step; - if (unlikely(i == end)) { - while (r <= dl->length) - dl->items[w++] = dl->items[r++]; - } else { - while (r <= dl->length) { - assert(i != end); - tASSERT(txn, !spilled || (pl[i] & 1) == 0); - pl_pgno = pl[i] >> spilled; - dp_pgno = dl->items[r].pgno; - if (dp_pgno < pl_pgno) - dl->items[w++] = dl->items[r++]; - else if (dp_pgno > pl_pgno) - goto next_i; - else - goto remove_dl; - } - } - dl->sorted = dpl_setlen(dl, w - 1); - txn->tw.dirtyroom += r - w; - tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); - return; +__cold static MDBX_chk_line_t *chk_flush(MDBX_chk_line_t *line) { + if (likely(line)) { + MDBX_chk_internal_t *chk = line->ctx->internal; + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + if (likely(chk->cb->print_flush)) { + chk->cb->print_flush(line); + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + line->out = line->begin; } } + return line; } -/* End a transaction, except successful commit of a nested transaction. - * May be called twice for readonly txns: First reset it, then abort. - * [in] txn the transaction handle to end - * [in] mode why and how to end the transaction */ -static int txn_end(MDBX_txn *txn, const unsigned mode) { - MDBX_env *env = txn->mt_env; - static const char *const names[] = TXN_END_NAMES; - - DEBUG("%s txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO - "/%" PRIaPGNO, - names[mode & TXN_END_OPMASK], txn->mt_txnid, - (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, - txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root); - - if (!(mode & TXN_END_EOTDONE)) /* !(already closed cursors) */ - cursors_eot(txn, false); - - int rc = MDBX_SUCCESS; - if (txn->mt_flags & MDBX_TXN_RDONLY) { - if (txn->to.reader) { - MDBX_reader *slot = txn->to.reader; - eASSERT(env, slot->mr_pid.weak == env->me_pid); - if (likely(!(txn->mt_flags & MDBX_TXN_FINISHED))) { - ENSURE(env, txn->mt_txnid >= - /* paranoia is appropriate here */ env->me_lck - ->mti_oldest_reader.weak); - eASSERT(env, - txn->mt_txnid == slot->mr_txnid.weak && - slot->mr_txnid.weak >= env->me_lck->mti_oldest_reader.weak); -#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) - txn_valgrind(env, nullptr); -#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ - atomic_store32(&slot->mr_snapshot_pages_used, 0, mo_Relaxed); - safe64_reset(&slot->mr_txnid, false); - atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, - mo_Relaxed); - } else { - eASSERT(env, slot->mr_pid.weak == env->me_pid); - eASSERT(env, slot->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); - } - if (mode & TXN_END_SLOT) { - if ((env->me_flags & MDBX_ENV_TXKEY) == 0) - atomic_store32(&slot->mr_pid, 0, mo_Relaxed); - txn->to.reader = NULL; - } +__cold static size_t chk_print_wanna(MDBX_chk_line_t *line, size_t need) { + if (likely(line && need)) { + size_t have = line->end - line->out; + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + if (need > have) { + line = chk_flush(line); + have = line->end - line->out; } -#if defined(_WIN32) || defined(_WIN64) - if (txn->mt_flags & MDBX_SHRINK_ALLOWED) - osal_srwlock_ReleaseShared(&env->me_remap_guard); -#endif - txn->mt_numdbs = 0; /* prevent further DBI activity */ - txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED; - txn->mt_owner = 0; - } else if (!(txn->mt_flags & MDBX_TXN_FINISHED)) { - ENSURE(env, txn->mt_txnid >= - /* paranoia is appropriate here */ env->me_lck - ->mti_oldest_reader.weak); -#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) - if (txn == env->me_txn0) - txn_valgrind(env, nullptr); -#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ - - txn->mt_flags = MDBX_TXN_FINISHED; - env->me_txn = txn->mt_parent; - pnl_free(txn->tw.spilled.list); - txn->tw.spilled.list = nullptr; - if (txn == env->me_txn0) { - eASSERT(env, txn->mt_parent == NULL); - /* Export or close DBI handles created in this txn */ - rc = dbi_update(txn, mode & TXN_END_UPDATE); - pnl_shrink(&txn->tw.retired_pages); - pnl_shrink(&txn->tw.relist); - if (!(env->me_flags & MDBX_WRITEMAP)) - dlist_free(txn); - /* The writer mutex was locked in mdbx_txn_begin. */ - osal_txn_unlock(env); - } else { - eASSERT(env, txn->mt_parent != NULL); - MDBX_txn *const parent = txn->mt_parent; - eASSERT(env, parent->mt_signature == MDBX_MT_SIGNATURE); - eASSERT(env, parent->mt_child == txn && - (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); - eASSERT(env, pnl_check_allocated(txn->tw.relist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - eASSERT(env, memcmp(&txn->tw.troika, &parent->tw.troika, - sizeof(meta_troika_t)) == 0); - - txn->mt_owner = 0; - if (txn->tw.lifo_reclaimed) { - eASSERT(env, MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) >= - (uintptr_t)parent->tw.lifo_reclaimed); - MDBX_PNL_SETSIZE(txn->tw.lifo_reclaimed, - (uintptr_t)parent->tw.lifo_reclaimed); - parent->tw.lifo_reclaimed = txn->tw.lifo_reclaimed; - } - - if (txn->tw.retired_pages) { - eASSERT(env, MDBX_PNL_GETSIZE(txn->tw.retired_pages) >= - (uintptr_t)parent->tw.retired_pages); - MDBX_PNL_SETSIZE(txn->tw.retired_pages, - (uintptr_t)parent->tw.retired_pages); - parent->tw.retired_pages = txn->tw.retired_pages; - } + return (need < have) ? need : have; + } + return 0; +} - parent->mt_child = nullptr; - parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; - parent->tw.dirtylru = txn->tw.dirtylru; - tASSERT(parent, dirtylist_check(parent)); - tASSERT(parent, audit_ex(parent, 0, false) == 0); - dlist_free(txn); - dpl_free(txn); - pnl_free(txn->tw.relist); +__cold static MDBX_chk_line_t *chk_puts(MDBX_chk_line_t *line, + const char *str) { + if (likely(line && str && *str)) { + MDBX_chk_internal_t *chk = line->ctx->internal; + size_t left = strlen(str); + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + if (chk->cb->print_chars) { + chk->cb->print_chars(line, str, left); + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + } else + do { + size_t chunk = chk_print_wanna(line, left); + assert(chunk <= left); + if (unlikely(!chunk)) + break; + memcpy(line->out, str, chunk); + line->out += chunk; + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + str += chunk; + left -= chunk; + } while (left); + line->empty = false; + } + return line; +} - if (parent->mt_geo.upper != txn->mt_geo.upper || - parent->mt_geo.now != txn->mt_geo.now) { - /* undo resize performed by child txn */ - rc = dxb_resize(env, parent->mt_next_pgno, parent->mt_geo.now, - parent->mt_geo.upper, impilict_shrink); - if (rc == MDBX_EPERM) { - /* unable undo resize (it is regular for Windows), - * therefore promote size changes from child to the parent txn */ - WARNING("unable undo resize performed by child txn, promote to " - "the parent (%u->%u, %u->%u)", - txn->mt_geo.now, parent->mt_geo.now, txn->mt_geo.upper, - parent->mt_geo.upper); - parent->mt_geo.now = txn->mt_geo.now; - parent->mt_geo.upper = txn->mt_geo.upper; - parent->mt_flags |= MDBX_TXN_DIRTY; - rc = MDBX_SUCCESS; - } else if (unlikely(rc != MDBX_SUCCESS)) { - ERROR("error %d while undo resize performed by child txn, fail " - "the parent", - rc); - parent->mt_flags |= MDBX_TXN_ERROR; - if (!env->me_dxb_mmap.base) - env->me_flags |= MDBX_FATAL_ERROR; +__cold static MDBX_chk_line_t *chk_print_va(MDBX_chk_line_t *line, + const char *fmt, va_list args) { + if (likely(line)) { + MDBX_chk_internal_t *chk = line->ctx->internal; + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + if (chk->cb->print_format) { + chk->cb->print_format(line, fmt, args); + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + } else { + va_list ones; + va_copy(ones, args); + const int needed = vsnprintf(nullptr, 0, fmt, ones); + va_end(ones); + if (likely(needed > 0)) { + const size_t have = chk_print_wanna(line, needed); + if (likely(have > 0)) { + int written = vsnprintf(line->out, have, fmt, args); + if (likely(written > 0)) + line->out += written; + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); } } } + line->empty = false; } - - eASSERT(env, txn == env->me_txn0 || txn->mt_owner == 0); - if ((mode & TXN_END_FREE) != 0 && txn != env->me_txn0) { - txn->mt_signature = 0; - osal_free(txn); - } - - return rc; + return line; } -int mdbx_txn_reset(MDBX_txn *txn) { - int rc = check_txn(txn, 0); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - /* This call is only valid for read-only txns */ - if (unlikely((txn->mt_flags & MDBX_TXN_RDONLY) == 0)) - return MDBX_EINVAL; - - /* LY: don't close DBI-handles */ - rc = txn_end(txn, TXN_END_RESET | TXN_END_UPDATE); - if (rc == MDBX_SUCCESS) { - tASSERT(txn, txn->mt_signature == MDBX_MT_SIGNATURE); - tASSERT(txn, txn->mt_owner == 0); +__cold static MDBX_chk_line_t *MDBX_PRINTF_ARGS(2, 3) + chk_print(MDBX_chk_line_t *line, const char *fmt, ...) { + if (likely(line)) { + // MDBX_chk_internal_t *chk = line->ctx->internal; + va_list args; + va_start(args, fmt); + line = chk_print_va(line, fmt, args); + va_end(args); + line->empty = false; } - return rc; -} - -int mdbx_txn_break(MDBX_txn *txn) { - do { - int rc = check_txn(txn, 0); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - txn->mt_flags |= MDBX_TXN_ERROR; - if (txn->mt_flags & MDBX_TXN_RDONLY) - break; - txn = txn->mt_child; - } while (txn); - return MDBX_SUCCESS; + return line; } -static int txn_abort(MDBX_txn *txn) { - if (txn->mt_flags & MDBX_TXN_RDONLY) - /* LY: don't close DBI-handles */ - return txn_end(txn, TXN_END_ABORT | TXN_END_UPDATE | TXN_END_SLOT | - TXN_END_FREE); - - if (unlikely(txn->mt_flags & MDBX_TXN_FINISHED)) - return MDBX_BAD_TXN; - - if (txn->mt_child) - txn_abort(txn->mt_child); - - tASSERT(txn, (txn->mt_flags & MDBX_TXN_ERROR) || dirtylist_check(txn)); - return txn_end(txn, TXN_END_ABORT | TXN_END_SLOT | TXN_END_FREE); +__cold static MDBX_chk_line_t *chk_print_size(MDBX_chk_line_t *line, + const char *prefix, + const uint64_t value, + const char *suffix) { + static const char sf[] = + "KMGTPEZY"; /* LY: Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta! */ + if (likely(line)) { + MDBX_chk_internal_t *chk = line->ctx->internal; + prefix = prefix ? prefix : ""; + suffix = suffix ? suffix : ""; + if (chk->cb->print_size) + chk->cb->print_size(line, prefix, value, suffix); + else + for (unsigned i = 0;; ++i) { + const unsigned scale = 10 + i * 10; + const uint64_t rounded = value + (UINT64_C(5) << (scale - 10)); + const uint64_t integer = rounded >> scale; + const uint64_t fractional = + (rounded - (integer << scale)) * 100u >> scale; + if ((rounded >> scale) <= 1000) + return chk_print(line, "%s%" PRIu64 " (%u.%02u %ciB)%s", prefix, + value, (unsigned)integer, (unsigned)fractional, + sf[i], suffix); + } + line->empty = false; + } + return line; } -int mdbx_txn_abort(MDBX_txn *txn) { - int rc = check_txn(txn, 0); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - rc = check_env(txn->mt_env, true); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +__cold static int chk_error_rc(MDBX_chk_scope_t *const scope, int err, + const char *subj) { + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_error); + if (line) + chk_line_end(chk_flush(chk_print(line, "%s() failed, error %s (%d)", subj, + mdbx_strerror(err), err))); + else + debug_log(MDBX_LOG_ERROR, "mdbx_env_chk", 0, "%s() failed, error %s (%d)", + subj, mdbx_strerror(err), err); + return err; +} - if ((txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_NOSTICKYTHREADS)) == - MDBX_NOSTICKYTHREADS && - unlikely(txn->mt_owner != osal_thread_self())) { - mdbx_txn_break(txn); - return MDBX_THREAD_MISMATCH; +__cold static void MDBX_PRINTF_ARGS(5, 6) + chk_object_issue(MDBX_chk_scope_t *const scope, const char *object, + uint64_t entry_number, const char *caption, + const char *extra_fmt, ...) { + MDBX_chk_internal_t *const chk = scope->internal; + MDBX_chk_issue_t *issue = chk->usr->scope->issues; + while (issue) { + if (issue->caption == caption) { + issue->count += 1; + break; + } else + issue = issue->next; + } + const bool fresh = issue == nullptr; + if (fresh) { + issue = osal_malloc(sizeof(*issue)); + if (likely(issue)) { + issue->caption = caption; + issue->count = 1; + issue->next = chk->usr->scope->issues; + chk->usr->scope->issues = issue; + } else + chk_error_rc(scope, ENOMEM, "adding issue"); } - return txn_abort(txn); -} - -__cold static MDBX_db *audit_db_dig(const MDBX_txn *txn, const size_t dbi, - MDBX_db *fallback) { - const MDBX_txn *dig = txn; - do { - tASSERT(txn, txn->mt_numdbs == dig->mt_numdbs); - const uint8_t state = dbi_state(dig, dbi); - if (state & DBI_LINDO) - switch (state & (DBI_VALID | DBI_STALE | DBI_OLDEN)) { - case DBI_VALID: - case DBI_OLDEN: - return dig->mt_dbs + dbi; - case 0: - return nullptr; - case DBI_VALID | DBI_STALE: - case DBI_OLDEN | DBI_STALE: - break; - default: - tASSERT(txn, !!"unexpected dig->mt_dbi_state[dbi]"); - } - dig = dig->mt_parent; - } while (dig); - return fallback; + va_list args; + va_start(args, extra_fmt); + if (chk->cb->issue) { + mdbx_env_chk_encount_problem(chk->usr); + chk->cb->issue(chk->usr, object, entry_number, caption, extra_fmt, args); + } else { + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_error); + if (entry_number != UINT64_MAX) + chk_print(line, "%s #%" PRIu64 ": %s", object, entry_number, caption); + else + chk_print(line, "%s: %s", object, caption); + if (extra_fmt) + chk_puts(chk_print_va(chk_puts(line, " ("), extra_fmt, args), ")"); + chk_line_end(fresh ? chk_flush(line) : line); + } + va_end(args); } -static size_t audit_db_used(const MDBX_db *db) { - return db ? (size_t)db->md_branch_pages + (size_t)db->md_leaf_pages + - (size_t)db->md_overflow_pages - : 0; +__cold static void MDBX_PRINTF_ARGS(2, 3) + chk_scope_issue(MDBX_chk_scope_t *const scope, const char *fmt, ...) { + MDBX_chk_internal_t *const chk = scope->internal; + va_list args; + va_start(args, fmt); + if (likely(chk->cb->issue)) { + mdbx_env_chk_encount_problem(chk->usr); + chk->cb->issue(chk->usr, nullptr, 0, nullptr, fmt, args); + } else + chk_line_end( + chk_print_va(chk_line_begin(scope, MDBX_chk_error), fmt, args)); + va_end(args); } -/* Count all the pages in each DB and in the GC and make sure - * it matches the actual number of pages being used. */ -__cold static int audit_ex_locked(MDBX_txn *txn, size_t retired_stored, - bool dont_filter_gc) { - const MDBX_env *const env = txn->mt_env; - size_t pending = 0; - if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) - pending = txn->tw.loose_count + MDBX_PNL_GETSIZE(txn->tw.relist) + - (MDBX_PNL_GETSIZE(txn->tw.retired_pages) - retired_stored); - - MDBX_cursor_couple cx; - int rc = cursor_init(&cx.outer, txn, FREE_DBI); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - size_t gc = 0; - MDBX_val key, data; - while ((rc = cursor_get(&cx.outer, &key, &data, MDBX_NEXT)) == 0) { - if (!dont_filter_gc) { - if (unlikely(key.iov_len != sizeof(txnid_t))) { - ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "invalid GC-key size", (unsigned)key.iov_len); - return MDBX_CORRUPTED; - } - txnid_t id = unaligned_peek_u64(4, key.iov_base); - if (txn->tw.lifo_reclaimed) { - for (size_t i = 1; i <= MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed); ++i) - if (id == txn->tw.lifo_reclaimed[i]) - goto skip; - } else if (id <= txn->tw.last_reclaimed) - goto skip; +__cold static int chk_scope_end(MDBX_chk_internal_t *chk, int err) { + assert(chk->scope_depth > 0); + MDBX_chk_scope_t *const inner = chk->scope_stack + chk->scope_depth; + MDBX_chk_scope_t *const outer = chk->scope_depth ? inner - 1 : nullptr; + if (!outer || outer->stage != inner->stage) { + if (err == MDBX_SUCCESS && *chk->problem_counter) + err = MDBX_PROBLEM; + else if (*chk->problem_counter == 0 && MDBX_IS_ERROR(err)) + *chk->problem_counter = 1; + if (chk->problem_counter != &chk->usr->result.total_problems) { + chk->usr->result.total_problems += *chk->problem_counter; + chk->problem_counter = &chk->usr->result.total_problems; } - - gc += *(pgno_t *)data.iov_base; - skip:; + if (chk->cb->stage_end) + err = chk->cb->stage_end(chk->usr, inner->stage, err); } - tASSERT(txn, rc == MDBX_NOTFOUND); + if (chk->cb->scope_conclude) + err = chk->cb->scope_conclude(chk->usr, outer, inner, err); + chk->usr->scope = outer; + chk->usr->scope_nesting = chk->scope_depth -= 1; + if (outer) + outer->subtotal_issues += inner->subtotal_issues; + if (chk->cb->scope_pop) + chk->cb->scope_pop(chk->usr, outer, inner); - const size_t done_bitmap_size = (txn->mt_numdbs + CHAR_BIT - 1) / CHAR_BIT; - uint8_t *const done_bitmap = alloca(done_bitmap_size); - memset(done_bitmap, 0, done_bitmap_size); - if (txn->mt_parent) { - tASSERT(txn, txn->mt_numdbs == txn->mt_parent->mt_numdbs && - txn->mt_numdbs == txn->mt_env->me_txn->mt_numdbs); -#if MDBX_ENABLE_DBI_SPARSE - tASSERT(txn, txn->mt_dbi_sparse == txn->mt_parent->mt_dbi_sparse && - txn->mt_dbi_sparse == txn->mt_env->me_txn->mt_dbi_sparse); -#endif /* MDBX_ENABLE_DBI_SPARSE */ + while (inner->issues) { + MDBX_chk_issue_t *next = inner->issues->next; + osal_free(inner->issues); + inner->issues = next; } + memset(inner, -1, sizeof(*inner)); + return err; +} - size_t used = NUM_METAS + - audit_db_used(audit_db_dig(txn, FREE_DBI, nullptr)) + - audit_db_used(audit_db_dig(txn, MAIN_DBI, nullptr)); - rc = cursor_init(&cx.outer, txn, MAIN_DBI); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +__cold static int chk_scope_begin_args(MDBX_chk_internal_t *chk, + int verbosity_adjustment, + enum MDBX_chk_stage stage, + const void *object, size_t *problems, + const char *fmt, va_list args) { + if (unlikely(chk->scope_depth + 1u >= ARRAY_LENGTH(chk->scope_stack))) + return MDBX_BACKLOG_DEPLETED; - for (rc = page_search(&cx.outer, NULL, MDBX_PS_FIRST); rc == MDBX_SUCCESS; - rc = cursor_sibling(&cx.outer, SIBLING_RIGHT)) { - MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top]; - for (size_t k = 0; k < page_numkeys(mp); k++) { - MDBX_node *node = page_node(mp, k); - if (node_flags(node) != F_SUBDATA) - continue; - if (unlikely(node_ds(node) != sizeof(MDBX_db))) { - ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "invalid dupsort sub-tree node size", (unsigned)node_ds(node)); - return MDBX_CORRUPTED; - } + MDBX_chk_scope_t *const outer = chk->scope_stack + chk->scope_depth; + const int verbosity = + outer->verbosity + + (verbosity_adjustment - 1) * (1 << MDBX_chk_severity_prio_shift); + MDBX_chk_scope_t *const inner = outer + 1; + memset(inner, 0, sizeof(*inner)); + inner->internal = outer->internal; + inner->stage = stage ? stage : (stage = outer->stage); + inner->object = object; + inner->verbosity = (verbosity < MDBX_chk_warning) + ? MDBX_chk_warning + : (enum MDBX_chk_severity)verbosity; + if (problems) + chk->problem_counter = problems; + else if (!chk->problem_counter || outer->stage != stage) + chk->problem_counter = &chk->usr->result.total_problems; - MDBX_db reside; - const MDBX_db *db = memcpy(&reside, node_data(node), sizeof(reside)); - const MDBX_val name = {node_key(node), node_ks(node)}; - for (size_t dbi = CORE_DBS; dbi < env->me_numdbs; ++dbi) { - if (dbi >= txn->mt_numdbs || !(env->me_db_flags[dbi] & DB_VALID)) - continue; - if (env->me_dbxs[MAIN_DBI].md_cmp(&name, &env->me_dbxs[dbi].md_name)) - continue; + if (chk->cb->scope_push) { + const int err = chk->cb->scope_push(chk->usr, outer, inner, fmt, args); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + chk->usr->scope = inner; + chk->usr->scope_nesting = chk->scope_depth += 1; - done_bitmap[dbi / CHAR_BIT] |= 1 << dbi % CHAR_BIT; - db = audit_db_dig(txn, dbi, &reside); - break; - } - used += audit_db_used(db); + if (stage != outer->stage && chk->cb->stage_begin) { + int err = chk->cb->stage_begin(chk->usr, stage); + if (unlikely(err != MDBX_SUCCESS)) { + err = chk_scope_end(chk, err); + assert(err != MDBX_SUCCESS); + return err ? err : MDBX_RESULT_TRUE; } } - tASSERT(txn, rc == MDBX_NOTFOUND); + return MDBX_SUCCESS; +} - for (size_t dbi = CORE_DBS; dbi < txn->mt_numdbs; ++dbi) { - if (done_bitmap[dbi / CHAR_BIT] & (1 << dbi % CHAR_BIT)) - continue; - const MDBX_db *db = audit_db_dig(txn, dbi, nullptr); - if (db) - used += audit_db_used(db); - else if (dbi_state(txn, dbi)) - WARNING("audit %s@%" PRIaTXN - ": unable account dbi %zd / \"%*s\", state 0x%02x", - txn->mt_parent ? "nested-" : "", txn->mt_txnid, dbi, - (int)env->me_dbxs[dbi].md_name.iov_len, - (const char *)env->me_dbxs[dbi].md_name.iov_base, - dbi_state(txn, dbi)); - } +__cold static int MDBX_PRINTF_ARGS(6, 7) + chk_scope_begin(MDBX_chk_internal_t *chk, int verbosity_adjustment, + enum MDBX_chk_stage stage, const void *object, + size_t *problems, const char *fmt, ...) { + va_list args; + va_start(args, fmt); + int rc = chk_scope_begin_args(chk, verbosity_adjustment, stage, object, + problems, fmt, args); + va_end(args); + return rc; +} - if (pending + gc + used == txn->mt_next_pgno) - return MDBX_SUCCESS; - - if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) - ERROR("audit @%" PRIaTXN ": %zu(pending) = %zu(loose) + " - "%zu(reclaimed) + %zu(retired-pending) - %zu(retired-stored)", - txn->mt_txnid, pending, txn->tw.loose_count, - MDBX_PNL_GETSIZE(txn->tw.relist), - txn->tw.retired_pages ? MDBX_PNL_GETSIZE(txn->tw.retired_pages) : 0, - retired_stored); - ERROR("audit @%" PRIaTXN ": %zu(pending) + %zu" - "(gc) + %zu(count) = %zu(total) <> %zu" - "(allocated)", - txn->mt_txnid, pending, gc, used, pending + gc + used, - (size_t)txn->mt_next_pgno); - return MDBX_PROBLEM; +__cold static int chk_scope_restore(MDBX_chk_scope_t *const target, int err) { + MDBX_chk_internal_t *const chk = target->internal; + assert(target <= chk->usr->scope); + while (chk->usr->scope > target) + err = chk_scope_end(chk, err); + return err; } -__cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, - bool dont_filter_gc) { - MDBX_env *const env = txn->mt_env; - int rc = osal_fastmutex_acquire(&env->me_dbi_lock); - if (likely(rc == MDBX_SUCCESS)) { - rc = audit_ex_locked(txn, retired_stored, dont_filter_gc); - ENSURE(txn->mt_env, - osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); - } - return rc; +__cold void chk_scope_pop(MDBX_chk_scope_t *const inner) { + if (inner && inner > inner->internal->scope_stack) + chk_scope_restore(inner - 1, MDBX_SUCCESS); } -typedef struct gc_update_context { - size_t loop, reserve_adj; - size_t retired_stored; - size_t reserved, cleaned_slot, reused_slot, fill_idx; - txnid_t cleaned_id, rid; - bool lifo, dense; -#if MDBX_ENABLE_BIGFOOT - txnid_t bigfoot; -#endif /* MDBX_ENABLE_BIGFOOT */ - MDBX_cursor cursor; -} gcu_context_t; - -static __inline int gcu_context_init(MDBX_txn *txn, gcu_context_t *ctx) { - memset(ctx, 0, offsetof(gcu_context_t, cursor)); - ctx->lifo = (txn->mt_env->me_flags & MDBX_LIFORECLAIM) != 0; -#if MDBX_ENABLE_BIGFOOT - ctx->bigfoot = txn->mt_txnid; -#endif /* MDBX_ENABLE_BIGFOOT */ - return cursor_init(&ctx->cursor, txn, FREE_DBI); +__cold static MDBX_chk_scope_t *MDBX_PRINTF_ARGS(3, 4) + chk_scope_push(MDBX_chk_scope_t *const scope, int verbosity_adjustment, + const char *fmt, ...) { + chk_scope_restore(scope, MDBX_SUCCESS); + va_list args; + va_start(args, fmt); + int err = chk_scope_begin_args(scope->internal, verbosity_adjustment, + scope->stage, nullptr, nullptr, fmt, args); + va_end(args); + return err ? nullptr : scope + 1; } -static __always_inline size_t gcu_backlog_size(MDBX_txn *txn) { - return MDBX_PNL_GETSIZE(txn->tw.relist) + txn->tw.loose_count; -} +__cold static const char *chk_v2a(MDBX_chk_internal_t *chk, + const MDBX_val *val) { + if (val == MDBX_CHK_MAIN) + return "@MAIN"; + if (val == MDBX_CHK_GC) + return "@GC"; + if (val == MDBX_CHK_META) + return "@META"; -static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { - int err = MDBX_SUCCESS; - if (ctx->retired_stored) { - MDBX_cursor *const gc = ptr_disp(txn, sizeof(MDBX_txn)); - tASSERT(txn, txn == txn->mt_env->me_txn0 && gc->mc_next == nullptr); - gc->mc_txn = txn; - gc->mc_flags = 0; - gc->mc_next = txn->mt_cursors[FREE_DBI]; - txn->mt_cursors[FREE_DBI] = gc; - do { - MDBX_val key, val; -#if MDBX_ENABLE_BIGFOOT - key.iov_base = &ctx->bigfoot; -#else - key.iov_base = &txn->mt_txnid; -#endif /* MDBX_ENABLE_BIGFOOT */ - key.iov_len = sizeof(txnid_t); - const struct cursor_set_result csr = cursor_set(gc, &key, &val, MDBX_SET); - if (csr.err == MDBX_SUCCESS && csr.exact) { - ctx->retired_stored = 0; - err = cursor_del(gc, 0); - TRACE("== clear-4linear, backlog %zu, err %d", gcu_backlog_size(txn), - err); - } else - err = (err == MDBX_NOTFOUND) ? MDBX_SUCCESS : err; + const unsigned char *const data = val->iov_base; + const size_t len = val->iov_len; + if (data == MDBX_CHK_MAIN) + return "@MAIN"; + if (data == MDBX_CHK_GC) + return "@GC"; + if (data == MDBX_CHK_META) + return "@META"; + + if (!len) + return ""; + if (!data) + return ""; + if (len > 65536) { + const size_t enough = 42; + if (chk->v2a_buf.iov_len < enough) { + void *ptr = osal_realloc(chk->v2a_buf.iov_base, enough); + if (unlikely(!ptr)) + return ""; + chk->v2a_buf.iov_base = ptr; + chk->v2a_buf.iov_len = enough; } -#if MDBX_ENABLE_BIGFOOT - while (!err && --ctx->bigfoot >= txn->mt_txnid); -#else - while (0); -#endif /* MDBX_ENABLE_BIGFOOT */ - txn->mt_cursors[FREE_DBI] = gc->mc_next; - gc->mc_next = nullptr; + snprintf(chk->v2a_buf.iov_base, chk->v2a_buf.iov_len, + "", len); + return chk->v2a_buf.iov_base; } - return err; -} - -static int gcu_touch(gcu_context_t *ctx) { - MDBX_val key, val; - key.iov_base = val.iov_base = nullptr; - key.iov_len = sizeof(txnid_t); - val.iov_len = MDBX_PNL_SIZEOF(ctx->cursor.mc_txn->tw.retired_pages); - ctx->cursor.mc_flags |= C_GCU; - int err = cursor_touch(&ctx->cursor, &key, &val); - ctx->cursor.mc_flags -= C_GCU; - return err; -} - -/* Prepare a backlog of pages to modify GC itself, while reclaiming is - * prohibited. It should be enough to prevent search in page_alloc_slowpath() - * during a deleting, when GC tree is unbalanced. */ -static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx) { - const size_t for_cow = txn->mt_dbs[FREE_DBI].md_depth; - const size_t for_rebalance = for_cow + 1 + - (txn->mt_dbs[FREE_DBI].md_depth + 1ul >= - txn->mt_dbs[FREE_DBI].md_branch_pages); - size_t for_split = ctx->retired_stored == 0; - const intptr_t retired_left = - MDBX_PNL_SIZEOF(txn->tw.retired_pages) - ctx->retired_stored; - size_t for_relist = 0; - if (MDBX_ENABLE_BIGFOOT && retired_left > 0) { - for_relist = (retired_left + txn->mt_env->me_maxgc_ov1page - 1) / - txn->mt_env->me_maxgc_ov1page; - const size_t per_branch_page = txn->mt_env->me_maxgc_per_branch; - for (size_t entries = for_relist; entries > 1; for_split += entries) - entries = (entries + per_branch_page - 1) / per_branch_page; - } else if (!MDBX_ENABLE_BIGFOOT && retired_left != 0) { - for_relist = - number_of_ovpages(txn->mt_env, MDBX_PNL_SIZEOF(txn->tw.retired_pages)); + bool printable = true; + bool quoting = false; + size_t xchars = 0; + for (size_t i = 0; i < len && printable; ++i) { + quoting = quoting || !(data[i] == '_' || isalnum(data[i])); + printable = + isprint(data[i]) || (data[i] < ' ' && ++xchars < 4 && len > xchars * 4); } - const size_t for_tree_before_touch = for_cow + for_rebalance + for_split; - const size_t for_tree_after_touch = for_rebalance + for_split; - const size_t for_all_before_touch = for_relist + for_tree_before_touch; - const size_t for_all_after_touch = for_relist + for_tree_after_touch; - - if (likely(for_relist < 2 && gcu_backlog_size(txn) > for_all_before_touch) && - (ctx->cursor.mc_snum == 0 || - IS_MODIFIABLE(txn, ctx->cursor.mc_pg[ctx->cursor.mc_top]))) - return MDBX_SUCCESS; - - TRACE(">> retired-stored %zu, left %zi, backlog %zu, need %zu (4list %zu, " - "4split %zu, " - "4cow %zu, 4tree %zu)", - ctx->retired_stored, retired_left, gcu_backlog_size(txn), - for_all_before_touch, for_relist, for_split, for_cow, - for_tree_before_touch); - - int err = gcu_touch(ctx); - TRACE("== after-touch, backlog %zu, err %d", gcu_backlog_size(txn), err); + size_t need = len + 1; + if (quoting || !printable) + need += len + /* quotes */ 2 + 2 * /* max xchars */ 4; + if (need > chk->v2a_buf.iov_len) { + void *ptr = osal_realloc(chk->v2a_buf.iov_base, need); + if (unlikely(!ptr)) + return ""; + chk->v2a_buf.iov_base = ptr; + chk->v2a_buf.iov_len = need; + } - if (!MDBX_ENABLE_BIGFOOT && unlikely(for_relist > 1) && - MDBX_PNL_GETSIZE(txn->tw.retired_pages) != ctx->retired_stored && - err == MDBX_SUCCESS) { - if (unlikely(ctx->retired_stored)) { - err = gcu_clean_stored_retired(txn, ctx); - if (unlikely(err != MDBX_SUCCESS)) - return err; - if (!ctx->retired_stored) - return /* restart by tail-recursion */ gcu_prepare_backlog(txn, ctx); + static const char hex[] = "0123456789abcdef"; + char *w = chk->v2a_buf.iov_base; + if (!quoting) { + memcpy(w, data, len); + w += len; + } else if (printable) { + *w++ = '\''; + for (size_t i = 0; i < len; ++i) { + if (data[i] < ' ') { + assert((char *)chk->v2a_buf.iov_base + chk->v2a_buf.iov_len > w + 4); + w[0] = '\\'; + w[1] = 'x'; + w[2] = hex[data[i] >> 4]; + w[3] = hex[data[i] & 15]; + w += 4; + } else if (strchr("\"'`\\", data[i])) { + assert((char *)chk->v2a_buf.iov_base + chk->v2a_buf.iov_len > w + 2); + w[0] = '\\'; + w[1] = data[i]; + w += 2; + } else { + assert((char *)chk->v2a_buf.iov_base + chk->v2a_buf.iov_len > w + 1); + *w++ = data[i]; + } + } + *w++ = '\''; + } else { + *w++ = '\\'; + *w++ = 'x'; + for (size_t i = 0; i < len; ++i) { + assert((char *)chk->v2a_buf.iov_base + chk->v2a_buf.iov_len > w + 2); + w[0] = hex[data[i] >> 4]; + w[1] = hex[data[i] & 15]; + w += 2; } - err = page_alloc_slowpath(&ctx->cursor, for_relist, MDBX_ALLOC_RESERVE).err; - TRACE("== after-4linear, backlog %zu, err %d", gcu_backlog_size(txn), err); - cASSERT(&ctx->cursor, - gcu_backlog_size(txn) >= for_relist || err != MDBX_SUCCESS); } - - while (gcu_backlog_size(txn) < for_all_after_touch && err == MDBX_SUCCESS) - err = page_alloc_slowpath(&ctx->cursor, 0, - MDBX_ALLOC_RESERVE | MDBX_ALLOC_UNIMPORTANT) - .err; - - TRACE("<< backlog %zu, err %d, gc: height %u, branch %zu, leaf %zu, large " - "%zu, entries %zu", - gcu_backlog_size(txn), err, txn->mt_dbs[FREE_DBI].md_depth, - (size_t)txn->mt_dbs[FREE_DBI].md_branch_pages, - (size_t)txn->mt_dbs[FREE_DBI].md_leaf_pages, - (size_t)txn->mt_dbs[FREE_DBI].md_overflow_pages, - (size_t)txn->mt_dbs[FREE_DBI].md_entries); - tASSERT(txn, - err != MDBX_NOTFOUND || (txn->mt_flags & MDBX_TXN_DRAINED_GC) != 0); - return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS; + assert((char *)chk->v2a_buf.iov_base + chk->v2a_buf.iov_len > w); + *w = 0; + return chk->v2a_buf.iov_base; } -static __inline void gcu_clean_reserved(MDBX_env *env, MDBX_val pnl) { -#if MDBX_DEBUG && (defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__)) - /* Для предотвращения предупреждения Valgrind из mdbx_dump_val() - * вызванное через макрос DVAL_DEBUG() на выходе - * из cursor_set(MDBX_SET_KEY), которая вызывается ниже внутри update_gc() в - * цикле очистки и цикле заполнения зарезервированных элементов. */ - memset(pnl.iov_base, 0xBB, pnl.iov_len); -#endif /* MDBX_DEBUG && (ENABLE_MEMCHECK || __SANITIZE_ADDRESS__) */ +__cold static void chk_dispose(MDBX_chk_internal_t *chk) { + assert(chk->subdb[FREE_DBI] == &chk->subdb_gc); + assert(chk->subdb[MAIN_DBI] == &chk->subdb_main); + for (size_t i = 0; i < ARRAY_LENGTH(chk->subdb); ++i) { + MDBX_chk_subdb_t *const sdb = chk->subdb[i]; + if (sdb) { + chk->subdb[i] = nullptr; + if (chk->cb->subdb_dispose && sdb->cookie) { + chk->cb->subdb_dispose(chk->usr, sdb); + sdb->cookie = nullptr; + } + if (sdb != &chk->subdb_gc && sdb != &chk->subdb_main) { + osal_free(sdb); + } + } + } + osal_free(chk->v2a_buf.iov_base); + osal_free(chk->pagemap); + chk->usr->internal = nullptr; + chk->usr->scope = nullptr; + chk->pagemap = nullptr; + memset(chk, 0xDD, sizeof(*chk)); + osal_free(chk); +} - /* PNL is initially empty, zero out at least the length */ - memset(pnl.iov_base, 0, sizeof(pgno_t)); - if ((env->me_flags & (MDBX_WRITEMAP | MDBX_NOMEMINIT)) == 0) - /* zero out to avoid leaking values from uninitialized malloc'ed memory - * to the file in non-writemap mode if length of the saving page-list - * was changed during space reservation. */ - memset(pnl.iov_base, 0, pnl.iov_len); +static size_t div_8s(size_t numerator, size_t divider) { + assert(numerator <= (SIZE_MAX >> 8)); + return (numerator << 8) / divider; } -/* Cleanups reclaimed GC (aka freeDB) records, saves the retired-list (aka - * freelist) of current transaction to GC, puts back into GC leftover of the - * reclaimed pages with chunking. This recursive changes the reclaimed-list, - * loose-list and retired-list. Keep trying until it stabilizes. - * - * NOTE: This code is a consequence of many iterations of adding crutches (aka - * "checks and balances") to partially bypass the fundamental design problems - * inherited from LMDB. So do not try to understand it completely in order to - * avoid your madness. */ -static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { - TRACE("\n>>> @%" PRIaTXN, txn->mt_txnid); - MDBX_env *const env = txn->mt_env; - const char *const dbg_prefix_mode = ctx->lifo ? " lifo" : " fifo"; - (void)dbg_prefix_mode; - ctx->cursor.mc_next = txn->mt_cursors[FREE_DBI]; - txn->mt_cursors[FREE_DBI] = &ctx->cursor; +static size_t mul_8s(size_t quotient, size_t multiplier) { + size_t hi = multiplier * (quotient >> 8); + size_t lo = multiplier * (quotient & 255) + 128; + return hi + (lo >> 8); +} - /* txn->tw.relist[] can grow and shrink during this call. - * txn->tw.last_reclaimed and txn->tw.retired_pages[] can only grow. - * But page numbers cannot disappear from txn->tw.retired_pages[]. */ -retry_clean_adj: - ctx->reserve_adj = 0; -retry: - if (ctx->loop++) - TRACE("%s", " >> restart"); - int rc = MDBX_SUCCESS; - tASSERT(txn, pnl_check_allocated(txn->tw.relist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - tASSERT(txn, dirtylist_check(txn)); - if (unlikely(/* paranoia */ ctx->loop > ((MDBX_DEBUG > 0) ? 12 : 42))) { - ERROR("too more loops %zu, bailout", ctx->loop); - rc = MDBX_PROBLEM; - goto bailout; - } - - if (unlikely(ctx->dense)) { - rc = gcu_clean_stored_retired(txn, ctx); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } - - ctx->reserved = 0; - ctx->cleaned_slot = 0; - ctx->reused_slot = 0; - ctx->fill_idx = ~0u; - ctx->cleaned_id = 0; - ctx->rid = txn->tw.last_reclaimed; - while (true) { - /* Come back here after each Put() in case retired-list changed */ - TRACE("%s", " >> continue"); - - if (ctx->retired_stored != MDBX_PNL_GETSIZE(txn->tw.retired_pages) && - (ctx->loop == 1 || ctx->retired_stored > env->me_maxgc_ov1page || - MDBX_PNL_GETSIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page)) { - rc = gcu_prepare_backlog(txn, ctx); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; +static void histogram_reduce(struct MDBX_chk_histogram *p) { + const size_t size = ARRAY_LENGTH(p->ranges), last = size - 1; + // ищем пару для слияния с минимальной ошибкой + size_t min_err = SIZE_MAX, min_i = last - 1; + for (size_t i = 0; i < last; ++i) { + const size_t b1 = p->ranges[i].begin, e1 = p->ranges[i].end, + s1 = p->ranges[i].amount; + const size_t b2 = p->ranges[i + 1].begin, e2 = p->ranges[i + 1].end, + s2 = p->ranges[i + 1].amount; + const size_t l1 = e1 - b1, l2 = e2 - b2, lx = e2 - b1, sx = s1 + s2; + assert(s1 > 0 && b1 > 0 && b1 < e1); + assert(s2 > 0 && b2 > 0 && b2 < e2); + assert(e1 <= b2); + // за ошибку принимаем площадь изменений на гистограмме при слиянии + const size_t h1 = div_8s(s1, l1), h2 = div_8s(s2, l2), hx = div_8s(sx, lx); + const size_t d1 = mul_8s((h1 > hx) ? h1 - hx : hx - h1, l1); + const size_t d2 = mul_8s((h2 > hx) ? h2 - hx : hx - h2, l2); + const size_t dx = mul_8s(hx, b2 - e1); + const size_t err = d1 + d2 + dx; + if (min_err >= err) { + min_i = i; + min_err = err; } + } + // объединяем + p->ranges[min_i].end = p->ranges[min_i + 1].end; + p->ranges[min_i].amount += p->ranges[min_i + 1].amount; + p->ranges[min_i].count += p->ranges[min_i + 1].count; + if (min_i < last) + // перемещаем хвост + memmove(p->ranges + min_i, p->ranges + min_i + 1, + (last - min_i) * sizeof(p->ranges[0])); + // обнуляем последний элемент и продолжаем + p->ranges[last].count = 0; +} - tASSERT(txn, pnl_check_allocated(txn->tw.relist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - MDBX_val key, data; - if (ctx->lifo) { - if (ctx->cleaned_slot < (txn->tw.lifo_reclaimed - ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - : 0)) { - ctx->reserved = 0; - ctx->cleaned_slot = 0; - ctx->reused_slot = 0; - ctx->fill_idx = ~0u; - /* LY: cleanup reclaimed records. */ - do { - ctx->cleaned_id = txn->tw.lifo_reclaimed[++ctx->cleaned_slot]; - tASSERT(txn, - ctx->cleaned_slot > 0 && - ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); - key.iov_base = &ctx->cleaned_id; - key.iov_len = sizeof(ctx->cleaned_id); - rc = cursor_set(&ctx->cursor, &key, NULL, MDBX_SET).err; - if (rc == MDBX_NOTFOUND) - continue; - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - if (likely(!ctx->dense)) { - rc = gcu_prepare_backlog(txn, ctx); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } - tASSERT(txn, ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); - TRACE("%s: cleanup-reclaimed-id [%zu]%" PRIaTXN, dbg_prefix_mode, - ctx->cleaned_slot, ctx->cleaned_id); - tASSERT(txn, *txn->mt_cursors == &ctx->cursor); - rc = cursor_del(&ctx->cursor, 0); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } while (ctx->cleaned_slot < MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)); - txl_sort(txn->tw.lifo_reclaimed); - } - } else { - /* Удаляем оставшиеся вынутые из GC записи. */ - while (ctx->cleaned_id <= txn->tw.last_reclaimed) { - rc = cursor_first(&ctx->cursor, &key, NULL); - if (rc == MDBX_NOTFOUND) - break; - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - if (!MDBX_DISABLE_VALIDATION && - unlikely(key.iov_len != sizeof(txnid_t))) { - ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "invalid GC-key size", (unsigned)key.iov_len); - rc = MDBX_CORRUPTED; - goto bailout; +static void histogram_acc(const size_t n, struct MDBX_chk_histogram *p) { + STATIC_ASSERT(ARRAY_LENGTH(p->ranges) > 2); + p->amount += n; + p->count += 1; + if (likely(n < 2)) { + p->ones += n; + p->pad += 1; + } else + for (;;) { + const size_t size = ARRAY_LENGTH(p->ranges), last = size - 1; + size_t i = 0; + while (i < size && p->ranges[i].count && n >= p->ranges[i].begin) { + if (n < p->ranges[i].end) { + // значение попадает в существующий интервал + p->ranges[i].amount += n; + p->ranges[i].count += 1; + return; } - ctx->rid = ctx->cleaned_id; - ctx->reserved = 0; - ctx->reused_slot = 0; - ctx->cleaned_id = unaligned_peek_u64(4, key.iov_base); - if (ctx->cleaned_id > txn->tw.last_reclaimed) - break; - if (likely(!ctx->dense)) { - rc = gcu_prepare_backlog(txn, ctx); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; + ++i; + } + if (p->ranges[last].count == 0) { + // использованы еще не все слоты, добавляем интервал + assert(i < size); + if (p->ranges[i].count) { + assert(i < last); + // раздвигаем +#ifdef __COVERITY__ + if (i < last) /* avoid Coverity false-positive issue */ +#endif /* __COVERITY__ */ + memmove(p->ranges + i + 1, p->ranges + i, + (last - i) * sizeof(p->ranges[0])); } - tASSERT(txn, ctx->cleaned_id <= txn->tw.last_reclaimed); - tASSERT(txn, ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); - TRACE("%s: cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, - ctx->cleaned_id); - tASSERT(txn, *txn->mt_cursors == &ctx->cursor); - rc = cursor_del(&ctx->cursor, 0); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; + p->ranges[i].begin = n; + p->ranges[i].end = n + 1; + p->ranges[i].amount = n; + p->ranges[i].count = 1; + return; } + histogram_reduce(p); } +} - tASSERT(txn, pnl_check_allocated(txn->tw.relist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - tASSERT(txn, dirtylist_check(txn)); - if (AUDIT_ENABLED()) { - rc = audit_ex(txn, ctx->retired_stored, false); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; +__cold static MDBX_chk_line_t * +histogram_dist(MDBX_chk_line_t *line, + const struct MDBX_chk_histogram *histogram, const char *prefix, + const char *first, bool amount) { + line = chk_print(line, "%s:", prefix); + const char *comma = ""; + const size_t first_val = amount ? histogram->ones : histogram->pad; + if (first_val) { + chk_print(line, " %s=%" PRIuSIZE, first, first_val); + comma = ","; + } + for (size_t n = 0; n < ARRAY_LENGTH(histogram->ranges); ++n) + if (histogram->ranges[n].count) { + chk_print(line, "%s %" PRIuSIZE, comma, histogram->ranges[n].begin); + if (histogram->ranges[n].begin != histogram->ranges[n].end - 1) + chk_print(line, "-%" PRIuSIZE, histogram->ranges[n].end - 1); + line = chk_print(line, "=%" PRIuSIZE, + amount ? histogram->ranges[n].amount + : histogram->ranges[n].count); + comma = ","; } + return line; +} - /* return suitable into unallocated space */ - if (txn_refund(txn)) { - tASSERT(txn, pnl_check_allocated(txn->tw.relist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - if (AUDIT_ENABLED()) { - rc = audit_ex(txn, ctx->retired_stored, false); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } - } +__cold static MDBX_chk_line_t * +histogram_print(MDBX_chk_scope_t *scope, MDBX_chk_line_t *line, + const struct MDBX_chk_histogram *histogram, const char *prefix, + const char *first, bool amount) { + if (histogram->count) { + line = chk_print(line, "%s %" PRIuSIZE, prefix, + amount ? histogram->amount : histogram->count); + if (scope->verbosity > MDBX_chk_info) + line = chk_puts( + histogram_dist(line, histogram, " (distribution", first, amount), + ")"); + } + return line; +} - /* handle loose pages - put ones into the reclaimed- or retired-list */ - if (txn->tw.loose_pages) { - tASSERT(txn, txn->tw.loose_count > 0); - /* Return loose page numbers to tw.relist, - * though usually none are left at this point. - * The pages themselves remain in dirtylist. */ - if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) { - TRACE("%s: try allocate gc-slot for %zu loose-pages", dbg_prefix_mode, - txn->tw.loose_count); - rc = page_alloc_slowpath(&ctx->cursor, 0, MDBX_ALLOC_RESERVE).err; - if (rc == MDBX_SUCCESS) { - TRACE("%s: retry since gc-slot for %zu loose-pages available", - dbg_prefix_mode, txn->tw.loose_count); - continue; - } +//----------------------------------------------------------------------------- - /* Put loose page numbers in tw.retired_pages, - * since unable to return them to tw.relist. */ - if (unlikely((rc = pnl_need(&txn->tw.retired_pages, - txn->tw.loose_count)) != 0)) - goto bailout; - for (MDBX_page *lp = txn->tw.loose_pages; lp; lp = mp_next(lp)) { - pnl_xappend(txn->tw.retired_pages, lp->mp_pgno); - MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); - VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); - } - TRACE("%s: append %zu loose-pages to retired-pages", dbg_prefix_mode, - txn->tw.loose_count); - } else { - /* Room for loose pages + temp PNL with same */ - rc = pnl_need(&txn->tw.relist, 2 * txn->tw.loose_count + 2); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - MDBX_PNL loose = txn->tw.relist + MDBX_PNL_ALLOCLEN(txn->tw.relist) - - txn->tw.loose_count - 1; - size_t count = 0; - for (MDBX_page *lp = txn->tw.loose_pages; lp; lp = mp_next(lp)) { - tASSERT(txn, lp->mp_flags == P_LOOSE); - loose[++count] = lp->mp_pgno; - MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); - VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); - } - tASSERT(txn, count == txn->tw.loose_count); - MDBX_PNL_SETSIZE(loose, count); - pnl_sort(loose, txn->mt_next_pgno); - pnl_merge(txn->tw.relist, loose); - TRACE("%s: append %zu loose-pages to reclaimed-pages", dbg_prefix_mode, - txn->tw.loose_count); - } +__cold static int chk_get_sdb(MDBX_chk_scope_t *const scope, + const walk_sdb_t *in, MDBX_chk_subdb_t **out) { + MDBX_chk_internal_t *const chk = scope->internal; + if (chk->last_lookup && + chk->last_lookup->name.iov_base == in->name.iov_base) { + *out = chk->last_lookup; + return MDBX_SUCCESS; + } - /* filter-out list of dirty-pages from loose-pages */ - MDBX_dpl *const dl = txn->tw.dirtylist; - if (dl) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - tASSERT(txn, dl->sorted <= dl->length); - size_t w = 0, sorted_out = 0; - for (size_t r = w; ++r <= dl->length;) { - MDBX_page *dp = dl->items[r].ptr; - tASSERT(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); - tASSERT(txn, dpl_endpgno(dl, r) <= txn->mt_next_pgno); - if ((dp->mp_flags & P_LOOSE) == 0) { - if (++w != r) - dl->items[w] = dl->items[r]; - } else { - tASSERT(txn, dp->mp_flags == P_LOOSE); - sorted_out += dl->sorted >= r; - if (!MDBX_AVOID_MSYNC || !(env->me_flags & MDBX_WRITEMAP)) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); - dpage_free(env, dp, 1); - } - } - } - TRACE("%s: filtered-out loose-pages from %zu -> %zu dirty-pages", - dbg_prefix_mode, dl->length, w); - tASSERT(txn, txn->tw.loose_count == dl->length - w); - dl->sorted -= sorted_out; - tASSERT(txn, dl->sorted <= w); - dpl_setlen(dl, w); - dl->pages_including_loose -= txn->tw.loose_count; - txn->tw.dirtyroom += txn->tw.loose_count; - tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); - } else { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + for (size_t i = 0; i < ARRAY_LENGTH(chk->subdb); ++i) { + MDBX_chk_subdb_t *sdb = chk->subdb[i]; + if (!sdb) { + sdb = osal_calloc(1, sizeof(MDBX_chk_subdb_t)); + if (unlikely(!sdb)) { + *out = nullptr; + return chk_error_rc(scope, MDBX_ENOMEM, "alloc_subDB"); } - txn->tw.loose_pages = NULL; - txn->tw.loose_count = 0; -#if MDBX_ENABLE_REFUND - txn->tw.loose_refund_wl = 0; -#endif /* MDBX_ENABLE_REFUND */ + chk->subdb[i] = sdb; + sdb->flags = in->internal->flags; + sdb->id = -1; + sdb->name = in->name; } - - const size_t amount = MDBX_PNL_GETSIZE(txn->tw.relist); - /* handle retired-list - store ones into single gc-record */ - if (ctx->retired_stored < MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { - if (unlikely(!ctx->retired_stored)) { - /* Make sure last page of GC is touched and on retired-list */ - rc = cursor_last(&ctx->cursor, nullptr, nullptr); - if (likely(rc == MDBX_SUCCESS)) - rc = gcu_touch(ctx); - if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND) - goto bailout; + if (sdb->name.iov_base == in->name.iov_base) { + if (sdb->id < 0) { + sdb->id = (int)i; + sdb->cookie = + chk->cb->subdb_filter + ? chk->cb->subdb_filter(chk->usr, &sdb->name, sdb->flags) + : (void *)(intptr_t)-1; } + *out = (chk->last_lookup = sdb); + return MDBX_SUCCESS; + } + } + chk_scope_issue(scope, "too many subDBs > %u", + (unsigned)ARRAY_LENGTH(chk->subdb) - CORE_DBS - /* meta */ 1); + *out = nullptr; + return MDBX_PROBLEM; +} -#if MDBX_ENABLE_BIGFOOT - size_t retired_pages_before; - do { - if (ctx->bigfoot > txn->mt_txnid) { - rc = gcu_clean_stored_retired(txn, ctx); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - tASSERT(txn, ctx->bigfoot <= txn->mt_txnid); - } +//------------------------------------------------------------------------------ - retired_pages_before = MDBX_PNL_GETSIZE(txn->tw.retired_pages); - rc = gcu_prepare_backlog(txn, ctx); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - if (retired_pages_before != MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { - TRACE("%s: retired-list changed (%zu -> %zu), retry", dbg_prefix_mode, - retired_pages_before, MDBX_PNL_GETSIZE(txn->tw.retired_pages)); - break; - } +__cold static void chk_verbose_meta(MDBX_chk_scope_t *const scope, + const unsigned num) { + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_verbose); + MDBX_chk_internal_t *const chk = scope->internal; + if (line) { + MDBX_env *const env = chk->usr->env; + const bool have_bootid = (chk->envinfo.mi_bootid.current.x | + chk->envinfo.mi_bootid.current.y) != 0; + const bool bootid_match = + have_bootid && memcmp(&chk->envinfo.mi_bootid.meta[num], + &chk->envinfo.mi_bootid.current, + sizeof(chk->envinfo.mi_bootid.current)) == 0; - pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno); - ctx->retired_stored = 0; - ctx->bigfoot = txn->mt_txnid; - do { - if (ctx->retired_stored) { - rc = gcu_prepare_backlog(txn, ctx); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - if (ctx->retired_stored >= - MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { - TRACE("%s: retired-list changed (%zu -> %zu), retry", - dbg_prefix_mode, retired_pages_before, - MDBX_PNL_GETSIZE(txn->tw.retired_pages)); - break; - } - } - key.iov_len = sizeof(txnid_t); - key.iov_base = &ctx->bigfoot; - const size_t left = - MDBX_PNL_GETSIZE(txn->tw.retired_pages) - ctx->retired_stored; - const size_t chunk = - (left > env->me_maxgc_ov1page && ctx->bigfoot < MAX_TXNID) - ? env->me_maxgc_ov1page - : left; - data.iov_len = (chunk + 1) * sizeof(pgno_t); - rc = cursor_put_nochecklen(&ctx->cursor, &key, &data, MDBX_RESERVE); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; + const char *status = "stay"; + if (num == chk->troika.recent) + status = "head"; + else if (num == TROIKA_TAIL(&chk->troika)) + status = "tail"; + line = chk_print(line, "meta-%u: %s, ", num, status); -#if MDBX_DEBUG && (defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__)) - /* Для предотвращения предупреждения Valgrind из mdbx_dump_val() - * вызванное через макрос DVAL_DEBUG() на выходе - * из cursor_set(MDBX_SET_KEY), которая вызывается как выше в цикле - * очистки, так и ниже в цикле заполнения зарезервированных элементов. - */ - memset(data.iov_base, 0xBB, data.iov_len); -#endif /* MDBX_DEBUG && (ENABLE_MEMCHECK || __SANITIZE_ADDRESS__) */ + switch (chk->envinfo.mi_meta_sign[num]) { + case DATASIGN_NONE: + line = chk_puts(line, "no-sync/legacy"); + break; + case DATASIGN_WEAK: + line = chk_print(line, "weak-%s", + have_bootid + ? (bootid_match ? "intact (same boot-id)" : "dead") + : "unknown (no boot-id)"); + break; + default: + line = chk_puts(line, "steady"); + break; + } + const txnid_t meta_txnid = chk->envinfo.mi_meta_txnid[num]; + line = chk_print(line, " txn#%" PRIaTXN ", ", meta_txnid); + if (chk->envinfo.mi_bootid.meta[num].x | chk->envinfo.mi_bootid.meta[num].y) + line = chk_print(line, "boot-id %" PRIx64 "-%" PRIx64 " (%s)", + chk->envinfo.mi_bootid.meta[num].x, + chk->envinfo.mi_bootid.meta[num].y, + bootid_match ? "live" : "not match"); + else + line = chk_puts(line, "no boot-id"); - if (retired_pages_before == MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { - const size_t at = (ctx->lifo == MDBX_PNL_ASCENDING) - ? left - chunk - : ctx->retired_stored; - pgno_t *const begin = txn->tw.retired_pages + at; - /* MDBX_PNL_ASCENDING == false && LIFO == false: - * - the larger pgno is at the beginning of retired list - * and should be placed with the larger txnid. - * MDBX_PNL_ASCENDING == true && LIFO == true: - * - the larger pgno is at the ending of retired list - * and should be placed with the smaller txnid. - */ - const pgno_t save = *begin; - *begin = (pgno_t)chunk; - memcpy(data.iov_base, begin, data.iov_len); - *begin = save; - TRACE("%s: put-retired/bigfoot @ %" PRIaTXN - " (slice #%u) #%zu [%zu..%zu] of %zu", - dbg_prefix_mode, ctx->bigfoot, - (unsigned)(ctx->bigfoot - txn->mt_txnid), chunk, at, - at + chunk, retired_pages_before); - } - ctx->retired_stored += chunk; - } while (ctx->retired_stored < - MDBX_PNL_GETSIZE(txn->tw.retired_pages) && - (++ctx->bigfoot, true)); - } while (retired_pages_before != MDBX_PNL_GETSIZE(txn->tw.retired_pages)); -#else - /* Write to last page of GC */ - key.iov_len = sizeof(txnid_t); - key.iov_base = &txn->mt_txnid; - do { - gcu_prepare_backlog(txn, ctx); - data.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); - rc = cursor_put_nochecklen(&ctx->cursor, &key, &data, MDBX_RESERVE); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; + if (env->stuck_meta >= 0) { + if (num == (unsigned)env->stuck_meta) + line = chk_print(line, ", %s", "forced for checking"); + } else if (meta_txnid > chk->envinfo.mi_recent_txnid && + (env->flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) == MDBX_EXCLUSIVE) + line = chk_print(line, + ", rolled-back %" PRIu64 " commit(s) (%" PRIu64 + " >>> %" PRIu64 ")", + meta_txnid - chk->envinfo.mi_recent_txnid, meta_txnid, + chk->envinfo.mi_recent_txnid); + chk_line_end(line); + } +} -#if MDBX_DEBUG && (defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__)) - /* Для предотвращения предупреждения Valgrind из mdbx_dump_val() - * вызванное через макрос DVAL_DEBUG() на выходе - * из cursor_set(MDBX_SET_KEY), которая вызывается как выше в цикле - * очистки, так и ниже в цикле заполнения зарезервированных элементов. - */ - memset(data.iov_base, 0xBB, data.iov_len); -#endif /* MDBX_DEBUG && (ENABLE_MEMCHECK || __SANITIZE_ADDRESS__) */ +__cold static int +chk_pgvisitor(const size_t pgno, const unsigned npages, void *const ctx, + const int deep, const walk_sdb_t *sdb_info, + const size_t page_size, const page_type_t pagetype, + const MDBX_error_t page_err, const size_t nentries, + const size_t payload_bytes, const size_t header_bytes, + const size_t unused_bytes) { + MDBX_chk_scope_t *const scope = ctx; + MDBX_chk_internal_t *const chk = scope->internal; + MDBX_chk_context_t *const usr = chk->usr; + MDBX_env *const env = usr->env; - /* Retry if tw.retired_pages[] grew during the Put() */ - } while (data.iov_len < MDBX_PNL_SIZEOF(txn->tw.retired_pages)); + MDBX_chk_subdb_t *sdb; + int err = chk_get_sdb(scope, sdb_info, &sdb); + if (unlikely(err)) + return err; - ctx->retired_stored = MDBX_PNL_GETSIZE(txn->tw.retired_pages); - pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno); - eASSERT(env, data.iov_len == MDBX_PNL_SIZEOF(txn->tw.retired_pages)); - memcpy(data.iov_base, txn->tw.retired_pages, data.iov_len); + if (deep > 42) { + chk_scope_issue(scope, "too deeply %u", deep); + return MDBX_CORRUPTED /* avoid infinite loop/recursion */; + } + histogram_acc(deep, &sdb->histogram.deep); + usr->result.processed_pages += npages; + const size_t page_bytes = payload_bytes + header_bytes + unused_bytes; - TRACE("%s: put-retired #%zu @ %" PRIaTXN, dbg_prefix_mode, - ctx->retired_stored, txn->mt_txnid); -#endif /* MDBX_ENABLE_BIGFOOT */ - if (LOG_ENABLED(MDBX_LOG_EXTRA)) { - size_t i = ctx->retired_stored; - DEBUG_EXTRA("txn %" PRIaTXN " root %" PRIaPGNO " num %zu, retired-PNL", - txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); - for (; i; i--) - DEBUG_EXTRA_PRINT(" %" PRIaPGNO, txn->tw.retired_pages[i]); - DEBUG_EXTRA_PRINT("%s\n", "."); - } - if (unlikely(amount != MDBX_PNL_GETSIZE(txn->tw.relist) && - ctx->reserved)) { - TRACE("%s: reclaimed-list changed %zu -> %zu, retry", dbg_prefix_mode, - amount, MDBX_PNL_GETSIZE(txn->tw.relist)); - goto retry_clean_adj /* rare case, but avoids GC fragmentation - and one cycle. */ - ; - } - continue; + int height = deep + 1; + if (sdb->id >= CORE_DBS) + height -= usr->txn->dbs[MAIN_DBI].height; + const tree_t *nested = sdb_info->nested; + if (nested) { + if (sdb->flags & MDBX_DUPSORT) + height -= sdb_info->internal->height; + else { + chk_object_issue(scope, "nested tree", pgno, "unexpected", + "subDb %s flags 0x%x, deep %i", chk_v2a(chk, &sdb->name), + sdb->flags, deep); + nested = nullptr; } + } else + chk->last_nested = nullptr; - /* handle reclaimed and lost pages - merge and store both into gc */ - tASSERT(txn, pnl_check_allocated(txn->tw.relist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - tASSERT(txn, txn->tw.loose_count == 0); + const char *pagetype_caption; + bool branch = false; + switch (pagetype) { + default: + chk_object_issue(scope, "page", pgno, "unknown page-type", + "type %u, deep %i", (unsigned)pagetype, deep); + pagetype_caption = "unknown"; + sdb->pages.other += npages; + break; + case page_broken: + assert(page_err != MDBX_SUCCESS); + pagetype_caption = "broken"; + sdb->pages.other += npages; + break; + case page_sub_broken: + assert(page_err != MDBX_SUCCESS); + pagetype_caption = "broken-subpage"; + sdb->pages.other += npages; + break; + case page_large: + pagetype_caption = "large"; + histogram_acc(npages, &sdb->histogram.large_pages); + if (sdb->flags & MDBX_DUPSORT) + chk_object_issue(scope, "page", pgno, "unexpected", + "type %u, subDb %s flags 0x%x, deep %i", + (unsigned)pagetype, chk_v2a(chk, &sdb->name), sdb->flags, + deep); + break; + case page_branch: + branch = true; + if (!nested) { + pagetype_caption = "branch"; + sdb->pages.branch += 1; + } else { + pagetype_caption = "nested-branch"; + sdb->pages.nested_branch += 1; + } + break; + case page_dupfix_leaf: + if (!nested) + chk_object_issue(scope, "page", pgno, "unexpected", + "type %u, subDb %s flags 0x%x, deep %i", + (unsigned)pagetype, chk_v2a(chk, &sdb->name), sdb->flags, + deep); + /* fall through */ + __fallthrough; + case page_leaf: + if (!nested) { + pagetype_caption = "leaf"; + sdb->pages.leaf += 1; + if (height != sdb_info->internal->height) + chk_object_issue(scope, "page", pgno, "wrong tree height", + "actual %i != %i subDb %s", height, + sdb_info->internal->height, chk_v2a(chk, &sdb->name)); + } else { + pagetype_caption = + (pagetype == page_leaf) ? "nested-leaf" : "nested-leaf-dupfix"; + sdb->pages.nested_leaf += 1; + if (chk->last_nested != nested) { + histogram_acc(height, &sdb->histogram.nested_tree); + chk->last_nested = nested; + } + if (height != nested->height) + chk_object_issue(scope, "page", pgno, "wrong nested-tree height", + "actual %i != %i dupsort-node %s", height, + nested->height, chk_v2a(chk, &sdb->name)); + } + break; + case page_sub_dupfix_leaf: + case page_sub_leaf: + pagetype_caption = + (pagetype == page_sub_leaf) ? "subleaf-dupsort" : "subleaf-dupfix"; + sdb->pages.nested_subleaf += 1; + if ((sdb->flags & MDBX_DUPSORT) == 0 || nested) + chk_object_issue(scope, "page", pgno, "unexpected", + "type %u, subDb %s flags 0x%x, deep %i", + (unsigned)pagetype, chk_v2a(chk, &sdb->name), sdb->flags, + deep); + break; + } - TRACE("%s", " >> reserving"); - if (AUDIT_ENABLED()) { - rc = audit_ex(txn, ctx->retired_stored, false); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; + if (npages) { + if (sdb->cookie) { + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_extra); + if (npages == 1) + chk_print(line, "%s-page %" PRIuSIZE, pagetype_caption, pgno); + else + chk_print(line, "%s-span %" PRIuSIZE "[%u]", pagetype_caption, pgno, + npages); + chk_line_end(chk_print(line, + " of %s: header %" PRIiPTR ", %s %" PRIiPTR + ", payload %" PRIiPTR ", unused %" PRIiPTR + ", deep %i", + chk_v2a(chk, &sdb->name), header_bytes, + (pagetype == page_branch) ? "keys" : "entries", + nentries, payload_bytes, unused_bytes, deep)); } - const size_t left = amount - ctx->reserved - ctx->reserve_adj; - TRACE("%s: amount %zu, settled %zd, reserve_adj %zu, left %zd, " - "lifo-reclaimed-slots %zu, " - "reused-gc-slots %zu", - dbg_prefix_mode, amount, ctx->reserved, ctx->reserve_adj, left, - txn->tw.lifo_reclaimed ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) : 0, - ctx->reused_slot); - if (0 >= (intptr_t)left) - break; - const size_t prefer_max_scatter = MDBX_ENABLE_BIGFOOT ? MDBX_TXL_MAX : 257; - txnid_t reservation_gc_id; - if (ctx->lifo) { - if (txn->tw.lifo_reclaimed == nullptr) { - txn->tw.lifo_reclaimed = txl_alloc(); - if (unlikely(!txn->tw.lifo_reclaimed)) { - rc = MDBX_ENOMEM; - goto bailout; - } + bool already_used = false; + for (unsigned n = 0; n < npages; ++n) { + const size_t spanpgno = pgno + n; + if (spanpgno >= usr->result.alloc_pages) { + chk_object_issue(scope, "page", spanpgno, "wrong page-no", + "%s-page: %" PRIuSIZE " > %" PRIuSIZE ", deep %i", + pagetype_caption, spanpgno, usr->result.alloc_pages, + deep); + sdb->pages.all += 1; + } else if (chk->pagemap[spanpgno]) { + const MDBX_chk_subdb_t *const rival = + chk->subdb[chk->pagemap[spanpgno] - 1]; + chk_object_issue(scope, "page", spanpgno, + (branch && rival == sdb) ? "loop" : "already used", + "%s-page: by %s, deep %i", pagetype_caption, + chk_v2a(chk, &rival->name), deep); + already_used = true; + } else { + chk->pagemap[spanpgno] = (int16_t)sdb->id + 1; + sdb->pages.all += 1; } - if (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && - left > (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot) * - env->me_maxgc_ov1page && - !ctx->dense) { - /* Hужен свободный для для сохранения списка страниц. */ - bool need_cleanup = false; - txnid_t snap_oldest = 0; - retry_rid: - do { - rc = page_alloc_slowpath(&ctx->cursor, 0, MDBX_ALLOC_RESERVE).err; - snap_oldest = env->me_lck->mti_oldest_reader.weak; - if (likely(rc == MDBX_SUCCESS)) { - TRACE("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode, - MDBX_PNL_LAST(txn->tw.lifo_reclaimed)); - need_cleanup = true; - } - } while ( - rc == MDBX_SUCCESS && - MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && - left > - (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot) * - env->me_maxgc_ov1page); - - if (likely(rc == MDBX_SUCCESS)) { - TRACE("%s: got enough from GC.", dbg_prefix_mode); - continue; - } else if (unlikely(rc != MDBX_NOTFOUND)) - /* LY: some troubles... */ - goto bailout; + } - if (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)) { - if (need_cleanup) { - txl_sort(txn->tw.lifo_reclaimed); - ctx->cleaned_slot = 0; - } - ctx->rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed); - } else { - tASSERT(txn, txn->tw.last_reclaimed == 0); - if (unlikely(txn_oldest_reader(txn) != snap_oldest)) - /* should retry page_alloc_slowpath() - * if the oldest reader changes since the last attempt */ - goto retry_rid; - /* no reclaimable GC entries, - * therefore no entries with ID < mdbx_find_oldest(txn) */ - txn->tw.last_reclaimed = ctx->rid = snap_oldest; - TRACE("%s: none recycled yet, set rid to @%" PRIaTXN, dbg_prefix_mode, - ctx->rid); - } + if (already_used) + return branch ? MDBX_RESULT_TRUE /* avoid infinite loop/recursion */ + : MDBX_SUCCESS; + } - /* В GC нет годных к переработке записей, - * будем использовать свободные id в обратном порядке. */ - while (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && - left > (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - - ctx->reused_slot) * - env->me_maxgc_ov1page) { - if (unlikely(ctx->rid <= MIN_TXNID)) { - if (unlikely(MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) <= - ctx->reused_slot)) { - NOTICE("** restart: reserve depleted (reused_gc_slot %zu >= " - "lifo_reclaimed %zu" PRIaTXN, - ctx->reused_slot, - MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)); - goto retry; - } - break; - } + if (MDBX_IS_ERROR(page_err)) { + chk_object_issue(scope, "page", pgno, "invalid/corrupted", "%s-page", + pagetype_caption); + } else { + if (unused_bytes > page_size) + chk_object_issue(scope, "page", pgno, "illegal unused-bytes", + "%s-page: %u < %" PRIuSIZE " < %u", pagetype_caption, 0, + unused_bytes, env->ps); - tASSERT(txn, ctx->rid >= MIN_TXNID && ctx->rid <= MAX_TXNID); - ctx->rid -= 1; - key.iov_base = &ctx->rid; - key.iov_len = sizeof(ctx->rid); - rc = cursor_set(&ctx->cursor, &key, &data, MDBX_SET_KEY).err; - if (unlikely(rc == MDBX_SUCCESS)) { - DEBUG("%s: GC's id %" PRIaTXN " is present, going to first", - dbg_prefix_mode, ctx->rid); - rc = cursor_first(&ctx->cursor, &key, nullptr); - if (unlikely(rc != MDBX_SUCCESS || - key.iov_len != sizeof(txnid_t))) { - ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "invalid GC-key size", (unsigned)key.iov_len); - rc = MDBX_CORRUPTED; - goto bailout; - } - const txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); - if (gc_first <= MIN_TXNID) { - DEBUG("%s: no free GC's id(s) less than %" PRIaTXN - " (going dense-mode)", - dbg_prefix_mode, ctx->rid); - ctx->dense = true; - break; - } - ctx->rid = gc_first - 1; - } + if (header_bytes < (int)sizeof(long) || + (size_t)header_bytes >= env->ps - sizeof(long)) { + chk_object_issue(scope, "page", pgno, "illegal header-length", + "%s-page: %" PRIuSIZE " < %" PRIuSIZE " < %" PRIuSIZE, + pagetype_caption, sizeof(long), header_bytes, + env->ps - sizeof(long)); + } + if (nentries < 1 || (pagetype == page_branch && nentries < 2)) { + chk_object_issue(scope, "page", pgno, nentries ? "half-empty" : "empty", + "%s-page: payload %" PRIuSIZE " bytes, %" PRIuSIZE + " entries, deep %i", + pagetype_caption, payload_bytes, nentries, deep); + sdb->pages.empty += 1; + } - eASSERT(env, !ctx->dense); - rc = txl_append(&txn->tw.lifo_reclaimed, ctx->rid); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; + if (npages) { + if (page_bytes != page_size) { + chk_object_issue(scope, "page", pgno, "misused", + "%s-page: %" PRIuPTR " != %" PRIuPTR " (%" PRIuPTR + "h + %" PRIuPTR "p + %" PRIuPTR "u), deep %i", + pagetype_caption, page_size, page_bytes, header_bytes, + payload_bytes, unused_bytes, deep); + if (page_size > page_bytes) + sdb->lost_bytes += page_size - page_bytes; + } else { + sdb->payload_bytes += payload_bytes + header_bytes; + usr->result.total_payload_bytes += payload_bytes + header_bytes; + } + } + } + return chk_check_break(scope); +} - if (ctx->reused_slot) - /* rare case, but it is better to clear and re-create GC entries - * with less fragmentation. */ - need_cleanup = true; - else - ctx->cleaned_slot += - 1 /* mark cleanup is not needed for added slot. */; +__cold static int chk_tree(MDBX_chk_scope_t *const scope) { + MDBX_chk_internal_t *const chk = scope->internal; + MDBX_chk_context_t *const usr = chk->usr; + MDBX_env *const env = usr->env; + MDBX_txn *const txn = usr->txn; - TRACE("%s: append @%" PRIaTXN - " to lifo-reclaimed, cleaned-gc-slot = %zu", - dbg_prefix_mode, ctx->rid, ctx->cleaned_slot); - } +#if defined(_WIN32) || defined(_WIN64) + SetLastError(ERROR_SUCCESS); +#else + errno = 0; +#endif /* Windows */ + chk->pagemap = osal_calloc(usr->result.alloc_pages, sizeof(*chk->pagemap)); + if (!chk->pagemap) { + int err = osal_get_errno(); + return chk_error_rc(scope, err ? err : MDBX_ENOMEM, "calloc"); + } - if (need_cleanup || ctx->dense) { - if (ctx->cleaned_slot) { - TRACE("%s: restart to clear and re-create GC entries", - dbg_prefix_mode); - goto retry; - } - continue; - } - } + if (scope->verbosity > MDBX_chk_info) + chk_scope_push(scope, 0, "Walking pages..."); + /* always skip key ordering checking + * to avoid MDBX_CORRUPTED in case custom comparators were used */ + usr->result.processed_pages = NUM_METAS; + int err = walk_pages(txn, chk_pgvisitor, scope, dont_check_keys_ordering); + if (MDBX_IS_ERROR(err) && err != MDBX_EINTR) + chk_error_rc(scope, err, "walk_pages"); - const size_t i = - MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot; - tASSERT(txn, i > 0 && i <= MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)); - reservation_gc_id = txn->tw.lifo_reclaimed[i]; - TRACE("%s: take @%" PRIaTXN " from lifo-reclaimed[%zu]", dbg_prefix_mode, - reservation_gc_id, i); - } else { - tASSERT(txn, txn->tw.lifo_reclaimed == NULL); - if (unlikely(ctx->rid == 0)) { - ctx->rid = txn_oldest_reader(txn); - rc = cursor_first(&ctx->cursor, &key, nullptr); - if (likely(rc == MDBX_SUCCESS)) { - if (unlikely(key.iov_len != sizeof(txnid_t))) { - ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "invalid GC-key size", (unsigned)key.iov_len); - rc = MDBX_CORRUPTED; - goto bailout; - } - const txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); - if (ctx->rid >= gc_first) - ctx->rid = gc_first - 1; - if (unlikely(ctx->rid == 0)) { - ERROR("%s", "** no GC tail-space to store (going dense-mode)"); - ctx->dense = true; - goto retry_clean_adj; - } - } else if (rc != MDBX_NOTFOUND) - goto bailout; - txn->tw.last_reclaimed = ctx->rid; - ctx->cleaned_id = ctx->rid + 1; - } - reservation_gc_id = ctx->rid--; - TRACE("%s: take @%" PRIaTXN " from head-gc-id", dbg_prefix_mode, - reservation_gc_id); - } - ++ctx->reused_slot; + for (size_t n = NUM_METAS; n < usr->result.alloc_pages; ++n) + if (!chk->pagemap[n]) + usr->result.unused_pages += 1; - size_t chunk = left; - if (unlikely(chunk > env->me_maxgc_ov1page)) { - const size_t avail_gc_slots = - txn->tw.lifo_reclaimed - ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot + 1 - : (ctx->rid < INT16_MAX) ? (size_t)ctx->rid - : INT16_MAX; - if (avail_gc_slots > 1) { -#if MDBX_ENABLE_BIGFOOT - chunk = (chunk < env->me_maxgc_ov1page * (size_t)2) - ? chunk / 2 - : env->me_maxgc_ov1page; -#else - if (chunk < env->me_maxgc_ov1page * 2) - chunk /= 2; - else { - const size_t threshold = - env->me_maxgc_ov1page * ((avail_gc_slots < prefer_max_scatter) - ? avail_gc_slots - : prefer_max_scatter); - if (left < threshold) - chunk = env->me_maxgc_ov1page; - else { - const size_t tail = left - threshold + env->me_maxgc_ov1page + 1; - size_t span = 1; - size_t avail = ((pgno2bytes(env, span) - PAGEHDRSZ) / - sizeof(pgno_t)) /* - 1 + span */; - if (tail > avail) { - for (size_t i = amount - span; i > 0; --i) { - if (MDBX_PNL_ASCENDING ? (txn->tw.relist[i] + span) - : (txn->tw.relist[i] - span) == - txn->tw.relist[i + span]) { - span += 1; - avail = - ((pgno2bytes(env, span) - PAGEHDRSZ) / sizeof(pgno_t)) - - 1 + span; - if (avail >= tail) - break; - } - } - } + MDBX_chk_subdb_t total; + memset(&total, 0, sizeof(total)); + total.pages.all = NUM_METAS; + for (size_t i = 0; i < ARRAY_LENGTH(chk->subdb) && chk->subdb[i]; ++i) { + MDBX_chk_subdb_t *const sdb = chk->subdb[i]; + total.payload_bytes += sdb->payload_bytes; + total.lost_bytes += sdb->lost_bytes; + total.pages.all += sdb->pages.all; + total.pages.empty += sdb->pages.empty; + total.pages.other += sdb->pages.other; + total.pages.branch += sdb->pages.branch; + total.pages.leaf += sdb->pages.leaf; + total.pages.nested_branch += sdb->pages.nested_branch; + total.pages.nested_leaf += sdb->pages.nested_leaf; + total.pages.nested_subleaf += sdb->pages.nested_subleaf; + } + assert(total.pages.all == usr->result.processed_pages); - chunk = (avail >= tail) ? tail - span - : (avail_gc_slots > 3 && - ctx->reused_slot < prefer_max_scatter - 3) - ? avail - span - : tail; + const size_t total_page_bytes = pgno2bytes(env, total.pages.all); + if (usr->scope->subtotal_issues || usr->scope->verbosity >= MDBX_chk_verbose) + chk_line_end(chk_print(chk_line_begin(usr->scope, MDBX_chk_resolution), + "walked %zu pages, left/unused %zu" + ", %" PRIuSIZE " problem(s)", + usr->result.processed_pages, + usr->result.unused_pages, + usr->scope->subtotal_issues)); + + err = chk_scope_restore(scope, err); + if (scope->verbosity > MDBX_chk_info) { + for (size_t i = 0; i < ARRAY_LENGTH(chk->subdb) && chk->subdb[i]; ++i) { + MDBX_chk_subdb_t *const sdb = chk->subdb[i]; + MDBX_chk_scope_t *inner = + chk_scope_push(scope, 0, "tree %s:", chk_v2a(chk, &sdb->name)); + if (sdb->pages.all == 0) + chk_line_end( + chk_print(chk_line_begin(inner, MDBX_chk_resolution), "empty")); + else { + MDBX_chk_line_t *line = chk_line_begin(inner, MDBX_chk_info); + if (line) { + line = chk_print(line, "page usage: subtotal %" PRIuSIZE, + sdb->pages.all); + const size_t branch_pages = + sdb->pages.branch + sdb->pages.nested_branch; + const size_t leaf_pages = sdb->pages.leaf + sdb->pages.nested_leaf + + sdb->pages.nested_subleaf; + if (sdb->pages.other) + line = chk_print(line, ", other %" PRIuSIZE, sdb->pages.other); + if (sdb->pages.other == 0 || + (branch_pages | leaf_pages | sdb->histogram.large_pages.count) != + 0) { + line = chk_print(line, ", branch %" PRIuSIZE ", leaf %" PRIuSIZE, + branch_pages, leaf_pages); + if (sdb->histogram.large_pages.count || + (sdb->flags & MDBX_DUPSORT) == 0) { + line = chk_print(line, ", large %" PRIuSIZE, + sdb->histogram.large_pages.count); + if (sdb->histogram.large_pages.amount | + sdb->histogram.large_pages.count) + line = histogram_print(inner, line, &sdb->histogram.large_pages, + " amount", "single", true); + } + } + line = histogram_dist(chk_line_feed(line), &sdb->histogram.deep, + "tree deep density", "1", false); + if (sdb != &chk->subdb_gc && sdb->histogram.nested_tree.count) { + line = chk_print(chk_line_feed(line), "nested tree(s) %" PRIuSIZE, + sdb->histogram.nested_tree.count); + line = histogram_dist(line, &sdb->histogram.nested_tree, " density", + "1", false); + line = chk_print(chk_line_feed(line), + "nested tree(s) pages %" PRIuSIZE + ": branch %" PRIuSIZE ", leaf %" PRIuSIZE + ", subleaf %" PRIuSIZE, + sdb->pages.nested_branch + sdb->pages.nested_leaf, + sdb->pages.nested_branch, sdb->pages.nested_leaf, + sdb->pages.nested_subleaf); } + + const size_t bytes = pgno2bytes(env, sdb->pages.all); + line = chk_print( + chk_line_feed(line), + "page filling: subtotal %" PRIuSIZE + " bytes (%.1f%%), payload %" PRIuSIZE + " (%.1f%%), unused %" PRIuSIZE " (%.1f%%)", + bytes, bytes * 100.0 / total_page_bytes, sdb->payload_bytes, + sdb->payload_bytes * 100.0 / bytes, bytes - sdb->payload_bytes, + (bytes - sdb->payload_bytes) * 100.0 / bytes); + if (sdb->pages.empty) + line = chk_print(line, ", %" PRIuSIZE " empty pages", + sdb->pages.empty); + if (sdb->lost_bytes) + line = + chk_print(line, ", %" PRIuSIZE " bytes lost", sdb->lost_bytes); + chk_line_end(line); } -#endif /* MDBX_ENABLE_BIGFOOT */ } + chk_scope_restore(scope, 0); } - tASSERT(txn, chunk > 0); + } - TRACE("%s: gc_rid %" PRIaTXN ", reused_gc_slot %zu, reservation-id " - "%" PRIaTXN, - dbg_prefix_mode, ctx->rid, ctx->reused_slot, reservation_gc_id); + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_resolution); + line = chk_print(line, + "summary: total %" PRIuSIZE " bytes, payload %" PRIuSIZE + " (%.1f%%), unused %" PRIuSIZE " (%.1f%%)," + " average fill %.1f%%", + total_page_bytes, usr->result.total_payload_bytes, + usr->result.total_payload_bytes * 100.0 / total_page_bytes, + total_page_bytes - usr->result.total_payload_bytes, + (total_page_bytes - usr->result.total_payload_bytes) * + 100.0 / total_page_bytes, + usr->result.total_payload_bytes * 100.0 / total_page_bytes); + if (total.pages.empty) + line = chk_print(line, ", %" PRIuSIZE " empty pages", total.pages.empty); + if (total.lost_bytes) + line = chk_print(line, ", %" PRIuSIZE " bytes lost", total.lost_bytes); + chk_line_end(line); + return err; +} - TRACE("%s: chunk %zu, gc-per-ovpage %u", dbg_prefix_mode, chunk, - env->me_maxgc_ov1page); +typedef int(chk_kv_visitor)(MDBX_chk_scope_t *const scope, + MDBX_chk_subdb_t *sdb, const size_t record_number, + const MDBX_val *key, const MDBX_val *data); - tASSERT(txn, reservation_gc_id <= env->me_lck->mti_oldest_reader.weak); - if (unlikely( - reservation_gc_id < MIN_TXNID || - reservation_gc_id > - atomic_load64(&env->me_lck->mti_oldest_reader, mo_Relaxed))) { - ERROR("** internal error (reservation_gc_id %" PRIaTXN ")", - reservation_gc_id); - rc = MDBX_PROBLEM; - goto bailout; - } +__cold static int chk_handle_kv(MDBX_chk_scope_t *const scope, + MDBX_chk_subdb_t *sdb, + const size_t record_number, const MDBX_val *key, + const MDBX_val *data) { + MDBX_chk_internal_t *const chk = scope->internal; + int err = MDBX_SUCCESS; + assert(sdb->cookie); + if (chk->cb->subdb_handle_kv) + err = chk->cb->subdb_handle_kv(chk->usr, sdb, record_number, key, data); + return err ? err : chk_check_break(scope); +} - key.iov_len = sizeof(reservation_gc_id); - key.iov_base = &reservation_gc_id; - data.iov_len = (chunk + 1) * sizeof(pgno_t); - TRACE("%s: reserve %zu [%zu...%zu) @%" PRIaTXN, dbg_prefix_mode, chunk, - ctx->reserved + 1, ctx->reserved + chunk + 1, reservation_gc_id); - gcu_prepare_backlog(txn, ctx); - rc = cursor_put_nochecklen(&ctx->cursor, &key, &data, - MDBX_RESERVE | MDBX_NOOVERWRITE); - tASSERT(txn, pnl_check_allocated(txn->tw.relist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; +__cold static int chk_db(MDBX_chk_scope_t *const scope, MDBX_dbi dbi, + MDBX_chk_subdb_t *sdb, chk_kv_visitor *handler) { + MDBX_chk_internal_t *const chk = scope->internal; + MDBX_chk_context_t *const usr = chk->usr; + MDBX_env *const env = usr->env; + MDBX_txn *const txn = usr->txn; + MDBX_cursor *cursor = nullptr; + size_t record_count = 0, dups = 0, sub_databases = 0; + int err; - gcu_clean_reserved(env, data); - ctx->reserved += chunk; - TRACE("%s: settled %zu (+%zu), continue", dbg_prefix_mode, ctx->reserved, - chunk); + if ((MDBX_TXN_FINISHED | MDBX_TXN_ERROR) & txn->flags) { + chk_line_end( + chk_flush(chk_print(chk_line_begin(scope, MDBX_chk_error), + "abort processing %s due to a previous error", + chk_v2a(chk, &sdb->name)))); + err = MDBX_BAD_TXN; + goto bailout; + } - if (txn->tw.lifo_reclaimed && - unlikely(amount < MDBX_PNL_GETSIZE(txn->tw.relist)) && - (ctx->loop < 5 || MDBX_PNL_GETSIZE(txn->tw.relist) - amount > - env->me_maxgc_ov1page / 2)) { - NOTICE("** restart: reclaimed-list growth %zu -> %zu", amount, - MDBX_PNL_GETSIZE(txn->tw.relist)); - goto retry_clean_adj; + if (0 > (int)dbi) { + err = dbi_open( + txn, &sdb->name, MDBX_DB_ACCEDE, &dbi, + (chk->flags & MDBX_CHK_IGNORE_ORDER) ? cmp_equal_or_greater : nullptr, + (chk->flags & MDBX_CHK_IGNORE_ORDER) ? cmp_equal_or_greater : nullptr); + if (unlikely(err)) { + tASSERT(txn, dbi >= txn->env->n_dbi || + (txn->env->dbs_flags[dbi] & DB_VALID) == 0); + chk_error_rc(scope, err, "mdbx_dbi_open"); + goto bailout; } - - continue; + tASSERT(txn, dbi < txn->env->n_dbi && + (txn->env->dbs_flags[dbi] & DB_VALID) != 0); } - tASSERT(txn, - ctx->cleaned_slot == (txn->tw.lifo_reclaimed - ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - : 0)); - - TRACE("%s", " >> filling"); - /* Fill in the reserved records */ - size_t excess_slots = 0; - ctx->fill_idx = - txn->tw.lifo_reclaimed - ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot - : ctx->reused_slot; - rc = MDBX_SUCCESS; - tASSERT(txn, pnl_check_allocated(txn->tw.relist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - tASSERT(txn, dirtylist_check(txn)); - if (ctx->reserved || MDBX_PNL_GETSIZE(txn->tw.relist)) { - MDBX_val key, data; - key.iov_len = data.iov_len = 0; /* avoid MSVC warning */ - key.iov_base = data.iov_base = NULL; - - const size_t amount = MDBX_PNL_GETSIZE(txn->tw.relist); - size_t left = amount, excess = 0; - if (txn->tw.lifo_reclaimed == nullptr) { - tASSERT(txn, ctx->lifo == 0); - rc = cursor_first(&ctx->cursor, &key, &data); - if (unlikely(rc != MDBX_SUCCESS)) { - if (rc == MDBX_NOTFOUND && ctx->reserve_adj) - goto retry_clean_adj; - goto bailout; - } - } else { - tASSERT(txn, ctx->lifo != 0); + const tree_t *const db = txn->dbs + dbi; + if (handler) { + const char *key_mode = nullptr; + switch (sdb->flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY)) { + case 0: + key_mode = "usual"; + break; + case MDBX_REVERSEKEY: + key_mode = "reserve"; + break; + case MDBX_INTEGERKEY: + key_mode = "ordinal"; + break; + case MDBX_REVERSEKEY | MDBX_INTEGERKEY: + key_mode = "msgpack"; + break; + default: + key_mode = "inconsistent"; + chk_scope_issue(scope, "wrong key-mode (0x%x)", + sdb->flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY)); } - while (true) { - txnid_t fill_gc_id; - TRACE("%s: left %zu of %zu", dbg_prefix_mode, left, - MDBX_PNL_GETSIZE(txn->tw.relist)); - if (txn->tw.lifo_reclaimed == nullptr) { - tASSERT(txn, ctx->lifo == 0); - fill_gc_id = unaligned_peek_u64(4, key.iov_base); - if (ctx->fill_idx == 0 || fill_gc_id > txn->tw.last_reclaimed) { - if (!left) - break; - NOTICE("** restart: reserve depleted (fill_idx %zu, fill_id %" PRIaTXN - " > last_reclaimed %" PRIaTXN ", left %zu", - ctx->fill_idx, fill_gc_id, txn->tw.last_reclaimed, left); - ctx->reserve_adj = - (ctx->reserve_adj > left) ? ctx->reserve_adj - left : 0; - goto retry; - } - ctx->fill_idx -= 1; - } else { - tASSERT(txn, ctx->lifo != 0); - if (ctx->fill_idx >= MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)) { - if (!left) - break; - NOTICE("** restart: reserve depleted (fill_idx %zu >= " - "lifo_reclaimed %zu, left %zu", - ctx->fill_idx, MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed), left); - ctx->reserve_adj = - (ctx->reserve_adj > left) ? ctx->reserve_adj - left : 0; - goto retry; - } - ctx->fill_idx += 1; - fill_gc_id = txn->tw.lifo_reclaimed[ctx->fill_idx]; - TRACE("%s: seek-reservation @%" PRIaTXN " at lifo_reclaimed[%zu]", - dbg_prefix_mode, fill_gc_id, ctx->fill_idx); - key.iov_base = &fill_gc_id; - key.iov_len = sizeof(fill_gc_id); - rc = cursor_set(&ctx->cursor, &key, &data, MDBX_SET_KEY).err; - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } - tASSERT(txn, ctx->cleaned_slot == - (txn->tw.lifo_reclaimed - ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - : 0)); - tASSERT(txn, fill_gc_id > 0 && - fill_gc_id <= env->me_lck->mti_oldest_reader.weak); - key.iov_base = &fill_gc_id; - key.iov_len = sizeof(fill_gc_id); - - tASSERT(txn, data.iov_len >= sizeof(pgno_t) * 2); - size_t chunk = data.iov_len / sizeof(pgno_t) - 1; - if (unlikely(chunk > left)) { - const size_t delta = chunk - left; - excess += delta; - if (!left) { - excess_slots += 1; - goto next; - } - TRACE("%s: chunk %zu > left %zu, @%" PRIaTXN, dbg_prefix_mode, chunk, - left, fill_gc_id); - if ((ctx->loop < 5 && delta > (ctx->loop / 2)) || - delta > env->me_maxgc_ov1page) - data.iov_len = (left + 1) * sizeof(pgno_t); - chunk = left; - } - rc = cursor_put_nochecklen(&ctx->cursor, &key, &data, - MDBX_CURRENT | MDBX_RESERVE); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - gcu_clean_reserved(env, data); - - if (unlikely(txn->tw.loose_count || - amount != MDBX_PNL_GETSIZE(txn->tw.relist))) { - NOTICE("** restart: reclaimed-list growth (%zu -> %zu, loose +%zu)", - amount, MDBX_PNL_GETSIZE(txn->tw.relist), txn->tw.loose_count); - goto retry_clean_adj; - } - if (unlikely(txn->tw.lifo_reclaimed - ? ctx->cleaned_slot < - MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - : ctx->cleaned_id < txn->tw.last_reclaimed)) { - NOTICE("%s", "** restart: reclaimed-slots changed"); - goto retry_clean_adj; - } - if (unlikely(ctx->retired_stored != - MDBX_PNL_GETSIZE(txn->tw.retired_pages))) { - tASSERT(txn, - ctx->retired_stored < MDBX_PNL_GETSIZE(txn->tw.retired_pages)); - NOTICE("** restart: retired-list growth (%zu -> %zu)", - ctx->retired_stored, MDBX_PNL_GETSIZE(txn->tw.retired_pages)); - goto retry_clean_adj; - } - - pgno_t *dst = data.iov_base; - *dst++ = (pgno_t)chunk; - pgno_t *src = MDBX_PNL_BEGIN(txn->tw.relist) + left - chunk; - memcpy(dst, src, chunk * sizeof(pgno_t)); - pgno_t *from = src, *to = src + chunk; - TRACE("%s: fill %zu [ %zu:%" PRIaPGNO "...%zu:%" PRIaPGNO "] @%" PRIaTXN, - dbg_prefix_mode, chunk, from - txn->tw.relist, from[0], - to - txn->tw.relist, to[-1], fill_gc_id); - - left -= chunk; - if (AUDIT_ENABLED()) { - rc = audit_ex(txn, ctx->retired_stored + amount - left, true); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } - - next: - if (txn->tw.lifo_reclaimed == nullptr) { - tASSERT(txn, ctx->lifo == 0); - rc = cursor_next(&ctx->cursor, &key, &data, MDBX_NEXT); - if (unlikely(rc != MDBX_SUCCESS)) { - if (rc != MDBX_NOTFOUND) - goto bailout; - rc = MDBX_SUCCESS; - break; - } - } else { - tASSERT(txn, ctx->lifo != 0); - } + const char *value_mode = nullptr; + switch (sdb->flags & (MDBX_DUPSORT | MDBX_REVERSEDUP | MDBX_DUPFIXED | + MDBX_INTEGERDUP)) { + case 0: + value_mode = "single"; + break; + case MDBX_DUPSORT: + value_mode = "multi"; + break; + case MDBX_DUPSORT | MDBX_REVERSEDUP: + value_mode = "multi-reverse"; + break; + case MDBX_DUPSORT | MDBX_DUPFIXED: + value_mode = "multi-samelength"; + break; + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP: + value_mode = "multi-reverse-samelength"; + break; + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP: + value_mode = "multi-ordinal"; + break; + case MDBX_DUPSORT | MDBX_INTEGERDUP | MDBX_REVERSEDUP: + value_mode = "multi-msgpack"; + break; + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP: + value_mode = "reserved"; + break; + default: + value_mode = "inconsistent"; + chk_scope_issue(scope, "wrong value-mode (0x%x)", + sdb->flags & (MDBX_DUPSORT | MDBX_REVERSEDUP | + MDBX_DUPFIXED | MDBX_INTEGERDUP)); } - if (excess) { - size_t n = excess, adj = excess; - while (n >= env->me_maxgc_ov1page) - adj -= n /= env->me_maxgc_ov1page; - ctx->reserve_adj += adj; - TRACE("%s: extra %zu reserved space, adj +%zu (%zu)", dbg_prefix_mode, - excess, adj, ctx->reserve_adj); + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_info); + line = chk_print(line, "key-value kind: %s-key => %s-value", key_mode, + value_mode); + line = chk_print(line, ", flags:"); + if (!sdb->flags) + line = chk_print(line, " none"); + else { + const uint8_t f[] = {MDBX_DUPSORT, + MDBX_INTEGERKEY, + MDBX_REVERSEKEY, + MDBX_DUPFIXED, + MDBX_REVERSEDUP, + MDBX_INTEGERDUP, + 0}; + const char *const t[] = {"dupsort", "integerkey", "reversekey", + "dupfix", "reversedup", "integerdup"}; + for (size_t i = 0; f[i]; i++) + if (sdb->flags & f[i]) + line = chk_print(line, " %s", t[i]); } - } - - tASSERT(txn, rc == MDBX_SUCCESS); - if (unlikely(txn->tw.loose_count != 0)) { - NOTICE("** restart: got %zu loose pages", txn->tw.loose_count); - goto retry_clean_adj; - } - - if (unlikely(excess_slots)) { - const bool will_retry = ctx->loop < 5 || excess_slots > 1; - NOTICE("** %s: reserve excess (excess-slots %zu, filled-slot %zu, adj %zu, " - "loop %zu)", - will_retry ? "restart" : "ignore", excess_slots, ctx->fill_idx, - ctx->reserve_adj, ctx->loop); - if (will_retry) - goto retry; - } - - tASSERT(txn, - txn->tw.lifo_reclaimed == NULL || - ctx->cleaned_slot == MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)); - -bailout: - txn->mt_cursors[FREE_DBI] = ctx->cursor.mc_next; + chk_line_end(chk_print(line, " (0x%02X)", sdb->flags)); - MDBX_PNL_SETSIZE(txn->tw.relist, 0); -#if MDBX_ENABLE_PROFGC - env->me_lck->mti_pgop_stat.gc_prof.wloops += (uint32_t)ctx->loop; -#endif /* MDBX_ENABLE_PROFGC */ - TRACE("<<< %zu loops, rc = %d", ctx->loop, rc); - return rc; -} + line = chk_print(chk_line_begin(scope, MDBX_chk_verbose), + "entries %" PRIu64 ", sequence %" PRIu64, db->items, + db->sequence); + if (db->mod_txnid) + line = + chk_print(line, ", last modification txn#%" PRIaTXN, db->mod_txnid); + if (db->root != P_INVALID) + line = chk_print(line, ", root #%" PRIaPGNO, db->root); + chk_line_end(line); + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_verbose), + "b-tree depth %u, pages: branch %" PRIaPGNO + ", leaf %" PRIaPGNO ", large %" PRIaPGNO, + db->height, db->branch_pages, db->leaf_pages, + db->large_pages)); -static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - MDBX_dpl *const dl = dpl_sort(txn); - int rc = MDBX_SUCCESS; - size_t r, w, total_npages = 0; - for (w = 0, r = 1; r <= dl->length; ++r) { - MDBX_page *dp = dl->items[r].ptr; - if (dp->mp_flags & P_LOOSE) { - dl->items[++w] = dl->items[r]; - continue; + if ((chk->flags & MDBX_CHK_SKIP_BTREE_TRAVERSAL) == 0) { + const size_t branch_pages = sdb->pages.branch + sdb->pages.nested_branch; + const size_t leaf_pages = sdb->pages.leaf + sdb->pages.nested_leaf; + const size_t subtotal_pages = + db->branch_pages + db->leaf_pages + db->large_pages; + if (subtotal_pages != sdb->pages.all) + chk_scope_issue( + scope, "%s pages mismatch (%" PRIuSIZE " != walked %" PRIuSIZE ")", + "subtotal", subtotal_pages, sdb->pages.all); + if (db->branch_pages != branch_pages) + chk_scope_issue( + scope, "%s pages mismatch (%" PRIaPGNO " != walked %" PRIuSIZE ")", + "branch", db->branch_pages, branch_pages); + if (db->leaf_pages != leaf_pages) + chk_scope_issue( + scope, "%s pages mismatch (%" PRIaPGNO " != walked %" PRIuSIZE ")", + "all-leaf", db->leaf_pages, leaf_pages); + if (db->large_pages != sdb->histogram.large_pages.amount) + chk_scope_issue( + scope, "%s pages mismatch (%" PRIaPGNO " != walked %" PRIuSIZE ")", + "large/overlow", db->large_pages, + sdb->histogram.large_pages.amount); } - unsigned npages = dpl_npages(dl, r); - total_npages += npages; - rc = iov_page(txn, ctx, dp, npages); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; } - if (!iov_empty(ctx)) { - tASSERT(txn, rc == MDBX_SUCCESS); - rc = iov_write(ctx); + err = mdbx_cursor_open(txn, dbi, &cursor); + if (unlikely(err)) { + chk_error_rc(scope, err, "mdbx_cursor_open"); + goto bailout; } - - if (likely(rc == MDBX_SUCCESS) && ctx->fd == txn->mt_env->me_lazy_fd) { - txn->mt_env->me_lck->mti_unsynced_pages.weak += total_npages; - if (!txn->mt_env->me_lck->mti_eoos_timestamp.weak) - txn->mt_env->me_lck->mti_eoos_timestamp.weak = osal_monotime(); + if (chk->flags & MDBX_CHK_IGNORE_ORDER) { + cursor->checking |= z_ignord | z_pagecheck; + if (cursor->subcur) + cursor->subcur->cursor.checking |= z_ignord | z_pagecheck; } - txn->tw.dirtylist->pages_including_loose -= total_npages; - while (r <= dl->length) - dl->items[++w] = dl->items[r++]; + const size_t maxkeysize = mdbx_env_get_maxkeysize_ex(env, sdb->flags); + MDBX_val prev_key = {nullptr, 0}, prev_data = {nullptr, 0}; + MDBX_val key, data; + err = mdbx_cursor_get(cursor, &key, &data, MDBX_FIRST); + while (err == MDBX_SUCCESS) { + err = chk_check_break(scope); + if (unlikely(err)) + goto bailout; - dl->sorted = dpl_setlen(dl, w); - txn->tw.dirtyroom += r - 1 - w; - tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); - tASSERT(txn, txn->tw.dirtylist->length == txn->tw.loose_count); - tASSERT(txn, txn->tw.dirtylist->pages_including_loose == txn->tw.loose_count); - return rc; -} - -/* Merge child txn into parent */ -static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, - const size_t parent_retired_len) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); - MDBX_dpl *const src = dpl_sort(txn); - - /* Remove refunded pages from parent's dirty list */ - MDBX_dpl *const dst = dpl_sort(parent); - if (MDBX_ENABLE_REFUND) { - size_t n = dst->length; - while (n && dst->items[n].pgno >= parent->mt_next_pgno) { - const unsigned npages = dpl_npages(dst, n); - dpage_free(txn->mt_env, dst->items[n].ptr, npages); - --n; + bool bad_key = false; + if (key.iov_len > maxkeysize) { + chk_object_issue(scope, "entry", record_count, + "key length exceeds max-key-size", + "%" PRIuPTR " > %" PRIuPTR, key.iov_len, maxkeysize); + bad_key = true; + } else if ((sdb->flags & MDBX_INTEGERKEY) && key.iov_len != 8 && + key.iov_len != 4) { + chk_object_issue(scope, "entry", record_count, "wrong key length", + "%" PRIuPTR " != 4or8", key.iov_len); + bad_key = true; } - parent->tw.dirtyroom += dst->sorted - n; - dst->sorted = dpl_setlen(dst, n); - tASSERT(parent, - parent->tw.dirtyroom + parent->tw.dirtylist->length == - (parent->mt_parent ? parent->mt_parent->tw.dirtyroom - : parent->mt_env->me_options.dp_limit)); - } - - /* Remove reclaimed pages from parent's dirty list */ - const MDBX_PNL reclaimed_list = parent->tw.relist; - dpl_sift(parent, reclaimed_list, false); - - /* Move retired pages from parent's dirty & spilled list to reclaimed */ - size_t r, w, d, s, l; - for (r = w = parent_retired_len; - ++r <= MDBX_PNL_GETSIZE(parent->tw.retired_pages);) { - const pgno_t pgno = parent->tw.retired_pages[r]; - const size_t di = dpl_exist(parent, pgno); - const size_t si = !di ? search_spilled(parent, pgno) : 0; - unsigned npages; - const char *kind; - if (di) { - MDBX_page *dp = dst->items[di].ptr; - tASSERT(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | - P_OVERFLOW | P_SPILLED)) == 0); - npages = dpl_npages(dst, di); - page_wash(parent, di, dp, npages); - kind = "dirty"; - l = 1; - if (unlikely(npages > l)) { - /* OVERFLOW-страница могла быть переиспользована по частям. Тогда - * в retired-списке может быть только начало последовательности, - * а остаток растащен по dirty, spilled и reclaimed спискам. Поэтому - * переносим в reclaimed с проверкой на обрыв последовательности. - * В любом случае, все осколки будут учтены и отфильтрованы, т.е. если - * страница была разбита на части, то важно удалить dirty-элемент, - * а все осколки будут учтены отдельно. */ - /* Список retired страниц не сортирован, но для ускорения сортировки - * дополняется в соответствии с MDBX_PNL_ASCENDING */ -#if MDBX_PNL_ASCENDING - const size_t len = MDBX_PNL_GETSIZE(parent->tw.retired_pages); - while (r < len && parent->tw.retired_pages[r + 1] == pgno + l) { - ++r; - if (++l == npages) - break; - } -#else - while (w > parent_retired_len && - parent->tw.retired_pages[w - 1] == pgno + l) { - --w; - if (++l == npages) - break; - } -#endif - } - } else if (unlikely(si)) { - l = npages = 1; - spill_remove(parent, si, 1); - kind = "spilled"; - } else { - parent->tw.retired_pages[++w] = pgno; - continue; + bool bad_data = false; + if ((sdb->flags & MDBX_INTEGERDUP) && data.iov_len != 8 && + data.iov_len != 4) { + chk_object_issue(scope, "entry", record_count, "wrong data length", + "%" PRIuPTR " != 4or8", data.iov_len); + bad_data = true; } - DEBUG("reclaim retired parent's %u -> %zu %s page %" PRIaPGNO, npages, l, - kind, pgno); - int err = pnl_insert_range(&parent->tw.relist, pgno, l); - ENSURE(txn->mt_env, err == MDBX_SUCCESS); - } - MDBX_PNL_SETSIZE(parent->tw.retired_pages, w); - - /* Filter-out parent spill list */ - if (parent->tw.spilled.list && - MDBX_PNL_GETSIZE(parent->tw.spilled.list) > 0) { - const MDBX_PNL sl = spill_purge(parent); - size_t len = MDBX_PNL_GETSIZE(sl); - if (len) { - /* Remove refunded pages from parent's spill list */ - if (MDBX_ENABLE_REFUND && - MDBX_PNL_MOST(sl) >= (parent->mt_next_pgno << 1)) { -#if MDBX_PNL_ASCENDING - size_t i = MDBX_PNL_GETSIZE(sl); - assert(MDBX_PNL_MOST(sl) == MDBX_PNL_LAST(sl)); - do { - if ((sl[i] & 1) == 0) - DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); - i -= 1; - } while (i && sl[i] >= (parent->mt_next_pgno << 1)); - MDBX_PNL_SETSIZE(sl, i); -#else - assert(MDBX_PNL_MOST(sl) == MDBX_PNL_FIRST(sl)); - size_t i = 0; - do { - ++i; - if ((sl[i] & 1) == 0) - DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); - } while (i < len && sl[i + 1] >= (parent->mt_next_pgno << 1)); - MDBX_PNL_SETSIZE(sl, len -= i); - memmove(sl + 1, sl + 1 + i, len * sizeof(sl[0])); -#endif + if (prev_key.iov_base) { + if (prev_data.iov_base && !bad_data && (sdb->flags & MDBX_DUPFIXED) && + prev_data.iov_len != data.iov_len) { + chk_object_issue(scope, "entry", record_count, "different data length", + "%" PRIuPTR " != %" PRIuPTR, prev_data.iov_len, + data.iov_len); + bad_data = true; } - tASSERT(txn, pnl_check_allocated(sl, (size_t)parent->mt_next_pgno << 1)); - /* Remove reclaimed pages from parent's spill list */ - s = MDBX_PNL_GETSIZE(sl), r = MDBX_PNL_GETSIZE(reclaimed_list); - /* Scanning from end to begin */ - while (s && r) { - if (sl[s] & 1) { - --s; - continue; - } - const pgno_t spilled_pgno = sl[s] >> 1; - const pgno_t reclaimed_pgno = reclaimed_list[r]; - if (reclaimed_pgno != spilled_pgno) { - const bool cmp = MDBX_PNL_ORDERED(spilled_pgno, reclaimed_pgno); - s -= !cmp; - r -= cmp; - } else { - DEBUG("remove reclaimed parent's spilled page %" PRIaPGNO, - reclaimed_pgno); - spill_remove(parent, s, 1); - --s; - --r; - } + if (!bad_key) { + int cmp = mdbx_cmp(txn, dbi, &key, &prev_key); + if (cmp == 0) { + ++dups; + if ((sdb->flags & MDBX_DUPSORT) == 0) { + chk_object_issue(scope, "entry", record_count, "duplicated entries", + nullptr); + if (prev_data.iov_base && data.iov_len == prev_data.iov_len && + memcmp(data.iov_base, prev_data.iov_base, data.iov_len) == 0) + chk_object_issue(scope, "entry", record_count, + "complete duplicate", nullptr); + } else if (!bad_data && prev_data.iov_base) { + cmp = mdbx_dcmp(txn, dbi, &data, &prev_data); + if (cmp == 0) + chk_object_issue(scope, "entry", record_count, + "complete duplicate", nullptr); + else if (cmp < 0 && !(chk->flags & MDBX_CHK_IGNORE_ORDER)) + chk_object_issue(scope, "entry", record_count, + "wrong order of multi-values", nullptr); + } + } else if (cmp < 0 && !(chk->flags & MDBX_CHK_IGNORE_ORDER)) + chk_object_issue(scope, "entry", record_count, + "wrong order of entries", nullptr); } + } - /* Remove anything in our dirty list from parent's spill list */ - /* Scanning spill list in descend order */ - const intptr_t step = MDBX_PNL_ASCENDING ? -1 : 1; - s = MDBX_PNL_ASCENDING ? MDBX_PNL_GETSIZE(sl) : 1; - d = src->length; - while (d && (MDBX_PNL_ASCENDING ? s > 0 : s <= MDBX_PNL_GETSIZE(sl))) { - if (sl[s] & 1) { - s += step; - continue; - } - const pgno_t spilled_pgno = sl[s] >> 1; - const pgno_t dirty_pgno_form = src->items[d].pgno; - const unsigned npages = dpl_npages(src, d); - const pgno_t dirty_pgno_to = dirty_pgno_form + npages; - if (dirty_pgno_form > spilled_pgno) { - --d; - continue; - } - if (dirty_pgno_to <= spilled_pgno) { - s += step; - continue; - } + if (!bad_key) { + if (!prev_key.iov_base && (sdb->flags & MDBX_INTEGERKEY)) + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_info), + "fixed key-size %" PRIuSIZE, key.iov_len)); + prev_key = key; + } + if (!bad_data) { + if (!prev_data.iov_base && + (sdb->flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED))) + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_info), + "fixed data-size %" PRIuSIZE, data.iov_len)); + prev_data = data; + } - DEBUG("remove dirtied parent's spilled %u page %" PRIaPGNO, npages, - dirty_pgno_form); - spill_remove(parent, s, 1); - s += step; - } + record_count++; + histogram_acc(key.iov_len, &sdb->histogram.key_len); + histogram_acc(data.iov_len, &sdb->histogram.val_len); - /* Squash deleted pagenums if we deleted any */ - spill_purge(parent); + const node_t *const node = + page_node(cursor->pg[cursor->top], cursor->ki[cursor->top]); + if (node_flags(node) == N_SUBDATA) { + if (dbi != MAIN_DBI || (sdb->flags & (MDBX_DUPSORT | MDBX_DUPFIXED | + MDBX_REVERSEDUP | MDBX_INTEGERDUP))) + chk_object_issue(scope, "entry", record_count, + "unexpected sub-database", "node-flags 0x%x", + node_flags(node)); + else if (data.iov_len != sizeof(tree_t)) + chk_object_issue(scope, "entry", record_count, + "wrong sub-database node size", + "node-size %" PRIuSIZE " != %" PRIuSIZE, data.iov_len, + sizeof(tree_t)); + else if (scope->stage == MDBX_chk_maindb) + /* подсчитываем subDB при первом проходе */ + sub_databases += 1; + else { + /* обработка subDB при втором проходе */ + tree_t aligned_db; + memcpy(&aligned_db, data.iov_base, sizeof(aligned_db)); + walk_sdb_t sdb_info = {.name = key}; + sdb_info.internal = &aligned_db; + MDBX_chk_subdb_t *subdb; + err = chk_get_sdb(scope, &sdb_info, &subdb); + if (unlikely(err)) + goto bailout; + if (subdb->cookie) { + err = chk_scope_begin( + chk, 0, MDBX_chk_subdbs, subdb, &usr->result.problems_kv, + "Processing subDB %s...", chk_v2a(chk, &subdb->name)); + if (likely(!err)) { + err = chk_db(usr->scope, (MDBX_dbi)-1, subdb, chk_handle_kv); + if (err != MDBX_EINTR && err != MDBX_RESULT_TRUE) + usr->result.subdb_processed += 1; + } + err = chk_scope_restore(scope, err); + if (unlikely(err)) + goto bailout; + } else + chk_line_end(chk_flush( + chk_print(chk_line_begin(scope, MDBX_chk_processing), + "Skip processing %s...", chk_v2a(chk, &subdb->name)))); + } + } else if (handler) { + err = handler(scope, sdb, record_count, &key, &data); + if (unlikely(err)) + goto bailout; } - } - /* Remove anything in our spill list from parent's dirty list */ - if (txn->tw.spilled.list) { - tASSERT(txn, pnl_check_allocated(txn->tw.spilled.list, - (size_t)parent->mt_next_pgno << 1)); - dpl_sift(parent, txn->tw.spilled.list, true); - tASSERT(parent, - parent->tw.dirtyroom + parent->tw.dirtylist->length == - (parent->mt_parent ? parent->mt_parent->tw.dirtyroom - : parent->mt_env->me_options.dp_limit)); + err = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT); } - /* Find length of merging our dirty list with parent's and release - * filter-out pages */ - for (l = 0, d = dst->length, s = src->length; d > 0 && s > 0;) { - MDBX_page *sp = src->items[s].ptr; - tASSERT(parent, (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | - P_LOOSE | P_SPILLED)) == 0); - const unsigned s_npages = dpl_npages(src, s); - const pgno_t s_pgno = src->items[s].pgno; - - MDBX_page *dp = dst->items[d].ptr; - tASSERT(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | - P_SPILLED)) == 0); - const unsigned d_npages = dpl_npages(dst, d); - const pgno_t d_pgno = dst->items[d].pgno; - - if (d_pgno >= s_pgno + s_npages) { - --d; - ++l; - } else if (d_pgno + d_npages <= s_pgno) { - if (sp->mp_flags != P_LOOSE) { - sp->mp_txnid = parent->mt_front; - sp->mp_flags &= ~P_SPILLED; + err = (err != MDBX_NOTFOUND) ? chk_error_rc(scope, err, "mdbx_cursor_get") + : MDBX_SUCCESS; + if (err == MDBX_SUCCESS && record_count != db->items) + chk_scope_issue(scope, + "different number of entries %" PRIuSIZE " != %" PRIu64, + record_count, db->items); +bailout: + if (cursor) { + if (handler) { + if (sdb->histogram.key_len.count) { + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_info); + line = histogram_dist(line, &sdb->histogram.key_len, + "key length density", "0/1", false); + chk_line_feed(line); + line = histogram_dist(line, &sdb->histogram.val_len, + "value length density", "0/1", false); + chk_line_end(line); } - --s; - ++l; - } else { - dst->items[d--].ptr = nullptr; - dpage_free(txn->mt_env, dp, d_npages); + if (scope->stage == MDBX_chk_maindb) + usr->result.subdb_total = sub_databases; + if (chk->cb->subdb_conclude) + err = chk->cb->subdb_conclude(usr, sdb, cursor, err); + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_resolution); + line = chk_print(line, "summary: %" PRIuSIZE " records,", record_count); + if (dups || (sdb->flags & (MDBX_DUPSORT | MDBX_DUPFIXED | + MDBX_REVERSEDUP | MDBX_INTEGERDUP))) + line = chk_print(line, " %" PRIuSIZE " dups,", dups); + if (sub_databases || dbi == MAIN_DBI) + line = chk_print(line, " %" PRIuSIZE " sub-databases,", sub_databases); + line = chk_print(line, + " %" PRIuSIZE " key's bytes," + " %" PRIuSIZE " data's bytes," + " %" PRIuSIZE " problem(s)", + sdb->histogram.key_len.amount, + sdb->histogram.val_len.amount, scope->subtotal_issues); + chk_line_end(chk_flush(line)); } - } - assert(dst->sorted == dst->length); - tASSERT(parent, dst->detent >= l + d + s); - dst->sorted = l + d + s; /* the merged length */ - while (s > 0) { - MDBX_page *sp = src->items[s].ptr; - tASSERT(parent, (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | - P_LOOSE | P_SPILLED)) == 0); - if (sp->mp_flags != P_LOOSE) { - sp->mp_txnid = parent->mt_front; - sp->mp_flags &= ~P_SPILLED; - } - --s; + mdbx_cursor_close(cursor); + if (!txn->cursors[dbi] && (txn->dbi_state[dbi] & DBI_FRESH)) + mdbx_dbi_close(env, dbi); } + return err; +} - /* Merge our dirty list into parent's, i.e. merge(dst, src) -> dst */ - if (dst->sorted >= dst->length) { - /* from end to begin with dst extending */ - for (l = dst->sorted, s = src->length, d = dst->length; s > 0 && d > 0;) { - if (unlikely(l <= d)) { - /* squash to get a gap of free space for merge */ - for (r = w = 1; r <= d; ++r) - if (dst->items[r].ptr) { - if (w != r) { - dst->items[w] = dst->items[r]; - dst->items[r].ptr = nullptr; - } - ++w; +__cold static int chk_handle_gc(MDBX_chk_scope_t *const scope, + MDBX_chk_subdb_t *sdb, + const size_t record_number, const MDBX_val *key, + const MDBX_val *data) { + MDBX_chk_internal_t *const chk = scope->internal; + MDBX_chk_context_t *const usr = chk->usr; + assert(sdb == &chk->subdb_gc); + (void)sdb; + const char *bad = ""; + pgno_t *iptr = data->iov_base; + + if (key->iov_len != sizeof(txnid_t)) + chk_object_issue(scope, "entry", record_number, "wrong txn-id size", + "key-size %" PRIuSIZE, key->iov_len); + else { + txnid_t txnid; + memcpy(&txnid, key->iov_base, sizeof(txnid)); + if (txnid < 1 || txnid > usr->txn->txnid) + chk_object_issue(scope, "entry", record_number, "wrong txn-id", + "%" PRIaTXN, txnid); + else { + if (data->iov_len < sizeof(pgno_t) || data->iov_len % sizeof(pgno_t)) + chk_object_issue(scope, "entry", txnid, "wrong idl size", "%" PRIuPTR, + data->iov_len); + size_t number = (data->iov_len >= sizeof(pgno_t)) ? *iptr++ : 0; + if (number > PAGELIST_LIMIT) + chk_object_issue(scope, "entry", txnid, "wrong idl length", "%" PRIuPTR, + number); + else if ((number + 1) * sizeof(pgno_t) > data->iov_len) { + chk_object_issue(scope, "entry", txnid, "trimmed idl", + "%" PRIuSIZE " > %" PRIuSIZE " (corruption)", + (number + 1) * sizeof(pgno_t), data->iov_len); + number = data->iov_len / sizeof(pgno_t) - 1; + } else if (data->iov_len - (number + 1) * sizeof(pgno_t) >= + /* LY: allow gap up to one page. it is ok + * and better than shink-and-retry inside gc_update() */ + usr->env->ps) + chk_object_issue(scope, "entry", txnid, "extra idl space", + "%" PRIuSIZE " < %" PRIuSIZE " (minor, not a trouble)", + (number + 1) * sizeof(pgno_t), data->iov_len); + + usr->result.gc_pages += number; + if (chk->envinfo.mi_latter_reader_txnid > txnid) + usr->result.reclaimable_pages += number; + + size_t prev = + MDBX_PNL_ASCENDING ? NUM_METAS - 1 : usr->txn->geo.first_unallocated; + size_t span = 1; + for (size_t i = 0; i < number; ++i) { + const size_t pgno = iptr[i]; + if (pgno < NUM_METAS) + chk_object_issue(scope, "entry", txnid, "wrong idl entry", + "pgno %" PRIuSIZE " < meta-pages %u", pgno, + NUM_METAS); + else if (pgno >= usr->result.backed_pages) + chk_object_issue(scope, "entry", txnid, "wrong idl entry", + "pgno %" PRIuSIZE " > backed-pages %" PRIuSIZE, pgno, + usr->result.backed_pages); + else if (pgno >= usr->result.alloc_pages) + chk_object_issue(scope, "entry", txnid, "wrong idl entry", + "pgno %" PRIuSIZE " > alloc-pages %" PRIuSIZE, pgno, + usr->result.alloc_pages - 1); + else { + if (MDBX_PNL_DISORDERED(prev, pgno)) { + bad = " [bad sequence]"; + chk_object_issue( + scope, "entry", txnid, "bad sequence", + "%" PRIuSIZE " %c [%" PRIuSIZE "].%" PRIuSIZE, prev, + (prev == pgno) ? '=' : (MDBX_PNL_ASCENDING ? '>' : '<'), i, + pgno); } - NOTICE("squash to begin for extending-merge %zu -> %zu", d, w - 1); - d = w - 1; - continue; - } - assert(l > d); - if (dst->items[d].ptr) { - dst->items[l--] = (dst->items[d].pgno > src->items[s].pgno) - ? dst->items[d--] - : src->items[s--]; - } else - --d; - } - if (s > 0) { - assert(l == s); - while (d > 0) { - assert(dst->items[d].ptr == nullptr); - --d; - } - do { - assert(l > 0); - dst->items[l--] = src->items[s--]; - } while (s > 0); - } else { - assert(l == d); - while (l > 0) { - assert(dst->items[l].ptr != nullptr); - --l; - } - } - } else { - /* from begin to end with shrinking (a lot of new large/overflow pages) */ - for (l = s = d = 1; s <= src->length && d <= dst->length;) { - if (unlikely(l >= d)) { - /* squash to get a gap of free space for merge */ - for (r = w = dst->length; r >= d; --r) - if (dst->items[r].ptr) { - if (w != r) { - dst->items[w] = dst->items[r]; - dst->items[r].ptr = nullptr; - } - --w; + if (chk->pagemap) { + const intptr_t id = chk->pagemap[pgno]; + if (id == 0) + chk->pagemap[pgno] = -1 /* mark the pgno listed in GC */; + else if (id > 0) { + assert(id - 1 <= (intptr_t)ARRAY_LENGTH(chk->subdb)); + chk_object_issue(scope, "page", pgno, "already used", "by %s", + chk_v2a(chk, &chk->subdb[id - 1]->name)); + } else + chk_object_issue(scope, "page", pgno, "already listed in GC", + nullptr); } - NOTICE("squash to end for shrinking-merge %zu -> %zu", d, w + 1); - d = w + 1; - continue; - } - assert(l < d); - if (dst->items[d].ptr) { - dst->items[l++] = (dst->items[d].pgno < src->items[s].pgno) - ? dst->items[d++] - : src->items[s++]; - } else - ++d; - } - if (s <= src->length) { - assert(dst->sorted - l == src->length - s); - while (d <= dst->length) { - assert(dst->items[d].ptr == nullptr); - --d; + } + prev = pgno; + while (i + span < number && + iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span) + : pgno_sub(pgno, span))) + ++span; } - do { - assert(l <= dst->sorted); - dst->items[l++] = src->items[s++]; - } while (s <= src->length); - } else { - assert(dst->sorted - l == dst->length - d); - while (l <= dst->sorted) { - assert(l <= d && d <= dst->length && dst->items[d].ptr); - dst->items[l++] = dst->items[d++]; + if (sdb->cookie) { + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_details), + "transaction %" PRIaTXN ", %" PRIuSIZE + " pages, maxspan %" PRIuSIZE "%s", + txnid, number, span, bad)); + for (size_t i = 0; i < number; i += span) { + const size_t pgno = iptr[i]; + for (span = 1; + i + span < number && + iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span) + : pgno_sub(pgno, span)); + ++span) + ; + histogram_acc(span, &sdb->histogram.nested_tree); + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_extra); + if (line) { + if (span > 1) + line = + chk_print(line, "%9" PRIuSIZE "[%" PRIuSIZE "]", pgno, span); + else + line = chk_print(line, "%9" PRIuSIZE, pgno); + chk_line_end(line); + int err = chk_check_break(scope); + if (err) + return err; + } + } } } } - parent->tw.dirtyroom -= dst->sorted - dst->length; - assert(parent->tw.dirtyroom <= parent->mt_env->me_options.dp_limit); - dpl_setlen(dst, dst->sorted); - parent->tw.dirtylru = txn->tw.dirtylru; - - /* В текущем понимании выгоднее пересчитать кол-во страниц, - * чем подмешивать лишние ветвления и вычисления в циклы выше. */ - dst->pages_including_loose = 0; - for (r = 1; r <= dst->length; ++r) - dst->pages_including_loose += dpl_npages(dst, r); - - tASSERT(parent, dirtylist_check(parent)); - dpl_free(txn); + return chk_check_break(scope); +} - if (txn->tw.spilled.list) { - if (parent->tw.spilled.list) { - /* Must not fail since space was preserved above. */ - pnl_merge(parent->tw.spilled.list, txn->tw.spilled.list); - pnl_free(txn->tw.spilled.list); - } else { - parent->tw.spilled.list = txn->tw.spilled.list; - parent->tw.spilled.least_removed = txn->tw.spilled.least_removed; - } - tASSERT(parent, dirtylist_check(parent)); - } - - parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; - if (parent->tw.spilled.list) { - assert(pnl_check_allocated(parent->tw.spilled.list, - (size_t)parent->mt_next_pgno << 1)); - if (MDBX_PNL_GETSIZE(parent->tw.spilled.list)) - parent->mt_flags |= MDBX_TXN_SPILLS; - } -} - -static void take_gcprof(MDBX_txn *txn, MDBX_commit_latency *latency) { - MDBX_env *const env = txn->mt_env; - if (MDBX_ENABLE_PROFGC) { - pgop_stat_t *const ptr = &env->me_lck->mti_pgop_stat; - latency->gc_prof.work_counter = ptr->gc_prof.work.spe_counter; - latency->gc_prof.work_rtime_monotonic = - osal_monotime_to_16dot16(ptr->gc_prof.work.rtime_monotonic); - latency->gc_prof.work_xtime_cpu = - osal_monotime_to_16dot16(ptr->gc_prof.work.xtime_cpu); - latency->gc_prof.work_rsteps = ptr->gc_prof.work.rsteps; - latency->gc_prof.work_xpages = ptr->gc_prof.work.xpages; - latency->gc_prof.work_majflt = ptr->gc_prof.work.majflt; - - latency->gc_prof.self_counter = ptr->gc_prof.self.spe_counter; - latency->gc_prof.self_rtime_monotonic = - osal_monotime_to_16dot16(ptr->gc_prof.self.rtime_monotonic); - latency->gc_prof.self_xtime_cpu = - osal_monotime_to_16dot16(ptr->gc_prof.self.xtime_cpu); - latency->gc_prof.self_rsteps = ptr->gc_prof.self.rsteps; - latency->gc_prof.self_xpages = ptr->gc_prof.self.xpages; - latency->gc_prof.self_majflt = ptr->gc_prof.self.majflt; - - latency->gc_prof.wloops = ptr->gc_prof.wloops; - latency->gc_prof.coalescences = ptr->gc_prof.coalescences; - latency->gc_prof.wipes = ptr->gc_prof.wipes; - latency->gc_prof.flushes = ptr->gc_prof.flushes; - latency->gc_prof.kicks = ptr->gc_prof.kicks; - if (txn == env->me_txn0) - memset(&ptr->gc_prof, 0, sizeof(ptr->gc_prof)); - } else - memset(&latency->gc_prof, 0, sizeof(latency->gc_prof)); -} - -int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { - STATIC_ASSERT(MDBX_TXN_FINISHED == - MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR); - const uint64_t ts_0 = latency ? osal_monotime() : 0; - uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0, ts_5 = 0, gc_cputime = 0; - - int rc = check_txn(txn, MDBX_TXN_FINISHED); - if (unlikely(rc != MDBX_SUCCESS)) { - if (latency) - memset(latency, 0, sizeof(*latency)); - return rc; - } - - MDBX_env *const env = txn->mt_env; -#if MDBX_ENV_CHECKPID - if (unlikely(env->me_pid != osal_getpid())) { - env->me_flags |= MDBX_FATAL_ERROR; - if (latency) - memset(latency, 0, sizeof(*latency)); - return MDBX_PANIC; - } -#endif /* MDBX_ENV_CHECKPID */ - - if (unlikely(txn->mt_flags & MDBX_TXN_ERROR)) { - rc = MDBX_RESULT_TRUE; - goto fail; - } - - /* txn_end() mode for a commit which writes nothing */ - unsigned end_mode = - TXN_END_PURE_COMMIT | TXN_END_UPDATE | TXN_END_SLOT | TXN_END_FREE; - if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) - goto done; - - if ((txn->mt_flags & MDBX_NOSTICKYTHREADS) && - unlikely(txn->mt_owner != osal_thread_self())) { - rc = MDBX_THREAD_MISMATCH; - goto fail; - } - - if (txn->mt_child) { - rc = mdbx_txn_commit_ex(txn->mt_child, NULL); - tASSERT(txn, txn->mt_child == NULL); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - } +__cold static int env_chk(MDBX_chk_scope_t *const scope) { + MDBX_chk_internal_t *const chk = scope->internal; + MDBX_chk_context_t *const usr = chk->usr; + MDBX_env *const env = usr->env; + MDBX_txn *const txn = usr->txn; + int err = + env_info(env, txn, &chk->envinfo, sizeof(chk->envinfo), &chk->troika); + if (unlikely(err)) + return chk_error_rc(scope, err, "env_info"); - if (unlikely(txn != env->me_txn)) { - DEBUG("%s", "attempt to commit unknown transaction"); - rc = MDBX_EINVAL; - goto fail; - } + MDBX_chk_line_t *line = + chk_puts(chk_line_begin(scope, MDBX_chk_info), "current boot-id "); + if (chk->envinfo.mi_bootid.current.x | chk->envinfo.mi_bootid.current.y) + line = chk_print(line, "%016" PRIx64 "-%016" PRIx64, + chk->envinfo.mi_bootid.current.x, + chk->envinfo.mi_bootid.current.y); + else + line = chk_puts(line, "unavailable"); + chk_line_end(line); - if (txn->mt_parent) { - tASSERT(txn, audit_ex(txn, 0, false) == 0); - eASSERT(env, txn != env->me_txn0); - MDBX_txn *const parent = txn->mt_parent; - eASSERT(env, parent->mt_signature == MDBX_MT_SIGNATURE); - eASSERT(env, parent->mt_child == txn && - (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); - eASSERT(env, dirtylist_check(txn)); - - if (txn->tw.dirtylist->length == 0 && !(txn->mt_flags & MDBX_TXN_DIRTY) && - parent->mt_numdbs == txn->mt_numdbs) { - TXN_FOREACH_DBI_ALL(txn, i) { - tASSERT(txn, (txn->mt_dbi_state[i] & DBI_DIRTY) == 0); - if ((txn->mt_dbi_state[i] & DBI_STALE) && - !(parent->mt_dbi_state[i] & DBI_STALE)) - tASSERT(txn, memcmp(&parent->mt_dbs[i], &txn->mt_dbs[i], - sizeof(MDBX_db)) == 0); - } + err = osal_filesize(env->lazy_fd, &env->dxb_mmap.filesize); + if (unlikely(err)) + return chk_error_rc(scope, err, "osal_filesize"); - tASSERT(txn, memcmp(&parent->mt_geo, &txn->mt_geo, - sizeof(parent->mt_geo)) == 0); - tASSERT(txn, memcmp(&parent->mt_canary, &txn->mt_canary, - sizeof(parent->mt_canary)) == 0); - tASSERT(txn, !txn->tw.spilled.list || - MDBX_PNL_GETSIZE(txn->tw.spilled.list) == 0); - tASSERT(txn, txn->tw.loose_count == 0); + //-------------------------------------------------------------------------- - /* fast completion of pure nested transaction */ - VERBOSE("fast-complete pure nested txn %" PRIaTXN, txn->mt_txnid); - end_mode = TXN_END_PURE_COMMIT | TXN_END_SLOT | TXN_END_FREE; - goto done; + err = chk_scope_begin(chk, 1, MDBX_chk_meta, nullptr, + &usr->result.problems_meta, "Peek the meta-pages..."); + if (likely(!err)) { + MDBX_chk_scope_t *const inner = usr->scope; + const uint64_t dxbfile_pages = env->dxb_mmap.filesize >> env->ps2ln; + usr->result.alloc_pages = txn->geo.first_unallocated; + usr->result.backed_pages = bytes2pgno(env, env->dxb_mmap.current); + if (unlikely(usr->result.backed_pages > dxbfile_pages)) + chk_scope_issue(inner, "backed-pages %zu > file-pages %" PRIu64, + usr->result.backed_pages, dxbfile_pages); + if (unlikely(dxbfile_pages < NUM_METAS)) + chk_scope_issue(inner, "file-pages %" PRIu64 " < %u", dxbfile_pages, + NUM_METAS); + if (unlikely(usr->result.backed_pages < NUM_METAS)) + chk_scope_issue(inner, "backed-pages %zu < %u", usr->result.backed_pages, + NUM_METAS); + if (unlikely(usr->result.backed_pages < NUM_METAS)) { + chk_scope_issue(inner, "backed-pages %zu < num-metas %u", + usr->result.backed_pages, NUM_METAS); + return MDBX_CORRUPTED; } - - /* Preserve space for spill list to avoid parent's state corruption - * if allocation fails. */ - const size_t parent_retired_len = (uintptr_t)parent->tw.retired_pages; - tASSERT(txn, parent_retired_len <= MDBX_PNL_GETSIZE(txn->tw.retired_pages)); - const size_t retired_delta = - MDBX_PNL_GETSIZE(txn->tw.retired_pages) - parent_retired_len; - if (retired_delta) { - rc = pnl_need(&txn->tw.relist, retired_delta); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; + if (unlikely(dxbfile_pages < NUM_METAS)) { + chk_scope_issue(inner, "backed-pages %zu < num-metas %u", + usr->result.backed_pages, NUM_METAS); + return MDBX_CORRUPTED; } - - if (txn->tw.spilled.list) { - if (parent->tw.spilled.list) { - rc = pnl_need(&parent->tw.spilled.list, - MDBX_PNL_GETSIZE(txn->tw.spilled.list)); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - } - spill_purge(txn); + if (unlikely(usr->result.backed_pages > (size_t)MAX_PAGENO + 1)) { + chk_scope_issue(inner, "backed-pages %zu > max-pages %zu", + usr->result.backed_pages, (size_t)MAX_PAGENO + 1); + usr->result.backed_pages = MAX_PAGENO + 1; } - if (unlikely(txn->tw.dirtylist->length + parent->tw.dirtylist->length > - parent->tw.dirtylist->detent && - !dpl_reserve(parent, txn->tw.dirtylist->length + - parent->tw.dirtylist->length))) { - rc = MDBX_ENOMEM; - goto fail; + if ((env->flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) { + if (unlikely(usr->result.backed_pages > dxbfile_pages)) { + chk_scope_issue(inner, "backed-pages %zu > file-pages %" PRIu64, + usr->result.backed_pages, dxbfile_pages); + usr->result.backed_pages = (size_t)dxbfile_pages; + } + if (unlikely(usr->result.alloc_pages > usr->result.backed_pages)) { + chk_scope_issue(scope, "alloc-pages %zu > backed-pages %zu", + usr->result.alloc_pages, usr->result.backed_pages); + usr->result.alloc_pages = usr->result.backed_pages; + } + } else { + /* DB may be shrunk by writer down to the allocated (but unused) pages. */ + if (unlikely(usr->result.alloc_pages > usr->result.backed_pages)) { + chk_scope_issue(inner, "alloc-pages %zu > backed-pages %zu", + usr->result.alloc_pages, usr->result.backed_pages); + usr->result.alloc_pages = usr->result.backed_pages; + } + if (unlikely(usr->result.alloc_pages > dxbfile_pages)) { + chk_scope_issue(inner, "alloc-pages %zu > file-pages %" PRIu64, + usr->result.alloc_pages, dxbfile_pages); + usr->result.alloc_pages = (size_t)dxbfile_pages; + } + if (unlikely(usr->result.backed_pages > dxbfile_pages)) + usr->result.backed_pages = (size_t)dxbfile_pages; } - //------------------------------------------------------------------------- - - parent->tw.lifo_reclaimed = txn->tw.lifo_reclaimed; - txn->tw.lifo_reclaimed = NULL; - - parent->tw.retired_pages = txn->tw.retired_pages; - txn->tw.retired_pages = NULL; - - pnl_free(parent->tw.relist); - parent->tw.relist = txn->tw.relist; - txn->tw.relist = NULL; - parent->tw.gc_time_acc = txn->tw.gc_time_acc; - parent->tw.last_reclaimed = txn->tw.last_reclaimed; - - parent->mt_geo = txn->mt_geo; - parent->mt_canary = txn->mt_canary; - parent->mt_flags |= txn->mt_flags & MDBX_TXN_DIRTY; - - /* Move loose pages to parent */ -#if MDBX_ENABLE_REFUND - parent->tw.loose_refund_wl = txn->tw.loose_refund_wl; -#endif /* MDBX_ENABLE_REFUND */ - parent->tw.loose_count = txn->tw.loose_count; - parent->tw.loose_pages = txn->tw.loose_pages; - - /* Merge our cursors into parent's and close them */ - cursors_eot(txn, true); - end_mode |= TXN_END_EOTDONE; + line = chk_line_feed(chk_print( + chk_line_begin(inner, MDBX_chk_info), + "pagesize %u (%u system), max keysize %u..%u" + ", max readers %u", + env->ps, globals.sys_pagesize, + mdbx_env_get_maxkeysize_ex(env, MDBX_DUPSORT), + mdbx_env_get_maxkeysize_ex(env, MDBX_DB_DEFAULTS), env->max_readers)); + line = chk_line_feed( + chk_print_size(line, "mapsize ", env->dxb_mmap.current, nullptr)); + if (txn->geo.lower == txn->geo.upper) + line = chk_print_size( + line, "fixed datafile: ", chk->envinfo.mi_geo.current, nullptr); + else { + line = chk_print_size( + line, "dynamic datafile: ", chk->envinfo.mi_geo.lower, nullptr); + line = chk_print_size(line, " .. ", chk->envinfo.mi_geo.upper, ", "); + line = chk_print_size(line, "+", chk->envinfo.mi_geo.grow, ", "); - /* Update parent's DBs array */ - eASSERT(env, parent->mt_numdbs == txn->mt_numdbs); - TXN_FOREACH_DBI_ALL(txn, dbi) { - if (txn->mt_dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)) { - parent->mt_dbs[dbi] = txn->mt_dbs[dbi]; - /* preserve parent's status */ - const uint8_t state = - txn->mt_dbi_state[dbi] | - (parent->mt_dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)); - DEBUG("dbi %zu dbi-state %s 0x%02x -> 0x%02x", dbi, - (parent->mt_dbi_state[dbi] != state) ? "update" : "still", - parent->mt_dbi_state[dbi], state); - parent->mt_dbi_state[dbi] = state; - } else { - eASSERT(env, txn->mt_dbi_state[dbi] == - (parent->mt_dbi_state[dbi] & - ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY))); - } + line = chk_line_feed( + chk_print_size(line, "-", chk->envinfo.mi_geo.shrink, nullptr)); + line = chk_print_size( + line, "current datafile: ", chk->envinfo.mi_geo.current, nullptr); } - - if (latency) { - ts_1 = osal_monotime(); - ts_2 = /* no gc-update */ ts_1; - ts_3 = /* no audit */ ts_2; - ts_4 = /* no write */ ts_3; - ts_5 = /* no sync */ ts_4; + tASSERT(txn, txn->geo.now == chk->envinfo.mi_geo.current / + chk->envinfo.mi_dxb_pagesize); + chk_line_end(chk_print(line, ", %u pages", txn->geo.now)); +#if defined(_WIN32) || defined(_WIN64) || MDBX_DEBUG + if (txn->geo.shrink_pv && txn->geo.now != txn->geo.upper && + scope->verbosity >= MDBX_chk_verbose) { + line = chk_line_begin(inner, MDBX_chk_notice); + chk_line_feed(chk_print( + line, " > WARNING: Due Windows system limitations a file couldn't")); + chk_line_feed(chk_print( + line, " > be truncated while the database is opened. So, the size")); + chk_line_feed(chk_print( + line, " > database file of may by large than the database itself,")); + chk_line_end(chk_print( + line, " > until it will be closed or reopened in read-write mode.")); } - txn_merge(parent, txn, parent_retired_len); - env->me_txn = parent; - parent->mt_child = NULL; - tASSERT(parent, dirtylist_check(parent)); +#endif /* Windows || Debug */ + chk_verbose_meta(inner, 0); + chk_verbose_meta(inner, 1); + chk_verbose_meta(inner, 2); -#if MDBX_ENABLE_REFUND - txn_refund(parent); - if (ASSERT_ENABLED()) { - /* Check parent's loose pages not suitable for refund */ - for (MDBX_page *lp = parent->tw.loose_pages; lp; lp = mp_next(lp)) { - tASSERT(parent, lp->mp_pgno < parent->tw.loose_refund_wl && - lp->mp_pgno + 1 < parent->mt_next_pgno); - MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); - VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); + if (env->stuck_meta >= 0) { + chk_line_end(chk_print(chk_line_begin(inner, MDBX_chk_processing), + "skip checking meta-pages since the %u" + " is selected for verification", + env->stuck_meta)); + line = chk_line_feed( + chk_print(chk_line_begin(inner, MDBX_chk_resolution), + "transactions: recent %" PRIu64 ", " + "selected for verification %" PRIu64 ", lag %" PRIi64, + chk->envinfo.mi_recent_txnid, + chk->envinfo.mi_meta_txnid[env->stuck_meta], + chk->envinfo.mi_recent_txnid - + chk->envinfo.mi_meta_txnid[env->stuck_meta])); + chk_line_end(line); + } else { + chk_line_end(chk_puts(chk_line_begin(inner, MDBX_chk_verbose), + "performs check for meta-pages clashes")); + const unsigned meta_clash_mask = meta_eq_mask(&chk->troika); + if (meta_clash_mask & 1) + chk_scope_issue(inner, "meta-%d and meta-%d are clashed", 0, 1); + if (meta_clash_mask & 2) + chk_scope_issue(inner, "meta-%d and meta-%d are clashed", 1, 2); + if (meta_clash_mask & 4) + chk_scope_issue(inner, "meta-%d and meta-%d are clashed", 2, 0); + + const unsigned prefer_steady_metanum = chk->troika.prefer_steady; + const uint64_t prefer_steady_txnid = + chk->troika.txnid[prefer_steady_metanum]; + const unsigned recent_metanum = chk->troika.recent; + const uint64_t recent_txnid = chk->troika.txnid[recent_metanum]; + if (env->flags & MDBX_EXCLUSIVE) { + chk_line_end( + chk_puts(chk_line_begin(inner, MDBX_chk_verbose), + "performs full check recent-txn-id with meta-pages")); + eASSERT(env, recent_txnid == chk->envinfo.mi_recent_txnid); + if (prefer_steady_txnid != recent_txnid) { + if ((chk->flags & MDBX_CHK_READWRITE) != 0 && + (env->flags & MDBX_RDONLY) == 0 && + recent_txnid > prefer_steady_txnid && + (chk->envinfo.mi_bootid.current.x | + chk->envinfo.mi_bootid.current.y) != 0 && + chk->envinfo.mi_bootid.current.x == + chk->envinfo.mi_bootid.meta[recent_metanum].x && + chk->envinfo.mi_bootid.current.y == + chk->envinfo.mi_bootid.meta[recent_metanum].y) { + chk_line_end( + chk_print(chk_line_begin(inner, MDBX_chk_verbose), + "recent meta-%u is weak, but boot-id match current" + " (will synced upon successful check)", + recent_metanum)); + } else + chk_scope_issue( + inner, + "steady meta-%d txn-id mismatch recent-txn-id (%" PRIi64 + " != %" PRIi64 ")", + prefer_steady_metanum, prefer_steady_txnid, recent_txnid); + } + } else if (chk->write_locked) { + chk_line_end( + chk_puts(chk_line_begin(inner, MDBX_chk_verbose), + "performs lite check recent-txn-id with meta-pages (not a " + "monopolistic mode)")); + if (recent_txnid != chk->envinfo.mi_recent_txnid) { + chk_scope_issue(inner, + "weak meta-%d txn-id mismatch recent-txn-id (%" PRIi64 + " != %" PRIi64 ")", + recent_metanum, recent_txnid, + chk->envinfo.mi_recent_txnid); + } + } else { + chk_line_end(chk_puts( + chk_line_begin(inner, MDBX_chk_verbose), + "skip check recent-txn-id with meta-pages (monopolistic or " + "read-write mode only)")); } - /* Check parent's reclaimed pages not suitable for refund */ - if (MDBX_PNL_GETSIZE(parent->tw.relist)) - tASSERT(parent, - MDBX_PNL_MOST(parent->tw.relist) + 1 < parent->mt_next_pgno); - } -#endif /* MDBX_ENABLE_REFUND */ - txn->mt_signature = 0; - osal_free(txn); - tASSERT(parent, audit_ex(parent, 0, false) == 0); - rc = MDBX_SUCCESS; - goto provide_latency; + chk_line_end(chk_print( + chk_line_begin(inner, MDBX_chk_resolution), + "transactions: recent %" PRIu64 ", latter reader %" PRIu64 + ", lag %" PRIi64, + chk->envinfo.mi_recent_txnid, chk->envinfo.mi_latter_reader_txnid, + chk->envinfo.mi_recent_txnid - chk->envinfo.mi_latter_reader_txnid)); + } } + err = chk_scope_restore(scope, err); - if (!txn->tw.dirtylist) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); - } else { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : env->me_options.dp_limit)); - } - cursors_eot(txn, false); - end_mode |= TXN_END_EOTDONE; + //-------------------------------------------------------------------------- - if ((!txn->tw.dirtylist || txn->tw.dirtylist->length == 0) && - (txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0) { - TXN_FOREACH_DBI_ALL(txn, i) { - tASSERT(txn, !(txn->mt_dbi_state[i] & DBI_DIRTY)); - } -#if defined(MDBX_NOSUCCESS_EMPTY_COMMIT) && MDBX_NOSUCCESS_EMPTY_COMMIT - rc = txn_end(txn, end_mode); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - rc = MDBX_RESULT_TRUE; - goto provide_latency; -#else - goto done; -#endif /* MDBX_NOSUCCESS_EMPTY_COMMIT */ + const char *const subj_tree = "B-Trees"; + if (chk->flags & MDBX_CHK_SKIP_BTREE_TRAVERSAL) + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_processing), + "Skipping %s traversal...", subj_tree)); + else { + err = chk_scope_begin( + chk, -1, MDBX_chk_tree, nullptr, &usr->result.tree_problems, + "Traversal %s by txn#%" PRIaTXN "...", subj_tree, txn->txnid); + if (likely(!err)) + err = chk_tree(usr->scope); + if (usr->result.tree_problems && usr->result.gc_tree_problems == 0) + usr->result.gc_tree_problems = usr->result.tree_problems; + if (usr->result.tree_problems && usr->result.kv_tree_problems == 0) + usr->result.kv_tree_problems = usr->result.tree_problems; + chk_scope_restore(scope, err); } - DEBUG("committing txn %" PRIaTXN " %p on env %p, root page %" PRIaPGNO - "/%" PRIaPGNO, - txn->mt_txnid, (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root, - txn->mt_dbs[FREE_DBI].md_root); - - if (txn->mt_numdbs > CORE_DBS) { - /* Update subDB root pointers */ - MDBX_cursor_couple cx; - rc = cursor_init(&cx.outer, txn, MAIN_DBI); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - cx.outer.mc_next = txn->mt_cursors[MAIN_DBI]; - txn->mt_cursors[MAIN_DBI] = &cx.outer; - TXN_FOREACH_DBI_USER(txn, i) { - if ((txn->mt_dbi_state[i] & DBI_DIRTY) == 0) - continue; - MDBX_db *const db = &txn->mt_dbs[i]; - DEBUG("update main's entry for sub-db %zu, mod_txnid %" PRIaTXN - " -> %" PRIaTXN, - i, db->md_mod_txnid, txn->mt_txnid); - /* Может быть mod_txnid > front после коммита вложенных тразакций */ - db->md_mod_txnid = txn->mt_txnid; - MDBX_val data = {db, sizeof(MDBX_db)}; - rc = cursor_put_nochecklen(&cx.outer, &env->me_dbxs[i].md_name, &data, - F_SUBDATA); - if (unlikely(rc != MDBX_SUCCESS)) { - txn->mt_cursors[MAIN_DBI] = cx.outer.mc_next; - goto fail; - } + const char *const subj_gc = chk_v2a(chk, MDBX_CHK_GC); + if (usr->result.gc_tree_problems > 0) + chk_line_end(chk_print( + chk_line_begin(scope, MDBX_chk_processing), + "Skip processing %s since %s is corrupted (%" PRIuSIZE " problem(s))", + subj_gc, subj_tree, + usr->result.problems_gc = usr->result.gc_tree_problems)); + else { + err = chk_scope_begin( + chk, -1, MDBX_chk_gc, &chk->subdb_gc, &usr->result.problems_gc, + "Processing %s by txn#%" PRIaTXN "...", subj_gc, txn->txnid); + if (likely(!err)) + err = chk_db(usr->scope, FREE_DBI, &chk->subdb_gc, chk_handle_gc); + line = chk_line_begin(scope, MDBX_chk_info); + if (line) { + histogram_print(scope, line, &chk->subdb_gc.histogram.nested_tree, + "span(s)", "single", false); + chk_line_end(line); + } + if (usr->result.problems_gc == 0 && + (chk->flags & MDBX_CHK_SKIP_BTREE_TRAVERSAL) == 0) { + const size_t used_pages = usr->result.alloc_pages - usr->result.gc_pages; + if (usr->result.processed_pages != used_pages) + chk_scope_issue(usr->scope, + "used pages mismatch (%" PRIuSIZE + "(walked) != %" PRIuSIZE "(allocated - GC))", + usr->result.processed_pages, used_pages); + if (usr->result.unused_pages != usr->result.gc_pages) + chk_scope_issue(usr->scope, + "GC pages mismatch (%" PRIuSIZE + "(expected) != %" PRIuSIZE "(GC))", + usr->result.unused_pages, usr->result.gc_pages); } - txn->mt_cursors[MAIN_DBI] = cx.outer.mc_next; } + chk_scope_restore(scope, err); - ts_1 = latency ? osal_monotime() : 0; - - gcu_context_t gcu_ctx; - gc_cputime = latency ? osal_cputime(nullptr) : 0; - rc = gcu_context_init(txn, &gcu_ctx); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - rc = update_gc(txn, &gcu_ctx); - gc_cputime = latency ? osal_cputime(nullptr) - gc_cputime : 0; - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; + //-------------------------------------------------------------------------- - tASSERT(txn, txn->tw.loose_count == 0); - txn->mt_dbs[FREE_DBI].md_mod_txnid = (txn->mt_dbi_state[FREE_DBI] & DBI_DIRTY) - ? txn->mt_txnid - : txn->mt_dbs[FREE_DBI].md_mod_txnid; + err = chk_scope_begin(chk, 1, MDBX_chk_space, nullptr, nullptr, + "Page allocation:"); + const double percent_boundary_reciprocal = 100.0 / txn->geo.upper; + const double percent_backed_reciprocal = 100.0 / usr->result.backed_pages; + const size_t detained = usr->result.gc_pages - usr->result.reclaimable_pages; + const size_t available2boundary = + txn->geo.upper - usr->result.alloc_pages + usr->result.reclaimable_pages; + const size_t available2backed = usr->result.backed_pages - + usr->result.alloc_pages + + usr->result.reclaimable_pages; + const size_t remained2boundary = txn->geo.upper - usr->result.alloc_pages; + const size_t remained2backed = + usr->result.backed_pages - usr->result.alloc_pages; - txn->mt_dbs[MAIN_DBI].md_mod_txnid = (txn->mt_dbi_state[MAIN_DBI] & DBI_DIRTY) - ? txn->mt_txnid - : txn->mt_dbs[MAIN_DBI].md_mod_txnid; + const size_t used = (chk->flags & MDBX_CHK_SKIP_BTREE_TRAVERSAL) + ? usr->result.alloc_pages - usr->result.gc_pages + : usr->result.processed_pages; - ts_2 = latency ? osal_monotime() : 0; - ts_3 = ts_2; - if (AUDIT_ENABLED()) { - rc = audit_ex(txn, MDBX_PNL_GETSIZE(txn->tw.retired_pages), true); - ts_3 = osal_monotime(); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - } + line = chk_line_begin(usr->scope, MDBX_chk_info); + line = chk_print(line, + "backed by file: %" PRIuSIZE " pages (%.1f%%)" + ", %" PRIuSIZE " left to boundary (%.1f%%)", + usr->result.backed_pages, + usr->result.backed_pages * percent_boundary_reciprocal, + txn->geo.upper - usr->result.backed_pages, + (txn->geo.upper - usr->result.backed_pages) * + percent_boundary_reciprocal); + line = chk_line_feed(line); - bool need_flush_for_nometasync = false; - const meta_ptr_t head = meta_recent(env, &txn->tw.troika); - const uint32_t meta_sync_txnid = - atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed); - /* sync prev meta */ - if (head.is_steady && meta_sync_txnid != (uint32_t)head.txnid) { - /* Исправление унаследованного от LMDB недочета: - * - * Всё хорошо, если все процессы работающие с БД не используют WRITEMAP. - * Тогда мета-страница (обновленная, но не сброшенная на диск) будет - * сохранена в результате fdatasync() при записи данных этой транзакции. - * - * Всё хорошо, если все процессы работающие с БД используют WRITEMAP - * без MDBX_AVOID_MSYNC. - * Тогда мета-страница (обновленная, но не сброшенная на диск) будет - * сохранена в результате msync() при записи данных этой транзакции. - * - * Если же в процессах работающих с БД используется оба метода, как sync() - * в режиме MDBX_WRITEMAP, так и записи через файловый дескриптор, то - * становится невозможным обеспечить фиксацию на диске мета-страницы - * предыдущей транзакции и данных текущей транзакции, за счет одной - * sync-операцией выполняемой после записи данных текущей транзакции. - * Соответственно, требуется явно обновлять мета-страницу, что полностью - * уничтожает выгоду от NOMETASYNC. */ - const uint32_t txnid_dist = - ((txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) - ? MDBX_NOMETASYNC_LAZY_FD - : MDBX_NOMETASYNC_LAZY_WRITEMAP; - /* Смысл "магии" в том, чтобы избежать отдельного вызова fdatasync() - * или msync() для гарантированной фиксации на диске мета-страницы, - * которая была "лениво" отправлена на запись в предыдущей транзакции, - * но не сброшена на диск из-за активного режима MDBX_NOMETASYNC. */ - if ( -#if defined(_WIN32) || defined(_WIN64) - !env->me_overlapped_fd && -#endif - meta_sync_txnid == (uint32_t)head.txnid - txnid_dist) - need_flush_for_nometasync = true; - else { - rc = meta_sync(env, head); - if (unlikely(rc != MDBX_SUCCESS)) { - ERROR("txn-%s: error %d", "presync-meta", rc); - goto fail; - } - } - } + line = chk_print( + line, "%s: %" PRIuSIZE " page(s), %.1f%% of backed, %.1f%% of boundary", + "used", used, used * percent_backed_reciprocal, + used * percent_boundary_reciprocal); + line = chk_line_feed(line); - if (txn->tw.dirtylist) { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - tASSERT(txn, txn->tw.loose_count == 0); + line = chk_print( + line, + "%s: %" PRIuSIZE " page(s) (%.1f%%) of backed, %" PRIuSIZE + " to boundary (%.1f%% of boundary)", + "remained", remained2backed, remained2backed * percent_backed_reciprocal, + remained2boundary, remained2boundary * percent_boundary_reciprocal); + line = chk_line_feed(line); - mdbx_filehandle_t fd = -#if defined(_WIN32) || defined(_WIN64) - env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; - (void)need_flush_for_nometasync; -#else -#define MDBX_WRITETHROUGH_THRESHOLD_DEFAULT 2 - (need_flush_for_nometasync || - env->me_dsync_fd == INVALID_HANDLE_VALUE || - txn->tw.dirtylist->length > env->me_options.writethrough_threshold || - atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) - ? env->me_lazy_fd - : env->me_dsync_fd; -#endif /* Windows */ + line = chk_print( + line, + "reclaimable: %" PRIuSIZE " (%.1f%% of backed, %.1f%% of boundary)" + ", GC %" PRIuSIZE " (%.1f%% of backed, %.1f%% of boundary)", + usr->result.reclaimable_pages, + usr->result.reclaimable_pages * percent_backed_reciprocal, + usr->result.reclaimable_pages * percent_boundary_reciprocal, + usr->result.gc_pages, usr->result.gc_pages * percent_backed_reciprocal, + usr->result.gc_pages * percent_boundary_reciprocal); + line = chk_line_feed(line); - iov_ctx_t write_ctx; - rc = iov_init(txn, &write_ctx, txn->tw.dirtylist->length, - txn->tw.dirtylist->pages_including_loose, fd, false); - if (unlikely(rc != MDBX_SUCCESS)) { - ERROR("txn-%s: error %d", "iov-init", rc); - goto fail; - } + line = chk_print( + line, + "detained by reader(s): %" PRIuSIZE + " (%.1f%% of backed, %.1f%% of boundary)" + ", %u reader(s), lag %" PRIi64, + detained, detained * percent_backed_reciprocal, + detained * percent_boundary_reciprocal, chk->envinfo.mi_numreaders, + chk->envinfo.mi_recent_txnid - chk->envinfo.mi_latter_reader_txnid); + line = chk_line_feed(line); - rc = txn_write(txn, &write_ctx); - if (unlikely(rc != MDBX_SUCCESS)) { - ERROR("txn-%s: error %d", "write", rc); - goto fail; - } - } else { - tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); - env->me_lck->mti_unsynced_pages.weak += txn->tw.writemap_dirty_npages; - if (!env->me_lck->mti_eoos_timestamp.weak) - env->me_lck->mti_eoos_timestamp.weak = osal_monotime(); - } + line = chk_print( + line, "%s: %" PRIuSIZE " page(s), %.1f%% of backed, %.1f%% of boundary", + "allocated", usr->result.alloc_pages, + usr->result.alloc_pages * percent_backed_reciprocal, + usr->result.alloc_pages * percent_boundary_reciprocal); + line = chk_line_feed(line); - /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */ - ts_4 = latency ? osal_monotime() : 0; + line = chk_print(line, + "%s: %" PRIuSIZE " page(s) (%.1f%%) of backed, %" PRIuSIZE + " to boundary (%.1f%% of boundary)", + "available", available2backed, + available2backed * percent_backed_reciprocal, + available2boundary, + available2boundary * percent_boundary_reciprocal); + chk_line_end(line); - MDBX_meta meta; - memcpy(meta.mm_magic_and_version, head.ptr_c->mm_magic_and_version, 8); - meta.mm_extra_flags = head.ptr_c->mm_extra_flags; - meta.mm_validator_id = head.ptr_c->mm_validator_id; - meta.mm_extra_pagehdr = head.ptr_c->mm_extra_pagehdr; - unaligned_poke_u64(4, meta.mm_pages_retired, - unaligned_peek_u64(4, head.ptr_c->mm_pages_retired) + - MDBX_PNL_GETSIZE(txn->tw.retired_pages)); - meta.mm_geo = txn->mt_geo; - meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; - meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; - meta.mm_canary = txn->mt_canary; + line = chk_line_begin(usr->scope, MDBX_chk_resolution); + line = chk_print(line, "%s %" PRIaPGNO " pages", + (txn->geo.upper == txn->geo.now) ? "total" : "upto", + txn->geo.upper); + line = chk_print(line, ", backed %" PRIuSIZE " (%.1f%%)", + usr->result.backed_pages, + usr->result.backed_pages * percent_boundary_reciprocal); + line = chk_print(line, ", allocated %" PRIuSIZE " (%.1f%%)", + usr->result.alloc_pages, + usr->result.alloc_pages * percent_boundary_reciprocal); + line = + chk_print(line, ", available %" PRIuSIZE " (%.1f%%)", available2boundary, + available2boundary * percent_boundary_reciprocal); + chk_line_end(line); + chk_scope_restore(scope, err); - txnid_t commit_txnid = txn->mt_txnid; -#if MDBX_ENABLE_BIGFOOT - if (gcu_ctx.bigfoot > txn->mt_txnid) { - commit_txnid = gcu_ctx.bigfoot; - TRACE("use @%" PRIaTXN " (+%zu) for commit bigfoot-txn", commit_txnid, - (size_t)(commit_txnid - txn->mt_txnid)); + //-------------------------------------------------------------------------- + + const char *const subj_main = chk_v2a(chk, MDBX_CHK_MAIN); + if (chk->flags & MDBX_CHK_SKIP_KV_TRAVERSAL) + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_processing), + "Skip processing %s...", subj_main)); + else if ((usr->result.problems_kv = usr->result.kv_tree_problems) > 0) + chk_line_end(chk_print( + chk_line_begin(scope, MDBX_chk_processing), + "Skip processing %s since %s is corrupted (%" PRIuSIZE " problem(s))", + subj_main, subj_tree, + usr->result.problems_kv = usr->result.kv_tree_problems)); + else { + err = chk_scope_begin(chk, 0, MDBX_chk_maindb, &chk->subdb_main, + &usr->result.problems_kv, "Processing %s...", + subj_main); + if (likely(!err)) + err = chk_db(usr->scope, MAIN_DBI, &chk->subdb_main, chk_handle_kv); + chk_scope_restore(scope, err); + + const char *const subj_subdbs = "sub-database(s)"; + if (usr->result.problems_kv && usr->result.subdb_total) + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_processing), + "Skip processing %s", subj_subdbs)); + else if (usr->result.problems_kv == 0 && usr->result.subdb_total == 0) + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_info), "No %s", + subj_subdbs)); + else if (usr->result.problems_kv == 0 && usr->result.subdb_total) { + err = chk_scope_begin( + chk, 1, MDBX_chk_subdbs, nullptr, &usr->result.problems_kv, + "Processing %s by txn#%" PRIaTXN "...", subj_subdbs, txn->txnid); + if (!err) + err = chk_db(usr->scope, MAIN_DBI, &chk->subdb_main, nullptr); + if (usr->scope->subtotal_issues) + chk_line_end(chk_print(chk_line_begin(usr->scope, MDBX_chk_resolution), + "processed %" PRIuSIZE " of %" PRIuSIZE + " %s, %" PRIuSIZE " problems(s)", + usr->result.subdb_processed, + usr->result.subdb_total, subj_subdbs, + usr->scope->subtotal_issues)); + } + chk_scope_restore(scope, err); } -#endif - meta.unsafe_sign = MDBX_DATASIGN_NONE; - meta_set_txnid(env, &meta, commit_txnid); - rc = sync_locked(env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, - &meta, &txn->tw.troika); + return chk_scope_end(chk, chk_scope_begin(chk, 0, MDBX_chk_conclude, nullptr, + nullptr, nullptr)); +} - ts_5 = latency ? osal_monotime() : 0; - if (unlikely(rc != MDBX_SUCCESS)) { - env->me_flags |= MDBX_FATAL_ERROR; - ERROR("txn-%s: error %d", "sync", rc); - goto fail; +__cold int mdbx_env_chk_encount_problem(MDBX_chk_context_t *ctx) { + if (likely(ctx && ctx->internal && ctx->internal->usr == ctx && + ctx->internal->problem_counter && ctx->scope)) { + *ctx->internal->problem_counter += 1; + ctx->scope->subtotal_issues += 1; + return MDBX_SUCCESS; } + return MDBX_EINVAL; +} - end_mode = TXN_END_COMMITTED | TXN_END_UPDATE | TXN_END_EOTDONE; +__cold int mdbx_env_chk(MDBX_env *env, const struct MDBX_chk_callbacks *cb, + MDBX_chk_context_t *ctx, const MDBX_chk_flags_t flags, + MDBX_chk_severity_t verbosity, + unsigned timeout_seconds_16dot16) { + int err, rc = check_env(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + if (unlikely(!cb || !ctx || ctx->internal)) + return MDBX_EINVAL; -done: - if (latency) - take_gcprof(txn, latency); - rc = txn_end(txn, end_mode); + MDBX_chk_internal_t *const chk = osal_calloc(1, sizeof(MDBX_chk_internal_t)); + if (unlikely(!chk)) + return MDBX_ENOMEM; -provide_latency: - if (latency) { - latency->preparation = ts_1 ? osal_monotime_to_16dot16(ts_1 - ts_0) : 0; - latency->gc_wallclock = - (ts_2 > ts_1) ? osal_monotime_to_16dot16(ts_2 - ts_1) : 0; - latency->gc_cputime = gc_cputime ? osal_monotime_to_16dot16(gc_cputime) : 0; - latency->audit = (ts_3 > ts_2) ? osal_monotime_to_16dot16(ts_3 - ts_2) : 0; - latency->write = (ts_4 > ts_3) ? osal_monotime_to_16dot16(ts_4 - ts_3) : 0; - latency->sync = (ts_5 > ts_4) ? osal_monotime_to_16dot16(ts_5 - ts_4) : 0; - const uint64_t ts_6 = osal_monotime(); - latency->ending = ts_5 ? osal_monotime_to_16dot16(ts_6 - ts_5) : 0; - latency->whole = osal_monotime_to_16dot16_noUnderflow(ts_6 - ts_0); - } - return rc; + chk->cb = cb; + chk->usr = ctx; + chk->usr->internal = chk; + chk->usr->env = env; + chk->flags = flags; -fail: - txn->mt_flags |= MDBX_TXN_ERROR; - if (latency) - take_gcprof(txn, latency); - mdbx_txn_abort(txn); - goto provide_latency; -} + chk->subdb_gc.id = -1; + chk->subdb_gc.name.iov_base = MDBX_CHK_GC; + chk->subdb[FREE_DBI] = &chk->subdb_gc; -static __always_inline int cmp_int_inline(const size_t expected_alignment, - const MDBX_val *a, - const MDBX_val *b) { - if (likely(a->iov_len == b->iov_len)) { - if (sizeof(size_t) > 7 && likely(a->iov_len == 8)) - return CMP2INT(unaligned_peek_u64(expected_alignment, a->iov_base), - unaligned_peek_u64(expected_alignment, b->iov_base)); - if (likely(a->iov_len == 4)) - return CMP2INT(unaligned_peek_u32(expected_alignment, a->iov_base), - unaligned_peek_u32(expected_alignment, b->iov_base)); - if (sizeof(size_t) < 8 && likely(a->iov_len == 8)) - return CMP2INT(unaligned_peek_u64(expected_alignment, a->iov_base), - unaligned_peek_u64(expected_alignment, b->iov_base)); + chk->subdb_main.id = -1; + chk->subdb_main.name.iov_base = MDBX_CHK_MAIN; + chk->subdb[MAIN_DBI] = &chk->subdb_main; + + chk->monotime_timeout = + timeout_seconds_16dot16 + ? osal_16dot16_to_monotime(timeout_seconds_16dot16) + osal_monotime() + : 0; + chk->usr->scope_nesting = 0; + chk->usr->result.subdbs = (const void *)&chk->subdb; + + MDBX_chk_scope_t *const top = chk->scope_stack; + top->verbosity = verbosity; + top->internal = chk; + + // init + rc = chk_scope_end( + chk, chk_scope_begin(chk, 0, MDBX_chk_init, nullptr, nullptr, nullptr)); + + // lock + if (likely(!rc)) + rc = chk_scope_begin( + chk, 0, MDBX_chk_lock, nullptr, nullptr, "Taking %slock...", + (env->flags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) ? "" : "read "); + if (likely(!rc) && (env->flags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) == 0 && + (flags & MDBX_CHK_READWRITE)) { + rc = mdbx_txn_lock(env, false); + if (unlikely(rc)) + chk_error_rc(ctx->scope, rc, "mdbx_txn_lock"); + else + chk->write_locked = true; } - ERROR("mismatch and/or invalid size %p.%zu/%p.%zu for INTEGERKEY/INTEGERDUP", - a->iov_base, a->iov_len, b->iov_base, b->iov_len); - return 0; -} + if (likely(!rc)) { + rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &ctx->txn); + if (unlikely(rc)) + chk_error_rc(ctx->scope, rc, "mdbx_txn_begin"); + } + chk_scope_end(chk, rc); -__hot static int cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) { - return cmp_int_inline(1, a, b); -} + // doit + if (likely(!rc)) { + chk->subdb_gc.flags = ctx->txn->dbs[FREE_DBI].flags; + chk->subdb_main.flags = ctx->txn->dbs[MAIN_DBI].flags; + rc = env_chk(top); + } -/* Compare two items pointing at 2-byte aligned unsigned int's. */ -#if MDBX_UNALIGNED_OK < 2 || \ - (MDBX_DEBUG || MDBX_FORCE_ASSERTIONS || !defined(NDEBUG)) -__hot static int cmp_int_align2(const MDBX_val *a, const MDBX_val *b) { - return cmp_int_inline(2, a, b); -} -#else -#define cmp_int_align2 cmp_int_unaligned -#endif /* !MDBX_UNALIGNED_OK || debug */ + // unlock + if (ctx->txn || chk->write_locked) { + chk_scope_begin(chk, 0, MDBX_chk_unlock, nullptr, nullptr, nullptr); + if (ctx->txn) { + err = mdbx_txn_abort(ctx->txn); + if (err && !rc) + rc = err; + ctx->txn = nullptr; + } + if (chk->write_locked) + mdbx_txn_unlock(env); + rc = chk_scope_end(chk, rc); + } -/* Compare two items pointing at aligned unsigned int's. */ -#if MDBX_UNALIGNED_OK < 4 || \ - (MDBX_DEBUG || MDBX_FORCE_ASSERTIONS || !defined(NDEBUG)) -__hot static int cmp_int_align4(const MDBX_val *a, const MDBX_val *b) { - return cmp_int_inline(4, a, b); + // finalize + err = chk_scope_begin(chk, 0, MDBX_chk_finalize, nullptr, nullptr, nullptr); + rc = chk_scope_end(chk, err ? err : rc); + chk_dispose(chk); + return rc; } -#else -#define cmp_int_align4 cmp_int_unaligned -#endif /* !MDBX_UNALIGNED_OK || debug */ +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 -/* Compare two items lexically */ -__hot static int cmp_lexical(const MDBX_val *a, const MDBX_val *b) { - if (a->iov_len == b->iov_len) - return a->iov_len ? memcmp(a->iov_base, b->iov_base, a->iov_len) : 0; - const int diff_len = (a->iov_len < b->iov_len) ? -1 : 1; - const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; - int diff_data = shortest ? memcmp(a->iov_base, b->iov_base, shortest) : 0; - return likely(diff_data) ? diff_data : diff_len; +/*------------------------------------------------------------------------------ + * Pack/Unpack 16-bit values for Grow step & Shrink threshold */ + +MDBX_NOTHROW_CONST_FUNCTION static inline pgno_t me2v(size_t m, size_t e) { + assert(m < 2048 && e < 8); + return (pgno_t)(32768 + ((m + 1) << (e + 8))); } -MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned -tail3le(const uint8_t *p, size_t l) { - STATIC_ASSERT(sizeof(unsigned) > 2); - // 1: 0 0 0 +MDBX_NOTHROW_CONST_FUNCTION static inline uint16_t v2me(size_t v, size_t e) { + assert(v > (e ? me2v(2047, e - 1) : 32768)); + assert(v <= me2v(2047, e)); + size_t m = (v - 32768 + ((size_t)1 << (e + 8)) - 1) >> (e + 8); + m -= m > 0; + assert(m < 2048 && e < 8); + // f e d c b a 9 8 7 6 5 4 3 2 1 0 + // 1 e e e m m m m m m m m m m m 1 + const uint16_t pv = (uint16_t)(0x8001 + (e << 12) + (m << 1)); + assert(pv != 65535); + return pv; +} + +/* Convert 16-bit packed (exponential quantized) value to number of pages */ +pgno_t pv2pages(uint16_t pv) { + if ((pv & 0x8001) != 0x8001) + return pv; + if (pv == 65535) + return 65536; + // f e d c b a 9 8 7 6 5 4 3 2 1 0 + // 1 e e e m m m m m m m m m m m 1 + return me2v((pv >> 1) & 2047, (pv >> 12) & 7); +} + +/* Convert number of pages to 16-bit packed (exponential quantized) value */ +uint16_t pages2pv(size_t pages) { + if (pages < 32769 || (pages < 65536 && (pages & 1) == 0)) + return (uint16_t)pages; + if (pages <= me2v(2047, 0)) + return v2me(pages, 0); + if (pages <= me2v(2047, 1)) + return v2me(pages, 1); + if (pages <= me2v(2047, 2)) + return v2me(pages, 2); + if (pages <= me2v(2047, 3)) + return v2me(pages, 3); + if (pages <= me2v(2047, 4)) + return v2me(pages, 4); + if (pages <= me2v(2047, 5)) + return v2me(pages, 5); + if (pages <= me2v(2047, 6)) + return v2me(pages, 6); + return (pages < me2v(2046, 7)) ? v2me(pages, 7) : 65533; +} + +__cold bool pv2pages_verify(void) { + bool ok = true, dump_translation = false; + for (size_t i = 0; i < 65536; ++i) { + size_t pages = pv2pages(i); + size_t x = pages2pv(pages); + size_t xp = pv2pages(x); + if (pages != xp) { + ERROR("%zu => %zu => %zu => %zu\n", i, pages, x, xp); + ok = false; + } else if (dump_translation && !(x == i || (x % 2 == 0 && x < 65536))) { + DEBUG("%zu => %zu => %zu => %zu\n", i, pages, x, xp); + } + } + return ok; +} + +/*----------------------------------------------------------------------------*/ + +MDBX_NOTHROW_PURE_FUNCTION size_t bytes_align2os_bytes(const MDBX_env *env, + size_t bytes) { + return ceil_powerof2( + bytes, (env->ps > globals.sys_pagesize) ? env->ps : globals.sys_pagesize); +} + +MDBX_NOTHROW_PURE_FUNCTION size_t pgno_align2os_bytes(const MDBX_env *env, + size_t pgno) { + return ceil_powerof2(pgno2bytes(env, pgno), globals.sys_pagesize); +} + +MDBX_NOTHROW_PURE_FUNCTION pgno_t pgno_align2os_pgno(const MDBX_env *env, + size_t pgno) { + return bytes2pgno(env, pgno_align2os_bytes(env, pgno)); +} + +/*----------------------------------------------------------------------------*/ + +MDBX_NOTHROW_PURE_FUNCTION static __always_inline int +cmp_int_inline(const size_t expected_alignment, const MDBX_val *a, + const MDBX_val *b) { + if (likely(a->iov_len == b->iov_len)) { + if (sizeof(size_t) > 7 && likely(a->iov_len == 8)) + return CMP2INT(unaligned_peek_u64(expected_alignment, a->iov_base), + unaligned_peek_u64(expected_alignment, b->iov_base)); + if (likely(a->iov_len == 4)) + return CMP2INT(unaligned_peek_u32(expected_alignment, a->iov_base), + unaligned_peek_u32(expected_alignment, b->iov_base)); + if (sizeof(size_t) < 8 && likely(a->iov_len == 8)) + return CMP2INT(unaligned_peek_u64(expected_alignment, a->iov_base), + unaligned_peek_u64(expected_alignment, b->iov_base)); + } + ERROR("mismatch and/or invalid size %p.%zu/%p.%zu for INTEGERKEY/INTEGERDUP", + a->iov_base, a->iov_len, b->iov_base, b->iov_len); + return 0; +} + +MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_int_unaligned(const MDBX_val *a, + const MDBX_val *b) { + return cmp_int_inline(1, a, b); +} + +#ifndef cmp_int_align2 +/* Compare two items pointing at 2-byte aligned unsigned int's. */ +MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_int_align2(const MDBX_val *a, + const MDBX_val *b) { + return cmp_int_inline(2, a, b); +} +#endif /* cmp_int_align2 */ + +#ifndef cmp_int_align4 +/* Compare two items pointing at 4-byte aligned unsigned int's. */ +MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_int_align4(const MDBX_val *a, + const MDBX_val *b) { + return cmp_int_inline(4, a, b); +} +#endif /* cmp_int_align4 */ + +/* Compare two items lexically */ +MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_lexical(const MDBX_val *a, + const MDBX_val *b) { + if (a->iov_len == b->iov_len) + return a->iov_len ? memcmp(a->iov_base, b->iov_base, a->iov_len) : 0; + + const int diff_len = (a->iov_len < b->iov_len) ? -1 : 1; + const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; + int diff_data = shortest ? memcmp(a->iov_base, b->iov_base, shortest) : 0; + return likely(diff_data) ? diff_data : diff_len; +} + +MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned +tail3le(const uint8_t *p, size_t l) { + STATIC_ASSERT(sizeof(unsigned) > 2); + // 1: 0 0 0 // 2: 0 1 1 // 3: 0 1 2 return p[0] | p[l >> 1] << 8 | p[l - 1] << 16; } /* Compare two items in reverse byte order */ -__hot static int cmp_reverse(const MDBX_val *a, const MDBX_val *b) { +MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_reverse(const MDBX_val *a, + const MDBX_val *b) { size_t left = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; if (likely(left)) { const uint8_t *pa = ptr_disp(a->iov_base, a->iov_len); @@ -16755,15 +12783,16 @@ __hot static int cmp_reverse(const MDBX_val *a, const MDBX_val *b) { } /* Fast non-lexically comparator */ -__hot static int cmp_lenfast(const MDBX_val *a, const MDBX_val *b) { +MDBX_NOTHROW_PURE_FUNCTION __hot int cmp_lenfast(const MDBX_val *a, + const MDBX_val *b) { int diff = CMP2INT(a->iov_len, b->iov_len); return (likely(diff) || a->iov_len == 0) ? diff : memcmp(a->iov_base, b->iov_base, a->iov_len); } -__hot static bool eq_fast_slowpath(const uint8_t *a, const uint8_t *b, - size_t l) { +MDBX_NOTHROW_PURE_FUNCTION __hot bool +eq_fast_slowpath(const uint8_t *a, const uint8_t *b, size_t l) { if (likely(l > 3)) { if (MDBX_UNALIGNED_OK >= 4 && likely(l < 9)) return ((unaligned_peek_u32(1, a) - unaligned_peek_u32(1, b)) | @@ -16780,22260 +12809,27030 @@ __hot static bool eq_fast_slowpath(const uint8_t *a, const uint8_t *b, return true; } -static __always_inline bool eq_fast(const MDBX_val *a, const MDBX_val *b) { - return unlikely(a->iov_len == b->iov_len) && - eq_fast_slowpath(a->iov_base, b->iov_base, a->iov_len); -} - -static int cmp_equal_or_greater(const MDBX_val *a, const MDBX_val *b) { +int cmp_equal_or_greater(const MDBX_val *a, const MDBX_val *b) { return eq_fast(a, b) ? 0 : 1; } -static int validate_meta(MDBX_env *env, MDBX_meta *const meta, - const MDBX_page *const page, - const unsigned meta_number, unsigned *guess_pagesize) { - const uint64_t magic_and_version = - unaligned_peek_u64(4, &meta->mm_magic_and_version); - if (unlikely(magic_and_version != MDBX_DATA_MAGIC && - magic_and_version != MDBX_DATA_MAGIC_LEGACY_COMPAT && - magic_and_version != MDBX_DATA_MAGIC_LEGACY_DEVEL)) { - ERROR("meta[%u] has invalid magic/version %" PRIx64, meta_number, - magic_and_version); - return ((magic_and_version >> 8) != MDBX_MAGIC) ? MDBX_INVALID - : MDBX_VERSION_MISMATCH; - } +int cmp_equal_or_wrong(const MDBX_val *a, const MDBX_val *b) { + return eq_fast(a, b) ? 0 : -1; +} - if (unlikely(page->mp_pgno != meta_number)) { - ERROR("meta[%u] has invalid pageno %" PRIaPGNO, meta_number, page->mp_pgno); - return MDBX_INVALID; - } +/*----------------------------------------------------------------------------*/ - if (unlikely(page->mp_flags != P_META)) { - ERROR("page #%u not a meta-page", meta_number); - return MDBX_INVALID; +__cold void update_mlcnt(const MDBX_env *env, + const pgno_t new_aligned_mlocked_pgno, + const bool lock_not_release) { + for (;;) { + const pgno_t mlock_pgno_before = + atomic_load32(&env->mlocked_pgno, mo_AcquireRelease); + eASSERT(env, + pgno_align2os_pgno(env, mlock_pgno_before) == mlock_pgno_before); + eASSERT(env, pgno_align2os_pgno(env, new_aligned_mlocked_pgno) == + new_aligned_mlocked_pgno); + if (lock_not_release ? (mlock_pgno_before >= new_aligned_mlocked_pgno) + : (mlock_pgno_before <= new_aligned_mlocked_pgno)) + break; + if (likely(atomic_cas32(&((MDBX_env *)env)->mlocked_pgno, mlock_pgno_before, + new_aligned_mlocked_pgno))) + for (;;) { + mdbx_atomic_uint32_t *const mlcnt = env->lck->mlcnt; + const int32_t snap_locked = atomic_load32(mlcnt + 0, mo_Relaxed); + const int32_t snap_unlocked = atomic_load32(mlcnt + 1, mo_Relaxed); + if (mlock_pgno_before == 0 && (snap_locked - snap_unlocked) < INT_MAX) { + eASSERT(env, lock_not_release); + if (unlikely(!atomic_cas32(mlcnt + 0, snap_locked, snap_locked + 1))) + continue; + } + if (new_aligned_mlocked_pgno == 0 && + (snap_locked - snap_unlocked) > 0) { + eASSERT(env, !lock_not_release); + if (unlikely( + !atomic_cas32(mlcnt + 1, snap_unlocked, snap_unlocked + 1))) + continue; + } + NOTICE("%s-pages %u..%u, mlocked-process(es) %u -> %u", + lock_not_release ? "lock" : "unlock", + lock_not_release ? mlock_pgno_before : new_aligned_mlocked_pgno, + lock_not_release ? new_aligned_mlocked_pgno : mlock_pgno_before, + snap_locked - snap_unlocked, + atomic_load32(mlcnt + 0, mo_Relaxed) - + atomic_load32(mlcnt + 1, mo_Relaxed)); + return; + } } +} - /* LY: check pagesize */ - if (unlikely(!is_powerof2(meta->mm_psize) || meta->mm_psize < MIN_PAGESIZE || - meta->mm_psize > MAX_PAGESIZE)) { - WARNING("meta[%u] has invalid pagesize (%u), skip it", meta_number, - meta->mm_psize); - return is_powerof2(meta->mm_psize) ? MDBX_VERSION_MISMATCH : MDBX_INVALID; +__cold void munlock_after(const MDBX_env *env, const pgno_t aligned_pgno, + const size_t end_bytes) { + if (atomic_load32(&env->mlocked_pgno, mo_AcquireRelease) > aligned_pgno) { + int err = MDBX_ENOSYS; + const size_t munlock_begin = pgno2bytes(env, aligned_pgno); + const size_t munlock_size = end_bytes - munlock_begin; + eASSERT(env, end_bytes % globals.sys_pagesize == 0 && + munlock_begin % globals.sys_pagesize == 0 && + munlock_size % globals.sys_pagesize == 0); +#if defined(_WIN32) || defined(_WIN64) + err = + VirtualUnlock(ptr_disp(env->dxb_mmap.base, munlock_begin), munlock_size) + ? MDBX_SUCCESS + : (int)GetLastError(); + if (err == ERROR_NOT_LOCKED) + err = MDBX_SUCCESS; +#elif defined(_POSIX_MEMLOCK_RANGE) + err = munlock(ptr_disp(env->dxb_mmap.base, munlock_begin), munlock_size) + ? errno + : MDBX_SUCCESS; +#endif + if (likely(err == MDBX_SUCCESS)) + update_mlcnt(env, aligned_pgno, false); + else { +#if defined(_WIN32) || defined(_WIN64) + WARNING("VirtualUnlock(%zu, %zu) error %d", munlock_begin, munlock_size, + err); +#else + WARNING("munlock(%zu, %zu) error %d", munlock_begin, munlock_size, err); +#endif + } } +} - if (guess_pagesize && *guess_pagesize != meta->mm_psize) { - *guess_pagesize = meta->mm_psize; - VERBOSE("meta[%u] took pagesize %u", meta_number, meta->mm_psize); - } +__cold void munlock_all(const MDBX_env *env) { + munlock_after(env, 0, bytes_align2os_bytes(env, env->dxb_mmap.current)); +} - const txnid_t txnid = unaligned_peek_u64(4, &meta->mm_txnid_a); - if (unlikely(txnid != unaligned_peek_u64(4, &meta->mm_txnid_b))) { - WARNING("meta[%u] not completely updated, skip it", meta_number); - return MDBX_RESULT_TRUE; - } +/*----------------------------------------------------------------------------*/ - /* LY: check signature as a checksum */ - if (META_IS_STEADY(meta) && - unlikely(unaligned_peek_u64(4, &meta->mm_sign) != meta_sign(meta))) { - WARNING("meta[%u] has invalid steady-checksum (0x%" PRIx64 " != 0x%" PRIx64 - "), skip it", - meta_number, unaligned_peek_u64(4, &meta->mm_sign), - meta_sign(meta)); - return MDBX_RESULT_TRUE; - } +uint32_t combine_durability_flags(const uint32_t a, const uint32_t b) { + uint32_t r = a | b; - if (unlikely(meta->mm_dbs[FREE_DBI].md_flags != MDBX_INTEGERKEY)) { - WARNING("meta[%u] has invalid %s flags 0x%u, skip it", meta_number, - "GC/FreeDB", meta->mm_dbs[FREE_DBI].md_flags); - return MDBX_INCOMPATIBLE; - } - if (unlikely(!db_check_flags(meta->mm_dbs[MAIN_DBI].md_flags))) { - WARNING("meta[%u] has invalid %s flags 0x%u, skip it", meta_number, - "MainDB", meta->mm_dbs[MAIN_DBI].md_flags); - return MDBX_INCOMPATIBLE; - } + /* avoid false MDBX_UTTERLY_NOSYNC */ + if (F_ISSET(r, MDBX_UTTERLY_NOSYNC) && !F_ISSET(a, MDBX_UTTERLY_NOSYNC) && + !F_ISSET(b, MDBX_UTTERLY_NOSYNC)) + r = (r - MDBX_UTTERLY_NOSYNC) | MDBX_SAFE_NOSYNC; - DEBUG("checking meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO - ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO - " +%u -%u, txn_id %" PRIaTXN ", %s", - page->mp_pgno, meta->mm_dbs[MAIN_DBI].md_root, - meta->mm_dbs[FREE_DBI].md_root, meta->mm_geo.lower, meta->mm_geo.next, - meta->mm_geo.now, meta->mm_geo.upper, pv2pages(meta->mm_geo.grow_pv), - pv2pages(meta->mm_geo.shrink_pv), txnid, durable_caption(meta)); + /* convert DEPRECATED_MAPASYNC to MDBX_SAFE_NOSYNC */ + if ((r & (MDBX_WRITEMAP | DEPRECATED_MAPASYNC)) == + (MDBX_WRITEMAP | DEPRECATED_MAPASYNC) && + !F_ISSET(r, MDBX_UTTERLY_NOSYNC)) + r = (r - DEPRECATED_MAPASYNC) | MDBX_SAFE_NOSYNC; - if (unlikely(txnid < MIN_TXNID || txnid > MAX_TXNID)) { - WARNING("meta[%u] has invalid txnid %" PRIaTXN ", skip it", meta_number, - txnid); - return MDBX_RESULT_TRUE; - } + /* force MDBX_NOMETASYNC if NOSYNC enabled */ + if (r & (MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC)) + r |= MDBX_NOMETASYNC; - /* LY: check min-pages value */ - if (unlikely(meta->mm_geo.lower < MIN_PAGENO || - meta->mm_geo.lower > MAX_PAGENO + 1)) { - WARNING("meta[%u] has invalid min-pages (%" PRIaPGNO "), skip it", - meta_number, meta->mm_geo.lower); - return MDBX_INVALID; - } + assert(!(F_ISSET(r, MDBX_UTTERLY_NOSYNC) && + !F_ISSET(a, MDBX_UTTERLY_NOSYNC) && + !F_ISSET(b, MDBX_UTTERLY_NOSYNC))); + return r; +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 - /* LY: check max-pages value */ - if (unlikely(meta->mm_geo.upper < MIN_PAGENO || - meta->mm_geo.upper > MAX_PAGENO + 1 || - meta->mm_geo.upper < meta->mm_geo.lower)) { - WARNING("meta[%u] has invalid max-pages (%" PRIaPGNO "), skip it", - meta_number, meta->mm_geo.upper); - return MDBX_INVALID; - } - /* LY: check last_pgno */ - if (unlikely(meta->mm_geo.next < MIN_PAGENO || - meta->mm_geo.next - 1 > MAX_PAGENO)) { - WARNING("meta[%u] has invalid next-pageno (%" PRIaPGNO "), skip it", - meta_number, meta->mm_geo.next); - return MDBX_CORRUPTED; - } +/* check against https://libmdbx.dqdkfa.ru/dead-github/issues/269 */ +static bool coherency_check(const MDBX_env *env, const txnid_t txnid, + const volatile tree_t *trees, + const volatile meta_t *meta, bool report) { + const txnid_t freedb_mod_txnid = trees[FREE_DBI].mod_txnid; + const txnid_t maindb_mod_txnid = trees[MAIN_DBI].mod_txnid; + const pgno_t last_pgno = meta->geometry.now; + + const pgno_t freedb_root_pgno = trees[FREE_DBI].root; + const page_t *freedb_root = + (env->dxb_mmap.base && freedb_root_pgno < last_pgno) + ? pgno2page(env, freedb_root_pgno) + : nullptr; + + const pgno_t maindb_root_pgno = trees[MAIN_DBI].root; + const page_t *maindb_root = + (env->dxb_mmap.base && maindb_root_pgno < last_pgno) + ? pgno2page(env, maindb_root_pgno) + : nullptr; + const uint64_t magic_and_version = + unaligned_peek_u64_volatile(4, &meta->magic_and_version); - /* LY: check filesize & used_bytes */ - const uint64_t used_bytes = meta->mm_geo.next * (uint64_t)meta->mm_psize; - if (unlikely(used_bytes > env->me_dxb_mmap.filesize)) { - /* Here could be a race with DB-shrinking performed by other process */ - int err = osal_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); - if (unlikely(err != MDBX_SUCCESS)) - return err; - if (unlikely(used_bytes > env->me_dxb_mmap.filesize)) { - WARNING("meta[%u] used-bytes (%" PRIu64 ") beyond filesize (%" PRIu64 - "), skip it", - meta_number, used_bytes, env->me_dxb_mmap.filesize); - return MDBX_CORRUPTED; - } - } - if (unlikely(meta->mm_geo.next - 1 > MAX_PAGENO || - used_bytes > MAX_MAPSIZE)) { - WARNING("meta[%u] has too large used-space (%" PRIu64 "), skip it", - meta_number, used_bytes); - return MDBX_TOO_LARGE; + bool ok = true; + if (freedb_root_pgno != P_INVALID && + unlikely(freedb_root_pgno >= last_pgno)) { + if (report) + WARNING( + "catch invalid %sdb root %" PRIaPGNO " for meta_txnid %" PRIaTXN + " %s", + "free", freedb_root_pgno, txnid, + (env->stuck_meta < 0) + ? "(workaround for incoherent flaw of unified page/buffer cache)" + : "(wagering meta)"); + ok = false; } - - /* LY: check mapsize limits */ - pgno_t geo_lower = meta->mm_geo.lower; - uint64_t mapsize_min = geo_lower * (uint64_t)meta->mm_psize; - STATIC_ASSERT(MAX_MAPSIZE < PTRDIFF_MAX - MAX_PAGESIZE); - STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE); - STATIC_ASSERT((uint64_t)(MAX_PAGENO + 1) * MIN_PAGESIZE % (4ul << 20) == 0); - if (unlikely(mapsize_min < MIN_MAPSIZE || mapsize_min > MAX_MAPSIZE)) { - if (MAX_MAPSIZE != MAX_MAPSIZE64 && mapsize_min > MAX_MAPSIZE && - mapsize_min <= MAX_MAPSIZE64) { - eASSERT(env, - meta->mm_geo.next - 1 <= MAX_PAGENO && used_bytes <= MAX_MAPSIZE); - WARNING("meta[%u] has too large min-mapsize (%" PRIu64 "), " - "but size of used space still acceptable (%" PRIu64 ")", - meta_number, mapsize_min, used_bytes); - geo_lower = (pgno_t)((mapsize_min = MAX_MAPSIZE) / meta->mm_psize); - if (geo_lower > MAX_PAGENO + 1) { - geo_lower = MAX_PAGENO + 1; - mapsize_min = geo_lower * (uint64_t)meta->mm_psize; - } - WARNING("meta[%u] consider get-%s pageno is %" PRIaPGNO - " instead of wrong %" PRIaPGNO - ", will be corrected on next commit(s)", - meta_number, "lower", geo_lower, meta->mm_geo.lower); - meta->mm_geo.lower = geo_lower; - } else { - WARNING("meta[%u] has invalid min-mapsize (%" PRIu64 "), skip it", - meta_number, mapsize_min); - return MDBX_VERSION_MISMATCH; - } - } - - pgno_t geo_upper = meta->mm_geo.upper; - uint64_t mapsize_max = geo_upper * (uint64_t)meta->mm_psize; - STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE); - if (unlikely(mapsize_max > MAX_MAPSIZE || - (MAX_PAGENO + 1) < - ceil_powerof2((size_t)mapsize_max, env->me_os_psize) / - (size_t)meta->mm_psize)) { - if (mapsize_max > MAX_MAPSIZE64) { - WARNING("meta[%u] has invalid max-mapsize (%" PRIu64 "), skip it", - meta_number, mapsize_max); - return MDBX_VERSION_MISMATCH; - } - /* allow to open large DB from a 32-bit environment */ - eASSERT(env, - meta->mm_geo.next - 1 <= MAX_PAGENO && used_bytes <= MAX_MAPSIZE); - WARNING("meta[%u] has too large max-mapsize (%" PRIu64 "), " - "but size of used space still acceptable (%" PRIu64 ")", - meta_number, mapsize_max, used_bytes); - geo_upper = (pgno_t)((mapsize_max = MAX_MAPSIZE) / meta->mm_psize); - if (geo_upper > MAX_PAGENO + 1) { - geo_upper = MAX_PAGENO + 1; - mapsize_max = geo_upper * (uint64_t)meta->mm_psize; - } - WARNING("meta[%u] consider get-%s pageno is %" PRIaPGNO - " instead of wrong %" PRIaPGNO - ", will be corrected on next commit(s)", - meta_number, "upper", geo_upper, meta->mm_geo.upper); - meta->mm_geo.upper = geo_upper; + if (maindb_root_pgno != P_INVALID && + unlikely(maindb_root_pgno >= last_pgno)) { + if (report) + WARNING( + "catch invalid %sdb root %" PRIaPGNO " for meta_txnid %" PRIaTXN + " %s", + "main", maindb_root_pgno, txnid, + (env->stuck_meta < 0) + ? "(workaround for incoherent flaw of unified page/buffer cache)" + : "(wagering meta)"); + ok = false; } - - /* LY: check and silently put mm_geo.now into [geo.lower...geo.upper]. - * - * Copy-with-compaction by previous version of libmdbx could produce DB-file - * less than meta.geo.lower bound, in case actual filling is low or no data - * at all. This is not a problem as there is no damage or loss of data. - * Therefore it is better not to consider such situation as an error, but - * silently correct it. */ - pgno_t geo_now = meta->mm_geo.now; - if (geo_now < geo_lower) - geo_now = geo_lower; - if (geo_now > geo_upper && meta->mm_geo.next <= geo_upper) - geo_now = geo_upper; - - if (unlikely(meta->mm_geo.next > geo_now)) { - WARNING("meta[%u] next-pageno (%" PRIaPGNO - ") is beyond end-pgno (%" PRIaPGNO "), skip it", - meta_number, meta->mm_geo.next, geo_now); - return MDBX_CORRUPTED; + if (unlikely(txnid < freedb_mod_txnid || + (!freedb_mod_txnid && freedb_root && + likely(magic_and_version == MDBX_DATA_MAGIC)))) { + if (report) + WARNING( + "catch invalid %sdb.mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN + " %s", + "free", freedb_mod_txnid, txnid, + (env->stuck_meta < 0) + ? "(workaround for incoherent flaw of unified page/buffer cache)" + : "(wagering meta)"); + ok = false; } - if (meta->mm_geo.now != geo_now) { - WARNING("meta[%u] consider geo-%s pageno is %" PRIaPGNO - " instead of wrong %" PRIaPGNO - ", will be corrected on next commit(s)", - meta_number, "now", geo_now, meta->mm_geo.now); - meta->mm_geo.now = geo_now; + if (unlikely(txnid < maindb_mod_txnid || + (!maindb_mod_txnid && maindb_root && + likely(magic_and_version == MDBX_DATA_MAGIC)))) { + if (report) + WARNING( + "catch invalid %sdb.mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN + " %s", + "main", maindb_mod_txnid, txnid, + (env->stuck_meta < 0) + ? "(workaround for incoherent flaw of unified page/buffer cache)" + : "(wagering meta)"); + ok = false; } - - /* GC */ - if (meta->mm_dbs[FREE_DBI].md_root == P_INVALID) { - if (unlikely(meta->mm_dbs[FREE_DBI].md_branch_pages || - meta->mm_dbs[FREE_DBI].md_depth || - meta->mm_dbs[FREE_DBI].md_entries || - meta->mm_dbs[FREE_DBI].md_leaf_pages || - meta->mm_dbs[FREE_DBI].md_overflow_pages)) { - WARNING("meta[%u] has false-empty %s, skip it", meta_number, "GC"); - return MDBX_CORRUPTED; + if (likely(freedb_root && freedb_mod_txnid)) { + VALGRIND_MAKE_MEM_DEFINED(freedb_root, sizeof(freedb_root->txnid)); + MDBX_ASAN_UNPOISON_MEMORY_REGION(freedb_root, sizeof(freedb_root->txnid)); + const txnid_t root_txnid = freedb_root->txnid; + if (unlikely(root_txnid != freedb_mod_txnid)) { + if (report) + WARNING("catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN + " for %sdb.mod_txnid %" PRIaTXN " %s", + freedb_root_pgno, root_txnid, "free", freedb_mod_txnid, + (env->stuck_meta < 0) ? "(workaround for incoherent flaw of " + "unified page/buffer cache)" + : "(wagering meta)"); + ok = false; } - } else if (unlikely(meta->mm_dbs[FREE_DBI].md_root >= meta->mm_geo.next)) { - WARNING("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", meta_number, - "GC", meta->mm_dbs[FREE_DBI].md_root); - return MDBX_CORRUPTED; } - - /* MainDB */ - if (meta->mm_dbs[MAIN_DBI].md_root == P_INVALID) { - if (unlikely(meta->mm_dbs[MAIN_DBI].md_branch_pages || - meta->mm_dbs[MAIN_DBI].md_depth || - meta->mm_dbs[MAIN_DBI].md_entries || - meta->mm_dbs[MAIN_DBI].md_leaf_pages || - meta->mm_dbs[MAIN_DBI].md_overflow_pages)) { - WARNING("meta[%u] has false-empty %s", meta_number, "MainDB"); - return MDBX_CORRUPTED; + if (likely(maindb_root && maindb_mod_txnid)) { + VALGRIND_MAKE_MEM_DEFINED(maindb_root, sizeof(maindb_root->txnid)); + MDBX_ASAN_UNPOISON_MEMORY_REGION(maindb_root, sizeof(maindb_root->txnid)); + const txnid_t root_txnid = maindb_root->txnid; + if (unlikely(root_txnid != maindb_mod_txnid)) { + if (report) + WARNING("catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN + " for %sdb.mod_txnid %" PRIaTXN " %s", + maindb_root_pgno, root_txnid, "main", maindb_mod_txnid, + (env->stuck_meta < 0) ? "(workaround for incoherent flaw of " + "unified page/buffer cache)" + : "(wagering meta)"); + ok = false; } - } else if (unlikely(meta->mm_dbs[MAIN_DBI].md_root >= meta->mm_geo.next)) { - WARNING("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", meta_number, - "MainDB", meta->mm_dbs[MAIN_DBI].md_root); - return MDBX_CORRUPTED; } + if (unlikely(!ok) && report) + env->lck->pgops.incoherence.weak = + (env->lck->pgops.incoherence.weak >= INT32_MAX) + ? INT32_MAX + : env->lck->pgops.incoherence.weak + 1; + return ok; +} - if (unlikely(meta->mm_dbs[FREE_DBI].md_mod_txnid > txnid)) { - WARNING("meta[%u] has wrong md_mod_txnid %" PRIaTXN " for %s, skip it", - meta_number, meta->mm_dbs[FREE_DBI].md_mod_txnid, "GC"); - return MDBX_CORRUPTED; +__cold int coherency_timeout(uint64_t *timestamp, intptr_t pgno, + const MDBX_env *env) { + if (likely(timestamp && *timestamp == 0)) + *timestamp = osal_monotime(); + else if (unlikely(!timestamp || osal_monotime() - *timestamp > + osal_16dot16_to_monotime(65536 / 10))) { + if (pgno >= 0 && pgno != env->stuck_meta) + ERROR("bailout waiting for %" PRIuSIZE " page arrival %s", pgno, + "(workaround for incoherent flaw of unified page/buffer cache)"); + else if (env->stuck_meta < 0) + ERROR("bailout waiting for valid snapshot (%s)", + "workaround for incoherent flaw of unified page/buffer cache"); + return MDBX_PROBLEM; } - if (unlikely(meta->mm_dbs[MAIN_DBI].md_mod_txnid > txnid)) { - WARNING("meta[%u] has wrong md_mod_txnid %" PRIaTXN " for %s, skip it", - meta_number, meta->mm_dbs[MAIN_DBI].md_mod_txnid, "MainDB"); - return MDBX_CORRUPTED; + osal_memory_fence(mo_AcquireRelease, true); +#if defined(_WIN32) || defined(_WIN64) + SwitchToThread(); +#elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE) + sched_yield(); +#elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS) + pthread_yield(); +#else + usleep(42); +#endif + return MDBX_RESULT_TRUE; +} + +/* check with timeout as the workaround + * for https://libmdbx.dqdkfa.ru/dead-github/issues/269 */ +__hot int coherency_check_head(MDBX_txn *txn, const meta_ptr_t head, + uint64_t *timestamp) { + /* Copy the DB info and flags */ + txn->geo = head.ptr_v->geometry; + memcpy(txn->dbs, &head.ptr_c->trees, sizeof(head.ptr_c->trees)); + STATIC_ASSERT(sizeof(head.ptr_c->trees) == CORE_DBS * sizeof(tree_t)); + VALGRIND_MAKE_MEM_UNDEFINED(txn->dbs + CORE_DBS, + txn->env->max_dbi - CORE_DBS); + txn->canary = head.ptr_v->canary; + + if (unlikely(!coherency_check(txn->env, head.txnid, txn->dbs, head.ptr_v, + *timestamp == 0))) + return coherency_timeout(timestamp, -1, txn->env); + + tASSERT(txn, txn->dbs[FREE_DBI].flags == MDBX_INTEGERKEY); + tASSERT(txn, check_sdb_flags(txn->dbs[MAIN_DBI].flags)); + return MDBX_SUCCESS; +} + +int coherency_check_written(const MDBX_env *env, const txnid_t txnid, + const volatile meta_t *meta, const intptr_t pgno, + uint64_t *timestamp) { + const bool report = !(timestamp && *timestamp); + const txnid_t head_txnid = meta_txnid(meta); + if (unlikely(head_txnid < MIN_TXNID || head_txnid < txnid)) { + if (report) { + env->lck->pgops.incoherence.weak = + (env->lck->pgops.incoherence.weak >= INT32_MAX) + ? INT32_MAX + : env->lck->pgops.incoherence.weak + 1; + WARNING("catch %s txnid %" PRIaTXN " for meta_%" PRIaPGNO " %s", + (head_txnid < MIN_TXNID) ? "invalid" : "unexpected", head_txnid, + bytes2pgno(env, ptr_dist(meta, env->dxb_mmap.base)), + "(workaround for incoherent flaw of unified page/buffer cache)"); + } + return coherency_timeout(timestamp, pgno, env); } + if (unlikely( + !coherency_check(env, head_txnid, &meta->trees.gc, meta, report))) + return coherency_timeout(timestamp, pgno, env); + eASSERT(env, meta->trees.gc.flags == MDBX_INTEGERKEY); + eASSERT(env, check_sdb_flags(meta->trees.main.flags)); return MDBX_SUCCESS; } -static int validate_meta_copy(MDBX_env *env, const MDBX_meta *meta, - MDBX_meta *dest) { - *dest = *meta; - return validate_meta(env, dest, data_page(meta), - bytes2pgno(env, ptr_dist(meta, env->me_map)), nullptr); +bool coherency_check_meta(const MDBX_env *env, const volatile meta_t *meta, + bool report) { + uint64_t timestamp = 0; + return coherency_check_written(env, 0, meta, -1, + report ? ×tamp : nullptr) == MDBX_SUCCESS; } +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 -/* Read the environment parameters of a DB environment - * before mapping it into memory. */ -__cold static int read_header(MDBX_env *env, MDBX_meta *dest, - const int lck_exclusive, - const mdbx_mode_t mode_bits) { - memset(dest, 0, sizeof(MDBX_meta)); - int rc = osal_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - unaligned_poke_u64(4, dest->mm_sign, MDBX_DATASIGN_WEAK); - rc = MDBX_CORRUPTED; +__cold size_t mdbx_default_pagesize(void) { + size_t pagesize = globals.sys_pagesize; + ENSURE(nullptr, is_powerof2(pagesize)); + pagesize = (pagesize >= MDBX_MIN_PAGESIZE) ? pagesize : MDBX_MIN_PAGESIZE; + pagesize = (pagesize <= MDBX_MAX_PAGESIZE) ? pagesize : MDBX_MAX_PAGESIZE; + return pagesize; +} - /* Read twice all meta pages so we can find the latest one. */ - unsigned loop_limit = NUM_METAS * 2; - /* We don't know the page size on first time. So, just guess it. */ - unsigned guess_pagesize = 0; - for (unsigned loop_count = 0; loop_count < loop_limit; ++loop_count) { - const unsigned meta_number = loop_count % NUM_METAS; - const unsigned offset = (guess_pagesize ? guess_pagesize - : (loop_count > NUM_METAS) ? env->me_psize - : env->me_os_psize) * - meta_number; +__cold intptr_t mdbx_limits_dbsize_min(intptr_t pagesize) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_default_pagesize(); + else if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE || + pagesize > (intptr_t)MDBX_MAX_PAGESIZE || + !is_powerof2((size_t)pagesize))) + return -1; - char buffer[MIN_PAGESIZE]; - unsigned retryleft = 42; - while (1) { - TRACE("reading meta[%d]: offset %u, bytes %u, retry-left %u", meta_number, - offset, MIN_PAGESIZE, retryleft); - int err = osal_pread(env->me_lazy_fd, buffer, MIN_PAGESIZE, offset); - if (err == MDBX_ENODATA && offset == 0 && loop_count == 0 && - env->me_dxb_mmap.filesize == 0 && - mode_bits /* non-zero for DB creation */ != 0) { - NOTICE("read meta: empty file (%d, %s)", err, mdbx_strerror(err)); - return err; - } -#if defined(_WIN32) || defined(_WIN64) - if (err == ERROR_LOCK_VIOLATION) { - SleepEx(0, true); - err = osal_pread(env->me_lazy_fd, buffer, MIN_PAGESIZE, offset); - if (err == ERROR_LOCK_VIOLATION && --retryleft) { - WARNING("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, - mdbx_strerror(err)); - continue; - } - } -#endif /* Windows */ - if (err != MDBX_SUCCESS) { - ERROR("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, - mdbx_strerror(err)); - return err; - } + return MIN_PAGENO * pagesize; +} - char again[MIN_PAGESIZE]; - err = osal_pread(env->me_lazy_fd, again, MIN_PAGESIZE, offset); -#if defined(_WIN32) || defined(_WIN64) - if (err == ERROR_LOCK_VIOLATION) { - SleepEx(0, true); - err = osal_pread(env->me_lazy_fd, again, MIN_PAGESIZE, offset); - if (err == ERROR_LOCK_VIOLATION && --retryleft) { - WARNING("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, - mdbx_strerror(err)); - continue; - } - } -#endif /* Windows */ - if (err != MDBX_SUCCESS) { - ERROR("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, - mdbx_strerror(err)); - return err; - } +__cold intptr_t mdbx_limits_dbsize_max(intptr_t pagesize) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_default_pagesize(); + else if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE || + pagesize > (intptr_t)MDBX_MAX_PAGESIZE || + !is_powerof2((size_t)pagesize))) + return -1; - if (memcmp(buffer, again, MIN_PAGESIZE) == 0 || --retryleft == 0) - break; + STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX); + const uint64_t limit = (1 + (uint64_t)MAX_PAGENO) * pagesize; + return (limit < MAX_MAPSIZE) ? (intptr_t)limit : (intptr_t)MAX_MAPSIZE; +} - VERBOSE("meta[%u] was updated, re-read it", meta_number); - } +__cold intptr_t mdbx_limits_txnsize_max(intptr_t pagesize) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_default_pagesize(); + else if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE || + pagesize > (intptr_t)MDBX_MAX_PAGESIZE || + !is_powerof2((size_t)pagesize))) + return -1; - if (!retryleft) { - ERROR("meta[%u] is too volatile, skip it", meta_number); - continue; - } + STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX); + const uint64_t pgl_limit = + pagesize * (uint64_t)(PAGELIST_LIMIT / MDBX_GOLD_RATIO_DBL); + const uint64_t map_limit = (uint64_t)(MAX_MAPSIZE / MDBX_GOLD_RATIO_DBL); + return (pgl_limit < map_limit) ? (intptr_t)pgl_limit : (intptr_t)map_limit; +} - MDBX_page *const page = (MDBX_page *)buffer; - MDBX_meta *const meta = page_meta(page); - rc = validate_meta(env, meta, page, meta_number, &guess_pagesize); - if (rc != MDBX_SUCCESS) - continue; +__cold intptr_t mdbx_limits_keysize_max(intptr_t pagesize, + MDBX_db_flags_t flags) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_default_pagesize(); + if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE || + pagesize > (intptr_t)MDBX_MAX_PAGESIZE || + !is_powerof2((size_t)pagesize))) + return -1; - bool latch; - if (env->me_stuck_meta >= 0) - latch = (meta_number == (unsigned)env->me_stuck_meta); - else if (meta_bootid_match(meta)) - latch = meta_choice_recent( - meta->unsafe_txnid, SIGN_IS_STEADY(meta->unsafe_sign), - dest->unsafe_txnid, SIGN_IS_STEADY(dest->unsafe_sign)); - else - latch = meta_choice_steady( - meta->unsafe_txnid, SIGN_IS_STEADY(meta->unsafe_sign), - dest->unsafe_txnid, SIGN_IS_STEADY(dest->unsafe_sign)); - if (latch) { - *dest = *meta; - if (!lck_exclusive && !META_IS_STEADY(dest)) - loop_limit += 1; /* LY: should re-read to hush race with update */ - VERBOSE("latch meta[%u]", meta_number); - } - } + return keysize_max(pagesize, flags); +} - if (dest->mm_psize == 0 || - (env->me_stuck_meta < 0 && - !(META_IS_STEADY(dest) || - meta_weak_acceptable(env, dest, lck_exclusive)))) { - ERROR("%s", "no usable meta-pages, database is corrupted"); - if (rc == MDBX_SUCCESS) { - /* TODO: try to restore the database by fully checking b-tree structure - * for the each meta page, if the corresponding option was given */ - return MDBX_CORRUPTED; - } - return rc; - } +__cold int mdbx_env_get_maxkeysize_ex(const MDBX_env *env, + MDBX_db_flags_t flags) { + if (unlikely(!env || env->signature.weak != env_signature)) + return -1; - return MDBX_SUCCESS; + return (int)mdbx_limits_keysize_max((intptr_t)env->ps, flags); } -__cold static MDBX_page *meta_model(const MDBX_env *env, MDBX_page *model, - size_t num) { - ENSURE(env, is_powerof2(env->me_psize)); - ENSURE(env, env->me_psize >= MIN_PAGESIZE); - ENSURE(env, env->me_psize <= MAX_PAGESIZE); - ENSURE(env, env->me_dbgeo.lower >= MIN_MAPSIZE); - ENSURE(env, env->me_dbgeo.upper <= MAX_MAPSIZE); - ENSURE(env, env->me_dbgeo.now >= env->me_dbgeo.lower); - ENSURE(env, env->me_dbgeo.now <= env->me_dbgeo.upper); - - memset(model, 0, env->me_psize); - model->mp_pgno = (pgno_t)num; - model->mp_flags = P_META; - MDBX_meta *const model_meta = page_meta(model); - unaligned_poke_u64(4, model_meta->mm_magic_and_version, MDBX_DATA_MAGIC); - - model_meta->mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower); - model_meta->mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper); - model_meta->mm_geo.grow_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.grow)); - model_meta->mm_geo.shrink_pv = - pages2pv(bytes2pgno(env, env->me_dbgeo.shrink)); - model_meta->mm_geo.now = bytes2pgno(env, env->me_dbgeo.now); - model_meta->mm_geo.next = NUM_METAS; - - ENSURE(env, model_meta->mm_geo.lower >= MIN_PAGENO); - ENSURE(env, model_meta->mm_geo.upper <= MAX_PAGENO + 1); - ENSURE(env, model_meta->mm_geo.now >= model_meta->mm_geo.lower); - ENSURE(env, model_meta->mm_geo.now <= model_meta->mm_geo.upper); - ENSURE(env, model_meta->mm_geo.next >= MIN_PAGENO); - ENSURE(env, model_meta->mm_geo.next <= model_meta->mm_geo.now); - ENSURE(env, model_meta->mm_geo.grow_pv == - pages2pv(pv2pages(model_meta->mm_geo.grow_pv))); - ENSURE(env, model_meta->mm_geo.shrink_pv == - pages2pv(pv2pages(model_meta->mm_geo.shrink_pv))); - - model_meta->mm_psize = env->me_psize; - model_meta->mm_dbs[FREE_DBI].md_flags = MDBX_INTEGERKEY; - model_meta->mm_dbs[FREE_DBI].md_root = P_INVALID; - model_meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; - meta_set_txnid(env, model_meta, MIN_TXNID + num); - unaligned_poke_u64(4, model_meta->mm_sign, meta_sign(model_meta)); - eASSERT(env, check_meta_coherency(env, model_meta, true)); - return ptr_disp(model, env->me_psize); +__cold int mdbx_env_get_maxkeysize(const MDBX_env *env) { + return mdbx_env_get_maxkeysize_ex(env, MDBX_DUPSORT); } -/* Fill in most of the zeroed meta-pages for an empty database environment. - * Return pointer to recently (head) meta-page. */ -__cold static MDBX_meta *init_metas(const MDBX_env *env, void *buffer) { - MDBX_page *page0 = (MDBX_page *)buffer; - MDBX_page *page1 = meta_model(env, page0, 0); - MDBX_page *page2 = meta_model(env, page1, 1); - meta_model(env, page2, 2); - return page_meta(page2); +__cold intptr_t mdbx_limits_keysize_min(MDBX_db_flags_t flags) { + return keysize_min(flags); } -static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, - meta_troika_t *const troika) { - eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); - eASSERT(env, pending->mm_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); - eASSERT(env, db_check_flags(pending->mm_dbs[MAIN_DBI].md_flags)); - const MDBX_meta *const meta0 = METAPAGE(env, 0); - const MDBX_meta *const meta1 = METAPAGE(env, 1); - const MDBX_meta *const meta2 = METAPAGE(env, 2); - const meta_ptr_t head = meta_recent(env, troika); - int rc; +__cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize, + MDBX_db_flags_t flags) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_default_pagesize(); + if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE || + pagesize > (intptr_t)MDBX_MAX_PAGESIZE || + !is_powerof2((size_t)pagesize))) + return -1; - eASSERT(env, - pending < METAPAGE(env, 0) || pending > METAPAGE(env, NUM_METAS)); - eASSERT(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0); - eASSERT(env, pending->mm_geo.next <= pending->mm_geo.now); + return valsize_max(pagesize, flags); +} - if (flags & MDBX_SAFE_NOSYNC) { - /* Check auto-sync conditions */ - const pgno_t autosync_threshold = - atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); - const uint64_t autosync_period = - atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); - uint64_t eoos_timestamp; - if ((autosync_threshold && - atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= - autosync_threshold) || - (autosync_period && - (eoos_timestamp = - atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && - osal_monotime() - eoos_timestamp >= autosync_period)) - flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ - } +__cold int mdbx_env_get_maxvalsize_ex(const MDBX_env *env, + MDBX_db_flags_t flags) { + if (unlikely(!env || env->signature.weak != env_signature)) + return -1; - pgno_t shrink = 0; - if (flags & MDBX_SHRINK_ALLOWED) { - const size_t prev_discarded_pgno = - atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed); - if (prev_discarded_pgno < pending->mm_geo.next) - env->me_lck->mti_discarded_tail.weak = pending->mm_geo.next; - else if (prev_discarded_pgno >= - pending->mm_geo.next + env->me_madv_threshold) { - /* LY: check conditions to discard unused pages */ - const pgno_t largest_pgno = find_largest_snapshot( - env, (head.ptr_c->mm_geo.next > pending->mm_geo.next) - ? head.ptr_c->mm_geo.next - : pending->mm_geo.next); - eASSERT(env, largest_pgno >= NUM_METAS); + return (int)mdbx_limits_valsize_max((intptr_t)env->ps, flags); +} -#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) - const pgno_t edge = env->me_poison_edge; - if (edge > largest_pgno) { - env->me_poison_edge = largest_pgno; - VALGRIND_MAKE_MEM_NOACCESS( - ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)), - pgno2bytes(env, edge - largest_pgno)); - MDBX_ASAN_POISON_MEMORY_REGION( - ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)), - pgno2bytes(env, edge - largest_pgno)); - } -#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ +__cold intptr_t mdbx_limits_valsize_min(MDBX_db_flags_t flags) { + return valsize_min(flags); +} -#if MDBX_ENABLE_MADVISE && \ - (defined(MADV_DONTNEED) || defined(POSIX_MADV_DONTNEED)) - const size_t discard_edge_pgno = pgno_align2os_pgno(env, largest_pgno); - if (prev_discarded_pgno >= discard_edge_pgno + env->me_madv_threshold) { - const size_t prev_discarded_bytes = - pgno_align2os_bytes(env, prev_discarded_pgno); - const size_t discard_edge_bytes = pgno2bytes(env, discard_edge_pgno); - /* из-за выравнивания prev_discarded_bytes и discard_edge_bytes - * могут быть равны */ - if (prev_discarded_bytes > discard_edge_bytes) { - NOTICE("shrink-MADV_%s %zu..%zu", "DONTNEED", discard_edge_pgno, - prev_discarded_pgno); - munlock_after(env, discard_edge_pgno, - bytes_align2os_bytes(env, env->me_dxb_mmap.current)); - const uint32_t munlocks_before = - atomic_load32(&env->me_lck->mti_mlcnt[1], mo_Relaxed); -#if defined(MADV_DONTNEED) - int advise = MADV_DONTNEED; -#if defined(MADV_FREE) && \ - 0 /* MADV_FREE works for only anonymous vma at the moment */ - if ((env->me_flags & MDBX_WRITEMAP) && - linux_kernel_version > 0x04050000) - advise = MADV_FREE; -#endif /* MADV_FREE */ - int err = madvise(ptr_disp(env->me_map, discard_edge_bytes), - prev_discarded_bytes - discard_edge_bytes, advise) - ? ignore_enosys(errno) - : MDBX_SUCCESS; -#else - int err = ignore_enosys(posix_madvise( - ptr_disp(env->me_map, discard_edge_bytes), - prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED)); -#endif - if (unlikely(MDBX_IS_ERROR(err))) { - const uint32_t mlocks_after = - atomic_load32(&env->me_lck->mti_mlcnt[0], mo_Relaxed); - if (err == MDBX_EINVAL) { - const int severity = (mlocks_after - munlocks_before) - ? MDBX_LOG_NOTICE - : MDBX_LOG_WARN; - if (LOG_ENABLED(severity)) - debug_log( - severity, __func__, __LINE__, - "%s-madvise: ignore EINVAL (%d) since some pages maybe " - "locked (%u/%u mlcnt-processes)", - "shrink", err, mlocks_after, munlocks_before); - } else { - ERROR("%s-madvise(%s, %zu, +%zu), %u/%u mlcnt-processes, err %d", - "shrink", "DONTNEED", discard_edge_bytes, - prev_discarded_bytes - discard_edge_bytes, mlocks_after, - munlocks_before, err); - return err; - } - } else - env->me_lck->mti_discarded_tail.weak = discard_edge_pgno; - } - } -#endif /* MDBX_ENABLE_MADVISE && (MADV_DONTNEED || POSIX_MADV_DONTNEED) */ +__cold intptr_t mdbx_limits_pairsize4page_max(intptr_t pagesize, + MDBX_db_flags_t flags) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_default_pagesize(); + if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE || + pagesize > (intptr_t)MDBX_MAX_PAGESIZE || + !is_powerof2((size_t)pagesize))) + return -1; - /* LY: check conditions to shrink datafile */ - const pgno_t backlog_gap = 3 + pending->mm_dbs[FREE_DBI].md_depth * 3; - pgno_t shrink_step = 0; - if (pending->mm_geo.shrink_pv && - pending->mm_geo.now - pending->mm_geo.next > - (shrink_step = pv2pages(pending->mm_geo.shrink_pv)) + - backlog_gap) { - if (pending->mm_geo.now > largest_pgno && - pending->mm_geo.now - largest_pgno > shrink_step + backlog_gap) { - const pgno_t aligner = - pending->mm_geo.grow_pv - ? /* grow_step */ pv2pages(pending->mm_geo.grow_pv) - : shrink_step; - const pgno_t with_backlog_gap = largest_pgno + backlog_gap; - const pgno_t aligned = - pgno_align2os_pgno(env, (size_t)with_backlog_gap + aligner - - with_backlog_gap % aligner); - const pgno_t bottom = (aligned > pending->mm_geo.lower) - ? aligned - : pending->mm_geo.lower; - if (pending->mm_geo.now > bottom) { - if (TROIKA_HAVE_STEADY(troika)) - /* force steady, but only if steady-checkpoint is present */ - flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; - shrink = pending->mm_geo.now - bottom; - pending->mm_geo.now = bottom; - if (unlikely(head.txnid == pending->unsafe_txnid)) { - const txnid_t txnid = safe64_txnid_next(pending->unsafe_txnid); - NOTICE("force-forward pending-txn %" PRIaTXN " -> %" PRIaTXN, - pending->unsafe_txnid, txnid); - ENSURE(env, !env->me_txn0 || !env->me_txn); - if (unlikely(txnid > MAX_TXNID)) { - rc = MDBX_TXN_FULL; - ERROR("txnid overflow, raise %d", rc); - goto fail; - } - meta_set_txnid(env, pending, txnid); - eASSERT(env, check_meta_coherency(env, pending, true)); - } - } - } - } - } - } + if (flags & + (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP)) + return BRANCH_NODE_MAX(pagesize) - NODESIZE; - /* LY: step#1 - sync previously written/updated data-pages */ - rc = MDBX_RESULT_FALSE /* carry steady */; - if (atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) { - eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); - enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE; - unsigned sync_op = 0; - if ((flags & MDBX_SAFE_NOSYNC) == 0) { - sync_op = 1; - mode_bits = MDBX_SYNC_DATA; - if (pending->mm_geo.next > - meta_prefer_steady(env, troika).ptr_c->mm_geo.now) - mode_bits |= MDBX_SYNC_SIZE; - if (flags & MDBX_NOMETASYNC) - mode_bits |= MDBX_SYNC_IODQ; - } else if (unlikely(env->me_incore)) - goto skip_incore_sync; - if (flags & MDBX_WRITEMAP) { -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.msync.weak += sync_op; -#else - (void)sync_op; -#endif /* MDBX_ENABLE_PGOP_STAT */ - rc = - osal_msync(&env->me_dxb_mmap, 0, - pgno_align2os_bytes(env, pending->mm_geo.next), mode_bits); - } else { -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.fsync.weak += sync_op; -#else - (void)sync_op; -#endif /* MDBX_ENABLE_PGOP_STAT */ - rc = osal_fsync(env->me_lazy_fd, mode_bits); - } - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - rc = (flags & MDBX_SAFE_NOSYNC) ? MDBX_RESULT_TRUE /* carry non-steady */ - : MDBX_RESULT_FALSE /* carry steady */; - } - eASSERT(env, check_meta_coherency(env, pending, true)); + return LEAF_NODE_MAX(pagesize) - NODESIZE; +} - /* Steady or Weak */ - if (rc == MDBX_RESULT_FALSE /* carry steady */) { - unaligned_poke_u64(4, pending->mm_sign, meta_sign(pending)); - atomic_store64(&env->me_lck->mti_eoos_timestamp, 0, mo_Relaxed); - atomic_store64(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed); - } else { - assert(rc == MDBX_RESULT_TRUE /* carry non-steady */); - skip_incore_sync: - eASSERT(env, env->me_lck->mti_unsynced_pages.weak > 0); - /* Может быть нулевым если unsynced_pages > 0 в результате спиллинга. - * eASSERT(env, env->me_lck->mti_eoos_timestamp.weak != 0); */ - unaligned_poke_u64(4, pending->mm_sign, MDBX_DATASIGN_WEAK); - } +__cold int mdbx_env_get_pairsize4page_max(const MDBX_env *env, + MDBX_db_flags_t flags) { + if (unlikely(!env || env->signature.weak != env_signature)) + return -1; - const bool legal4overwrite = - head.txnid == pending->unsafe_txnid && - memcmp(&head.ptr_c->mm_dbs, &pending->mm_dbs, sizeof(pending->mm_dbs)) == - 0 && - memcmp(&head.ptr_c->mm_canary, &pending->mm_canary, - sizeof(pending->mm_canary)) == 0 && - memcmp(&head.ptr_c->mm_geo, &pending->mm_geo, sizeof(pending->mm_geo)) == - 0; - MDBX_meta *target = nullptr; - if (head.txnid == pending->unsafe_txnid) { - ENSURE(env, legal4overwrite); - if (!head.is_steady && META_IS_STEADY(pending)) - target = (MDBX_meta *)head.ptr_c; - else { - WARNING("%s", "skip update meta"); - return MDBX_SUCCESS; - } - } else { - const unsigned troika_tail = troika->tail_and_flags & 3; - ENSURE(env, troika_tail < NUM_METAS && troika_tail != troika->recent && - troika_tail != troika->prefer_steady); - target = (MDBX_meta *)meta_tail(env, troika).ptr_c; - } + return (int)mdbx_limits_pairsize4page_max((intptr_t)env->ps, flags); +} - /* LY: step#2 - update meta-page. */ - DEBUG("writing meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO - ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO - " +%u -%u, txn_id %" PRIaTXN ", %s", - data_page(target)->mp_pgno, pending->mm_dbs[MAIN_DBI].md_root, - pending->mm_dbs[FREE_DBI].md_root, pending->mm_geo.lower, - pending->mm_geo.next, pending->mm_geo.now, pending->mm_geo.upper, - pv2pages(pending->mm_geo.grow_pv), pv2pages(pending->mm_geo.shrink_pv), - pending->unsafe_txnid, durable_caption(pending)); +__cold intptr_t mdbx_limits_valsize4page_max(intptr_t pagesize, + MDBX_db_flags_t flags) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_default_pagesize(); + if (unlikely(pagesize < (intptr_t)MDBX_MIN_PAGESIZE || + pagesize > (intptr_t)MDBX_MAX_PAGESIZE || + !is_powerof2((size_t)pagesize))) + return -1; - DEBUG("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, - (meta0 == head.ptr_c) ? "head" - : (meta0 == target) ? "tail" - : "stay", - durable_caption(meta0), constmeta_txnid(meta0), - meta0->mm_dbs[MAIN_DBI].md_root, meta0->mm_dbs[FREE_DBI].md_root); - DEBUG("meta1: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, - (meta1 == head.ptr_c) ? "head" - : (meta1 == target) ? "tail" - : "stay", - durable_caption(meta1), constmeta_txnid(meta1), - meta1->mm_dbs[MAIN_DBI].md_root, meta1->mm_dbs[FREE_DBI].md_root); - DEBUG("meta2: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, - (meta2 == head.ptr_c) ? "head" - : (meta2 == target) ? "tail" - : "stay", - durable_caption(meta2), constmeta_txnid(meta2), - meta2->mm_dbs[MAIN_DBI].md_root, meta2->mm_dbs[FREE_DBI].md_root); + if (flags & + (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP)) + return valsize_max(pagesize, flags); - eASSERT(env, pending->unsafe_txnid != constmeta_txnid(meta0) || - (META_IS_STEADY(pending) && !META_IS_STEADY(meta0))); - eASSERT(env, pending->unsafe_txnid != constmeta_txnid(meta1) || - (META_IS_STEADY(pending) && !META_IS_STEADY(meta1))); - eASSERT(env, pending->unsafe_txnid != constmeta_txnid(meta2) || - (META_IS_STEADY(pending) && !META_IS_STEADY(meta2))); + return PAGESPACE(pagesize); +} - eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); - ENSURE(env, target == head.ptr_c || - constmeta_txnid(target) < pending->unsafe_txnid); - if (flags & MDBX_WRITEMAP) { - jitter4testing(true); - if (likely(target != head.ptr_c)) { - /* LY: 'invalidate' the meta. */ - meta_update_begin(env, target, pending->unsafe_txnid); - unaligned_poke_u64(4, target->mm_sign, MDBX_DATASIGN_WEAK); -#ifndef NDEBUG - /* debug: provoke failure to catch a violators, but don't touch mm_psize - * to allow readers catch actual pagesize. */ - void *provoke_begin = &target->mm_dbs[FREE_DBI].md_root; - void *provoke_end = &target->mm_sign; - memset(provoke_begin, 0xCC, ptr_dist(provoke_end, provoke_begin)); - jitter4testing(false); -#endif +__cold int mdbx_env_get_valsize4page_max(const MDBX_env *env, + MDBX_db_flags_t flags) { + if (unlikely(!env || env->signature.weak != env_signature)) + return -1; - /* LY: update info */ - target->mm_geo = pending->mm_geo; - target->mm_dbs[FREE_DBI] = pending->mm_dbs[FREE_DBI]; - target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI]; - eASSERT(env, target->mm_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); - eASSERT(env, db_check_flags(target->mm_dbs[MAIN_DBI].md_flags)); - target->mm_canary = pending->mm_canary; - memcpy(target->mm_pages_retired, pending->mm_pages_retired, 8); - jitter4testing(true); - - /* LY: 'commit' the meta */ - meta_update_end(env, target, unaligned_peek_u64(4, pending->mm_txnid_b)); - jitter4testing(true); - eASSERT(env, check_meta_coherency(env, target, true)); - } else { - /* dangerous case (target == head), only mm_sign could - * me updated, check assertions once again */ - eASSERT(env, - legal4overwrite && !head.is_steady && META_IS_STEADY(pending)); - } - memcpy(target->mm_sign, pending->mm_sign, 8); - osal_flush_incoherent_cpu_writeback(); - jitter4testing(true); - if (!env->me_incore) { - if (!MDBX_AVOID_MSYNC) { - /* sync meta-pages */ -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.msync.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - rc = osal_msync( - &env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), - (flags & MDBX_NOMETASYNC) ? MDBX_SYNC_NONE - : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); - } else { -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - const MDBX_page *page = data_page(target); - rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, - ptr_dist(page, env->me_map)); - if (likely(rc == MDBX_SUCCESS)) { - osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), - env->me_os_psize); - if ((flags & MDBX_NOMETASYNC) == 0 && - env->me_fd4meta == env->me_lazy_fd) { -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.fsync.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); - } - } - } - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - } - } else { -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - const MDBX_meta undo_meta = *target; - eASSERT(env, pending->mm_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); - eASSERT(env, db_check_flags(pending->mm_dbs[MAIN_DBI].md_flags)); - rc = osal_pwrite(env->me_fd4meta, pending, sizeof(MDBX_meta), - ptr_dist(target, env->me_map)); - if (unlikely(rc != MDBX_SUCCESS)) { - undo: - DEBUG("%s", "write failed, disk error?"); - /* On a failure, the pagecache still contains the new data. - * Try write some old data back, to prevent it from being used. */ - osal_pwrite(env->me_fd4meta, &undo_meta, sizeof(MDBX_meta), - ptr_dist(target, env->me_map)); - goto fail; - } - osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); - /* sync meta-pages */ - if ((flags & MDBX_NOMETASYNC) == 0 && env->me_fd4meta == env->me_lazy_fd && - !env->me_incore) { -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.fsync.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); - if (rc != MDBX_SUCCESS) - goto undo; - } - } - - uint64_t timestamp = 0; - while ("workaround for https://libmdbx.dqdkfa.ru/dead-github/issues/269") { - rc = coherency_check_written(env, pending->unsafe_txnid, target, - bytes2pgno(env, ptr_dist(target, env->me_map)), - ×tamp); - if (likely(rc == MDBX_SUCCESS)) - break; - if (unlikely(rc != MDBX_RESULT_TRUE)) - goto fail; - } - - const uint32_t sync_txnid_dist = - ((flags & MDBX_NOMETASYNC) == 0) ? 0 - : ((flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) - ? MDBX_NOMETASYNC_LAZY_FD - : MDBX_NOMETASYNC_LAZY_WRITEMAP; - env->me_lck->mti_meta_sync_txnid.weak = - pending->mm_txnid_a[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__].weak - - sync_txnid_dist; - - *troika = meta_tap(env); - for (MDBX_txn *txn = env->me_txn0; txn; txn = txn->mt_child) - if (troika != &txn->tw.troika) - txn->tw.troika = *troika; - - /* LY: shrink datafile if needed */ - if (unlikely(shrink)) { - VERBOSE("shrink to %" PRIaPGNO " pages (-%" PRIaPGNO ")", - pending->mm_geo.now, shrink); - rc = dxb_resize(env, pending->mm_geo.next, pending->mm_geo.now, - pending->mm_geo.upper, impilict_shrink); - if (rc != MDBX_SUCCESS && rc != MDBX_EPERM) - goto fail; - eASSERT(env, check_meta_coherency(env, target, true)); - } - - MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (likely(lck)) - /* toggle oldest refresh */ - atomic_store32(&lck->mti_readers_refresh_flag, false, mo_Relaxed); + return (int)mdbx_limits_valsize4page_max((intptr_t)env->ps, flags); +} - return MDBX_SUCCESS; +/*----------------------------------------------------------------------------*/ -fail: - env->me_flags |= MDBX_FATAL_ERROR; - return rc; +__cold static void stat_add(const tree_t *db, MDBX_stat *const st, + const size_t bytes) { + st->ms_depth += db->height; + st->ms_branch_pages += db->branch_pages; + st->ms_leaf_pages += db->leaf_pages; + st->ms_overflow_pages += db->large_pages; + st->ms_entries += db->items; + if (likely(bytes >= + offsetof(MDBX_stat, ms_mod_txnid) + sizeof(st->ms_mod_txnid))) + st->ms_mod_txnid = + (st->ms_mod_txnid > db->mod_txnid) ? st->ms_mod_txnid : db->mod_txnid; } -static void recalculate_merge_threshold(MDBX_env *env) { - const size_t bytes = page_space(env); - env->me_merge_threshold = - (uint16_t)(bytes - - (bytes * env->me_options.merge_threshold_16dot16_percent >> - 16)); - env->me_merge_threshold_gc = - (uint16_t)(bytes - - ((env->me_options.merge_threshold_16dot16_percent > 19005) - ? bytes / 3 /* 33 % */ - : bytes / 4 /* 25 % */)); -} +__cold static int stat_acc(const MDBX_txn *txn, MDBX_stat *st, size_t bytes) { + int err = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(err != MDBX_SUCCESS)) + return err; -__cold static void setup_pagesize(MDBX_env *env, const size_t pagesize) { - STATIC_ASSERT(PTRDIFF_MAX > MAX_MAPSIZE); - STATIC_ASSERT(MIN_PAGESIZE > sizeof(MDBX_page) + sizeof(MDBX_meta)); - ENSURE(env, is_powerof2(pagesize)); - ENSURE(env, pagesize >= MIN_PAGESIZE); - ENSURE(env, pagesize <= MAX_PAGESIZE); - env->me_psize = (unsigned)pagesize; - if (env->me_pbuf) { - osal_memalign_free(env->me_pbuf); - env->me_pbuf = nullptr; + cursor_couple_t cx; + err = cursor_init(&cx.outer, (MDBX_txn *)txn, MAIN_DBI); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + const MDBX_env *const env = txn->env; + st->ms_psize = env->ps; + TXN_FOREACH_DBI_FROM( + txn, dbi, + /* assuming GC is internal and not subject for accounting */ MAIN_DBI) { + if ((txn->dbi_state[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID) + stat_add(txn->dbs + dbi, st, bytes); } - STATIC_ASSERT(MAX_GC1OVPAGE(MIN_PAGESIZE) > 4); - STATIC_ASSERT(MAX_GC1OVPAGE(MAX_PAGESIZE) < MDBX_PGL_LIMIT); - const intptr_t maxgc_ov1page = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1; - ENSURE(env, - maxgc_ov1page > 42 && maxgc_ov1page < (intptr_t)MDBX_PGL_LIMIT / 4); - env->me_maxgc_ov1page = (unsigned)maxgc_ov1page; - env->me_maxgc_per_branch = - (unsigned)((pagesize - PAGEHDRSZ) / - (sizeof(indx_t) + sizeof(MDBX_node) + sizeof(txnid_t))); + if (!(txn->dbs[MAIN_DBI].flags & MDBX_DUPSORT) && + txn->dbs[MAIN_DBI].items /* TODO: use `md_subs` field */) { - STATIC_ASSERT(LEAF_NODE_MAX(MIN_PAGESIZE) > sizeof(MDBX_db) + NODESIZE + 42); - STATIC_ASSERT(LEAF_NODE_MAX(MAX_PAGESIZE) < UINT16_MAX); - STATIC_ASSERT(LEAF_NODE_MAX(MIN_PAGESIZE) >= BRANCH_NODE_MAX(MIN_PAGESIZE)); - STATIC_ASSERT(BRANCH_NODE_MAX(MAX_PAGESIZE) > NODESIZE + 42); - STATIC_ASSERT(BRANCH_NODE_MAX(MAX_PAGESIZE) < UINT16_MAX); - const intptr_t branch_nodemax = BRANCH_NODE_MAX(pagesize); - const intptr_t leaf_nodemax = LEAF_NODE_MAX(pagesize); - ENSURE(env, branch_nodemax > (intptr_t)(NODESIZE + 42) && - branch_nodemax % 2 == 0 && - leaf_nodemax > (intptr_t)(sizeof(MDBX_db) + NODESIZE + 42) && - leaf_nodemax >= branch_nodemax && - leaf_nodemax < (int)UINT16_MAX && leaf_nodemax % 2 == 0); - env->me_leaf_nodemax = (uint16_t)leaf_nodemax; - env->me_branch_nodemax = (uint16_t)branch_nodemax; - env->me_psize2log = (uint8_t)log2n_powerof2(pagesize); - eASSERT(env, pgno2bytes(env, 1) == pagesize); - eASSERT(env, bytes2pgno(env, pagesize + pagesize) == 2); - recalculate_merge_threshold(env); + /* scan and account not opened named subDBs */ + err = tree_search(&cx.outer, nullptr, Z_FIRST); + while (err == MDBX_SUCCESS) { + const page_t *mp = cx.outer.pg[cx.outer.top]; + for (size_t i = 0; i < page_numkeys(mp); i++) { + const node_t *node = page_node(mp, i); + if (node_flags(node) != N_SUBDATA) + continue; + if (unlikely(node_ds(node) != sizeof(tree_t))) { + ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid subDb node size", node_ds(node)); + return MDBX_CORRUPTED; + } - /* TODO: recalculate me_subpage_xyz values from MDBX_opt_subpage_xyz. */ - env->me_subpage_limit = env->me_leaf_nodemax - NODESIZE; - env->me_subpage_room_threshold = 0; - env->me_subpage_reserve_prereq = env->me_leaf_nodemax; - env->me_subpage_reserve_limit = env->me_subpage_limit / 42; - eASSERT(env, - env->me_subpage_reserve_prereq > - env->me_subpage_room_threshold + env->me_subpage_reserve_limit); - eASSERT(env, env->me_leaf_nodemax >= env->me_subpage_limit + NODESIZE); + /* skip opened and already accounted */ + const MDBX_val name = {node_key(node), node_ks(node)}; + TXN_FOREACH_DBI_USER(txn, dbi) { + if ((txn->dbi_state[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID && + env->kvs[MAIN_DBI].clc.k.cmp(&name, &env->kvs[dbi].name) == 0) { + node = nullptr; + break; + } + } - const pgno_t max_pgno = bytes2pgno(env, MAX_MAPSIZE); - if (!env->me_options.flags.non_auto.dp_limit) { - /* auto-setup dp_limit by "The42" ;-) */ - intptr_t total_ram_pages, avail_ram_pages; - int err = mdbx_get_sysraminfo(nullptr, &total_ram_pages, &avail_ram_pages); - if (unlikely(err != MDBX_SUCCESS)) - ERROR("mdbx_get_sysraminfo(), rc %d", err); - else { - size_t reasonable_dpl_limit = - (size_t)(total_ram_pages + avail_ram_pages) / 42; - if (pagesize > env->me_os_psize) - reasonable_dpl_limit /= pagesize / env->me_os_psize; - else if (pagesize < env->me_os_psize) - reasonable_dpl_limit *= env->me_os_psize / pagesize; - reasonable_dpl_limit = (reasonable_dpl_limit < MDBX_PGL_LIMIT) - ? reasonable_dpl_limit - : MDBX_PGL_LIMIT; - reasonable_dpl_limit = (reasonable_dpl_limit > CURSOR_STACK * 4) - ? reasonable_dpl_limit - : CURSOR_STACK * 4; - env->me_options.dp_limit = (unsigned)reasonable_dpl_limit; + if (node) { + tree_t db; + memcpy(&db, node_data(node), sizeof(db)); + stat_add(&db, st, bytes); + } + } + err = cursor_sibling_right(&cx.outer); } + if (unlikely(err != MDBX_NOTFOUND)) + return err; } - if (env->me_options.dp_limit > max_pgno - NUM_METAS) - env->me_options.dp_limit = max_pgno - NUM_METAS; - if (env->me_options.dp_initial > env->me_options.dp_limit) - env->me_options.dp_initial = env->me_options.dp_limit; + + return MDBX_SUCCESS; } -__cold int mdbx_env_create(MDBX_env **penv) { - if (unlikely(!penv)) +__cold int mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn, + MDBX_stat *dest, size_t bytes) { + if (unlikely(!dest)) + return MDBX_EINVAL; + const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid); + if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid) return MDBX_EINVAL; - *penv = nullptr; - -#ifdef MDBX_HAVE_C11ATOMICS - if (unlikely(!atomic_is_lock_free((const volatile uint32_t *)penv))) { - ERROR("lock-free atomic ops for %u-bit types is required", 32); - return MDBX_INCOMPATIBLE; - } -#if MDBX_64BIT_ATOMIC - if (unlikely(!atomic_is_lock_free((const volatile uint64_t *)penv))) { - ERROR("lock-free atomic ops for %u-bit types is required", 64); - return MDBX_INCOMPATIBLE; - } -#endif /* MDBX_64BIT_ATOMIC */ -#endif /* MDBX_HAVE_C11ATOMICS */ - const size_t os_psize = osal_syspagesize(); - if (unlikely(!is_powerof2(os_psize) || os_psize < MIN_PAGESIZE)) { - ERROR("unsuitable system pagesize %" PRIuPTR, os_psize); - return MDBX_INCOMPATIBLE; + if (likely(txn)) { + if (env && unlikely(txn->env != env)) + return MDBX_EINVAL; + return stat_acc(txn, dest, bytes); } -#if defined(__linux__) || defined(__gnu_linux__) - if (unlikely(linux_kernel_version < 0x04000000)) { - /* 2022-09-01: Прошло уже больше двух после окончания какой-либо поддержки - * самого "долгоиграющего" ядра 3.16.85 ветки 3.x */ - ERROR("too old linux kernel %u.%u.%u.%u, the >= 4.0.0 is required", - linux_kernel_version >> 24, (linux_kernel_version >> 16) & 255, - (linux_kernel_version >> 8) & 255, linux_kernel_version & 255); - return MDBX_INCOMPATIBLE; - } -#endif /* Linux */ + int err = check_env(env, true); + if (unlikely(err != MDBX_SUCCESS)) + return err; - MDBX_env *env = osal_calloc(1, sizeof(MDBX_env)); - if (unlikely(!env)) - return MDBX_ENOMEM; + if (env->txn && env_txn0_owned(env)) + /* inside write-txn */ + return stat_acc(env->txn, dest, bytes); - env->me_maxreaders = DEFAULT_READERS; - env->me_maxdbs = env->me_numdbs = CORE_DBS; - env->me_lazy_fd = env->me_dsync_fd = env->me_fd4meta = env->me_lfd = - INVALID_HANDLE_VALUE; - env->me_stuck_meta = -1; + MDBX_txn *tmp_txn; + err = mdbx_txn_begin((MDBX_env *)env, nullptr, MDBX_TXN_RDONLY, &tmp_txn); + if (unlikely(err != MDBX_SUCCESS)) + return err; - env->me_options.rp_augment_limit = MDBX_PNL_INITIAL; - env->me_options.dp_reserve_limit = MDBX_PNL_INITIAL; - env->me_options.dp_initial = MDBX_PNL_INITIAL; - env->me_options.spill_max_denominator = 8; - env->me_options.spill_min_denominator = 8; - env->me_options.spill_parent4child_denominator = 0; - env->me_options.dp_loose_limit = 64; - env->me_options.merge_threshold_16dot16_percent = 65536 / 4 /* 25% */; + const int rc = stat_acc(tmp_txn, dest, bytes); + err = mdbx_txn_abort(tmp_txn); + if (unlikely(err != MDBX_SUCCESS)) + return err; + return rc; +} -#if !(defined(_WIN32) || defined(_WIN64)) - env->me_options.writethrough_threshold = -#if defined(__linux__) || defined(__gnu_linux__) - mdbx_RunningOnWSL1 ? MAX_PAGENO : -#endif /* Linux */ - MDBX_WRITETHROUGH_THRESHOLD_DEFAULT; -#endif /* Windows */ +/*----------------------------------------------------------------------------*/ - env->me_os_psize = (unsigned)os_psize; - setup_pagesize(env, (env->me_os_psize < MAX_PAGESIZE) ? env->me_os_psize - : MAX_PAGESIZE); +static size_t estimate_rss(size_t database_bytes) { + return database_bytes + database_bytes / 64 + + (512 + MDBX_WORDBITS * 16) * MEGABYTE; +} - int rc = osal_fastmutex_init(&env->me_dbi_lock); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; +__cold int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn, + MDBX_warmup_flags_t flags, + unsigned timeout_seconds_16dot16) { + if (unlikely(env == nullptr && txn == nullptr)) + return MDBX_EINVAL; + if (unlikely(flags > + (MDBX_warmup_force | MDBX_warmup_oomsafe | MDBX_warmup_lock | + MDBX_warmup_touchlimit | MDBX_warmup_release))) + return MDBX_EINVAL; -#if defined(_WIN32) || defined(_WIN64) - osal_srwlock_Init(&env->me_remap_guard); - InitializeCriticalSection(&env->me_windowsbug_lock); -#else - rc = osal_fastmutex_init(&env->me_remap_guard); - if (unlikely(rc != MDBX_SUCCESS)) { - osal_fastmutex_destroy(&env->me_dbi_lock); - goto bailout; + if (txn) { + int err = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR); + if (unlikely(err != MDBX_SUCCESS)) + return err; } - -#if MDBX_LOCKING > MDBX_LOCKING_SYSV - MDBX_lockinfo *const stub = lckless_stub(env); - rc = osal_ipclock_stubinit(&stub->mti_wlock); -#endif /* MDBX_LOCKING */ - if (unlikely(rc != MDBX_SUCCESS)) { - osal_fastmutex_destroy(&env->me_remap_guard); - osal_fastmutex_destroy(&env->me_dbi_lock); - goto bailout; + if (env) { + int err = check_env(env, false); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (txn && unlikely(txn->env != env)) + return MDBX_EINVAL; + } else { + env = txn->env; } -#endif /* Windows */ - VALGRIND_CREATE_MEMPOOL(env, 0, 0); - env->me_signature.weak = MDBX_ME_SIGNATURE; - *penv = env; - return MDBX_SUCCESS; + const uint64_t timeout_monotime = + (timeout_seconds_16dot16 && (flags & MDBX_warmup_force)) + ? osal_monotime() + osal_16dot16_to_monotime(timeout_seconds_16dot16) + : 0; -bailout: - osal_free(env); - return rc; -} + if (flags & MDBX_warmup_release) + munlock_all(env); -__cold static intptr_t get_reasonable_db_maxsize(intptr_t *cached_result) { - if (*cached_result == 0) { - intptr_t pagesize, total_ram_pages; - if (unlikely(mdbx_get_sysraminfo(&pagesize, &total_ram_pages, nullptr) != - MDBX_SUCCESS)) - return *cached_result = MAX_MAPSIZE32 /* the 32-bit limit is good enough - for fallback */ - ; + pgno_t used_pgno; + if (txn) { + used_pgno = txn->geo.first_unallocated; + } else { + const troika_t troika = meta_tap(env); + used_pgno = meta_recent(env, &troika).ptr_v->geometry.first_unallocated; + } + const size_t used_range = pgno_align2os_bytes(env, used_pgno); + const pgno_t mlock_pgno = bytes2pgno(env, used_range); - if (unlikely((size_t)total_ram_pages * 2 > MAX_MAPSIZE / (size_t)pagesize)) - return *cached_result = MAX_MAPSIZE; - assert(MAX_MAPSIZE >= (size_t)(total_ram_pages * pagesize * 2)); - - /* Suggesting should not be more than golden ratio of the size of RAM. */ - *cached_result = (intptr_t)((size_t)total_ram_pages * 207 >> 7) * pagesize; + int rc = MDBX_SUCCESS; + if (flags & MDBX_warmup_touchlimit) { + const size_t estimated_rss = estimate_rss(used_range); +#if defined(_WIN32) || defined(_WIN64) + SIZE_T current_ws_lower, current_ws_upper; + if (GetProcessWorkingSetSize(GetCurrentProcess(), ¤t_ws_lower, + ¤t_ws_upper) && + current_ws_lower < estimated_rss) { + const SIZE_T ws_lower = estimated_rss; + const SIZE_T ws_upper = + (MDBX_WORDBITS == 32 && ws_lower > MEGABYTE * 2048) + ? ws_lower + : ws_lower + MDBX_WORDBITS * MEGABYTE * 32; + if (!SetProcessWorkingSetSize(GetCurrentProcess(), ws_lower, ws_upper)) { + rc = (int)GetLastError(); + WARNING("SetProcessWorkingSetSize(%zu, %zu) error %d", ws_lower, + ws_upper, rc); + } + } +#endif /* Windows */ +#ifdef RLIMIT_RSS + struct rlimit rss; + if (getrlimit(RLIMIT_RSS, &rss) == 0 && rss.rlim_cur < estimated_rss) { + rss.rlim_cur = estimated_rss; + if (rss.rlim_max < estimated_rss) + rss.rlim_max = estimated_rss; + if (setrlimit(RLIMIT_RSS, &rss)) { + rc = errno; + WARNING("setrlimit(%s, {%zu, %zu}) error %d", "RLIMIT_RSS", + (size_t)rss.rlim_cur, (size_t)rss.rlim_max, rc); + } + } +#endif /* RLIMIT_RSS */ +#ifdef RLIMIT_MEMLOCK + if (flags & MDBX_warmup_lock) { + struct rlimit memlock; + if (getrlimit(RLIMIT_MEMLOCK, &memlock) == 0 && + memlock.rlim_cur < estimated_rss) { + memlock.rlim_cur = estimated_rss; + if (memlock.rlim_max < estimated_rss) + memlock.rlim_max = estimated_rss; + if (setrlimit(RLIMIT_MEMLOCK, &memlock)) { + rc = errno; + WARNING("setrlimit(%s, {%zu, %zu}) error %d", "RLIMIT_MEMLOCK", + (size_t)memlock.rlim_cur, (size_t)memlock.rlim_max, rc); + } + } + } +#endif /* RLIMIT_MEMLOCK */ + (void)estimated_rss; + } - /* Round to the nearest human-readable granulation. */ - for (size_t unit = MEGABYTE; unit; unit <<= 5) { - const size_t floor = floor_powerof2(*cached_result, unit); - const size_t ceil = ceil_powerof2(*cached_result, unit); - const size_t threshold = (size_t)*cached_result >> 4; - const bool down = - *cached_result - floor < ceil - *cached_result || ceil > MAX_MAPSIZE; - if (threshold < (down ? *cached_result - floor : ceil - *cached_result)) - break; - *cached_result = down ? floor : ceil; +#if defined(MLOCK_ONFAULT) && \ + ((defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 27)) || \ + (defined(__ANDROID_API__) && __ANDROID_API__ >= 30)) && \ + (defined(__linux__) || defined(__gnu_linux__)) + if ((flags & MDBX_warmup_lock) != 0 && + globals.linux_kernel_version >= 0x04040000 && + atomic_load32(&env->mlocked_pgno, mo_AcquireRelease) < mlock_pgno) { + if (mlock2(env->dxb_mmap.base, used_range, MLOCK_ONFAULT)) { + rc = errno; + WARNING("mlock2(%zu, %s) error %d", used_range, "MLOCK_ONFAULT", rc); + } else { + update_mlcnt(env, mlock_pgno, true); + rc = MDBX_SUCCESS; } + if (rc != EINVAL) + flags -= MDBX_warmup_lock; } - return *cached_result; -} +#endif /* MLOCK_ONFAULT */ -__cold int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, - intptr_t size_now, intptr_t size_upper, - intptr_t growth_step, - intptr_t shrink_threshold, intptr_t pagesize) { - int rc = check_env(env, false); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + int err = MDBX_ENOSYS; +#if MDBX_ENABLE_MADVISE + err = dxb_set_readahead(env, used_pgno, true, true); +#else +#if defined(_WIN32) || defined(_WIN64) + if (imports.PrefetchVirtualMemory) { + WIN32_MEMORY_RANGE_ENTRY hint; + hint.VirtualAddress = env->dxb_mmap.base; + hint.NumberOfBytes = used_range; + if (imports.PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0)) + err = MDBX_SUCCESS; + else { + err = (int)GetLastError(); + ERROR("%s(%zu) error %d", "PrefetchVirtualMemory", used_range, err); + } + } +#endif /* Windows */ - const bool txn0_owned = env->me_txn0 && env_txn0_owned(env); - const bool inside_txn = txn0_owned && env->me_txn; - bool should_unlock = false; +#if defined(POSIX_MADV_WILLNEED) + err = posix_madvise(env->dxb_mmap.base, used_range, POSIX_MADV_WILLNEED) + ? ignore_enosys(errno) + : MDBX_SUCCESS; +#elif defined(MADV_WILLNEED) + err = madvise(env->dxb_mmap.base, used_range, MADV_WILLNEED) + ? ignore_enosys(errno) + : MDBX_SUCCESS; +#endif -#if MDBX_DEBUG - if (growth_step < 0) { - growth_step = 1; - if (shrink_threshold < 0) - shrink_threshold = 1; +#if defined(F_RDADVISE) + if (err) { + fcntl(env->lazy_fd, F_RDAHEAD, true); + struct radvisory hint; + hint.ra_offset = 0; + hint.ra_count = unlikely(used_range > INT_MAX && + sizeof(used_range) > sizeof(hint.ra_count)) + ? INT_MAX + : (int)used_range; + err = fcntl(env->lazy_fd, F_RDADVISE, &hint) ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (err == ENOTTY) + err = MDBX_SUCCESS /* Ignore ENOTTY for DB on the ram-disk */; } -#endif /* MDBX_DEBUG */ +#endif /* F_RDADVISE */ +#endif /* MDBX_ENABLE_MADVISE */ + if (err != MDBX_SUCCESS && rc == MDBX_SUCCESS) + rc = err; - intptr_t reasonable_maxsize = 0; - if (env->me_map) { - /* env already mapped */ - if (unlikely(env->me_flags & MDBX_RDONLY)) - return MDBX_EACCESS; + if ((flags & MDBX_warmup_force) != 0 && + (rc == MDBX_SUCCESS || rc == MDBX_ENOSYS)) { + const volatile uint8_t *ptr = env->dxb_mmap.base; + size_t offset = 0, unused = 42; +#if !(defined(_WIN32) || defined(_WIN64)) + if (flags & MDBX_warmup_oomsafe) { + const int null_fd = open("/dev/null", O_WRONLY); + if (unlikely(null_fd < 0)) + rc = errno; + else { + struct iovec iov[MDBX_AUXILARY_IOV_MAX]; + for (;;) { + unsigned i; + for (i = 0; i < MDBX_AUXILARY_IOV_MAX && offset < used_range; ++i) { + iov[i].iov_base = (void *)(ptr + offset); + iov[i].iov_len = 1; + offset += globals.sys_pagesize; + } + if (unlikely(writev(null_fd, iov, i) < 0)) { + rc = errno; + if (rc == EFAULT) + rc = ENOMEM; + break; + } + if (offset >= used_range) { + rc = MDBX_SUCCESS; + break; + } + if (timeout_seconds_16dot16 && osal_monotime() > timeout_monotime) { + rc = MDBX_RESULT_TRUE; + break; + } + } + close(null_fd); + } + } else +#endif /* Windows */ + for (;;) { + unused += ptr[offset]; + offset += globals.sys_pagesize; + if (offset >= used_range) { + rc = MDBX_SUCCESS; + break; + } + if (timeout_seconds_16dot16 && osal_monotime() > timeout_monotime) { + rc = MDBX_RESULT_TRUE; + break; + } + } + (void)unused; + } - if (!txn0_owned) { - int err = osal_txn_lock(env, false); - if (unlikely(err != MDBX_SUCCESS)) - return err; - should_unlock = true; - env->me_txn0->tw.troika = meta_tap(env); - eASSERT(env, !env->me_txn && !env->me_txn0->mt_child); - env->me_txn0->mt_txnid = - env->me_txn0->tw.troika.txnid[env->me_txn0->tw.troika.recent]; - txn_oldest_reader(env->me_txn0); + if ((flags & MDBX_warmup_lock) != 0 && + (rc == MDBX_SUCCESS || rc == MDBX_ENOSYS) && + atomic_load32(&env->mlocked_pgno, mo_AcquireRelease) < mlock_pgno) { +#if defined(_WIN32) || defined(_WIN64) + if (VirtualLock(env->dxb_mmap.base, used_range)) { + update_mlcnt(env, mlock_pgno, true); + rc = MDBX_SUCCESS; + } else { + rc = (int)GetLastError(); + WARNING("%s(%zu) error %d", "VirtualLock", used_range, rc); + } +#elif defined(_POSIX_MEMLOCK_RANGE) + if (mlock(env->dxb_mmap.base, used_range) == 0) { + update_mlcnt(env, mlock_pgno, true); + rc = MDBX_SUCCESS; + } else { + rc = errno; + WARNING("%s(%zu) error %d", "mlock", used_range, rc); } +#else + rc = MDBX_ENOSYS; +#endif + } - /* get untouched params from current TXN or DB */ - if (pagesize <= 0 || pagesize >= INT_MAX) - pagesize = env->me_psize; - const MDBX_geo *const geo = - inside_txn ? &env->me_txn->mt_geo - : &meta_recent(env, &env->me_txn0->tw.troika).ptr_c->mm_geo; - if (size_lower < 0) - size_lower = pgno2bytes(env, geo->lower); - if (size_now < 0) - size_now = pgno2bytes(env, geo->now); - if (size_upper < 0) - size_upper = pgno2bytes(env, geo->upper); - if (growth_step < 0) - growth_step = pgno2bytes(env, pv2pages(geo->grow_pv)); - if (shrink_threshold < 0) - shrink_threshold = pgno2bytes(env, pv2pages(geo->shrink_pv)); + return rc; +} - if (pagesize != (intptr_t)env->me_psize) { - rc = MDBX_EINVAL; - goto bailout; - } - const size_t usedbytes = - pgno2bytes(env, find_largest_snapshot(env, geo->next)); - if ((size_t)size_upper < usedbytes) { - rc = MDBX_MAP_FULL; - goto bailout; - } - if ((size_t)size_now < usedbytes) - size_now = usedbytes; - } else { - /* env NOT yet mapped */ - if (unlikely(inside_txn)) - return MDBX_PANIC; +/*----------------------------------------------------------------------------*/ - /* is requested some auto-value for pagesize ? */ - if (pagesize >= INT_MAX /* maximal */) - pagesize = MAX_PAGESIZE; - else if (pagesize <= 0) { - if (pagesize < 0 /* default */) { - pagesize = env->me_os_psize; - if ((uintptr_t)pagesize > MAX_PAGESIZE) - pagesize = MAX_PAGESIZE; - eASSERT(env, (uintptr_t)pagesize >= MIN_PAGESIZE); - } else if (pagesize == 0 /* minimal */) - pagesize = MIN_PAGESIZE; +__cold int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *arg) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - /* choose pagesize */ - intptr_t max_size = (size_now > size_lower) ? size_now : size_lower; - max_size = (size_upper > max_size) ? size_upper : max_size; - if (max_size < 0 /* default */) - max_size = DEFAULT_MAPSIZE; - else if (max_size == 0 /* minimal */) - max_size = MIN_MAPSIZE; - else if (max_size >= (intptr_t)MAX_MAPSIZE /* maximal */) - max_size = get_reasonable_db_maxsize(&reasonable_maxsize); - - while (max_size > pagesize * (int64_t)(MAX_PAGENO + 1) && - pagesize < MAX_PAGESIZE) - pagesize <<= 1; - } - } + if (unlikely(!arg)) + return MDBX_EINVAL; - if (pagesize < (intptr_t)MIN_PAGESIZE || pagesize > (intptr_t)MAX_PAGESIZE || - !is_powerof2(pagesize)) { - rc = MDBX_EINVAL; - goto bailout; - } + *arg = env->lazy_fd; + return MDBX_SUCCESS; +} - if (size_lower <= 0) { - size_lower = MIN_MAPSIZE; - if (MIN_MAPSIZE / pagesize < MIN_PAGENO) - size_lower = MIN_PAGENO * pagesize; - } - if (size_lower >= INTPTR_MAX) { - size_lower = get_reasonable_db_maxsize(&reasonable_maxsize); - if ((size_t)size_lower / pagesize > MAX_PAGENO + 1) - size_lower = pagesize * (MAX_PAGENO + 1); - } +__cold int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags, + bool onoff) { + int rc = check_env(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (size_now <= 0) { - size_now = size_lower; - if (size_upper >= size_lower && size_now > size_upper) - size_now = size_upper; - } - if (size_now >= INTPTR_MAX) { - size_now = get_reasonable_db_maxsize(&reasonable_maxsize); - if ((size_t)size_now / pagesize > MAX_PAGENO + 1) - size_now = pagesize * (MAX_PAGENO + 1); - } - - if (size_upper <= 0) { - if (size_now >= get_reasonable_db_maxsize(&reasonable_maxsize) / 2) - size_upper = get_reasonable_db_maxsize(&reasonable_maxsize); - else if (MAX_MAPSIZE != MAX_MAPSIZE32 && - (size_t)size_now >= MAX_MAPSIZE32 / 2 && - (size_t)size_now <= MAX_MAPSIZE32 / 4 * 3) - size_upper = MAX_MAPSIZE32; - else { - size_upper = size_now + size_now; - if ((size_t)size_upper < DEFAULT_MAPSIZE * 2) - size_upper = DEFAULT_MAPSIZE * 2; - } - if ((size_t)size_upper / pagesize > (MAX_PAGENO + 1)) - size_upper = pagesize * (MAX_PAGENO + 1); - } else if (size_upper >= INTPTR_MAX) { - size_upper = get_reasonable_db_maxsize(&reasonable_maxsize); - if ((size_t)size_upper / pagesize > MAX_PAGENO + 1) - size_upper = pagesize * (MAX_PAGENO + 1); - } + if (unlikely(flags & ((env->flags & ENV_ACTIVE) ? ~ENV_CHANGEABLE_FLAGS + : ~ENV_USABLE_FLAGS))) + return MDBX_EPERM; - if (unlikely(size_lower < (intptr_t)MIN_MAPSIZE || size_lower > size_upper)) { - rc = MDBX_EINVAL; - goto bailout; - } + if (unlikely(env->flags & MDBX_RDONLY)) + return MDBX_EACCESS; - if ((uint64_t)size_lower / pagesize < MIN_PAGENO) { - size_lower = pagesize * MIN_PAGENO; - if (unlikely(size_lower > size_upper)) { - rc = MDBX_EINVAL; - goto bailout; - } - if (size_now < size_lower) - size_now = size_lower; + const bool lock_needed = (env->flags & ENV_ACTIVE) && !env_txn0_owned(env); + bool should_unlock = false; + if (lock_needed) { + rc = lck_txn_lock(env, false); + if (unlikely(rc)) + return rc; + should_unlock = true; } - if (unlikely((size_t)size_upper > MAX_MAPSIZE || - (uint64_t)size_upper / pagesize > MAX_PAGENO + 1)) { - rc = MDBX_TOO_LARGE; - goto bailout; - } + if (onoff) + env->flags = combine_durability_flags(env->flags, flags); + else + env->flags &= ~flags; - const size_t unit = (env->me_os_psize > (size_t)pagesize) ? env->me_os_psize - : (size_t)pagesize; - size_lower = ceil_powerof2(size_lower, unit); - size_upper = ceil_powerof2(size_upper, unit); - size_now = ceil_powerof2(size_now, unit); + if (should_unlock) + lck_txn_unlock(env); + return MDBX_SUCCESS; +} - /* LY: подбираем значение size_upper: - * - кратное размеру страницы - * - без нарушения MAX_MAPSIZE и MAX_PAGENO */ - while (unlikely((size_t)size_upper > MAX_MAPSIZE || - (uint64_t)size_upper / pagesize > MAX_PAGENO + 1)) { - if ((size_t)size_upper < unit + MIN_MAPSIZE || - (size_t)size_upper < (size_t)pagesize * (MIN_PAGENO + 1)) { - /* паранойа на случай переполнения при невероятных значениях */ - rc = MDBX_EINVAL; - goto bailout; - } - size_upper -= unit; - if ((size_t)size_upper < (size_t)size_lower) - size_lower = size_upper; - } - eASSERT(env, (size_upper - size_lower) % env->me_os_psize == 0); +__cold int mdbx_env_get_flags(const MDBX_env *env, unsigned *arg) { + int rc = check_env(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (size_now < size_lower) - size_now = size_lower; - if (size_now > size_upper) - size_now = size_upper; + if (unlikely(!arg)) + return MDBX_EINVAL; - if (growth_step < 0) { - growth_step = ((size_t)(size_upper - size_lower)) / 42; - if (growth_step > size_lower && size_lower < (intptr_t)MEGABYTE) - growth_step = size_lower; - if (growth_step < 65536) - growth_step = 65536; - if ((size_t)growth_step > MAX_MAPSIZE / 64) - growth_step = MAX_MAPSIZE / 64; - } - if (growth_step == 0 && shrink_threshold > 0) - growth_step = 1; - growth_step = ceil_powerof2(growth_step, unit); + *arg = env->flags & ENV_USABLE_FLAGS; + return MDBX_SUCCESS; +} - if (shrink_threshold < 0) - shrink_threshold = growth_step + growth_step; - shrink_threshold = ceil_powerof2(shrink_threshold, unit); +__cold int mdbx_env_set_userctx(MDBX_env *env, void *ctx) { + int rc = check_env(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - //---------------------------------------------------------------------------- + env->userctx = ctx; + return MDBX_SUCCESS; +} - if (!env->me_map) { - /* save user's geo-params for future open/create */ - if (pagesize != (intptr_t)env->me_psize) - setup_pagesize(env, pagesize); - env->me_dbgeo.lower = size_lower; - env->me_dbgeo.now = size_now; - env->me_dbgeo.upper = size_upper; - env->me_dbgeo.grow = - pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, growth_step)))); - env->me_dbgeo.shrink = - pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, shrink_threshold)))); - adjust_defaults(env); +__cold void *mdbx_env_get_userctx(const MDBX_env *env) { + return env ? env->userctx : nullptr; +} - ENSURE(env, env->me_dbgeo.lower >= MIN_MAPSIZE); - ENSURE(env, env->me_dbgeo.lower / (unsigned)pagesize >= MIN_PAGENO); - ENSURE(env, env->me_dbgeo.lower % (unsigned)pagesize == 0); - ENSURE(env, env->me_dbgeo.lower % env->me_os_psize == 0); +__cold int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) { + int rc = check_env(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - ENSURE(env, env->me_dbgeo.upper <= MAX_MAPSIZE); - ENSURE(env, env->me_dbgeo.upper / (unsigned)pagesize <= MAX_PAGENO + 1); - ENSURE(env, env->me_dbgeo.upper % (unsigned)pagesize == 0); - ENSURE(env, env->me_dbgeo.upper % env->me_os_psize == 0); +#if MDBX_DEBUG + env->assert_func = func; + return MDBX_SUCCESS; +#else + (void)func; + return MDBX_ENOSYS; +#endif +} - ENSURE(env, env->me_dbgeo.now >= env->me_dbgeo.lower); - ENSURE(env, env->me_dbgeo.now <= env->me_dbgeo.upper); - ENSURE(env, env->me_dbgeo.now % (unsigned)pagesize == 0); - ENSURE(env, env->me_dbgeo.now % env->me_os_psize == 0); +__cold int mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr) { + int rc = check_env(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - ENSURE(env, env->me_dbgeo.grow % (unsigned)pagesize == 0); - ENSURE(env, env->me_dbgeo.grow % env->me_os_psize == 0); - ENSURE(env, env->me_dbgeo.shrink % (unsigned)pagesize == 0); - ENSURE(env, env->me_dbgeo.shrink % env->me_os_psize == 0); + env->hsr_callback = hsr; + return MDBX_SUCCESS; +} - rc = MDBX_SUCCESS; - } else { - /* apply new params to opened environment */ - ENSURE(env, pagesize == (intptr_t)env->me_psize); - MDBX_meta meta; - memset(&meta, 0, sizeof(meta)); - if (!inside_txn) { - eASSERT(env, should_unlock); - const meta_ptr_t head = meta_recent(env, &env->me_txn0->tw.troika); +__cold MDBX_hsr_func *mdbx_env_get_hsr(const MDBX_env *env) { + return likely(env && env->signature.weak == env_signature) ? env->hsr_callback + : nullptr; +} - uint64_t timestamp = 0; - while ("workaround for " - "https://libmdbx.dqdkfa.ru/dead-github/issues/269") { - rc = coherency_check_head(env->me_txn0, head, ×tamp); - if (likely(rc == MDBX_SUCCESS)) - break; - if (unlikely(rc != MDBX_RESULT_TRUE)) - goto bailout; - } - meta = *head.ptr_c; - const txnid_t txnid = safe64_txnid_next(head.txnid); - if (unlikely(txnid > MAX_TXNID)) { - rc = MDBX_TXN_FULL; - ERROR("txnid overflow, raise %d", rc); - goto bailout; - } - meta_set_txnid(env, &meta, txnid); - } +#if defined(_WIN32) || defined(_WIN64) +__cold int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **arg) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - const MDBX_geo *const current_geo = - &(env->me_txn ? env->me_txn : env->me_txn0)->mt_geo; - /* update env-geo to avoid influences */ - env->me_dbgeo.now = pgno2bytes(env, current_geo->now); - env->me_dbgeo.lower = pgno2bytes(env, current_geo->lower); - env->me_dbgeo.upper = pgno2bytes(env, current_geo->upper); - env->me_dbgeo.grow = pgno2bytes(env, pv2pages(current_geo->grow_pv)); - env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(current_geo->shrink_pv)); + if (unlikely(!arg)) + return MDBX_EINVAL; - MDBX_geo new_geo; - new_geo.lower = bytes2pgno(env, size_lower); - new_geo.now = bytes2pgno(env, size_now); - new_geo.upper = bytes2pgno(env, size_upper); - new_geo.grow_pv = pages2pv(bytes2pgno(env, growth_step)); - new_geo.shrink_pv = pages2pv(bytes2pgno(env, shrink_threshold)); - new_geo.next = current_geo->next; + *arg = env->pathname.specified; + return MDBX_SUCCESS; +} +#endif /* Windows */ - ENSURE(env, pgno_align2os_bytes(env, new_geo.lower) == (size_t)size_lower); - ENSURE(env, pgno_align2os_bytes(env, new_geo.upper) == (size_t)size_upper); - ENSURE(env, pgno_align2os_bytes(env, new_geo.now) == (size_t)size_now); - ENSURE(env, new_geo.grow_pv == pages2pv(pv2pages(new_geo.grow_pv))); - ENSURE(env, new_geo.shrink_pv == pages2pv(pv2pages(new_geo.shrink_pv))); +__cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - ENSURE(env, (size_t)size_lower >= MIN_MAPSIZE); - ENSURE(env, new_geo.lower >= MIN_PAGENO); - ENSURE(env, (size_t)size_upper <= MAX_MAPSIZE); - ENSURE(env, new_geo.upper <= MAX_PAGENO + 1); - ENSURE(env, new_geo.now >= new_geo.next); - ENSURE(env, new_geo.upper >= new_geo.now); - ENSURE(env, new_geo.now >= new_geo.lower); + if (unlikely(!arg)) + return MDBX_EINVAL; - if (memcmp(current_geo, &new_geo, sizeof(MDBX_geo)) != 0) { #if defined(_WIN32) || defined(_WIN64) - /* Was DB shrinking disabled before and now it will be enabled? */ - if (new_geo.lower < new_geo.upper && new_geo.shrink_pv && - !(current_geo->lower < current_geo->upper && - current_geo->shrink_pv)) { - if (!env->me_lck_mmap.lck) { - rc = MDBX_EPERM; - goto bailout; - } - int err = osal_rdt_lock(env); - if (unlikely(MDBX_IS_ERROR(err))) { - rc = err; - goto bailout; - } - - /* Check if there are any reading threads that do not use the SRWL */ - const size_t CurrentTid = GetCurrentThreadId(); - const MDBX_reader *const begin = env->me_lck_mmap.lck->mti_readers; - const MDBX_reader *const end = - begin + atomic_load32(&env->me_lck_mmap.lck->mti_numreaders, - mo_AcquireRelease); - for (const MDBX_reader *reader = begin; reader < end; ++reader) { - if (reader->mr_pid.weak == env->me_pid && reader->mr_tid.weak && - reader->mr_tid.weak != CurrentTid) { - /* At least one thread may don't use SRWL */ - rc = MDBX_EPERM; - break; - } - } - - osal_rdt_unlock(env); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } -#endif /* Windows */ - - if (new_geo.now != current_geo->now || - new_geo.upper != current_geo->upper) { - rc = dxb_resize(env, current_geo->next, new_geo.now, new_geo.upper, - explicit_resize); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } - if (inside_txn) { - env->me_txn->mt_geo = new_geo; - env->me_txn->mt_flags |= MDBX_TXN_DIRTY; - } else { - meta.mm_geo = new_geo; - rc = sync_locked(env, env->me_flags, &meta, &env->me_txn0->tw.troika); - if (likely(rc == MDBX_SUCCESS)) { - env->me_dbgeo.now = pgno2bytes(env, new_geo.now = meta.mm_geo.now); - env->me_dbgeo.upper = - pgno2bytes(env, new_geo.upper = meta.mm_geo.upper); - } - } + if (!env->pathname_char) { + *arg = nullptr; + DWORD flags = /* WC_ERR_INVALID_CHARS */ 0x80; + size_t mb_len = + WideCharToMultiByte(CP_THREAD_ACP, flags, env->pathname.specified, -1, + nullptr, 0, nullptr, nullptr); + rc = mb_len ? MDBX_SUCCESS : (int)GetLastError(); + if (rc == ERROR_INVALID_FLAGS) { + mb_len = + WideCharToMultiByte(CP_THREAD_ACP, flags = 0, env->pathname.specified, + -1, nullptr, 0, nullptr, nullptr); + rc = mb_len ? MDBX_SUCCESS : (int)GetLastError(); } - if (likely(rc == MDBX_SUCCESS)) { - /* update env-geo to avoid influences */ - eASSERT(env, env->me_dbgeo.now == pgno2bytes(env, new_geo.now)); - env->me_dbgeo.lower = pgno2bytes(env, new_geo.lower); - eASSERT(env, env->me_dbgeo.upper == pgno2bytes(env, new_geo.upper)); - env->me_dbgeo.grow = pgno2bytes(env, pv2pages(new_geo.grow_pv)); - env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(new_geo.shrink_pv)); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + char *const mb_pathname = osal_malloc(mb_len); + if (!mb_pathname) + return MDBX_ENOMEM; + if (mb_len != (size_t)WideCharToMultiByte( + CP_THREAD_ACP, flags, env->pathname.specified, -1, + mb_pathname, (int)mb_len, nullptr, nullptr)) { + rc = (int)GetLastError(); + osal_free(mb_pathname); + return rc; } + if (env->pathname_char || + InterlockedCompareExchangePointer((PVOID volatile *)&env->pathname_char, + mb_pathname, nullptr)) + osal_free(mb_pathname); } + *arg = env->pathname_char; +#else + *arg = env->pathname.specified; +#endif /* Windows */ + return MDBX_SUCCESS; +} -bailout: - if (should_unlock) - osal_txn_unlock(env); - return rc; +/*------------------------------------------------------------------------------ + * Legacy API */ + +#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API + +LIBMDBX_API int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, + MDBX_txn_flags_t flags, MDBX_txn **ret) { + return __inline_mdbx_txn_begin(env, parent, flags, ret); } -__cold static int alloc_page_buf(MDBX_env *env) { - return env->me_pbuf ? MDBX_SUCCESS - : osal_memalign_alloc(env->me_os_psize, - env->me_psize * (size_t)NUM_METAS, - &env->me_pbuf); +LIBMDBX_API int mdbx_txn_commit(MDBX_txn *txn) { + return __inline_mdbx_txn_commit(txn); } -/* Further setup required for opening an MDBX environment */ -__cold static int setup_dxb(MDBX_env *env, const int lck_rc, - const mdbx_mode_t mode_bits) { - MDBX_meta header; - eASSERT(env, !(env->me_flags & MDBX_ENV_ACTIVE)); - int rc = MDBX_RESULT_FALSE; - int err = read_header(env, &header, lck_rc, mode_bits); - if (unlikely(err != MDBX_SUCCESS)) { - if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE || err != MDBX_ENODATA || - (env->me_flags & MDBX_RDONLY) != 0 || - /* recovery mode */ env->me_stuck_meta >= 0) - return err; +LIBMDBX_API __cold int mdbx_env_stat(const MDBX_env *env, MDBX_stat *stat, + size_t bytes) { + return __inline_mdbx_env_stat(env, stat, bytes); +} - DEBUG("%s", "create new database"); - rc = /* new database */ MDBX_RESULT_TRUE; +LIBMDBX_API __cold int mdbx_env_info(const MDBX_env *env, MDBX_envinfo *info, + size_t bytes) { + return __inline_mdbx_env_info(env, info, bytes); +} - if (!env->me_dbgeo.now) { - /* set defaults if not configured */ - err = mdbx_env_set_geometry(env, 0, -1, DEFAULT_MAPSIZE, -1, -1, -1); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } +LIBMDBX_API int mdbx_dbi_flags(const MDBX_txn *txn, MDBX_dbi dbi, + unsigned *flags) { + return __inline_mdbx_dbi_flags(txn, dbi, flags); +} - err = alloc_page_buf(env); - if (unlikely(err != MDBX_SUCCESS)) - return err; +LIBMDBX_API __cold int mdbx_env_sync(MDBX_env *env) { + return __inline_mdbx_env_sync(env); +} - header = *init_metas(env, env->me_pbuf); - err = osal_pwrite(env->me_lazy_fd, env->me_pbuf, - env->me_psize * (size_t)NUM_METAS, 0); - if (unlikely(err != MDBX_SUCCESS)) - return err; +LIBMDBX_API __cold int mdbx_env_sync_poll(MDBX_env *env) { + return __inline_mdbx_env_sync_poll(env); +} - err = osal_ftruncate(env->me_lazy_fd, env->me_dxb_mmap.filesize = - env->me_dxb_mmap.current = - env->me_dbgeo.now); - if (unlikely(err != MDBX_SUCCESS)) - return err; +LIBMDBX_API __cold int mdbx_env_close(MDBX_env *env) { + return __inline_mdbx_env_close(env); +} -#ifndef NDEBUG /* just for checking */ - err = read_header(env, &header, lck_rc, mode_bits); - if (unlikely(err != MDBX_SUCCESS)) - return err; -#endif - } +LIBMDBX_API __cold int mdbx_env_set_mapsize(MDBX_env *env, size_t size) { + return __inline_mdbx_env_set_mapsize(env, size); +} - VERBOSE("header: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO - "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO " +%u -%u, txn_id %" PRIaTXN - ", %s", - header.mm_dbs[MAIN_DBI].md_root, header.mm_dbs[FREE_DBI].md_root, - header.mm_geo.lower, header.mm_geo.next, header.mm_geo.now, - header.mm_geo.upper, pv2pages(header.mm_geo.grow_pv), - pv2pages(header.mm_geo.shrink_pv), - unaligned_peek_u64(4, header.mm_txnid_a), durable_caption(&header)); +LIBMDBX_API __cold int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) { + return __inline_mdbx_env_set_maxdbs(env, dbs); +} - if (unlikely(header.mm_dbs[FREE_DBI].md_flags != MDBX_INTEGERKEY)) { - ERROR("unexpected/invalid db-flags 0x%u for GC/FreeDB", - header.mm_dbs[FREE_DBI].md_flags); - return MDBX_INCOMPATIBLE; - } - env->me_db_flags[FREE_DBI] = DB_VALID | MDBX_INTEGERKEY; - env->me_dbxs[FREE_DBI].md_cmp = cmp_int_align4; /* aligned MDBX_INTEGERKEY */ - env->me_dbxs[FREE_DBI].md_dcmp = cmp_lenfast; - env->me_dbxs[FREE_DBI].md_klen_max = env->me_dbxs[FREE_DBI].md_klen_min = 8; - env->me_dbxs[FREE_DBI].md_vlen_min = 4; - env->me_dbxs[FREE_DBI].md_vlen_max = - mdbx_env_get_maxvalsize_ex(env, MDBX_INTEGERKEY); +LIBMDBX_API __cold int mdbx_env_get_maxdbs(const MDBX_env *env, MDBX_dbi *dbs) { + return __inline_mdbx_env_get_maxdbs(env, dbs); +} - if (env->me_psize != header.mm_psize) - setup_pagesize(env, header.mm_psize); - const size_t used_bytes = pgno2bytes(env, header.mm_geo.next); - const size_t used_aligned2os_bytes = - ceil_powerof2(used_bytes, env->me_os_psize); - if ((env->me_flags & MDBX_RDONLY) /* readonly */ - || lck_rc != MDBX_RESULT_TRUE /* not exclusive */ - || /* recovery mode */ env->me_stuck_meta >= 0) { - /* use present params from db */ - const size_t pagesize = header.mm_psize; - err = mdbx_env_set_geometry( - env, header.mm_geo.lower * pagesize, header.mm_geo.now * pagesize, - header.mm_geo.upper * pagesize, - pv2pages(header.mm_geo.grow_pv) * pagesize, - pv2pages(header.mm_geo.shrink_pv) * pagesize, header.mm_psize); - if (unlikely(err != MDBX_SUCCESS)) { - ERROR("%s: err %d", "could not apply geometry from db", err); - return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err; - } - } else if (env->me_dbgeo.now) { - /* silently growth to last used page */ - if (env->me_dbgeo.now < used_aligned2os_bytes) - env->me_dbgeo.now = used_aligned2os_bytes; - if (env->me_dbgeo.upper < used_aligned2os_bytes) - env->me_dbgeo.upper = used_aligned2os_bytes; +LIBMDBX_API __cold int mdbx_env_set_maxreaders(MDBX_env *env, + unsigned readers) { + return __inline_mdbx_env_set_maxreaders(env, readers); +} - /* apply preconfigured params, but only if substantial changes: - * - upper or lower limit changes - * - shrink threshold or growth step - * But ignore change just a 'now/current' size. */ - if (bytes_align2os_bytes(env, env->me_dbgeo.upper) != - pgno2bytes(env, header.mm_geo.upper) || - bytes_align2os_bytes(env, env->me_dbgeo.lower) != - pgno2bytes(env, header.mm_geo.lower) || - bytes_align2os_bytes(env, env->me_dbgeo.shrink) != - pgno2bytes(env, pv2pages(header.mm_geo.shrink_pv)) || - bytes_align2os_bytes(env, env->me_dbgeo.grow) != - pgno2bytes(env, pv2pages(header.mm_geo.grow_pv))) { - - if (env->me_dbgeo.shrink && env->me_dbgeo.now > used_bytes) - /* pre-shrink if enabled */ - env->me_dbgeo.now = used_bytes + env->me_dbgeo.shrink - - used_bytes % env->me_dbgeo.shrink; +LIBMDBX_API __cold int mdbx_env_get_maxreaders(const MDBX_env *env, + unsigned *readers) { + return __inline_mdbx_env_get_maxreaders(env, readers); +} - err = mdbx_env_set_geometry(env, env->me_dbgeo.lower, env->me_dbgeo.now, - env->me_dbgeo.upper, env->me_dbgeo.grow, - env->me_dbgeo.shrink, header.mm_psize); - if (unlikely(err != MDBX_SUCCESS)) { - ERROR("%s: err %d", "could not apply preconfigured db-geometry", err); - return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err; - } +LIBMDBX_API __cold int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold) { + return __inline_mdbx_env_set_syncbytes(env, threshold); +} - /* update meta fields */ - header.mm_geo.now = bytes2pgno(env, env->me_dbgeo.now); - header.mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower); - header.mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper); - header.mm_geo.grow_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.grow)); - header.mm_geo.shrink_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.shrink)); +LIBMDBX_API __cold int mdbx_env_get_syncbytes(const MDBX_env *env, + size_t *threshold) { + return __inline_mdbx_env_get_syncbytes(env, threshold); +} - VERBOSE("amended: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO - "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO - " +%u -%u, txn_id %" PRIaTXN ", %s", - header.mm_dbs[MAIN_DBI].md_root, header.mm_dbs[FREE_DBI].md_root, - header.mm_geo.lower, header.mm_geo.next, header.mm_geo.now, - header.mm_geo.upper, pv2pages(header.mm_geo.grow_pv), - pv2pages(header.mm_geo.shrink_pv), - unaligned_peek_u64(4, header.mm_txnid_a), - durable_caption(&header)); - } else { - /* fetch back 'now/current' size, since it was ignored during comparison - * and may differ. */ - env->me_dbgeo.now = pgno_align2os_bytes(env, header.mm_geo.now); - } - ENSURE(env, header.mm_geo.now >= header.mm_geo.next); - } else { - /* geo-params are not pre-configured by user, - * get current values from the meta. */ - env->me_dbgeo.now = pgno2bytes(env, header.mm_geo.now); - env->me_dbgeo.lower = pgno2bytes(env, header.mm_geo.lower); - env->me_dbgeo.upper = pgno2bytes(env, header.mm_geo.upper); - env->me_dbgeo.grow = pgno2bytes(env, pv2pages(header.mm_geo.grow_pv)); - env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(header.mm_geo.shrink_pv)); - } +LIBMDBX_API __cold int mdbx_env_set_syncperiod(MDBX_env *env, + unsigned seconds_16dot16) { + return __inline_mdbx_env_set_syncperiod(env, seconds_16dot16); +} - ENSURE(env, pgno_align2os_bytes(env, header.mm_geo.now) == env->me_dbgeo.now); - ENSURE(env, env->me_dbgeo.now >= used_bytes); - const uint64_t filesize_before = env->me_dxb_mmap.filesize; - if (unlikely(filesize_before != env->me_dbgeo.now)) { - if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) { - VERBOSE("filesize mismatch (expect %" PRIuPTR "b/%" PRIaPGNO - "p, have %" PRIu64 "b/%" PRIaPGNO "p), " - "assume other process working", - env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now), - filesize_before, bytes2pgno(env, (size_t)filesize_before)); - } else { - WARNING("filesize mismatch (expect %" PRIuSIZE "b/%" PRIaPGNO - "p, have %" PRIu64 "b/%" PRIaPGNO "p)", - env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now), - filesize_before, bytes2pgno(env, (size_t)filesize_before)); - if (filesize_before < used_bytes) { - ERROR("last-page beyond end-of-file (last %" PRIaPGNO - ", have %" PRIaPGNO ")", - header.mm_geo.next, bytes2pgno(env, (size_t)filesize_before)); - return MDBX_CORRUPTED; - } +LIBMDBX_API __cold int mdbx_env_get_syncperiod(const MDBX_env *env, + unsigned *seconds_16dot16) { + return __inline_mdbx_env_get_syncperiod(env, seconds_16dot16); +} - if (env->me_flags & MDBX_RDONLY) { - if (filesize_before & (env->me_os_psize - 1)) { - ERROR("%s", "filesize should be rounded-up to system page"); - return MDBX_WANNA_RECOVERY; - } - WARNING("%s", "ignore filesize mismatch in readonly-mode"); - } else { - VERBOSE("will resize datafile to %" PRIuSIZE " bytes, %" PRIaPGNO - " pages", - env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now)); - } - } - } +LIBMDBX_API __cold uint64_t mdbx_key_from_int64(const int64_t i64) { + return __inline_mdbx_key_from_int64(i64); +} - VERBOSE("current boot-id %" PRIx64 "-%" PRIx64 " (%savailable)", bootid.x, - bootid.y, (bootid.x | bootid.y) ? "" : "not-"); +LIBMDBX_API __cold uint32_t mdbx_key_from_int32(const int32_t i32) { + return __inline_mdbx_key_from_int32(i32); +} -#if MDBX_ENABLE_MADVISE - /* calculate readahead hint before mmap with zero redundant pages */ - const bool readahead = - !(env->me_flags & MDBX_NORDAHEAD) && - mdbx_is_readahead_reasonable(used_bytes, 0) == MDBX_RESULT_TRUE; -#endif /* MDBX_ENABLE_MADVISE */ +LIBMDBX_API __cold intptr_t mdbx_limits_pgsize_min(void) { + return __inline_mdbx_limits_pgsize_min(); +} - err = osal_mmap( - env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, env->me_dbgeo.upper, - (lck_rc && env->me_stuck_meta < 0) ? MMAP_OPTION_TRUNCATE : 0); - if (unlikely(err != MDBX_SUCCESS)) - return err; +LIBMDBX_API __cold intptr_t mdbx_limits_pgsize_max(void) { + return __inline_mdbx_limits_pgsize_max(); +} -#if MDBX_ENABLE_MADVISE -#if defined(MADV_DONTDUMP) - err = madvise(env->me_map, env->me_dxb_mmap.limit, MADV_DONTDUMP) - ? ignore_enosys(errno) - : MDBX_SUCCESS; - if (unlikely(MDBX_IS_ERROR(err))) - return err; -#endif /* MADV_DONTDUMP */ -#if defined(MADV_DODUMP) - if (mdbx_static.flags & MDBX_DBG_DUMP) { - const size_t meta_length_aligned2os = pgno_align2os_bytes(env, NUM_METAS); - err = madvise(env->me_map, meta_length_aligned2os, MADV_DODUMP) - ? ignore_enosys(errno) - : MDBX_SUCCESS; - if (unlikely(MDBX_IS_ERROR(err))) - return err; - } -#endif /* MADV_DODUMP */ -#endif /* MDBX_ENABLE_MADVISE */ +#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \note Please refer to the COPYRIGHT file for explanations license change, +/// credits and acknowledgments. +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 -#ifdef ENABLE_MEMCHECK - env->me_valgrind_handle = - VALGRIND_CREATE_BLOCK(env->me_map, env->me_dxb_mmap.limit, "mdbx"); -#endif /* ENABLE_MEMCHECK */ - eASSERT(env, used_bytes >= pgno2bytes(env, NUM_METAS) && - used_bytes <= env->me_dxb_mmap.limit); -#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) - if (env->me_dxb_mmap.filesize > used_bytes && - env->me_dxb_mmap.filesize < env->me_dxb_mmap.limit) { - VALGRIND_MAKE_MEM_NOACCESS(ptr_disp(env->me_map, used_bytes), - env->me_dxb_mmap.filesize - used_bytes); - MDBX_ASAN_POISON_MEMORY_REGION(ptr_disp(env->me_map, used_bytes), - env->me_dxb_mmap.filesize - used_bytes); - } - env->me_poison_edge = - bytes2pgno(env, (env->me_dxb_mmap.filesize < env->me_dxb_mmap.limit) - ? env->me_dxb_mmap.filesize - : env->me_dxb_mmap.limit); -#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ +typedef struct compacting_context { + MDBX_env *env; + MDBX_txn *txn; + pgno_t first_unallocated; + osal_condpair_t condpair; + volatile unsigned head; + volatile unsigned tail; + uint8_t *write_buf[2]; + size_t write_len[2]; + /* Error code. Never cleared if set. Both threads can set nonzero + * to fail the copy. Not mutex-protected, expects atomic int. */ + volatile int error; + mdbx_filehandle_t fd; +} ctx_t; - meta_troika_t troika = meta_tap(env); -#if MDBX_DEBUG - meta_troika_dump(env, &troika); -#endif - //-------------------------------- validate/rollback head & steady meta-pages - if (unlikely(env->me_stuck_meta >= 0)) { - /* recovery mode */ - MDBX_meta clone; - MDBX_meta const *const target = METAPAGE(env, env->me_stuck_meta); - err = validate_meta_copy(env, target, &clone); - if (unlikely(err != MDBX_SUCCESS)) { - ERROR("target meta[%u] is corrupted", - bytes2pgno(env, ptr_dist(data_page(target), env->me_map))); - meta_troika_dump(env, &troika); - return MDBX_CORRUPTED; - } - } else /* not recovery mode */ - while (1) { - const unsigned meta_clash_mask = meta_eq_mask(&troika); - if (unlikely(meta_clash_mask)) { - ERROR("meta-pages are clashed: mask 0x%d", meta_clash_mask); - meta_troika_dump(env, &troika); - return MDBX_CORRUPTED; - } +__cold static int compacting_walk_tree(ctx_t *ctx, tree_t *tree); - if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) { - /* non-exclusive mode, - * meta-pages should be validated by a first process opened the DB */ - if (troika.recent == troika.prefer_steady) - break; +/* Dedicated writer thread for compacting copy. */ +__cold static THREAD_RESULT THREAD_CALL compacting_write_thread(void *arg) { + ctx_t *const ctx = arg; - if (!env->me_lck_mmap.lck) { - /* LY: without-lck (read-only) mode, so it is impossible that other - * process made weak checkpoint. */ - ERROR("%s", "without-lck, unable recovery/rollback"); - meta_troika_dump(env, &troika); - return MDBX_WANNA_RECOVERY; - } +#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) + sigset_t sigset; + sigemptyset(&sigset); + sigaddset(&sigset, SIGPIPE); + ctx->error = pthread_sigmask(SIG_BLOCK, &sigset, nullptr); +#endif /* EPIPE */ - /* LY: assume just have a collision with other running process, - * or someone make a weak checkpoint */ - VERBOSE("%s", "assume collision or online weak checkpoint"); - break; + osal_condpair_lock(&ctx->condpair); + while (!ctx->error) { + while (ctx->tail == ctx->head && !ctx->error) { + int err = osal_condpair_wait(&ctx->condpair, true); + if (err != MDBX_SUCCESS) { + ctx->error = err; + goto bailout; } - eASSERT(env, lck_rc == MDBX_RESULT_TRUE); - /* exclusive mode */ - - const meta_ptr_t recent = meta_recent(env, &troika); - const meta_ptr_t prefer_steady = meta_prefer_steady(env, &troika); - MDBX_meta clone; - if (prefer_steady.is_steady) { - err = validate_meta_copy(env, prefer_steady.ptr_c, &clone); - if (unlikely(err != MDBX_SUCCESS)) { - ERROR("meta[%u] with %s txnid %" PRIaTXN " is corrupted, %s needed", - bytes2pgno(env, ptr_dist(prefer_steady.ptr_c, env->me_map)), - "steady", prefer_steady.txnid, "manual recovery"); - meta_troika_dump(env, &troika); - return MDBX_CORRUPTED; + } + const unsigned toggle = ctx->tail & 1; + size_t wsize = ctx->write_len[toggle]; + if (wsize == 0) { + ctx->tail += 1; + break /* EOF */; + } + ctx->write_len[toggle] = 0; + uint8_t *ptr = ctx->write_buf[toggle]; + if (!ctx->error) { + int err = osal_write(ctx->fd, ptr, wsize); + if (err != MDBX_SUCCESS) { +#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) + if (err == EPIPE) { + /* Collect the pending SIGPIPE, + * otherwise at least OS X gives it to the process on thread-exit. */ + int unused; + sigwait(&sigset, &unused); } - if (prefer_steady.ptr_c == recent.ptr_c) - break; +#endif /* EPIPE */ + ctx->error = err; + goto bailout; } + } + ctx->tail += 1; + osal_condpair_signal(&ctx->condpair, false); + } +bailout: + osal_condpair_unlock(&ctx->condpair); + return (THREAD_RESULT)0; +} - const pgno_t pgno = bytes2pgno(env, ptr_dist(recent.ptr_c, env->me_map)); - const bool last_valid = - validate_meta_copy(env, recent.ptr_c, &clone) == MDBX_SUCCESS; - eASSERT(env, - !prefer_steady.is_steady || recent.txnid != prefer_steady.txnid); - if (unlikely(!last_valid)) { - if (unlikely(!prefer_steady.is_steady)) { - ERROR("%s for open or automatic rollback, %s", - "there are no suitable meta-pages", - "manual recovery is required"); - meta_troika_dump(env, &troika); - return MDBX_CORRUPTED; - } - WARNING("meta[%u] with last txnid %" PRIaTXN - " is corrupted, rollback needed", - pgno, recent.txnid); - meta_troika_dump(env, &troika); - goto purge_meta_head; - } +/* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. */ +__cold static int compacting_toggle_write_buffers(ctx_t *ctx) { + osal_condpair_lock(&ctx->condpair); + eASSERT(ctx->env, ctx->head - ctx->tail < 2 || ctx->error); + ctx->head += 1; + osal_condpair_signal(&ctx->condpair, true); + while (!ctx->error && ctx->head - ctx->tail == 2 /* both buffers in use */) { + int err = osal_condpair_wait(&ctx->condpair, false); + if (err != MDBX_SUCCESS) + ctx->error = err; + } + osal_condpair_unlock(&ctx->condpair); + return ctx->error; +} - if (meta_bootid_match(recent.ptr_c)) { - if (env->me_flags & MDBX_RDONLY) { - ERROR("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: " - "rollback NOT needed, steady-sync NEEDED%s", - "opening after an unclean shutdown", bootid.x, bootid.y, - ", but unable in read-only mode"); - meta_troika_dump(env, &troika); - return MDBX_WANNA_RECOVERY; +static int compacting_put_bytes(ctx_t *ctx, const void *src, size_t bytes, + pgno_t pgno, pgno_t npages) { + assert(pgno == 0 || bytes > PAGEHDRSZ); + while (bytes > 0) { + const size_t side = ctx->head & 1; + const size_t left = MDBX_ENVCOPY_WRITEBUF - ctx->write_len[side]; + if (left < (pgno ? PAGEHDRSZ : 1)) { + int err = compacting_toggle_write_buffers(ctx); + if (unlikely(err != MDBX_SUCCESS)) + return err; + continue; + } + const size_t chunk = (bytes < left) ? bytes : left; + void *const dst = ctx->write_buf[side] + ctx->write_len[side]; + if (src) { + memcpy(dst, src, chunk); + if (pgno) { + assert(chunk > PAGEHDRSZ); + page_t *mp = dst; + mp->pgno = pgno; + if (mp->txnid == 0) + mp->txnid = ctx->txn->txnid; + if (mp->flags == P_LARGE) { + assert(bytes <= pgno2bytes(ctx->env, npages)); + mp->pages = npages; } - WARNING("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: " - "rollback NOT needed, steady-sync NEEDED%s", - "opening after an unclean shutdown", bootid.x, bootid.y, ""); - header = clone; - env->me_lck->mti_unsynced_pages.weak = header.mm_geo.next; - if (!env->me_lck->mti_eoos_timestamp.weak) - env->me_lck->mti_eoos_timestamp.weak = osal_monotime(); - break; - } - if (unlikely(!prefer_steady.is_steady)) { - ERROR("%s, but %s for automatic rollback: %s", - "opening after an unclean shutdown", - "there are no suitable meta-pages", - "manual recovery is required"); - meta_troika_dump(env, &troika); - return MDBX_CORRUPTED; - } - if (env->me_flags & MDBX_RDONLY) { - ERROR("%s and rollback needed: (from head %" PRIaTXN - " to steady %" PRIaTXN ")%s", - "opening after an unclean shutdown", recent.txnid, - prefer_steady.txnid, ", but unable in read-only mode"); - meta_troika_dump(env, &troika); - return MDBX_WANNA_RECOVERY; + pgno = 0; } + src = ptr_disp(src, chunk); + } else + memset(dst, 0, chunk); + bytes -= chunk; + ctx->write_len[side] += chunk; + } + return MDBX_SUCCESS; +} - purge_meta_head: - NOTICE("%s and doing automatic rollback: " - "purge%s meta[%u] with%s txnid %" PRIaTXN, - "opening after an unclean shutdown", last_valid ? "" : " invalid", - pgno, last_valid ? " weak" : "", recent.txnid); - meta_troika_dump(env, &troika); - ENSURE(env, prefer_steady.is_steady); - err = override_meta(env, pgno, 0, - last_valid ? recent.ptr_c : prefer_steady.ptr_c); - if (err) { - ERROR("rollback: overwrite meta[%u] with txnid %" PRIaTXN ", error %d", - pgno, recent.txnid, err); - return err; - } - troika = meta_tap(env); - ENSURE(env, 0 == meta_txnid(recent.ptr_v)); - ENSURE(env, 0 == meta_eq_mask(&troika)); - } +static int compacting_put_page(ctx_t *ctx, const page_t *mp, + const size_t head_bytes, const size_t tail_bytes, + const pgno_t npages) { + if (tail_bytes) { + assert(head_bytes + tail_bytes <= ctx->env->ps); + assert(npages == 1 && + (page_type(mp) == P_BRANCH || page_type(mp) == P_LEAF)); + } else { + assert(head_bytes <= pgno2bytes(ctx->env, npages)); + assert((npages == 1 && page_type(mp) == (P_LEAF | P_DUPFIX)) || + page_type(mp) == P_LARGE); + } - if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { - //-------------------------------------------------- shrink DB & update geo - /* re-check size after mmap */ - if ((env->me_dxb_mmap.current & (env->me_os_psize - 1)) != 0 || - env->me_dxb_mmap.current < used_bytes) { - ERROR("unacceptable/unexpected datafile size %" PRIuPTR, - env->me_dxb_mmap.current); - return MDBX_PROBLEM; - } - if (env->me_dxb_mmap.current != env->me_dbgeo.now) { - header.mm_geo.now = bytes2pgno(env, env->me_dxb_mmap.current); - NOTICE("need update meta-geo to filesize %" PRIuPTR " bytes, %" PRIaPGNO - " pages", - env->me_dxb_mmap.current, header.mm_geo.now); - } + const pgno_t pgno = ctx->first_unallocated; + ctx->first_unallocated += npages; + int err = compacting_put_bytes(ctx, mp, head_bytes, pgno, npages); + if (unlikely(err != MDBX_SUCCESS)) + return err; + err = compacting_put_bytes( + ctx, nullptr, pgno2bytes(ctx->env, npages) - (head_bytes + tail_bytes), 0, + 0); + if (unlikely(err != MDBX_SUCCESS)) + return err; + return compacting_put_bytes(ctx, ptr_disp(mp, ctx->env->ps - tail_bytes), + tail_bytes, 0, 0); +} - const meta_ptr_t recent = meta_recent(env, &troika); - if (/* не учитываем различия в geo.next */ - header.mm_geo.grow_pv != recent.ptr_c->mm_geo.grow_pv || - header.mm_geo.shrink_pv != recent.ptr_c->mm_geo.shrink_pv || - header.mm_geo.lower != recent.ptr_c->mm_geo.lower || - header.mm_geo.upper != recent.ptr_c->mm_geo.upper || - header.mm_geo.now != recent.ptr_c->mm_geo.now) { - if ((env->me_flags & MDBX_RDONLY) != 0 || - /* recovery mode */ env->me_stuck_meta >= 0) { - WARNING("skipped update meta.geo in %s mode: from l%" PRIaPGNO - "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u, to l%" PRIaPGNO - "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u", - (env->me_stuck_meta < 0) ? "read-only" : "recovery", - recent.ptr_c->mm_geo.lower, recent.ptr_c->mm_geo.now, - recent.ptr_c->mm_geo.upper, - pv2pages(recent.ptr_c->mm_geo.shrink_pv), - pv2pages(recent.ptr_c->mm_geo.grow_pv), header.mm_geo.lower, - header.mm_geo.now, header.mm_geo.upper, - pv2pages(header.mm_geo.shrink_pv), - pv2pages(header.mm_geo.grow_pv)); - } else { - const txnid_t next_txnid = safe64_txnid_next(recent.txnid); - if (unlikely(next_txnid > MAX_TXNID)) { - ERROR("txnid overflow, raise %d", MDBX_TXN_FULL); - return MDBX_TXN_FULL; - } - NOTICE("updating meta.geo: " - "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO - "/s%u-g%u (txn#%" PRIaTXN "), " - "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO - "/s%u-g%u (txn#%" PRIaTXN ")", - recent.ptr_c->mm_geo.lower, recent.ptr_c->mm_geo.now, - recent.ptr_c->mm_geo.upper, - pv2pages(recent.ptr_c->mm_geo.shrink_pv), - pv2pages(recent.ptr_c->mm_geo.grow_pv), recent.txnid, - header.mm_geo.lower, header.mm_geo.now, header.mm_geo.upper, - pv2pages(header.mm_geo.shrink_pv), - pv2pages(header.mm_geo.grow_pv), next_txnid); +__cold static int compacting_walk(ctx_t *ctx, MDBX_cursor *mc, + pgno_t *const parent_pgno, + txnid_t parent_txnid) { + mc->top = 0; + mc->ki[0] = 0; + int rc = page_get(mc, *parent_pgno, &mc->pg[0], parent_txnid); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - ENSURE(env, header.unsafe_txnid == recent.txnid); - meta_set_txnid(env, &header, next_txnid); - err = sync_locked(env, env->me_flags | MDBX_SHRINK_ALLOWED, &header, - &troika); - if (err) { - ERROR("error %d, while updating meta.geo: " - "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO - "/s%u-g%u (txn#%" PRIaTXN "), " - "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO - "/s%u-g%u (txn#%" PRIaTXN ")", - err, recent.ptr_c->mm_geo.lower, recent.ptr_c->mm_geo.now, - recent.ptr_c->mm_geo.upper, - pv2pages(recent.ptr_c->mm_geo.shrink_pv), - pv2pages(recent.ptr_c->mm_geo.grow_pv), recent.txnid, - header.mm_geo.lower, header.mm_geo.now, header.mm_geo.upper, - pv2pages(header.mm_geo.shrink_pv), - pv2pages(header.mm_geo.grow_pv), header.unsafe_txnid); - return err; - } - } - } + rc = tree_search_finalize(mc, nullptr, Z_FIRST); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - atomic_store32(&env->me_lck->mti_discarded_tail, - bytes2pgno(env, used_aligned2os_bytes), mo_Relaxed); + /* Make cursor pages writable */ + const intptr_t deep_limit = mc->top + 1; + void *const buf = osal_malloc(pgno2bytes(ctx->env, deep_limit + 1)); + if (buf == nullptr) + return MDBX_ENOMEM; - if ((env->me_flags & MDBX_RDONLY) == 0 && env->me_stuck_meta < 0 && - (mdbx_static.flags & MDBX_DBG_DONT_UPGRADE) == 0) { - for (int n = 0; n < NUM_METAS; ++n) { - MDBX_meta *const meta = METAPAGE(env, n); - if (unlikely(unaligned_peek_u64(4, &meta->mm_magic_and_version) != - MDBX_DATA_MAGIC)) { - const txnid_t txnid = constmeta_txnid(meta); - NOTICE("%s %s" - "meta[%u], txnid %" PRIaTXN, - "updating db-format signature for", - META_IS_STEADY(meta) ? "stead-" : "weak-", n, txnid); - err = override_meta(env, n, txnid, meta); - if (unlikely(err != MDBX_SUCCESS) && - /* Just ignore the MDBX_PROBLEM error, since here it is - * returned only in case of the attempt to upgrade an obsolete - * meta-page that is invalid for current state of a DB, - * e.g. after shrinking DB file */ - err != MDBX_PROBLEM) { - ERROR("%s meta[%u], txnid %" PRIaTXN ", error %d", - "updating db-format signature for", n, txnid, err); - return err; + void *ptr = buf; + for (intptr_t i = 0; i <= mc->top; i++) { + page_copy(ptr, mc->pg[i], ctx->env->ps); + mc->pg[i] = ptr; + ptr = ptr_disp(ptr, ctx->env->ps); + } + /* This is writable space for a leaf page. Usually not needed. */ + page_t *const leaf = ptr; + + while (mc->top >= 0) { + page_t *mp = mc->pg[mc->top]; + const size_t nkeys = page_numkeys(mp); + if (is_leaf(mp)) { + if (!(mc->flags & + z_inner) /* may have nested N_SUBDATA or N_BIGDATA nodes */) { + for (size_t i = 0; i < nkeys; i++) { + node_t *node = page_node(mp, i); + if (node_flags(node) == N_BIGDATA) { + /* Need writable leaf */ + if (mp != leaf) { + mc->pg[mc->top] = leaf; + page_copy(leaf, mp, ctx->env->ps); + mp = leaf; + node = page_node(mp, i); + } + + const pgr_t lp = + page_get_large(mc, node_largedata_pgno(node), mp->txnid); + if (unlikely((rc = lp.err) != MDBX_SUCCESS)) + goto bailout; + const size_t datasize = node_ds(node); + const pgno_t npages = largechunk_npages(ctx->env, datasize); + poke_pgno(node_data(node), ctx->first_unallocated); + rc = compacting_put_page(ctx, lp.page, PAGEHDRSZ + datasize, 0, + npages); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } else if (node_flags(node) & N_SUBDATA) { + if (!MDBX_DISABLE_VALIDATION && + unlikely(node_ds(node) != sizeof(tree_t))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid dupsort sub-tree node size", + (unsigned)node_ds(node)); + rc = MDBX_CORRUPTED; + goto bailout; + } + + /* Need writable leaf */ + if (mp != leaf) { + mc->pg[mc->top] = leaf; + page_copy(leaf, mp, ctx->env->ps); + mp = leaf; + node = page_node(mp, i); + } + + tree_t *nested = nullptr; + if (node_flags(node) & N_DUPDATA) { + rc = cursor_dupsort_setup(mc, node, mp); + if (likely(rc == MDBX_SUCCESS)) { + nested = &mc->subcur->nested_tree; + rc = compacting_walk(ctx, &mc->subcur->cursor, &nested->root, + mp->txnid); + } + } else { + cASSERT(mc, (mc->flags & z_inner) == 0 && mc->subcur == 0); + cursor_couple_t *couple = + container_of(mc, cursor_couple_t, outer); + nested = &couple->inner.nested_tree; + memcpy(nested, node_data(node), sizeof(tree_t)); + rc = compacting_walk_tree(ctx, nested); + } + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + memcpy(node_data(node), nested, sizeof(tree_t)); } - troika = meta_tap(env); } } + } else { + mc->ki[mc->top]++; + if (mc->ki[mc->top] < nkeys) { + for (;;) { + const node_t *node = page_node(mp, mc->ki[mc->top]); + rc = page_get(mc, node_pgno(node), &mp, mp->txnid); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + mc->top += 1; + if (unlikely(mc->top >= deep_limit)) { + rc = MDBX_CURSOR_FULL; + goto bailout; + } + mc->ki[mc->top] = 0; + if (!is_branch(mp)) { + mc->pg[mc->top] = mp; + break; + } + /* Whenever we advance to a sibling branch page, + * we must proceed all the way down to its first leaf. */ + page_copy(mc->pg[mc->top], mp, ctx->env->ps); + } + continue; + } } - } /* lck exclusive, lck_rc == MDBX_RESULT_TRUE */ - //---------------------------------------------------- setup madvise/readahead -#if MDBX_ENABLE_MADVISE - if (used_aligned2os_bytes < env->me_dxb_mmap.current) { -#if defined(MADV_REMOVE) - if (lck_rc && (env->me_flags & MDBX_WRITEMAP) != 0 && - /* not recovery mode */ env->me_stuck_meta < 0) { - NOTICE("open-MADV_%s %u..%u", "REMOVE (deallocate file space)", - env->me_lck->mti_discarded_tail.weak, - bytes2pgno(env, env->me_dxb_mmap.current)); - err = - madvise(ptr_disp(env->me_map, used_aligned2os_bytes), - env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_REMOVE) - ? ignore_enosys(errno) - : MDBX_SUCCESS; - if (unlikely(MDBX_IS_ERROR(err))) - return err; + const pgno_t pgno = ctx->first_unallocated; + if (likely(!is_dupfix_leaf(mp))) { + rc = compacting_put_page(ctx, mp, PAGEHDRSZ + mp->lower, + ctx->env->ps - (PAGEHDRSZ + mp->upper), 1); + } else { + rc = compacting_put_page( + ctx, mp, PAGEHDRSZ + page_numkeys(mp) * mp->dupfix_ksize, 0, 1); } -#endif /* MADV_REMOVE */ -#if defined(MADV_DONTNEED) - NOTICE("open-MADV_%s %u..%u", "DONTNEED", - env->me_lck->mti_discarded_tail.weak, - bytes2pgno(env, env->me_dxb_mmap.current)); - err = - madvise(ptr_disp(env->me_map, used_aligned2os_bytes), - env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_DONTNEED) - ? ignore_enosys(errno) - : MDBX_SUCCESS; - if (unlikely(MDBX_IS_ERROR(err))) - return err; -#elif defined(POSIX_MADV_DONTNEED) - err = ignore_enosys(posix_madvise( - ptr_disp(env->me_map, used_aligned2os_bytes), - env->me_dxb_mmap.current - used_aligned2os_bytes, POSIX_MADV_DONTNEED)); - if (unlikely(MDBX_IS_ERROR(err))) - return err; -#elif defined(POSIX_FADV_DONTNEED) - err = ignore_enosys(posix_fadvise( - env->me_lazy_fd, used_aligned2os_bytes, - env->me_dxb_mmap.current - used_aligned2os_bytes, POSIX_FADV_DONTNEED)); - if (unlikely(MDBX_IS_ERROR(err))) - return err; -#endif /* MADV_DONTNEED */ - } + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; - err = set_readahead(env, bytes2pgno(env, used_bytes), readahead, true); - if (unlikely(err != MDBX_SUCCESS)) - return err; -#endif /* MDBX_ENABLE_MADVISE */ + if (mc->top) { + /* Update parent if there is one */ + node_set_pgno(page_node(mc->pg[mc->top - 1], mc->ki[mc->top - 1]), pgno); + cursor_pop(mc); + } else { + /* Otherwise we're done */ + *parent_pgno = pgno; + break; + } + } +bailout: + osal_free(buf); return rc; } -/******************************************************************************/ +__cold static int compacting_walk_tree(ctx_t *ctx, tree_t *tree) { + if (unlikely(tree->root == P_INVALID)) + return MDBX_SUCCESS; /* empty db */ -__cold static int setup_lck_locked(MDBX_env *env) { - int err = rthc_register(env); - if (unlikely(err != MDBX_SUCCESS)) - return err; + cursor_couple_t couple; + memset(&couple, 0, sizeof(couple)); + couple.inner.cursor.signature = ~cur_signature_live; + kvx_t kvx = {.clc = {.k = {.lmin = INT_MAX}, .v = {.lmin = INT_MAX}}}; + int rc = cursor_init4walk(&couple, ctx->txn, tree, &kvx); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - int lck_seize_rc = osal_lck_seize(env); - if (unlikely(MDBX_IS_ERROR(lck_seize_rc))) - return lck_seize_rc; + couple.outer.checking |= z_ignord | z_pagecheck; + couple.inner.cursor.checking |= z_ignord | z_pagecheck; + if (!tree->mod_txnid) + tree->mod_txnid = ctx->txn->txnid; + return compacting_walk(ctx, &couple.outer, &tree->root, tree->mod_txnid); +} - if (env->me_lfd == INVALID_HANDLE_VALUE) { - env->me_lck = lckless_stub(env); - env->me_maxreaders = UINT_MAX; - DEBUG("lck-setup:%s%s%s", " lck-less", - (env->me_flags & MDBX_RDONLY) ? " readonly" : "", - (lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); - return lck_seize_rc; - } +__cold static void compacting_fixup_meta(MDBX_env *env, meta_t *meta) { + eASSERT(env, meta->trees.gc.mod_txnid || meta->trees.gc.root == P_INVALID); + eASSERT(env, + meta->trees.main.mod_txnid || meta->trees.main.root == P_INVALID); - DEBUG("lck-setup:%s%s%s", " with-lck", - (env->me_flags & MDBX_RDONLY) ? " readonly" : "", - (lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); - - MDBX_env *inprocess_neighbor = nullptr; - err = rthc_uniq_check(&env->me_lck_mmap, &inprocess_neighbor); - if (unlikely(MDBX_IS_ERROR(err))) - return err; - if (inprocess_neighbor) { - if ((mdbx_static.flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 || - (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) != 0) - return MDBX_BUSY; - if (lck_seize_rc == MDBX_RESULT_TRUE) { - err = osal_lck_downgrade(env); - if (unlikely(err != MDBX_SUCCESS)) - return err; - lck_seize_rc = MDBX_RESULT_FALSE; + /* Calculate filesize taking in account shrink/growing thresholds */ + if (meta->geometry.first_unallocated != meta->geometry.now) { + meta->geometry.now = meta->geometry.first_unallocated; + const size_t aligner = + pv2pages(meta->geometry.grow_pv ? meta->geometry.grow_pv + : meta->geometry.shrink_pv); + if (aligner) { + const pgno_t aligned = pgno_align2os_pgno( + env, meta->geometry.first_unallocated + aligner - + meta->geometry.first_unallocated % aligner); + meta->geometry.now = aligned; } } - uint64_t size = 0; - err = osal_filesize(env->me_lfd, &size); - if (unlikely(err != MDBX_SUCCESS)) - return err; + if (meta->geometry.now < meta->geometry.lower) + meta->geometry.now = meta->geometry.lower; + if (meta->geometry.now > meta->geometry.upper) + meta->geometry.now = meta->geometry.upper; - if (lck_seize_rc == MDBX_RESULT_TRUE) { - size = ceil_powerof2(env->me_maxreaders * sizeof(MDBX_reader) + - sizeof(MDBX_lockinfo), - env->me_os_psize); - jitter4testing(false); - } else { - if (env->me_flags & MDBX_EXCLUSIVE) - return MDBX_BUSY; - if (size > INT_MAX || (size & (env->me_os_psize - 1)) != 0 || - size < env->me_os_psize) { - ERROR("lck-file has invalid size %" PRIu64 " bytes", size); - return MDBX_PROBLEM; - } - } + /* Update signature */ + assert(meta->geometry.now >= meta->geometry.first_unallocated); + meta_sign_as_steady(meta); +} - const size_t maxreaders = - ((size_t)size - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader); - if (maxreaders < 4) { - ERROR("lck-size too small (up to %" PRIuPTR " readers)", maxreaders); - return MDBX_PROBLEM; +/* Make resizable */ +__cold static void meta_make_sizeable(meta_t *meta) { + meta->geometry.lower = MIN_PAGENO; + if (meta->geometry.grow_pv == 0) { + const pgno_t step = 1 + (meta->geometry.upper - meta->geometry.lower) / 42; + meta->geometry.grow_pv = pages2pv(step); + } + if (meta->geometry.shrink_pv == 0) { + const pgno_t step = pv2pages(meta->geometry.grow_pv) << 1; + meta->geometry.shrink_pv = pages2pv(step); } - env->me_maxreaders = (maxreaders <= MDBX_READERS_LIMIT) - ? (unsigned)maxreaders - : (unsigned)MDBX_READERS_LIMIT; +} - err = osal_mmap((env->me_flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP, - &env->me_lck_mmap, (size_t)size, (size_t)size, - lck_seize_rc ? MMAP_OPTION_TRUNCATE | MMAP_OPTION_SEMAPHORE - : MMAP_OPTION_SEMAPHORE); - if (unlikely(err != MDBX_SUCCESS)) - return err; +__cold static int copy_with_compacting(MDBX_env *env, MDBX_txn *read_txn, + mdbx_filehandle_t fd, uint8_t *buffer, + const bool dest_is_pipe, + const MDBX_copy_flags_t flags) { + const size_t meta_bytes = pgno2bytes(env, NUM_METAS); + uint8_t *const data_buffer = + buffer + ceil_powerof2(meta_bytes, globals.sys_pagesize); + meta_t *const meta = meta_init_triplet(env, buffer); + meta_set_txnid(env, meta, read_txn->txnid); -#if MDBX_ENABLE_MADVISE -#ifdef MADV_DODUMP - err = madvise(env->me_lck_mmap.lck, size, MADV_DODUMP) ? ignore_enosys(errno) - : MDBX_SUCCESS; - if (unlikely(MDBX_IS_ERROR(err))) - return err; -#endif /* MADV_DODUMP */ + if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) + meta_make_sizeable(meta); -#ifdef MADV_WILLNEED - err = madvise(env->me_lck_mmap.lck, size, MADV_WILLNEED) - ? ignore_enosys(errno) - : MDBX_SUCCESS; - if (unlikely(MDBX_IS_ERROR(err))) - return err; -#elif defined(POSIX_MADV_WILLNEED) - err = ignore_enosys( - posix_madvise(env->me_lck_mmap.lck, size, POSIX_MADV_WILLNEED)); - if (unlikely(MDBX_IS_ERROR(err))) - return err; -#endif /* MADV_WILLNEED */ -#endif /* MDBX_ENABLE_MADVISE */ + /* copy canary sequences if present */ + if (read_txn->canary.v) { + meta->canary = read_txn->canary; + meta->canary.v = constmeta_txnid(meta); + } - struct MDBX_lockinfo *lck = env->me_lck_mmap.lck; - if (lck_seize_rc == MDBX_RESULT_TRUE) { - /* If we succeed got exclusive lock, then nobody is using the lock region - * and we should initialize it. */ - memset(lck, 0, (size_t)size); - jitter4testing(false); - lck->mti_magic_and_version = MDBX_LOCK_MAGIC; - lck->mti_os_and_format = MDBX_LOCK_FORMAT; -#if MDBX_ENABLE_PGOP_STAT - lck->mti_pgop_stat.wops.weak = 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - err = osal_msync(&env->me_lck_mmap, 0, (size_t)size, - MDBX_SYNC_DATA | MDBX_SYNC_SIZE); - if (unlikely(err != MDBX_SUCCESS)) { - ERROR("initial-%s for lck-file failed, err %d", "msync/fsync", err); - eASSERT(env, MDBX_IS_ERROR(err)); - return err; + if (read_txn->dbs[MAIN_DBI].root == P_INVALID) { + /* When the DB is empty, handle it specially to + * fix any breakage like page leaks from ITS#8174. */ + meta->trees.main.flags = read_txn->dbs[MAIN_DBI].flags; + compacting_fixup_meta(env, meta); + if (dest_is_pipe) { + int rc = osal_write(fd, buffer, meta_bytes); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; } } else { - if (lck->mti_magic_and_version != MDBX_LOCK_MAGIC) { - const bool invalid = (lck->mti_magic_and_version >> 8) != MDBX_MAGIC; - ERROR("lock region has %s", - invalid - ? "invalid magic" - : "incompatible version (only applications with nearly or the " - "same versions of libmdbx can share the same database)"); - return invalid ? MDBX_INVALID : MDBX_VERSION_MISMATCH; - } - if (lck->mti_os_and_format != MDBX_LOCK_FORMAT) { - ERROR("lock region has os/format signature 0x%" PRIx32 - ", expected 0x%" PRIx32, - lck->mti_os_and_format, MDBX_LOCK_FORMAT); - return MDBX_VERSION_MISMATCH; + /* Count free pages + GC pages. */ + cursor_couple_t couple; + int rc = cursor_init(&couple.outer, read_txn, FREE_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + pgno_t gc_npages = read_txn->dbs[FREE_DBI].branch_pages + + read_txn->dbs[FREE_DBI].leaf_pages + + read_txn->dbs[FREE_DBI].large_pages; + MDBX_val key, data; + rc = outer_first(&couple.outer, &key, &data); + while (rc == MDBX_SUCCESS) { + const pnl_t pnl = data.iov_base; + if (unlikely(data.iov_len % sizeof(pgno_t) || + data.iov_len < MDBX_PNL_SIZEOF(pnl))) { + ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC-record length", data.iov_len); + return MDBX_CORRUPTED; + } + if (unlikely(!pnl_check(pnl, read_txn->geo.first_unallocated))) { + ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC-record content"); + return MDBX_CORRUPTED; + } + gc_npages += MDBX_PNL_GETSIZE(pnl); + rc = outer_next(&couple.outer, &key, &data, MDBX_NEXT); } - } + if (unlikely(rc != MDBX_NOTFOUND)) + return rc; - err = osal_lck_init(env, inprocess_neighbor, lck_seize_rc); - if (unlikely(err != MDBX_SUCCESS)) { - eASSERT(env, MDBX_IS_ERROR(err)); - return err; - } + meta->geometry.first_unallocated = + read_txn->geo.first_unallocated - gc_npages; + meta->trees.main = read_txn->dbs[MAIN_DBI]; - env->me_lck = lck; - eASSERT(env, !MDBX_IS_ERROR(lck_seize_rc)); - return lck_seize_rc; -} + ctx_t ctx; + memset(&ctx, 0, sizeof(ctx)); + rc = osal_condpair_init(&ctx.condpair); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; -/* Open and/or initialize the lock region for the environment. */ -__cold static int setup_lck(MDBX_env *env, mdbx_mode_t mode) { - eASSERT(env, env->me_lazy_fd != INVALID_HANDLE_VALUE); - eASSERT(env, env->me_lfd == INVALID_HANDLE_VALUE); + memset(data_buffer, 0, 2 * (size_t)MDBX_ENVCOPY_WRITEBUF); + ctx.write_buf[0] = data_buffer; + ctx.write_buf[1] = data_buffer + (size_t)MDBX_ENVCOPY_WRITEBUF; + ctx.first_unallocated = NUM_METAS; + ctx.env = env; + ctx.fd = fd; + ctx.txn = read_txn; - int err = osal_openfile(MDBX_OPEN_LCK, env, env->me_pathname.lck, - &env->me_lfd, mode); - if (err != MDBX_SUCCESS) { - switch (err) { - default: - return err; - case MDBX_ENOFILE: - case MDBX_EACCESS: - case MDBX_EPERM: - if (!F_ISSET(env->me_flags, MDBX_RDONLY | MDBX_EXCLUSIVE)) - return err; - break; - case MDBX_EROFS: - if ((env->me_flags & MDBX_RDONLY) == 0) - return err; - break; - } + osal_thread_t thread; + int thread_err = osal_thread_create(&thread, compacting_write_thread, &ctx); + if (likely(thread_err == MDBX_SUCCESS)) { + if (dest_is_pipe) { + if (!meta->trees.main.mod_txnid) + meta->trees.main.mod_txnid = read_txn->txnid; + compacting_fixup_meta(env, meta); + rc = osal_write(fd, buffer, meta_bytes); + } + if (likely(rc == MDBX_SUCCESS)) + rc = compacting_walk_tree(&ctx, &meta->trees.main); + if (ctx.write_len[ctx.head & 1]) + /* toggle to flush non-empty buffers */ + compacting_toggle_write_buffers(&ctx); - if (err != MDBX_ENOFILE) { - /* ENSURE the file system is read-only */ - err = osal_check_fs_rdonly(env->me_lazy_fd, env->me_pathname.lck, err); - if (err != MDBX_SUCCESS && - /* ignore ERROR_NOT_SUPPORTED for exclusive mode */ - !(err == MDBX_ENOSYS && (env->me_flags & MDBX_EXCLUSIVE))) - return err; - } + if (likely(rc == MDBX_SUCCESS) && + unlikely(meta->geometry.first_unallocated != ctx.first_unallocated)) { + if (ctx.first_unallocated > meta->geometry.first_unallocated) { + ERROR("the source DB %s: post-compactification used pages %" PRIaPGNO + " %c expected %" PRIaPGNO, + "has double-used pages or other corruption", + ctx.first_unallocated, '>', meta->geometry.first_unallocated); + rc = MDBX_CORRUPTED; /* corrupted DB */ + } + if (ctx.first_unallocated < meta->geometry.first_unallocated) { + WARNING( + "the source DB %s: post-compactification used pages %" PRIaPGNO + " %c expected %" PRIaPGNO, + "has page leak(s)", ctx.first_unallocated, '<', + meta->geometry.first_unallocated); + if (dest_is_pipe) + /* the root within already written meta-pages is wrong */ + rc = MDBX_CORRUPTED; + } + /* fixup meta */ + meta->geometry.first_unallocated = ctx.first_unallocated; + } - /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ - env->me_lfd = INVALID_HANDLE_VALUE; + /* toggle with empty buffers to exit thread's loop */ + eASSERT(env, (ctx.write_len[ctx.head & 1]) == 0); + compacting_toggle_write_buffers(&ctx); + thread_err = osal_thread_join(thread); + eASSERT(env, (ctx.tail == ctx.head && ctx.write_len[ctx.head & 1] == 0) || + ctx.error); + osal_condpair_destroy(&ctx.condpair); + } + if (unlikely(thread_err != MDBX_SUCCESS)) + return thread_err; + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + if (unlikely(ctx.error != MDBX_SUCCESS)) + return ctx.error; + if (!dest_is_pipe) + compacting_fixup_meta(env, meta); } - rthc_lock(); - err = setup_lck_locked(env); - rthc_unlock(); - return err; -} - -__cold int mdbx_is_readahead_reasonable(size_t volume, intptr_t redundancy) { - if (volume <= 1024 * 1024 * 4ul) - return MDBX_RESULT_TRUE; - - intptr_t pagesize, total_ram_pages; - int err = mdbx_get_sysraminfo(&pagesize, &total_ram_pages, nullptr); - if (unlikely(err != MDBX_SUCCESS)) - return err; + /* Extend file if required */ + if (meta->geometry.now != meta->geometry.first_unallocated) { + const size_t whole_size = pgno2bytes(env, meta->geometry.now); + if (!dest_is_pipe) + return osal_ftruncate(fd, whole_size); - const int log2page = log2n_powerof2(pagesize); - const intptr_t volume_pages = (volume + pagesize - 1) >> log2page; - const intptr_t redundancy_pages = - (redundancy < 0) ? -(intptr_t)((-redundancy + pagesize - 1) >> log2page) - : (intptr_t)(redundancy + pagesize - 1) >> log2page; - if (volume_pages >= total_ram_pages || - volume_pages + redundancy_pages >= total_ram_pages) - return MDBX_RESULT_FALSE; + const size_t used_size = pgno2bytes(env, meta->geometry.first_unallocated); + memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF); + for (size_t offset = used_size; offset < whole_size;) { + const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset) + ? (size_t)MDBX_ENVCOPY_WRITEBUF + : whole_size - offset; + int rc = osal_write(fd, data_buffer, chunk); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + offset += chunk; + } + } + return MDBX_SUCCESS; +} - intptr_t avail_ram_pages; - err = mdbx_get_sysraminfo(nullptr, nullptr, &avail_ram_pages); - if (unlikely(err != MDBX_SUCCESS)) - return err; +__cold static int copy_asis(MDBX_env *env, MDBX_txn *read_txn, + mdbx_filehandle_t fd, uint8_t *buffer, + const bool dest_is_pipe, + const MDBX_copy_flags_t flags) { + int rc = txn_end(read_txn, TXN_END_RESET_TMP); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - return (volume_pages + redundancy_pages >= avail_ram_pages) - ? MDBX_RESULT_FALSE - : MDBX_RESULT_TRUE; -} + /* Temporarily block writers until we snapshot the meta pages */ + rc = lck_txn_lock(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; -/* Merge sync flags */ -static uint32_t merge_sync_flags(const uint32_t a, const uint32_t b) { - uint32_t r = a | b; + rc = txn_renew(read_txn, MDBX_TXN_RDONLY); + if (unlikely(rc != MDBX_SUCCESS)) { + lck_txn_unlock(env); + return rc; + } - /* avoid false MDBX_UTTERLY_NOSYNC */ - if (F_ISSET(r, MDBX_UTTERLY_NOSYNC) && !F_ISSET(a, MDBX_UTTERLY_NOSYNC) && - !F_ISSET(b, MDBX_UTTERLY_NOSYNC)) - r = (r - MDBX_UTTERLY_NOSYNC) | MDBX_SAFE_NOSYNC; + jitter4testing(false); + const size_t meta_bytes = pgno2bytes(env, NUM_METAS); + const troika_t troika = meta_tap(env); + /* Make a snapshot of meta-pages, + * but writing ones after the data was flushed */ + memcpy(buffer, env->dxb_mmap.base, meta_bytes); + meta_t *const headcopy = /* LY: get pointer to the snapshot copy */ + ptr_disp(buffer, + ptr_dist(meta_recent(env, &troika).ptr_c, env->dxb_mmap.base)); + lck_txn_unlock(env); - /* convert MDBX_DEPRECATED_MAPASYNC to MDBX_SAFE_NOSYNC */ - if ((r & (MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC)) == - (MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC) && - !F_ISSET(r, MDBX_UTTERLY_NOSYNC)) - r = (r - MDBX_DEPRECATED_MAPASYNC) | MDBX_SAFE_NOSYNC; + if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) + meta_make_sizeable(headcopy); + /* Update signature to steady */ + meta_sign_as_steady(headcopy); - /* force MDBX_NOMETASYNC if NOSYNC enabled */ - if (r & (MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC)) - r |= MDBX_NOMETASYNC; + /* Copy the data */ + const size_t whole_size = pgno_align2os_bytes(env, read_txn->geo.end_pgno); + const size_t used_size = pgno2bytes(env, read_txn->geo.first_unallocated); + jitter4testing(false); - assert(!(F_ISSET(r, MDBX_UTTERLY_NOSYNC) && - !F_ISSET(a, MDBX_UTTERLY_NOSYNC) && - !F_ISSET(b, MDBX_UTTERLY_NOSYNC))); - return r; -} + if (dest_is_pipe) + rc = osal_write(fd, buffer, meta_bytes); -__cold static int __must_check_result override_meta(MDBX_env *env, - size_t target, - txnid_t txnid, - const MDBX_meta *shape) { - int rc = alloc_page_buf(env); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - MDBX_page *const page = env->me_pbuf; - meta_model(env, page, target); - MDBX_meta *const model = page_meta(page); - meta_set_txnid(env, model, txnid); - if (txnid) - eASSERT(env, check_meta_coherency(env, model, true)); - if (shape) { - if (txnid && unlikely(!check_meta_coherency(env, shape, false))) { - ERROR("bailout overriding meta-%zu since model failed " - "FreeDB/MainDB %s-check for txnid #%" PRIaTXN, - target, "pre", constmeta_txnid(shape)); - return MDBX_PROBLEM; + uint8_t *const data_buffer = + buffer + ceil_powerof2(meta_bytes, globals.sys_pagesize); +#if MDBX_USE_COPYFILERANGE + static bool copyfilerange_unavailable; + bool not_the_same_filesystem = false; + struct statfs statfs_info; + if (fstatfs(fd, &statfs_info) || + statfs_info.f_type == /* ECRYPTFS_SUPER_MAGIC */ 0xf15f) + /* avoid use copyfilerange_unavailable() to ecryptfs due bugs */ + not_the_same_filesystem = true; +#endif /* MDBX_USE_COPYFILERANGE */ + for (size_t offset = meta_bytes; rc == MDBX_SUCCESS && offset < used_size;) { +#if MDBX_USE_SENDFILE + static bool sendfile_unavailable; + if (dest_is_pipe && likely(!sendfile_unavailable)) { + off_t in_offset = offset; + const ssize_t written = + sendfile(fd, env->lazy_fd, &in_offset, used_size - offset); + if (likely(written > 0)) { + offset = in_offset; + continue; + } + rc = MDBX_ENODATA; + if (written == 0 || ignore_enosys(rc = errno) != MDBX_RESULT_TRUE) + break; + sendfile_unavailable = true; } - if (mdbx_static.flags & MDBX_DBG_DONT_UPGRADE) - memcpy(&model->mm_magic_and_version, &shape->mm_magic_and_version, - sizeof(model->mm_magic_and_version)); - model->mm_extra_flags = shape->mm_extra_flags; - model->mm_validator_id = shape->mm_validator_id; - model->mm_extra_pagehdr = shape->mm_extra_pagehdr; - memcpy(&model->mm_geo, &shape->mm_geo, sizeof(model->mm_geo)); - memcpy(&model->mm_dbs, &shape->mm_dbs, sizeof(model->mm_dbs)); - memcpy(&model->mm_canary, &shape->mm_canary, sizeof(model->mm_canary)); - memcpy(&model->mm_pages_retired, &shape->mm_pages_retired, - sizeof(model->mm_pages_retired)); - if (txnid) { - if ((!model->mm_dbs[FREE_DBI].md_mod_txnid && - model->mm_dbs[FREE_DBI].md_root != P_INVALID) || - (!model->mm_dbs[MAIN_DBI].md_mod_txnid && - model->mm_dbs[MAIN_DBI].md_root != P_INVALID)) - memcpy(&model->mm_magic_and_version, &shape->mm_magic_and_version, - sizeof(model->mm_magic_and_version)); - if (unlikely(!check_meta_coherency(env, model, false))) { - ERROR("bailout overriding meta-%zu since model failed " - "FreeDB/MainDB %s-check for txnid #%" PRIaTXN, - target, "post", txnid); - return MDBX_PROBLEM; +#endif /* MDBX_USE_SENDFILE */ + +#if MDBX_USE_COPYFILERANGE + if (!dest_is_pipe && !not_the_same_filesystem && + likely(!copyfilerange_unavailable)) { + off_t in_offset = offset, out_offset = offset; + ssize_t bytes_copied = copy_file_range( + env->lazy_fd, &in_offset, fd, &out_offset, used_size - offset, 0); + if (likely(bytes_copied > 0)) { + offset = in_offset; + continue; } + rc = MDBX_ENODATA; + if (bytes_copied == 0) + break; + rc = errno; + if (rc == EXDEV || rc == /* workaround for ecryptfs bug(s), + maybe useful for others FS */ + EINVAL) + not_the_same_filesystem = true; + else if (ignore_enosys(rc) == MDBX_RESULT_TRUE) + copyfilerange_unavailable = true; + else + break; } - } - unaligned_poke_u64(4, model->mm_sign, meta_sign(model)); - rc = validate_meta(env, model, page, (pgno_t)target, nullptr); - if (unlikely(MDBX_IS_ERROR(rc))) - return MDBX_PROBLEM; +#endif /* MDBX_USE_COPYFILERANGE */ - if (shape && memcmp(model, shape, sizeof(MDBX_meta)) == 0) { - NOTICE("skip overriding meta-%zu since no changes " - "for txnid #%" PRIaTXN, - target, txnid); - return MDBX_SUCCESS; + /* fallback to portable */ + const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < used_size - offset) + ? (size_t)MDBX_ENVCOPY_WRITEBUF + : used_size - offset; + /* copy to avoid EFAULT in case swapped-out */ + memcpy(data_buffer, ptr_disp(env->dxb_mmap.base, offset), chunk); + rc = osal_write(fd, data_buffer, chunk); + offset += chunk; } - if (env->me_flags & MDBX_WRITEMAP) { -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.msync.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - rc = osal_msync(&env->me_dxb_mmap, 0, - pgno_align2os_bytes(env, model->mm_geo.next), - MDBX_SYNC_DATA | MDBX_SYNC_IODQ); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - /* override_meta() called only while current process have exclusive - * lock of a DB file. So meta-page could be updated directly without - * clearing consistency flag by mdbx_meta_update_begin() */ - memcpy(pgno2page(env, target), page, env->me_psize); - osal_flush_incoherent_cpu_writeback(); -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.msync.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, target + 1), - MDBX_SYNC_DATA | MDBX_SYNC_IODQ); - } else { -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, - pgno2bytes(env, target)); - if (rc == MDBX_SUCCESS && env->me_fd4meta == env->me_lazy_fd) { -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.fsync.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + /* Extend file if required */ + if (likely(rc == MDBX_SUCCESS) && whole_size != used_size) { + if (!dest_is_pipe) + rc = osal_ftruncate(fd, whole_size); + else { + memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF); + for (size_t offset = used_size; + rc == MDBX_SUCCESS && offset < whole_size;) { + const size_t chunk = + ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset) + ? (size_t)MDBX_ENVCOPY_WRITEBUF + : whole_size - offset; + rc = osal_write(fd, data_buffer, chunk); + offset += chunk; + } } - osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), - env->me_os_psize); } - eASSERT(env, (!env->me_txn && !env->me_txn0) || - (env->me_stuck_meta == (int)target && - (env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) == - MDBX_EXCLUSIVE)); + return rc; } -__cold int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target) { - if (unlikely(target >= NUM_METAS)) - return MDBX_EINVAL; +__cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, + MDBX_copy_flags_t flags) { int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (unlikely((env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != - MDBX_EXCLUSIVE)) - return MDBX_EPERM; + const int dest_is_pipe = osal_is_pipe(fd); + if (MDBX_IS_ERROR(dest_is_pipe)) + return dest_is_pipe; - const MDBX_meta *const target_meta = METAPAGE(env, target); - txnid_t new_txnid = constmeta_txnid(target_meta); - if (new_txnid < MIN_TXNID) - new_txnid = MIN_TXNID; - for (unsigned n = 0; n < NUM_METAS; ++n) { - if (n == target) - continue; - MDBX_page *const page = pgno2page(env, n); - MDBX_meta meta = *page_meta(page); - if (validate_meta(env, &meta, page, n, nullptr) != MDBX_SUCCESS) { - int err = override_meta(env, n, 0, nullptr); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } else { - txnid_t txnid = constmeta_txnid(&meta); - if (new_txnid <= txnid) - new_txnid = safe64_txnid_next(txnid); - } + if (!dest_is_pipe) { + rc = osal_fseek(fd, 0); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; } - if (unlikely(new_txnid > MAX_TXNID)) { - ERROR("txnid overflow, raise %d", MDBX_TXN_FULL); - return MDBX_TXN_FULL; + const size_t buffer_size = + pgno_align2os_bytes(env, NUM_METAS) + + ceil_powerof2(((flags & MDBX_CP_COMPACT) + ? 2 * (size_t)MDBX_ENVCOPY_WRITEBUF + : (size_t)MDBX_ENVCOPY_WRITEBUF), + globals.sys_pagesize); + + uint8_t *buffer = nullptr; + rc = osal_memalign_alloc(globals.sys_pagesize, buffer_size, (void **)&buffer); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + MDBX_txn *read_txn = nullptr; + /* Do the lock/unlock of the reader mutex before starting the + * write txn. Otherwise other read txns could block writers. */ + rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &read_txn); + if (unlikely(rc != MDBX_SUCCESS)) { + osal_memalign_free(buffer); + return rc; + } + + if (!dest_is_pipe) { + /* Firstly write a stub to meta-pages. + * Now we sure to incomplete copy will not be used. */ + memset(buffer, -1, pgno2bytes(env, NUM_METAS)); + rc = osal_write(fd, buffer, pgno2bytes(env, NUM_METAS)); + } + + if (likely(rc == MDBX_SUCCESS)) { + memset(buffer, 0, pgno2bytes(env, NUM_METAS)); + rc = ((flags & MDBX_CP_COMPACT) ? copy_with_compacting : copy_asis)( + env, read_txn, fd, buffer, dest_is_pipe, flags); + } + mdbx_txn_abort(read_txn); + + if (!dest_is_pipe) { + if (likely(rc == MDBX_SUCCESS)) + rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE); + + /* Write actual meta */ + if (likely(rc == MDBX_SUCCESS)) + rc = osal_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0); + + if (likely(rc == MDBX_SUCCESS)) + rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); } - return override_meta(env, target, new_txnid, target_meta); + + osal_memalign_free(buffer); + return rc; } -__cold int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, - unsigned target_meta, bool writeable) { +__cold int mdbx_env_copy(MDBX_env *env, const char *dest_path, + MDBX_copy_flags_t flags) { #if defined(_WIN32) || defined(_WIN64) - wchar_t *pathnameW = nullptr; - int rc = osal_mb2w(pathname, &pathnameW); + wchar_t *dest_pathW = nullptr; + int rc = osal_mb2w(dest_path, &dest_pathW); if (likely(rc == MDBX_SUCCESS)) { - rc = mdbx_env_open_for_recoveryW(env, pathnameW, target_meta, writeable); - osal_free(pathnameW); + rc = mdbx_env_copyW(env, dest_pathW, flags); + osal_free(dest_pathW); } return rc; } -__cold int mdbx_env_open_for_recoveryW(MDBX_env *env, const wchar_t *pathname, - unsigned target_meta, bool writeable) { +__cold int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest_path, + MDBX_copy_flags_t flags) { #endif /* Windows */ - if (unlikely(target_meta >= NUM_METAS)) - return MDBX_EINVAL; - int rc = check_env(env, false); + int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (unlikely(env->me_map)) - return MDBX_EPERM; - env->me_stuck_meta = (int8_t)target_meta; - return + if (unlikely(!dest_path)) + return MDBX_EINVAL; + + /* The destination path must exist, but the destination file must not. + * We don't want the OS to cache the writes, since the source data is + * already in the OS cache. */ + mdbx_filehandle_t newfd; + rc = osal_openfile(MDBX_OPEN_COPY, env, dest_path, &newfd, #if defined(_WIN32) || defined(_WIN64) - mdbx_env_openW + (mdbx_mode_t)-1 #else - mdbx_env_open -#endif /* Windows */ - (env, pathname, writeable ? MDBX_EXCLUSIVE : MDBX_EXCLUSIVE | MDBX_RDONLY, - 0); -} + S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP +#endif + ); -__cold static int check_alternative_lck_absent(const pathchar_t *lck_pathname) { - int err = osal_fileexists(lck_pathname); - if (unlikely(err != MDBX_RESULT_FALSE)) { - if (err == MDBX_RESULT_TRUE) - err = MDBX_DUPLICATED_CLK; - ERROR("Alternative/Duplicate LCK-file '%" MDBX_PRIsPATH "' error %d", - lck_pathname, err); +#if defined(_WIN32) || defined(_WIN64) + /* no locking required since the file opened with ShareMode == 0 */ +#else + if (rc == MDBX_SUCCESS) { + MDBX_STRUCT_FLOCK lock_op; + memset(&lock_op, 0, sizeof(lock_op)); + lock_op.l_type = F_WRLCK; + lock_op.l_whence = SEEK_SET; + lock_op.l_start = 0; + lock_op.l_len = OFF_T_MAX; + if (MDBX_FCNTL(newfd, MDBX_F_SETLK, &lock_op) +#if (defined(__linux__) || defined(__gnu_linux__)) && defined(LOCK_EX) && \ + (!defined(__ANDROID_API__) || __ANDROID_API__ >= 24) + || flock(newfd, LOCK_EX | LOCK_NB) +#endif /* Linux */ + ) + rc = errno; } - return err; -} +#endif /* Windows / POSIX */ -__cold static int env_handle_pathname(MDBX_env *env, const pathchar_t *pathname, - const mdbx_mode_t mode) { - memset(&env->me_pathname, 0, sizeof(env->me_pathname)); - if (unlikely(!pathname || !*pathname)) - return MDBX_EINVAL; + if (rc == MDBX_SUCCESS) + rc = mdbx_env_copy2fd(env, newfd, flags); - int rc; -#if defined(_WIN32) || defined(_WIN64) - const DWORD dwAttrib = GetFileAttributesW(pathname); - if (dwAttrib == INVALID_FILE_ATTRIBUTES) { - rc = GetLastError(); - if (rc != MDBX_ENOFILE) - return rc; - if (mode == 0 || (env->me_flags & MDBX_RDONLY) != 0) - /* can't open existing */ - return rc; - - /* auto-create directory if requested */ - if ((env->me_flags & MDBX_NOSUBDIR) == 0 && - !CreateDirectoryW(pathname, nullptr)) { - rc = GetLastError(); - if (rc != ERROR_ALREADY_EXISTS) - return rc; - } - } else { - /* ignore passed MDBX_NOSUBDIR flag and set it automatically */ - env->me_flags |= MDBX_NOSUBDIR; - if (dwAttrib & FILE_ATTRIBUTE_DIRECTORY) - env->me_flags -= MDBX_NOSUBDIR; + if (newfd != INVALID_HANDLE_VALUE) { + int err = osal_closefile(newfd); + if (rc == MDBX_SUCCESS && err != rc) + rc = err; + if (rc != MDBX_SUCCESS) + (void)osal_removefile(dest_path); } -#else - struct stat st; - if (stat(pathname, &st) != 0) { - rc = errno; - if (rc != MDBX_ENOFILE) - return rc; - if (mode == 0 || (env->me_flags & MDBX_RDONLY) != 0) - /* can't open non-existing */ - return rc /* MDBX_ENOFILE */; - /* auto-create directory if requested */ - const mdbx_mode_t dir_mode = - (/* inherit read/write permissions for group and others */ mode & - (S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) | - /* always add read/write/search for owner */ S_IRWXU | - ((mode & S_IRGRP) ? /* +search if readable by group */ S_IXGRP : 0) | - ((mode & S_IROTH) ? /* +search if readable by others */ S_IXOTH : 0); - if ((env->me_flags & MDBX_NOSUBDIR) == 0 && mkdir(pathname, dir_mode)) { - rc = errno; - if (rc != EEXIST) - return rc; - } - } else { - /* ignore passed MDBX_NOSUBDIR flag and set it automatically */ - env->me_flags |= MDBX_NOSUBDIR; - if (S_ISDIR(st.st_mode)) - env->me_flags -= MDBX_NOSUBDIR; - } -#endif + return rc; +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \note Please refer to the COPYRIGHT file for explanations license change, +/// credits and acknowledgments. +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 - static const pathchar_t dxb_name[] = MDBX_DATANAME; - static const pathchar_t lck_name[] = MDBX_LOCKNAME; - static const pathchar_t lock_suffix[] = MDBX_LOCK_SUFFIX; -#if defined(_WIN32) || defined(_WIN64) - assert(dxb_name[0] == '\\' && lck_name[0] == '\\'); - const size_t pathname_len = wcslen(pathname); -#else - assert(dxb_name[0] == '/' && lck_name[0] == '/'); - const size_t pathname_len = strlen(pathname); -#endif - assert(!osal_isdirsep(lock_suffix[0])); - size_t base_len = pathname_len; - static const size_t dxb_name_len = ARRAY_LENGTH(dxb_name) - 1; - if (env->me_flags & MDBX_NOSUBDIR) { - if (base_len > dxb_name_len && - osal_pathequal(pathname + base_len - dxb_name_len, dxb_name, - dxb_name_len)) { - env->me_flags -= MDBX_NOSUBDIR; - base_len -= dxb_name_len; - } else if (base_len == dxb_name_len - 1 && osal_isdirsep(dxb_name[0]) && - osal_isdirsep(lck_name[0]) && - osal_pathequal(pathname + base_len - dxb_name_len + 1, - dxb_name + 1, dxb_name_len - 1)) { - env->me_flags -= MDBX_NOSUBDIR; - base_len -= dxb_name_len - 1; - } +__cold int cursor_check(const MDBX_cursor *mc) { + if (!mc->txn->tw.dirtylist) { + cASSERT(mc, (mc->txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + } else { + cASSERT(mc, (mc->txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + cASSERT(mc, mc->txn->tw.dirtyroom + mc->txn->tw.dirtylist->length == + (mc->txn->parent ? mc->txn->parent->tw.dirtyroom + : mc->txn->env->options.dp_limit)); } - const size_t suflen_with_NOSUBDIR = sizeof(lock_suffix) + sizeof(pathchar_t); - const size_t suflen_without_NOSUBDIR = sizeof(lck_name) + sizeof(dxb_name); - const size_t enough4any = (suflen_with_NOSUBDIR > suflen_without_NOSUBDIR) - ? suflen_with_NOSUBDIR - : suflen_without_NOSUBDIR; - const size_t bytes_needed = - sizeof(pathchar_t) * (base_len * 2 + pathname_len + 1) + enough4any; - env->me_pathname.buffer = osal_malloc(bytes_needed); - if (!env->me_pathname.buffer) - return MDBX_ENOMEM; + cASSERT(mc, (mc->checking & z_updating) ? mc->top + 1 <= mc->tree->height + : mc->top + 1 == mc->tree->height); + if (unlikely((mc->checking & z_updating) ? mc->top + 1 > mc->tree->height + : mc->top + 1 != mc->tree->height)) + return MDBX_CURSOR_FULL; - env->me_pathname.specified = env->me_pathname.buffer; - env->me_pathname.dxb = env->me_pathname.specified + pathname_len + 1; - env->me_pathname.lck = env->me_pathname.dxb + base_len + dxb_name_len + 1; - rc = MDBX_SUCCESS; - pathchar_t *const buf = env->me_pathname.buffer; - if (base_len) { - memcpy(buf, pathname, sizeof(pathchar_t) * pathname_len); - if (env->me_flags & MDBX_NOSUBDIR) { - const pathchar_t *const lck_ext = - osal_fileext(lck_name, ARRAY_LENGTH(lck_name)); - if (lck_ext) { - pathchar_t *pathname_ext = osal_fileext(buf, pathname_len); - memcpy(pathname_ext ? pathname_ext : buf + pathname_len, lck_ext, - sizeof(pathchar_t) * (ARRAY_END(lck_name) - lck_ext)); - rc = check_alternative_lck_absent(buf); - } + if (is_pointed(mc) && (mc->checking & z_updating) == 0) { + const page_t *mp = mc->pg[mc->top]; + const size_t nkeys = page_numkeys(mp); + if (mc->flags & z_hollow) { + cASSERT(mc, mc->ki[mc->top] <= nkeys); + if (mc->ki[mc->top] > nkeys) + return MDBX_CURSOR_FULL; } else { - memcpy(buf + base_len, dxb_name, sizeof(dxb_name)); - memcpy(buf + base_len + dxb_name_len, lock_suffix, sizeof(lock_suffix)); - rc = check_alternative_lck_absent(buf); + cASSERT(mc, mc->ki[mc->top] < nkeys); + if (mc->ki[mc->top] >= nkeys) + return MDBX_CURSOR_FULL; + } + if (inner_pointed(mc)) { + cASSERT(mc, is_filled(mc)); + if (!is_filled(mc)) + return MDBX_CURSOR_FULL; } + } - memcpy(env->me_pathname.dxb, pathname, sizeof(pathchar_t) * (base_len + 1)); - memcpy(env->me_pathname.lck, pathname, sizeof(pathchar_t) * base_len); - if (env->me_flags & MDBX_NOSUBDIR) { - memcpy(env->me_pathname.lck + base_len, lock_suffix, sizeof(lock_suffix)); + for (intptr_t n = 0; n <= mc->top; ++n) { + page_t *mp = mc->pg[n]; + const size_t nkeys = page_numkeys(mp); + const bool expect_branch = (n < mc->tree->height - 1) ? true : false; + const bool expect_nested_leaf = + (n + 1 == mc->tree->height - 1) ? true : false; + const bool branch = is_branch(mp) ? true : false; + cASSERT(mc, branch == expect_branch); + if (unlikely(branch != expect_branch)) + return MDBX_CURSOR_FULL; + if ((mc->checking & z_updating) == 0) { + cASSERT(mc, nkeys > mc->ki[n] || (!branch && nkeys == mc->ki[n] && + (mc->flags & z_hollow) != 0)); + if (unlikely(nkeys <= mc->ki[n] && !(!branch && nkeys == mc->ki[n] && + (mc->flags & z_hollow) != 0))) + return MDBX_CURSOR_FULL; } else { - memcpy(env->me_pathname.dxb + base_len, dxb_name, sizeof(dxb_name)); - memcpy(env->me_pathname.lck + base_len, lck_name, sizeof(lck_name)); + cASSERT(mc, nkeys + 1 >= mc->ki[n]); + if (unlikely(nkeys + 1 < mc->ki[n])) + return MDBX_CURSOR_FULL; } - } else { - assert(!(env->me_flags & MDBX_NOSUBDIR)); - memcpy(buf, dxb_name + 1, sizeof(dxb_name) - sizeof(pathchar_t)); - memcpy(buf + dxb_name_len - 1, lock_suffix, sizeof(lock_suffix)); - rc = check_alternative_lck_absent(buf); - memcpy(env->me_pathname.dxb, dxb_name + 1, - sizeof(dxb_name) - sizeof(pathchar_t)); - memcpy(env->me_pathname.lck, lck_name + 1, - sizeof(lck_name) - sizeof(pathchar_t)); + int err = page_check(mc, mp); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + for (size_t i = 0; i < nkeys; ++i) { + if (branch) { + node_t *node = page_node(mp, i); + cASSERT(mc, node_flags(node) == 0); + if (unlikely(node_flags(node) != 0)) + return MDBX_CURSOR_FULL; + pgno_t pgno = node_pgno(node); + page_t *np; + err = page_get(mc, pgno, &np, mp->txnid); + cASSERT(mc, err == MDBX_SUCCESS); + if (unlikely(err != MDBX_SUCCESS)) + return err; + const bool nested_leaf = is_leaf(np) ? true : false; + cASSERT(mc, nested_leaf == expect_nested_leaf); + if (unlikely(nested_leaf != expect_nested_leaf)) + return MDBX_CURSOR_FULL; + err = page_check(mc, np); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + } } + return MDBX_SUCCESS; +} - memcpy(env->me_pathname.specified, pathname, - sizeof(pathchar_t) * (pathname_len + 1)); +__cold int cursor_check_updating(MDBX_cursor *mc) { + const uint8_t checking = mc->checking; + mc->checking |= z_updating; + const int rc = cursor_check(mc); + mc->checking = checking; return rc; } -__cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { -#if defined(_WIN32) || defined(_WIN64) - wchar_t *pathnameW = nullptr; - int rc = osal_mb2w(pathname, &pathnameW); - if (likely(rc == MDBX_SUCCESS)) { - rc = mdbx_env_deleteW(pathnameW, mode); - osal_free(pathnameW); - } - return rc; +bool cursor_is_tracked(const MDBX_cursor *mc) { + for (MDBX_cursor *scan = mc->txn->cursors[cursor_dbi(mc)]; scan; + scan = scan->next) + if (mc == ((mc->flags & z_inner) ? &scan->subcur->cursor : scan)) + return true; + return false; } -__cold int mdbx_env_deleteW(const wchar_t *pathname, - MDBX_env_delete_mode_t mode) { -#endif /* Windows */ +/*----------------------------------------------------------------------------*/ - switch (mode) { - default: - return MDBX_EINVAL; - case MDBX_ENV_JUST_DELETE: - case MDBX_ENV_ENSURE_UNUSED: - case MDBX_ENV_WAIT_FOR_UNUSED: - break; - } +static int touch_dbi(MDBX_cursor *mc) { + cASSERT(mc, (mc->flags & z_inner) == 0); + cASSERT(mc, (*cursor_dbi_state(mc) & DBI_DIRTY) == 0); + *cursor_dbi_state(mc) |= DBI_DIRTY; + mc->txn->flags |= MDBX_TXN_DIRTY; -#ifdef __e2k__ /* https://bugs.mcst.ru/bugzilla/show_bug.cgi?id=6011 */ - MDBX_env *const dummy_env = alloca(sizeof(MDBX_env)); -#else - MDBX_env dummy_env_silo, *const dummy_env = &dummy_env_silo; -#endif - memset(dummy_env, 0, sizeof(*dummy_env)); - dummy_env->me_flags = - (mode == MDBX_ENV_ENSURE_UNUSED) ? MDBX_EXCLUSIVE : MDBX_ENV_DEFAULTS; - dummy_env->me_os_psize = (unsigned)osal_syspagesize(); - dummy_env->me_psize = (unsigned)mdbx_default_pagesize(); + if (!cursor_is_core(mc)) { + /* Touch DB record of named DB */ + cursor_couple_t cx; + int rc = dbi_check(mc->txn, MAIN_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + rc = cursor_init(&cx.outer, mc->txn, MAIN_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + mc->txn->dbi_state[MAIN_DBI] |= DBI_DIRTY; + rc = tree_search(&cx.outer, &container_of(mc->clc, kvx_t, clc)->name, + Z_MODIFY); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + return MDBX_SUCCESS; +} - STATIC_ASSERT(sizeof(dummy_env->me_flags) == sizeof(MDBX_env_flags_t)); - int rc = MDBX_RESULT_TRUE, err = env_handle_pathname(dummy_env, pathname, 0); - if (likely(err == MDBX_SUCCESS)) { - mdbx_filehandle_t clk_handle = INVALID_HANDLE_VALUE, - dxb_handle = INVALID_HANDLE_VALUE; - if (mode > MDBX_ENV_JUST_DELETE) { - err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, - dummy_env->me_pathname.dxb, &dxb_handle, 0); - err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; - if (err == MDBX_SUCCESS) { - err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, - dummy_env->me_pathname.lck, &clk_handle, 0); - err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; - } - if (err == MDBX_SUCCESS && clk_handle != INVALID_HANDLE_VALUE) - err = osal_lockfile(clk_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); - if (err == MDBX_SUCCESS && dxb_handle != INVALID_HANDLE_VALUE) - err = osal_lockfile(dxb_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); - } +__hot int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, + const MDBX_val *data) { + cASSERT(mc, (mc->txn->flags & MDBX_TXN_RDONLY) == 0); + cASSERT(mc, is_pointed(mc) || mc->tree->height == 0); + cASSERT(mc, cursor_is_tracked(mc)); - if (err == MDBX_SUCCESS) { - err = osal_removefile(dummy_env->me_pathname.dxb); - if (err == MDBX_SUCCESS) - rc = MDBX_SUCCESS; - else if (err == MDBX_ENOFILE) - err = MDBX_SUCCESS; - } + cASSERT(mc, F_ISSET(dbi_state(mc->txn, FREE_DBI), DBI_LINDO | DBI_VALID)); + cASSERT(mc, F_ISSET(dbi_state(mc->txn, MAIN_DBI), DBI_LINDO | DBI_VALID)); + if ((mc->flags & z_inner) == 0) { + MDBX_txn *const txn = mc->txn; + dpl_lru_turn(txn); - if (err == MDBX_SUCCESS) { - err = osal_removefile(dummy_env->me_pathname.lck); - if (err == MDBX_SUCCESS) - rc = MDBX_SUCCESS; - else if (err == MDBX_ENOFILE) - err = MDBX_SUCCESS; + if (unlikely((*cursor_dbi_state(mc) & DBI_DIRTY) == 0)) { + int err = touch_dbi(mc); + if (unlikely(err != MDBX_SUCCESS)) + return err; } - if (err == MDBX_SUCCESS && !(dummy_env->me_flags & MDBX_NOSUBDIR) && - (/* pathname != "." */ pathname[0] != '.' || pathname[1] != 0) && - (/* pathname != ".." */ pathname[0] != '.' || pathname[1] != '.' || - pathname[2] != 0)) { - err = osal_removedirectory(pathname); - if (err == MDBX_SUCCESS) - rc = MDBX_SUCCESS; - else if (err == MDBX_ENOFILE) - err = MDBX_SUCCESS; + /* Estimate how much space this operation will take: */ + /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */ + size_t need = CURSOR_STACK_SIZE + 3; + /* 2) GC/FreeDB for any payload */ + if (!cursor_is_gc(mc)) { + need += txn->dbs[FREE_DBI].height + (size_t)3; + /* 3) Named DBs also dirty the main DB */ + if (cursor_is_main(mc)) + need += txn->dbs[MAIN_DBI].height + (size_t)3; } +#if xMDBX_DEBUG_SPILLING != 2 + /* production mode */ + /* 4) Double the page chain estimation + * for extensively splitting, rebalance and merging */ + need += need; + /* 5) Factor the key+data which to be put in */ + need += bytes2pgno(txn->env, node_size(key, data)) + (size_t)1; +#else + /* debug mode */ + (void)key; + (void)data; + txn->env->debug_dirtied_est = ++need; + txn->env->debug_dirtied_act = 0; +#endif /* xMDBX_DEBUG_SPILLING == 2 */ - if (dxb_handle != INVALID_HANDLE_VALUE) - osal_closefile(dxb_handle); - if (clk_handle != INVALID_HANDLE_VALUE) - osal_closefile(clk_handle); - } else if (err == MDBX_ENOFILE) - err = MDBX_SUCCESS; + int err = txn_spill(txn, mc, need); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } - osal_free(dummy_env->me_pathname.buffer); - return (err == MDBX_SUCCESS) ? rc : err; + if (likely(mc->top >= 0) && !is_modifable(mc->txn, mc->pg[mc->top])) { + const int8_t top = mc->top; + mc->top = 0; + do { + int err = page_touch(mc); + if (unlikely(err != MDBX_SUCCESS)) + return err; + mc->top += 1; + } while (mc->top <= top); + mc->top = top; + } + return MDBX_SUCCESS; } -__cold static int env_open(MDBX_env *env, mdbx_mode_t mode) { - /* Использование O_DSYNC или FILE_FLAG_WRITE_THROUGH: - * - * 0) Если размер страниц БД меньше системной страницы ОЗУ, то ядру ОС - * придется чаще обновлять страницы в unified page cache. - * - * Однако, O_DSYNC не предполагает отключение unified page cache, - * поэтому подобные затруднения будем считать проблемой ОС и/или - * ожидаемым пенальти из-за использования мелких страниц БД. - * - * 1) В режиме MDBX_SYNC_DURABLE - O_DSYNC для записи как данных, - * так и мета-страниц. Однако, на Linux отказ от O_DSYNC с последующим - * fdatasync() может быть выгоднее при использовании HDD, так как - * позволяет io-scheduler переупорядочить запись с учетом актуального - * расположения файла БД на носителе. - * - * 2) В режиме MDBX_NOMETASYNC - O_DSYNC можно использовать для данных, - * но в этом может не быть смысла, так как fdatasync() всё равно - * требуется для гарантии фиксации мета после предыдущей транзакции. - * - * В итоге на нормальных системах (не Windows) есть два варианта: - * - при возможности O_DIRECT и/или io_ring для данных, скорее всего, - * есть смысл вызвать fdatasync() перед записью данных, а затем - * использовать O_DSYNC; - * - не использовать O_DSYNC и вызывать fdatasync() после записи данных. - * - * На Windows же следует минимизировать использование FlushFileBuffers() - * из-за проблем с производительностью. Поэтому на Windows в режиме - * MDBX_NOMETASYNC: - * - мета обновляется через дескриптор без FILE_FLAG_WRITE_THROUGH; - * - перед началом записи данных вызывается FlushFileBuffers(), если - * mti_meta_sync_txnid не совпадает с последней записанной мета; - * - данные записываются через дескриптор с FILE_FLAG_WRITE_THROUGH. - * - * 3) В режиме MDBX_SAFE_NOSYNC - O_DSYNC нет смысла использовать, пока не - * будет реализована возможность полностью асинхронной "догоняющей" - * записи в выделенном процессе-сервере с io-ring очередями внутри. - * - * ----- - * - * Использование O_DIRECT или FILE_FLAG_NO_BUFFERING: - * - * Назначение этих флагов в отключении файлового дескриптора от - * unified page cache, т.е. от отображенных в память данных в случае - * libmdbx. - * - * Поэтому, использование direct i/o в libmdbx без MDBX_WRITEMAP лишено - * смысла и контр-продуктивно, ибо так мы провоцируем ядро ОС на - * не-когерентность отображения в память с содержимым файла на носителе, - * либо требуем дополнительных проверок и действий направленных на - * фактическое отключение O_DIRECT для отображенных в память данных. - * - * В режиме MDBX_WRITEMAP когерентность отображенных данных обеспечивается - * физически. Поэтому использование direct i/o может иметь смысл, если у - * ядра ОС есть какие-то проблемы с msync(), в том числе с - * производительностью: - * - использование io_ring или gather-write может быть дешевле, чем - * просмотр PTE ядром и запись измененных/грязных; - * - но проблема в том, что записываемые из user mode страницы либо не - * будут помечены чистыми (и соответственно будут записаны ядром - * еще раз), либо ядру необходимо искать и чистить PTE при получении - * запроса на запись. - * - * Поэтому O_DIRECT или FILE_FLAG_NO_BUFFERING используется: - * - только в режиме MDBX_SYNC_DURABLE с MDBX_WRITEMAP; - * - когда me_psize >= me_os_psize; - * - опция сборки MDBX_AVOID_MSYNC != 0, которая по-умолчанию включена - * только на Windows (см ниже). - * - * ----- - * - * Использование FILE_FLAG_OVERLAPPED на Windows: - * - * У Windows очень плохо с I/O (за исключением прямых постраничных - * scatter/gather, которые работают в обход проблемного unified page - * cache и поэтому почти бесполезны в libmdbx). - * - * При этом всё еще хуже при использовании FlushFileBuffers(), что также - * требуется после FlushViewOfFile() в режиме MDBX_WRITEMAP. Поэтому - * на Windows вместо FlushViewOfFile() и FlushFileBuffers() следует - * использовать запись через дескриптор с FILE_FLAG_WRITE_THROUGH. - * - * В свою очередь, запись с FILE_FLAG_WRITE_THROUGH дешевле/быстрее - * при использовании FILE_FLAG_OVERLAPPED. В результате, на Windows - * в durable-режимах запись данных всегда в overlapped-режиме, - * при этом для записи мета требуется отдельный не-overlapped дескриптор. - */ +/*----------------------------------------------------------------------------*/ - env->me_pid = osal_getpid(); - int rc = osal_openfile((env->me_flags & MDBX_RDONLY) ? MDBX_OPEN_DXB_READ - : MDBX_OPEN_DXB_LAZY, - env, env->me_pathname.dxb, &env->me_lazy_fd, mode); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +int cursor_shadow(MDBX_cursor *parent_cursor, MDBX_txn *nested_txn, + const size_t dbi) { -#if MDBX_LOCKING == MDBX_LOCKING_SYSV - env->me_sysv_ipc.key = ftok(env->me_pathname.dxb, 42); - if (unlikely(env->me_sysv_ipc.key == -1)) - return errno; -#endif /* MDBX_LOCKING */ + tASSERT(nested_txn, dbi > FREE_DBI && dbi < nested_txn->n_dbi); + const size_t size = parent_cursor->subcur + ? sizeof(MDBX_cursor) + sizeof(subcur_t) + : sizeof(MDBX_cursor); + for (MDBX_cursor *bk; parent_cursor; parent_cursor = bk->next) { + cASSERT(parent_cursor, parent_cursor != parent_cursor->next); + bk = parent_cursor; + if (parent_cursor->signature != cur_signature_live) + continue; + bk = osal_malloc(size); + if (unlikely(!bk)) + return MDBX_ENOMEM; +#if MDBX_DEBUG + memset(bk, 0xCD, size); + VALGRIND_MAKE_MEM_UNDEFINED(bk, size); +#endif /* MDBX_DEBUG */ + *bk = *parent_cursor; + parent_cursor->backup = bk; + /* Kill pointers into src to reduce abuse: The + * user may not use mc until dst ends. But we need a valid + * txn pointer here for cursor fixups to keep working. */ + parent_cursor->txn = nested_txn; + parent_cursor->tree = &nested_txn->dbs[dbi]; + parent_cursor->dbi_state = &nested_txn->dbi_state[dbi]; + subcur_t *mx = parent_cursor->subcur; + if (mx != nullptr) { + *(subcur_t *)(bk + 1) = *mx; + mx->cursor.txn = nested_txn; + mx->cursor.dbi_state = parent_cursor->dbi_state; + } + parent_cursor->next = nested_txn->cursors[dbi]; + nested_txn->cursors[dbi] = parent_cursor; + } + return MDBX_SUCCESS; +} - /* Set the position in files outside of the data to avoid corruption - * due to erroneous use of file descriptors in the application code. */ - const uint64_t safe_parking_lot_offset = UINT64_C(0x7fffFFFF80000000); - osal_fseek(env->me_lazy_fd, safe_parking_lot_offset); +void cursor_eot(MDBX_cursor *mc, const bool merge) { + const unsigned stage = mc->signature; + MDBX_cursor *const bk = mc->backup; + ENSURE(mc->txn->env, stage == cur_signature_live || + (stage == cur_signature_wait4eot && bk)); + if (bk) { + subcur_t *mx = mc->subcur; + cASSERT(mc, mc->txn->parent != nullptr); + /* Zap: Using uninitialized memory '*mc->backup'. */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6001); + ENSURE(mc->txn->env, bk->signature == cur_signature_live); + cASSERT(mc, mx == bk->subcur); + if (merge) { + /* Update pointers to parent txn */ + mc->next = bk->next; + mc->backup = bk->backup; + mc->txn = bk->txn; + mc->tree = bk->tree; + mc->dbi_state = bk->dbi_state; + if (mx) { + mx->cursor.txn = mc->txn; + mx->cursor.dbi_state = mc->dbi_state; + } + } else { + /* Restore from backup, i.e. rollback/abort nested txn */ + *mc = *bk; + if (mx) + *mx = *(subcur_t *)(bk + 1); + } + if (stage == cur_signature_wait4eot /* Cursor was closed by user */) + mc->signature = stage /* Promote closed state to parent txn */; + bk->signature = 0; + osal_free(bk); + } else { + ENSURE(mc->txn->env, stage == cur_signature_live); + mc->signature = cur_signature_ready4dispose /* Cursor may be reused */; + mc->next = mc; + } +} - env->me_fd4meta = env->me_lazy_fd; -#if defined(_WIN32) || defined(_WIN64) - eASSERT(env, env->me_overlapped_fd == 0); - bool ior_direct = false; - if (!(env->me_flags & - (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_EXCLUSIVE))) { - if (MDBX_AVOID_MSYNC && (env->me_flags & MDBX_WRITEMAP)) { - /* Запрошен режим MDBX_SYNC_DURABLE | MDBX_WRITEMAP при активной опции - * MDBX_AVOID_MSYNC. - * - * 1) В этой комбинации наиболее выгодно использовать WriteFileGather(), - * но для этого необходимо открыть файл с флагом FILE_FLAG_NO_BUFFERING и - * после обеспечивать выравнивание адресов и размера данных на границу - * системной страницы, что в свою очередь возможно если размер страницы БД - * не меньше размера системной страницы ОЗУ. Поэтому для открытия файла в - * нужном режиме требуется знать размер страницы БД. - * - * 2) Кроме этого, в Windows запись в заблокированный регион файла - * возможно только через тот-же дескриптор. Поэтому изначальный захват - * блокировок посредством osal_lck_seize(), захват/освобождение блокировок - * во время пишущих транзакций и запись данных должны выполнятся через - * один дескриптор. - * - * Таким образом, требуется прочитать волатильный заголовок БД, чтобы - * узнать размер страницы, чтобы открыть дескриптор файла в режиме нужном - * для записи данных, чтобы использовать именно этот дескриптор для - * изначального захвата блокировок. */ - MDBX_meta header; - uint64_t dxb_filesize; - int err = read_header(env, &header, MDBX_SUCCESS, true); - if ((err == MDBX_SUCCESS && header.mm_psize >= env->me_os_psize) || - (err == MDBX_ENODATA && mode && env->me_psize >= env->me_os_psize && - osal_filesize(env->me_lazy_fd, &dxb_filesize) == MDBX_SUCCESS && - dxb_filesize == 0)) - /* Может быть коллизия, если два процесса пытаются одновременно создать - * БД с разным размером страницы, который у одного меньше системной - * страницы, а у другого НЕ меньше. Эта допустимая, но очень странная - * ситуация. Поэтому считаем её ошибочной и не пытаемся разрешить. */ - ior_direct = true; +/*----------------------------------------------------------------------------*/ + +static __always_inline int couple_init(cursor_couple_t *couple, + const MDBX_txn *const txn, + tree_t *const tree, kvx_t *const kvx, + uint8_t *const dbi_state) { + + VALGRIND_MAKE_MEM_UNDEFINED(couple, sizeof(cursor_couple_t)); + tASSERT(txn, F_ISSET(*dbi_state, DBI_VALID | DBI_LINDO)); + + couple->outer.signature = cur_signature_live; + couple->outer.next = &couple->outer; + couple->outer.backup = nullptr; + couple->outer.txn = (MDBX_txn *)txn; + couple->outer.tree = tree; + couple->outer.clc = &kvx->clc; + couple->outer.dbi_state = dbi_state; + couple->outer.top_and_flags = z_fresh_mark; + STATIC_ASSERT((int)z_branch == P_BRANCH && (int)z_leaf == P_LEAF && + (int)z_largepage == P_LARGE && (int)z_dupfix == P_DUPFIX); + couple->outer.checking = + (AUDIT_ENABLED() || (txn->env->flags & MDBX_VALIDATION)) + ? z_pagecheck | z_leaf + : z_leaf; + couple->outer.subcur = nullptr; + + if (tree->flags & MDBX_DUPSORT) { + couple->inner.cursor.signature = cur_signature_live; + subcur_t *const mx = couple->outer.subcur = &couple->inner; + mx->cursor.subcur = nullptr; + mx->cursor.next = &mx->cursor; + mx->cursor.txn = (MDBX_txn *)txn; + mx->cursor.tree = &mx->nested_tree; + mx->cursor.clc = ptr_disp(couple->outer.clc, sizeof(clc_t)); + tASSERT(txn, &mx->cursor.clc->k == &kvx->clc.v); + mx->cursor.dbi_state = dbi_state; + mx->cursor.top_and_flags = z_fresh_mark | z_inner; + STATIC_ASSERT(MDBX_DUPFIXED * 2 == P_DUPFIX); + mx->cursor.checking = + couple->outer.checking + ((tree->flags & MDBX_DUPFIXED) << 1); + } + + if (unlikely(*dbi_state & DBI_STALE)) + return sdb_fetch(couple->outer.txn, cursor_dbi(&couple->outer)); + + if (unlikely(kvx->clc.k.lmax == 0)) + return sdb_setup(txn->env, kvx, tree); + + return MDBX_SUCCESS; +} + +__cold int cursor_init4walk(cursor_couple_t *couple, const MDBX_txn *const txn, + tree_t *const tree, kvx_t *const kvx) { + return couple_init(couple, txn, tree, kvx, txn->dbi_state); +} + +int cursor_init(MDBX_cursor *mc, const MDBX_txn *txn, size_t dbi) { + STATIC_ASSERT(offsetof(cursor_couple_t, outer) == 0); + int rc = dbi_check(txn, dbi); + if (likely(rc == MDBX_SUCCESS)) + rc = couple_init(container_of(mc, cursor_couple_t, outer), txn, + &txn->dbs[dbi], &txn->env->kvs[dbi], &txn->dbi_state[dbi]); + return rc; +} + +__cold static int unexpected_dupsort(MDBX_cursor *mc) { + ERROR("unexpected dupsort-page/node for non-dupsort db/cursor (dbi %zu)", + cursor_dbi(mc)); + mc->txn->flags |= MDBX_TXN_ERROR; + be_poor(mc); + return MDBX_CORRUPTED; +} + +int cursor_dupsort_setup(MDBX_cursor *mc, const node_t *node, + const page_t *mp) { + cASSERT(mc, is_pointed(mc)); + subcur_t *mx = mc->subcur; + if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) + return unexpected_dupsort(mc); + + const uint8_t flags = node_flags(node); + switch (flags) { + default: + ERROR("invalid node flags %u", flags); + goto bailout; + case N_DUPDATA | N_SUBDATA: + if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) != sizeof(tree_t))) { + ERROR("invalid nested-db record size (%zu, expect %zu)", node_ds(node), + sizeof(tree_t)); + goto bailout; + } + memcpy(&mx->nested_tree, node_data(node), sizeof(tree_t)); + const txnid_t pp_txnid = mp->txnid; + if (!MDBX_DISABLE_VALIDATION && + unlikely(mx->nested_tree.mod_txnid > pp_txnid)) { + ERROR("nested-db.mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")", + mx->nested_tree.mod_txnid, pp_txnid); + goto bailout; + } + mx->cursor.top_and_flags = z_fresh_mark | z_inner; + break; + case N_DUPDATA: + if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) <= PAGEHDRSZ)) { + ERROR("invalid nested-page size %zu", node_ds(node)); + goto bailout; } + page_t *sp = node_data(node); + mx->nested_tree.height = 1; + mx->nested_tree.branch_pages = 0; + mx->nested_tree.leaf_pages = 1; + mx->nested_tree.large_pages = 0; + mx->nested_tree.items = page_numkeys(sp); + mx->nested_tree.root = 0; + mx->nested_tree.mod_txnid = mp->txnid; + mx->cursor.top_and_flags = z_inner; + /* mc->flags &= ~z_hollow; */ + mx->cursor.pg[0] = sp; + mx->cursor.ki[0] = 0; + mx->nested_tree.flags = flags_db2sub(mc->tree->flags); + mx->nested_tree.dupfix_size = + (mc->tree->flags & MDBX_DUPFIXED) ? sp->dupfix_ksize : 0; + break; + } - rc = osal_openfile(ior_direct ? MDBX_OPEN_DXB_OVERLAPPED_DIRECT - : MDBX_OPEN_DXB_OVERLAPPED, - env, env->me_pathname.dxb, &env->me_overlapped_fd, 0); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - env->me_data_lock_event = CreateEventW(nullptr, true, false, nullptr); - if (unlikely(!env->me_data_lock_event)) - return (int)GetLastError(); - osal_fseek(env->me_overlapped_fd, safe_parking_lot_offset); + if (unlikely(mx->nested_tree.dupfix_size != mc->tree->dupfix_size)) { + if (!MDBX_DISABLE_VALIDATION && unlikely(mc->tree->dupfix_size != 0)) { + ERROR("cursor mismatched nested-db dupfix_size %u", + mc->tree->dupfix_size); + goto bailout; + } + if (!MDBX_DISABLE_VALIDATION && + unlikely((mc->tree->flags & MDBX_DUPFIXED) == 0)) { + ERROR("mismatched nested-db flags %u", mc->tree->flags); + goto bailout; + } + if (!MDBX_DISABLE_VALIDATION && + unlikely(mx->nested_tree.dupfix_size < mc->clc->v.lmin || + mx->nested_tree.dupfix_size > mc->clc->v.lmax)) { + ERROR("mismatched nested-db.dupfix_size (%u) <> min/max value-length " + "(%zu/%zu)", + mx->nested_tree.dupfix_size, mc->clc->v.lmin, mc->clc->v.lmax); + goto bailout; + } + mc->tree->dupfix_size = mx->nested_tree.dupfix_size; + mc->clc->v.lmin = mc->clc->v.lmax = mx->nested_tree.dupfix_size; } -#else - if (mode == 0) { - /* pickup mode for lck-file */ - struct stat st; - if (unlikely(fstat(env->me_lazy_fd, &st))) - return errno; - mode = st.st_mode; + + DEBUG("Sub-db dbi -%zu root page %" PRIaPGNO, cursor_dbi(&mx->cursor), + mx->nested_tree.root); + return MDBX_SUCCESS; + +bailout: + mx->cursor.top_and_flags = z_poor_mark | z_inner; + return MDBX_CORRUPTED; +} + +/*----------------------------------------------------------------------------*/ + +MDBX_cursor *cursor_cpstk(const MDBX_cursor *csrc, MDBX_cursor *cdst) { + cASSERT(cdst, cdst->txn == csrc->txn); + cASSERT(cdst, cdst->tree == csrc->tree); + cASSERT(cdst, cdst->clc == csrc->clc); + cASSERT(cdst, cdst->dbi_state == csrc->dbi_state); + cdst->top_and_flags = csrc->top_and_flags; + + for (intptr_t i = 0; i <= csrc->top; i++) { + cdst->pg[i] = csrc->pg[i]; + cdst->ki[i] = csrc->ki[i]; } - mode = (/* inherit read permissions for group and others */ mode & - (S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) | - /* always add read/write for owner */ S_IRUSR | S_IWUSR | - ((mode & S_IRGRP) ? /* +write if readable by group */ S_IWGRP : 0) | - ((mode & S_IROTH) ? /* +write if readable by others */ S_IWOTH : 0); -#endif /* !Windows */ - const int lck_rc = setup_lck(env, mode); - if (unlikely(MDBX_IS_ERROR(lck_rc))) - return lck_rc; - if (env->me_lfd != INVALID_HANDLE_VALUE) - osal_fseek(env->me_lfd, safe_parking_lot_offset); + return cdst; +} - eASSERT(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); - if (!(env->me_flags & - (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC -#if defined(_WIN32) || defined(_WIN64) - | MDBX_EXCLUSIVE -#endif /* !Windows */ - ))) { - rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env->me_pathname.dxb, - &env->me_dsync_fd, 0); - if (unlikely(MDBX_IS_ERROR(rc))) - return rc; - if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { - if ((env->me_flags & MDBX_NOMETASYNC) == 0) - env->me_fd4meta = env->me_dsync_fd; - osal_fseek(env->me_dsync_fd, safe_parking_lot_offset); - } +#define SIBLING_LEFT 0 +#define SIBLING_RIGHT 2 +static __always_inline int sibling(MDBX_cursor *mc, int dir) { + assert(dir == SIBLING_LEFT || dir == SIBLING_RIGHT); + if (mc->top < 1) { + /* root has no siblings */ + return MDBX_NOTFOUND; } - const MDBX_env_flags_t lazy_flags = - MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC | MDBX_NOMETASYNC; - const MDBX_env_flags_t mode_flags = lazy_flags | MDBX_LIFORECLAIM | - MDBX_NORDAHEAD | MDBX_RDONLY | - MDBX_WRITEMAP; + cursor_pop(mc); + DEBUG("parent page is page %" PRIaPGNO ", index %u", mc->pg[mc->top]->pgno, + mc->ki[mc->top]); - MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (lck && lck_rc != MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { - MDBX_env_flags_t snap_flags; - while ((snap_flags = atomic_load32(&lck->mti_envmode, mo_AcquireRelease)) == - MDBX_RDONLY) { - if (atomic_cas32(&lck->mti_envmode, MDBX_RDONLY, - (snap_flags = (env->me_flags & mode_flags)))) { - /* The case: - * - let's assume that for some reason the DB file is smaller - * than it should be according to the geometry, - * but not smaller than the last page used; - * - the first process that opens the database (lck_rc == RESULT_TRUE) - * does this in readonly mode and therefore cannot bring - * the file size back to normal; - * - some next process (lck_rc != RESULT_TRUE) opens the DB in - * read-write mode and now is here. - * - * FIXME: Should we re-check and set the size of DB-file right here? */ - break; - } - atomic_yield(); + int err; + if ((dir == SIBLING_RIGHT) + ? (mc->ki[mc->top] + (size_t)1 >= page_numkeys(mc->pg[mc->top])) + : (mc->ki[mc->top] == 0)) { + DEBUG("no more keys aside, moving to next %s sibling", + dir ? "right" : "left"); + err = (dir == SIBLING_LEFT) ? cursor_sibling_left(mc) + : cursor_sibling_right(mc); + if (err != MDBX_SUCCESS) { + if (likely(err == MDBX_NOTFOUND)) + /* undo cursor_pop before returning */ + mc->top += 1; + return err; } + } else { + assert((dir - 1) == -1 || (dir - 1) == 1); + mc->ki[mc->top] += (indx_t)(dir - 1); + DEBUG("just moving to %s index key %u", + (dir == SIBLING_RIGHT) ? "right" : "left", mc->ki[mc->top]); + } + cASSERT(mc, is_branch(mc->pg[mc->top])); - if (env->me_flags & MDBX_ACCEDE) { - /* Pickup current mode-flags (MDBX_LIFORECLAIM, MDBX_NORDAHEAD, etc). */ - const MDBX_env_flags_t diff = - (snap_flags ^ env->me_flags) & - ((snap_flags & lazy_flags) ? mode_flags - : mode_flags & ~MDBX_WRITEMAP); - env->me_flags ^= diff; - NOTICE("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, - env->me_flags ^ diff, env->me_flags); - } - - /* Ранее упущенный не очевидный момент: При работе БД в режимах - * не-синхронной/отложенной фиксации на диске, все процессы-писатели должны - * иметь одинаковый режим MDBX_WRITEMAP. - * - * В противном случае, сброс на диск следует выполнять дважды: сначала - * msync(), затем fdatasync(). При этом msync() не обязан отрабатывать - * в процессах без MDBX_WRITEMAP, так как файл в память отображен только - * для чтения. Поэтому, в общем случае, различия по MDBX_WRITEMAP не - * позволяют выполнить фиксацию данных на диск, после их изменения в другом - * процессе. - * - * В режиме MDBX_UTTERLY_NOSYNC позволять совместную работу с MDBX_WRITEMAP - * также не следует, поскольку никакой процесс (в том числе последний) не - * может гарантированно сбросить данные на диск, а следовательно не должен - * помечать какую-либо транзакцию как steady. - * - * В результате, требуется либо запретить совместную работу процессам с - * разным MDBX_WRITEMAP в режиме отложенной записи, либо отслеживать такое - * смешивание и блокировать steady-пометки - что контрпродуктивно. */ - const MDBX_env_flags_t rigorous_flags = - (snap_flags & lazy_flags) - ? MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC | MDBX_WRITEMAP - : MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC; - const MDBX_env_flags_t rigorous_diff = - (snap_flags ^ env->me_flags) & rigorous_flags; - if (rigorous_diff) { - ERROR("current mode/flags 0x%X incompatible with requested 0x%X, " - "rigorous diff 0x%X", - env->me_flags, snap_flags, rigorous_diff); - return MDBX_INCOMPATIBLE; - } + page_t *mp = mc->pg[mc->top]; + const node_t *node = page_node(mp, mc->ki[mc->top]); + err = page_get(mc, node_pgno(node), &mp, mp->txnid); + if (likely(err == MDBX_SUCCESS)) { + err = cursor_push(mc, mp, + (dir == SIBLING_LEFT) ? (indx_t)page_numkeys(mp) - 1 : 0); + if (likely(err == MDBX_SUCCESS)) + return err; } - mincore_clean_cache(env); - const int dxb_rc = setup_dxb(env, lck_rc, mode); - if (MDBX_IS_ERROR(dxb_rc)) - return dxb_rc; + be_poor(mc); + return err; +} - rc = osal_check_fs_incore(env->me_lazy_fd); - env->me_incore = false; - if (rc == MDBX_RESULT_TRUE) { - env->me_incore = true; - NOTICE("%s", "in-core database"); - rc = MDBX_SUCCESS; - } else if (unlikely(rc != MDBX_SUCCESS)) { - ERROR("check_fs_incore(), err %d", rc); - return rc; +__hot int cursor_sibling_left(MDBX_cursor *mc) { + int err = sibling(mc, SIBLING_LEFT); + if (likely(err != MDBX_NOTFOUND)) + return err; + + cASSERT(mc, mc->top >= 0); + size_t nkeys = page_numkeys(mc->pg[mc->top]); + cASSERT(mc, nkeys > 0); + mc->ki[mc->top] = 0; + return MDBX_NOTFOUND; +} + +__hot int cursor_sibling_right(MDBX_cursor *mc) { + int err = sibling(mc, SIBLING_RIGHT); + if (likely(err != MDBX_NOTFOUND)) + return err; + + cASSERT(mc, mc->top >= 0); + size_t nkeys = page_numkeys(mc->pg[mc->top]); + cASSERT(mc, nkeys > 0); + mc->ki[mc->top] = (indx_t)nkeys - 1; + be_hollow(mc); + return MDBX_NOTFOUND; +} + +/*----------------------------------------------------------------------------*/ + +/* Функция-шаблон: Приземляет курсор на данные в текущей позиции. + * В том числе, загружает данные во вложенный курсор при его наличии. */ +static __always_inline int cursor_bring(const bool inner, const bool tend2first, + MDBX_cursor *__restrict mc, + MDBX_val *__restrict key, + MDBX_val *__restrict data) { + if (inner) { + cASSERT(mc, !data && !mc->subcur && (mc->flags & z_inner) != 0); + } else { + cASSERT(mc, (mc->flags & z_inner) == 0); } - if (unlikely(/* recovery mode */ env->me_stuck_meta >= 0) && - (lck_rc != /* exclusive */ MDBX_RESULT_TRUE || - (env->me_flags & MDBX_EXCLUSIVE) == 0)) { - ERROR("%s", "recovery requires exclusive mode"); - return MDBX_BUSY; + const page_t *mp = mc->pg[mc->top]; + if (!MDBX_DISABLE_VALIDATION && unlikely(!check_leaf_type(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->pgno, mp->flags); + return MDBX_CORRUPTED; } - DEBUG("opened dbenv %p", (void *)env); - env->me_flags |= MDBX_ENV_ACTIVE; - if (!lck || lck_rc == MDBX_RESULT_TRUE) { - env->me_lck->mti_envmode.weak = env->me_flags & mode_flags; - env->me_lck->mti_meta_sync_txnid.weak = - (uint32_t)recent_committed_txnid(env); - env->me_lck->mti_reader_check_timestamp.weak = osal_monotime(); + const size_t nkeys = page_numkeys(mp); + cASSERT(mc, nkeys > 0); + const size_t ki = mc->ki[mc->top]; + cASSERT(mc, nkeys > ki); + + if (inner && is_dupfix_leaf(mp)) { + be_filled(mc); + if (likely(key)) + *key = page_dupfix_key(mp, ki, mc->tree->dupfix_size); + return MDBX_SUCCESS; } - if (lck) { - if (lck_rc == MDBX_RESULT_TRUE) { - rc = osal_lck_downgrade(env); - DEBUG("lck-downgrade-%s: rc %i", - (env->me_flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc); - if (rc != MDBX_SUCCESS) - return rc; + + const node_t *__restrict node = page_node(mp, ki); + if (!inner && (node_flags(node) & N_DUPDATA)) { + int err = cursor_dupsort_setup(mc, node, mp); + if (unlikely(err != MDBX_SUCCESS)) + return err; + MDBX_ANALYSIS_ASSUME(mc->subcur != nullptr); + if (node_flags(node) & N_SUBDATA) { + err = tend2first ? inner_first(&mc->subcur->cursor, data) + : inner_last(&mc->subcur->cursor, data); + if (unlikely(err != MDBX_SUCCESS)) + return err; } else { - rc = cleanup_dead_readers(env, false, NULL); - if (MDBX_IS_ERROR(rc)) - return rc; + if (!tend2first) + mc->subcur->cursor.ki[0] = (indx_t)mc->subcur->nested_tree.items - 1; + if (data) { + const page_t *inner_mp = mc->subcur->cursor.pg[0]; + cASSERT(mc, is_subpage(inner_mp) && is_leaf(inner_mp)); + const size_t inner_ki = mc->subcur->cursor.ki[0]; + if (is_dupfix_leaf(inner_mp)) + *data = page_dupfix_key(inner_mp, inner_ki, mc->tree->dupfix_size); + else + *data = get_key(page_node(inner_mp, inner_ki)); + } + } + } else { + if (!inner) + inner_gone(mc); + if (data) { + int err = node_read(mc, node, data, mp); + if (unlikely(err != MDBX_SUCCESS)) + return err; } } - rc = (env->me_flags & MDBX_RDONLY) - ? MDBX_SUCCESS - : osal_ioring_create(&env->me_ioring -#if defined(_WIN32) || defined(_WIN64) - , - ior_direct, env->me_overlapped_fd -#endif /* Windows */ - ); - return rc; + be_filled(mc); + get_key_optional(node, key); + return MDBX_SUCCESS; } -__cold int mdbx_env_open(MDBX_env *env, const char *pathname, - MDBX_env_flags_t flags, mdbx_mode_t mode) { -#if defined(_WIN32) || defined(_WIN64) - wchar_t *pathnameW = nullptr; - int rc = osal_mb2w(pathname, &pathnameW); - if (likely(rc == MDBX_SUCCESS)) { - rc = mdbx_env_openW(env, pathnameW, flags, mode); - osal_free(pathnameW); - if (rc == MDBX_SUCCESS) - /* force to make cache of the multi-byte pathname representation */ - mdbx_env_get_path(env, &pathname); +/* Функция-шаблон: Устанавливает курсор в начало или конец. */ +static __always_inline int cursor_brim(const bool inner, const bool tend2first, + MDBX_cursor *__restrict mc, + MDBX_val *__restrict key, + MDBX_val *__restrict data) { + if (mc->top != 0) { + int err = tree_search(mc, nullptr, tend2first ? Z_FIRST : Z_LAST); + if (unlikely(err != MDBX_SUCCESS)) + return err; } - return rc; + const size_t nkeys = page_numkeys(mc->pg[mc->top]); + cASSERT(mc, nkeys > 0); + mc->ki[mc->top] = tend2first ? 0 : nkeys - 1; + return cursor_bring(inner, tend2first, mc, key, data); } -__cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, - MDBX_env_flags_t flags, mdbx_mode_t mode) { -#endif /* Windows */ +__hot int inner_first(MDBX_cursor *mc, MDBX_val *data) { + return cursor_brim(true, true, mc, data, nullptr); +} - int rc = check_env(env, false); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +__hot int inner_last(MDBX_cursor *mc, MDBX_val *data) { + return cursor_brim(true, false, mc, data, nullptr); +} - if (unlikely(flags & ~ENV_USABLE_FLAGS)) - return MDBX_EINVAL; +__hot int outer_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { + return cursor_brim(false, true, mc, key, data); +} - if (unlikely(env->me_lazy_fd != INVALID_HANDLE_VALUE || - (env->me_flags & MDBX_ENV_ACTIVE) != 0 || env->me_map)) - return MDBX_EPERM; +__hot int outer_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { + return cursor_brim(false, false, mc, key, data); +} - /* Pickup previously mdbx_env_set_flags(), - * but avoid MDBX_UTTERLY_NOSYNC by disjunction */ - const uint32_t saved_me_flags = env->me_flags; - flags = merge_sync_flags(flags | MDBX_DEPRECATED_COALESCE, env->me_flags); +/*----------------------------------------------------------------------------*/ - if (flags & MDBX_RDONLY) { - /* Silently ignore irrelevant flags when we're only getting read access */ - flags &= ~(MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC | MDBX_SAFE_NOSYNC | - MDBX_NOMETASYNC | MDBX_DEPRECATED_COALESCE | MDBX_LIFORECLAIM | - MDBX_NOMEMINIT | MDBX_ACCEDE); - mode = 0; +/* Функция-шаблон: Передвигает курсор на одну позицию. + * При необходимости управляет вложенным курсором. */ +static __always_inline int cursor_step(const bool inner, const bool forward, + MDBX_cursor *__restrict mc, + MDBX_val *__restrict key, + MDBX_val *__restrict data, + MDBX_cursor_op op) { + if (forward) { + if (inner) + cASSERT(mc, op == MDBX_NEXT); + else + cASSERT(mc, + op == MDBX_NEXT || op == MDBX_NEXT_DUP || op == MDBX_NEXT_NODUP); } else { -#if MDBX_MMAP_INCOHERENT_FILE_WRITE - /* Temporary `workaround` for OpenBSD kernel's flaw. - * See https://libmdbx.dqdkfa.ru/dead-github/issues/67 */ - if ((flags & MDBX_WRITEMAP) == 0) { - if (flags & MDBX_ACCEDE) - flags |= MDBX_WRITEMAP; - else { - debug_log(MDBX_LOG_ERROR, __func__, __LINE__, - "System (i.e. OpenBSD) requires MDBX_WRITEMAP because " - "of an internal flaw(s) in a file/buffer/page cache.\n"); - return 42 /* ENOPROTOOPT */; - } - } -#endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ + if (inner) + cASSERT(mc, op == MDBX_PREV); + else + cASSERT(mc, + op == MDBX_PREV || op == MDBX_PREV_DUP || op == MDBX_PREV_NODUP); } - - env->me_flags = (flags & ~MDBX_FATAL_ERROR); - rc = env_handle_pathname(env, pathname, mode); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - - env->me_dbxs = osal_calloc(env->me_maxdbs, sizeof(env->me_dbxs[0])); - env->me_db_flags = osal_calloc(env->me_maxdbs, sizeof(env->me_db_flags[0])); - env->me_dbi_seqs = osal_calloc(env->me_maxdbs, sizeof(env->me_dbi_seqs[0])); - if (unlikely(!(env->me_dbxs && env->me_db_flags && env->me_dbi_seqs))) { - rc = MDBX_ENOMEM; - goto bailout; + if (inner) { + cASSERT(mc, !data && !mc->subcur && (mc->flags & z_inner) != 0); + } else { + cASSERT(mc, (mc->flags & z_inner) == 0); } - if ((flags & MDBX_RDONLY) == 0) { - MDBX_txn *txn = nullptr; - const intptr_t bitmap_bytes = -#if MDBX_ENABLE_DBI_SPARSE - ceil_powerof2(env->me_maxdbs, - CHAR_BIT * sizeof(txn->mt_dbi_sparse[0])) / - CHAR_BIT; -#else - 0; -#endif /* MDBX_ENABLE_DBI_SPARSE */ - const size_t base = sizeof(MDBX_txn) + sizeof(MDBX_cursor); - const size_t size = - base + bitmap_bytes + - env->me_maxdbs * - (sizeof(txn->mt_dbs[0]) + sizeof(txn->mt_cursors[0]) + - sizeof(txn->mt_dbi_seqs[0]) + sizeof(txn->mt_dbi_state[0])); - rc = alloc_page_buf(env); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - - memset(env->me_pbuf, -1, env->me_psize * (size_t)2); - memset(ptr_disp(env->me_pbuf, env->me_psize * (size_t)2), 0, env->me_psize); - txn = osal_calloc(1, size); - if (unlikely(!txn)) { - rc = MDBX_ENOMEM; - goto bailout; + if (unlikely(is_poor(mc))) { + if (mc->flags & z_fresh) { + if (forward) + return inner ? inner_first(mc, key) : outer_first(mc, key, data); + else + return inner ? inner_last(mc, key) : outer_last(mc, key, data); } - txn->mt_dbs = ptr_disp(txn, base); - txn->mt_cursors = - ptr_disp(txn->mt_dbs, env->me_maxdbs * sizeof(txn->mt_dbs[0])); - txn->mt_dbi_seqs = - ptr_disp(txn->mt_cursors, env->me_maxdbs * sizeof(txn->mt_cursors[0])); - txn->mt_dbi_state = - ptr_disp(txn, size - env->me_maxdbs * sizeof(txn->mt_dbi_state[0])); -#if MDBX_ENABLE_DBI_SPARSE - txn->mt_dbi_sparse = ptr_disp(txn->mt_dbi_state, -bitmap_bytes); -#endif /* MDBX_ENABLE_DBI_SPARSE */ - txn->mt_env = env; - txn->mt_flags = MDBX_TXN_FINISHED; - env->me_txn0 = txn; - txn->tw.retired_pages = pnl_alloc(MDBX_PNL_INITIAL); - txn->tw.relist = pnl_alloc(MDBX_PNL_INITIAL); - if (unlikely(!txn->tw.retired_pages || !txn->tw.relist)) { - rc = MDBX_ENOMEM; - goto bailout; + if (mc->flags & z_after_delete) { + mc->flags -= z_after_delete; + return MDBX_NOTFOUND; } - adjust_defaults(env); + return MDBX_ENODATA; } - rc = env_open(env, mode); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - -#if MDBX_DEBUG - const meta_troika_t troika = meta_tap(env); - const meta_ptr_t head = meta_recent(env, &troika); - const MDBX_db *db = &head.ptr_c->mm_dbs[MAIN_DBI]; + const page_t *mp = mc->pg[mc->top]; + const intptr_t nkeys = page_numkeys(mp); + cASSERT(mc, nkeys > 0); + + intptr_t ki = mc->ki[mc->top]; + const uint8_t state = mc->flags & (z_after_delete | z_hollow); + if (likely(state == 0)) { + cASSERT(mc, ki < nkeys); + if (!inner && op != (forward ? MDBX_NEXT_NODUP : MDBX_PREV_NODUP)) { + int err = MDBX_NOTFOUND; + if (inner_pointed(mc)) { + err = forward ? inner_next(&mc->subcur->cursor, data) + : inner_prev(&mc->subcur->cursor, data); + if (likely(err == MDBX_SUCCESS)) { + get_key_optional(page_node(mp, ki), key); + mc->flags &= z_clear_mask; + return MDBX_SUCCESS; + } + if (unlikely(err != MDBX_NOTFOUND)) { + cASSERT(mc, !inner_pointed(mc)); + return err; + } + cASSERT(mc, !forward || (mc->subcur->cursor.flags & z_eof)); + } + if (op == (forward ? MDBX_NEXT_DUP : MDBX_PREV_DUP)) + return err; + } + if (!inner) + inner_gone(mc); + } else { + if (mc->flags & z_hollow) + cASSERT(mc, !inner_pointed(mc)); - DEBUG("opened database version %u, pagesize %u", - (uint8_t)unaligned_peek_u64(4, head.ptr_c->mm_magic_and_version), - env->me_psize); - DEBUG("using meta page %" PRIaPGNO ", txn %" PRIaTXN, - data_page(head.ptr_c)->mp_pgno, head.txnid); - DEBUG("depth: %u", db->md_depth); - DEBUG("entries: %" PRIu64, db->md_entries); - DEBUG("branch pages: %" PRIaPGNO, db->md_branch_pages); - DEBUG("leaf pages: %" PRIaPGNO, db->md_leaf_pages); - DEBUG("large/overflow pages: %" PRIaPGNO, db->md_overflow_pages); - DEBUG("root: %" PRIaPGNO, db->md_root); - DEBUG("schema_altered: %" PRIaTXN, db->md_mod_txnid); -#endif /* MDBX_DEBUG */ + if (!inner && op == (forward ? MDBX_NEXT_DUP : MDBX_PREV_DUP)) + return MDBX_NOTFOUND; - if (likely(rc == MDBX_SUCCESS)) { -#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) - txn_valgrind(env, nullptr); -#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ - } else { - bailout: - if (likely(env_close(env, false) == MDBX_SUCCESS)) { - env->me_flags = saved_me_flags; + if (state & z_hollow) { + if (forward) + return MDBX_NOTFOUND; + ki = nkeys - forward; + } else if (ki < nkeys) { + if (forward && (state & z_after_delete)) + goto bring; } else { - rc = MDBX_PANIC; - env->me_flags = saved_me_flags | MDBX_FATAL_ERROR; + ki = nkeys - forward; } } - return rc; -} -/* Destroy resources from mdbx_env_open(), clear our readers & DBIs */ -__cold static int env_close(MDBX_env *env, bool resurrect_after_fork) { - const unsigned flags = env->me_flags; - env->me_flags &= ~ENV_INTERNAL_FLAGS; - if (flags & MDBX_ENV_TXKEY) { - thread_key_delete(env->me_txkey); - env->me_txkey = 0; + DEBUG("turn-%s: top page was %" PRIaPGNO " in cursor %p, ki %zi of %zi", + forward ? "next" : "prev", mp->pgno, __Wpedantic_format_voidptr(mc), ki, + nkeys); + if (forward) { + if (likely(++ki < nkeys)) + mc->ki[mc->top] = (indx_t)ki; + else { + DEBUG("%s", "=====> move to next sibling page"); + int err = cursor_sibling_right(mc); + if (unlikely(err != MDBX_SUCCESS)) + return err; + mp = mc->pg[mc->top]; + DEBUG("next page is %" PRIaPGNO ", key index %u", mp->pgno, + mc->ki[mc->top]); + } + } else { + if (likely(--ki >= 0)) + mc->ki[mc->top] = (indx_t)ki; + else { + DEBUG("%s", "=====> move to prev sibling page"); + int err = cursor_sibling_left(mc); + if (unlikely(err != MDBX_SUCCESS)) + return err; + mp = mc->pg[mc->top]; + DEBUG("prev page is %" PRIaPGNO ", key index %u", mp->pgno, + mc->ki[mc->top]); + } } + DEBUG("==> cursor points to page %" PRIaPGNO " with %zu keys, key index %u", + mp->pgno, page_numkeys(mp), mc->ki[mc->top]); - if (env->me_lck) - munlock_all(env); +bring: + return cursor_bring(inner, forward, mc, key, data); +} - rthc_lock(); - int rc = rthc_remove(env); - rthc_unlock(); +__hot int inner_next(MDBX_cursor *mc, MDBX_val *data) { + return cursor_step(true, true, mc, data, nullptr, MDBX_NEXT); +} -#if MDBX_ENABLE_DBI_LOCKFREE - for (struct mdbx_defer_free_item *next, *ptr = env->me_defer_free; ptr; - ptr = next) { - next = ptr->next; - osal_free(ptr); - } - env->me_defer_free = nullptr; -#endif /* MDBX_ENABLE_DBI_LOCKFREE */ +__hot int inner_prev(MDBX_cursor *mc, MDBX_val *data) { + return cursor_step(true, false, mc, data, nullptr, MDBX_PREV); +} - if (!(env->me_flags & MDBX_RDONLY)) - osal_ioring_destroy(&env->me_ioring); +__hot int outer_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { + return cursor_step(false, true, mc, key, data, op); +} - env->me_lck = nullptr; - if (env->me_lck_mmap.lck) - osal_munmap(&env->me_lck_mmap); +__hot int outer_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { + return cursor_step(false, false, mc, key, data, op); +} - if (env->me_map) { - osal_munmap(&env->me_dxb_mmap); -#ifdef ENABLE_MEMCHECK - VALGRIND_DISCARD(env->me_valgrind_handle); - env->me_valgrind_handle = -1; -#endif /* ENABLE_MEMCHECK */ - } +/*----------------------------------------------------------------------------*/ -#if defined(_WIN32) || defined(_WIN64) - eASSERT(env, !env->me_overlapped_fd || - env->me_overlapped_fd == INVALID_HANDLE_VALUE); - if (env->me_data_lock_event != INVALID_HANDLE_VALUE) { - CloseHandle(env->me_data_lock_event); - env->me_data_lock_event = INVALID_HANDLE_VALUE; - } - eASSERT(env, !resurrect_after_fork); - if (env->me_pathname_char) { - osal_free(env->me_pathname_char); - env->me_pathname_char = nullptr; - } -#endif /* Windows */ +__hot int cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, + unsigned flags) { + int err; + DKBUF_DEBUG; + MDBX_env *const env = mc->txn->env; + DEBUG("==> put db %d key [%s], size %" PRIuPTR ", data [%s] size %" PRIuPTR, + cursor_dbi_dbg(mc), DKEY_DEBUG(key), key->iov_len, + DVAL_DEBUG((flags & MDBX_RESERVE) ? nullptr : data), data->iov_len); - if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { - (void)osal_closefile(env->me_dsync_fd); - env->me_dsync_fd = INVALID_HANDLE_VALUE; - } + if ((flags & MDBX_CURRENT) != 0 && (mc->flags & z_inner) == 0) { + if (unlikely(flags & (MDBX_APPEND | MDBX_NOOVERWRITE))) + return MDBX_EINVAL; + /* Запрошено обновление текущей записи, на которой сейчас стоит курсор. + * Проверяем что переданный ключ совпадает со значением в текущей позиции + * курсора. Здесь проще вызвать cursor_ops(), так как для обслуживания + * таблиц с MDBX_DUPSORT также требуется текущий размер данных. */ + MDBX_val current_key, current_data; + err = cursor_ops(mc, ¤t_key, ¤t_data, MDBX_GET_CURRENT); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (mc->clc->k.cmp(key, ¤t_key) != 0) + return MDBX_EKEYMISMATCH; - if (env->me_lazy_fd != INVALID_HANDLE_VALUE) { - (void)osal_closefile(env->me_lazy_fd); - env->me_lazy_fd = INVALID_HANDLE_VALUE; - } - - if (env->me_lfd != INVALID_HANDLE_VALUE) { - (void)osal_closefile(env->me_lfd); - env->me_lfd = INVALID_HANDLE_VALUE; - } - - if (!resurrect_after_fork) { - if (env->me_dbxs) { - for (size_t i = CORE_DBS; i < env->me_numdbs; ++i) - if (env->me_dbxs[i].md_name.iov_len) - osal_free(env->me_dbxs[i].md_name.iov_base); - osal_free(env->me_dbxs); - env->me_numdbs = CORE_DBS; - env->me_dbxs = nullptr; - } - if (env->me_pbuf) { - osal_memalign_free(env->me_pbuf); - env->me_pbuf = nullptr; - } - if (env->me_dbi_seqs) { - osal_free(env->me_dbi_seqs); - env->me_dbi_seqs = nullptr; - } - if (env->me_db_flags) { - osal_free(env->me_db_flags); - env->me_db_flags = nullptr; - } - if (env->me_pathname.buffer) { - osal_free(env->me_pathname.buffer); - env->me_pathname.buffer = nullptr; - } - if (env->me_txn0) { - dpl_free(env->me_txn0); - txl_free(env->me_txn0->tw.lifo_reclaimed); - pnl_free(env->me_txn0->tw.retired_pages); - pnl_free(env->me_txn0->tw.spilled.list); - pnl_free(env->me_txn0->tw.relist); - osal_free(env->me_txn0); - env->me_txn0 = nullptr; - } - } - env->me_stuck_meta = -1; - return rc; -} - -#if !(defined(_WIN32) || defined(_WIN64)) -__cold int mdbx_env_resurrect_after_fork(MDBX_env *env) { - if (unlikely(!env)) - return MDBX_EINVAL; - - if (unlikely(env->me_signature.weak != MDBX_ME_SIGNATURE)) - return MDBX_EBADSIGN; - - if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) - return MDBX_PANIC; - - if (unlikely((env->me_flags & MDBX_ENV_ACTIVE) == 0)) - return MDBX_SUCCESS; - - const uint32_t new_pid = osal_getpid(); - if (unlikely(env->me_pid == new_pid)) - return MDBX_SUCCESS; - - if (!atomic_cas32(&env->me_signature, MDBX_ME_SIGNATURE, ~MDBX_ME_SIGNATURE)) - return MDBX_EBADSIGN; + if (unlikely((flags & MDBX_MULTIPLE))) + goto drop_current; - if (env->me_txn) - txn_abort(env->me_txn0); - env->me_live_reader = 0; - int rc = env_close(env, true); - env->me_signature.weak = MDBX_ME_SIGNATURE; - if (likely(rc == MDBX_SUCCESS)) { - rc = (env->me_flags & MDBX_EXCLUSIVE) ? MDBX_BUSY : env_open(env, 0); - if (unlikely(rc != MDBX_SUCCESS && env_close(env, false) != MDBX_SUCCESS)) { - rc = MDBX_PANIC; - env->me_flags |= MDBX_FATAL_ERROR; + if (mc->subcur) { + node_t *node = page_node(mc->pg[mc->top], mc->ki[mc->top]); + if (node_flags(node) & N_DUPDATA) { + cASSERT(mc, inner_pointed(mc)); + /* Если за ключом более одного значения, либо если размер данных + * отличается, то вместо обновления требуется удаление и + * последующая вставка. */ + if (mc->subcur->nested_tree.items > 1 || + current_data.iov_len != data->iov_len) { + drop_current: + err = cursor_del(mc, flags & MDBX_ALLDUPS); + if (unlikely(err != MDBX_SUCCESS)) + return err; + flags -= MDBX_CURRENT; + goto skip_check_samedata; + } + } else if (unlikely(node_size(key, data) > env->leaf_nodemax)) { + /* Уже есть пара key-value хранящаяся в обычном узле. Новые данные + * слишком большие для размещения в обычном узле вместе с ключом, но + * могут быть размещены в вложенном дереве. Удаляем узел со старыми + * данными, чтобы при помещении новых создать вложенное дерево. */ + err = cursor_del(mc, 0); + if (unlikely(err != MDBX_SUCCESS)) + return err; + flags -= MDBX_CURRENT; + goto skip_check_samedata; + } } + if (!(flags & MDBX_RESERVE) && + unlikely(cmp_lenfast(¤t_data, data) == 0)) + return MDBX_SUCCESS /* the same data, nothing to update */; + skip_check_samedata:; } - return rc; -} -#endif /* Windows */ -__cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { - MDBX_page *dp; int rc = MDBX_SUCCESS; + if (mc->tree->height == 0) { + /* new database, cursor has nothing to point to */ + cASSERT(mc, is_poor(mc)); + rc = MDBX_NO_ROOT; + } else if ((flags & MDBX_CURRENT) == 0) { + bool exact = false; + MDBX_val last_key, old_data; + if ((flags & MDBX_APPEND) && mc->tree->items > 0) { + old_data.iov_base = nullptr; + old_data.iov_len = 0; + rc = (mc->flags & z_inner) ? inner_last(mc, &last_key) + : outer_last(mc, &last_key, &old_data); + if (likely(rc == MDBX_SUCCESS)) { + const int cmp = mc->clc->k.cmp(key, &last_key); + if (likely(cmp > 0)) { + mc->ki[mc->top]++; /* step forward for appending */ + rc = MDBX_NOTFOUND; + } else if (unlikely(cmp != 0)) { + /* new-key < last-key */ + return MDBX_EKEYMISMATCH; + } else { + rc = MDBX_SUCCESS; + exact = true; + } + } + } else { + csr_t csr = + /* olddata may not be updated in case DUPFIX-page of dupfix-subDB */ + cursor_seek(mc, (MDBX_val *)key, &old_data, MDBX_SET); + rc = csr.err; + exact = csr.exact; + } + if (likely(rc == MDBX_SUCCESS)) { + if (exact) { + if (unlikely(flags & MDBX_NOOVERWRITE)) { + DEBUG("duplicate key [%s]", DKEY_DEBUG(key)); + *data = old_data; + return MDBX_KEYEXIST; + } + if (unlikely(mc->flags & z_inner)) { + /* nested subtree of DUPSORT-database with the same key, + * nothing to update */ + eASSERT(env, + data->iov_len == 0 && (old_data.iov_len == 0 || + /* olddata may not be updated in case + DUPFIX-page of dupfix-subDB */ + (mc->tree->flags & MDBX_DUPFIXED))); + return MDBX_SUCCESS; + } + if (unlikely(flags & MDBX_ALLDUPS) && inner_pointed(mc)) { + err = cursor_del(mc, MDBX_ALLDUPS); + if (unlikely(err != MDBX_SUCCESS)) + return err; + flags -= MDBX_ALLDUPS; + cASSERT(mc, mc->top + 1 == mc->tree->height); + rc = (mc->top >= 0) ? MDBX_NOTFOUND : MDBX_NO_ROOT; + exact = false; + } else if (!(flags & (MDBX_RESERVE | MDBX_MULTIPLE))) { + /* checking for early exit without dirtying pages */ + if (unlikely(eq_fast(data, &old_data))) { + cASSERT(mc, mc->clc->v.cmp(data, &old_data) == 0); + if (mc->subcur) { + if (flags & MDBX_NODUPDATA) + return MDBX_KEYEXIST; + if (flags & MDBX_APPENDDUP) + return MDBX_EKEYMISMATCH; + } + /* the same data, nothing to update */ + return MDBX_SUCCESS; + } + cASSERT(mc, mc->clc->v.cmp(data, &old_data) != 0); + } + } + } else if (unlikely(rc != MDBX_NOTFOUND)) + return rc; + } - if (unlikely(!env)) - return MDBX_EINVAL; + mc->flags &= ~z_after_delete; + MDBX_val xdata, *ref_data = data; + size_t *batch_dupfix_done = nullptr, batch_dupfix_given = 0; + if (unlikely(flags & MDBX_MULTIPLE)) { + batch_dupfix_given = data[1].iov_len; + batch_dupfix_done = &data[1].iov_len; + *batch_dupfix_done = 0; + } - if (unlikely(env->me_signature.weak != MDBX_ME_SIGNATURE)) - return MDBX_EBADSIGN; + /* Cursor is positioned, check for room in the dirty list */ + err = cursor_touch(mc, key, ref_data); + if (unlikely(err)) + return err; -#if MDBX_ENV_CHECKPID || !(defined(_WIN32) || defined(_WIN64)) - /* Check the PID even if MDBX_ENV_CHECKPID=0 on non-Windows - * platforms (i.e. where fork() is available). - * This is required to legitimize a call after fork() - * from a child process, that should be allowed to free resources. */ - if (unlikely(env->me_pid != osal_getpid())) - env->me_flags |= MDBX_FATAL_ERROR; -#endif /* MDBX_ENV_CHECKPID */ + if (unlikely(rc == MDBX_NO_ROOT)) { + /* new database, write a root leaf page */ + DEBUG("%s", "allocating new root leaf page"); + pgr_t npr = page_new(mc, P_LEAF); + if (unlikely(npr.err != MDBX_SUCCESS)) + return npr.err; + npr.err = cursor_push(mc, npr.page, 0); + if (unlikely(npr.err != MDBX_SUCCESS)) + return npr.err; + mc->tree->root = npr.page->pgno; + mc->tree->height++; + if (mc->tree->flags & MDBX_INTEGERKEY) { + assert(key->iov_len >= mc->clc->k.lmin && + key->iov_len <= mc->clc->k.lmax); + mc->clc->k.lmin = mc->clc->k.lmax = key->iov_len; + } + if (mc->tree->flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) { + assert(data->iov_len >= mc->clc->v.lmin && + data->iov_len <= mc->clc->v.lmax); + assert(mc->subcur != nullptr); + mc->tree->dupfix_size = /* mc->subcur->nested_tree.dupfix_size = */ + (unsigned)(mc->clc->v.lmin = mc->clc->v.lmax = data->iov_len); + cASSERT(mc, mc->clc->v.lmin == mc->subcur->cursor.clc->k.lmin); + cASSERT(mc, mc->clc->v.lmax == mc->subcur->cursor.clc->k.lmax); + if (mc->flags & z_inner) + npr.page->flags |= P_DUPFIX; + } + } - if (env->me_map && (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0 && - env->me_txn0) { - if (env->me_txn0->mt_owner && env->me_txn0->mt_owner != osal_thread_self()) - return MDBX_BUSY; - } else - dont_sync = true; + MDBX_val old_singledup, old_data; + tree_t nested_dupdb; + page_t *sub_root = nullptr; + bool insert_key, insert_data; + uint16_t fp_flags = P_LEAF; + page_t *fp = env->page_auxbuf; + fp->txnid = mc->txn->front_txnid; + insert_key = insert_data = (rc != MDBX_SUCCESS); + old_singledup.iov_base = nullptr; + old_singledup.iov_len = 0; + if (insert_key) { + /* The key does not exist */ + DEBUG("inserting key at index %i", mc->ki[mc->top]); + if ((mc->tree->flags & MDBX_DUPSORT) && + node_size(key, data) > env->leaf_nodemax) { + /* Too big for a node, insert in sub-DB. Set up an empty + * "old sub-page" for convert_to_subtree to expand to a full page. */ + fp->dupfix_ksize = + (mc->tree->flags & MDBX_DUPFIXED) ? (uint16_t)data->iov_len : 0; + fp->lower = fp->upper = 0; + old_data.iov_len = PAGEHDRSZ; + goto convert_to_subtree; + } + } else { + /* there's only a key anyway, so this is a no-op */ + if (is_dupfix_leaf(mc->pg[mc->top])) { + size_t ksize = mc->tree->dupfix_size; + if (unlikely(key->iov_len != ksize)) + return MDBX_BAD_VALSIZE; + void *ptr = page_dupfix_ptr(mc->pg[mc->top], mc->ki[mc->top], ksize); + memcpy(ptr, key->iov_base, ksize); + fix_parent: + /* if overwriting slot 0 of leaf, need to + * update branch key if there is a parent page */ + if (mc->top && !mc->ki[mc->top]) { + size_t dtop = 1; + mc->top--; + /* slot 0 is always an empty key, find real slot */ + while (mc->top && !mc->ki[mc->top]) { + mc->top--; + dtop++; + } + err = MDBX_SUCCESS; + if (mc->ki[mc->top]) + err = tree_propagate_key(mc, key); + cASSERT(mc, mc->top + dtop < UINT16_MAX); + mc->top += (uint8_t)dtop; + if (unlikely(err != MDBX_SUCCESS)) + return err; + } - if (!atomic_cas32(&env->me_signature, MDBX_ME_SIGNATURE, 0)) - return MDBX_EBADSIGN; + if (AUDIT_ENABLED()) { + err = cursor_check(mc); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + return MDBX_SUCCESS; + } - if (!dont_sync) { -#if defined(_WIN32) || defined(_WIN64) - /* On windows, without blocking is impossible to determine whether another - * process is running a writing transaction or not. - * Because in the "owner died" condition kernel don't release - * file lock immediately. */ - rc = env_sync(env, true, false); - rc = (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; -#else - struct stat st; - if (unlikely(fstat(env->me_lazy_fd, &st))) - rc = errno; - else if (st.st_nlink > 0 /* don't sync deleted files */) { - rc = env_sync(env, true, true); - rc = (rc == MDBX_BUSY || rc == EAGAIN || rc == EACCES || rc == EBUSY || - rc == EWOULDBLOCK || rc == MDBX_RESULT_TRUE) - ? MDBX_SUCCESS - : rc; + more: + if (AUDIT_ENABLED()) { + err = cursor_check(mc); + if (unlikely(err != MDBX_SUCCESS)) + return err; } -#endif /* Windows */ - } + node_t *const node = page_node(mc->pg[mc->top], mc->ki[mc->top]); - if (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()) - osal_txn_unlock(env); + /* Large/Overflow page overwrites need special handling */ + if (unlikely(node_flags(node) & N_BIGDATA)) { + const size_t dpages = (node_size(key, data) > env->leaf_nodemax) + ? largechunk_npages(env, data->iov_len) + : 0; - eASSERT(env, env->me_signature.weak == 0); - rc = env_close(env, false) ? MDBX_PANIC : rc; - ENSURE(env, osal_fastmutex_destroy(&env->me_dbi_lock) == MDBX_SUCCESS); -#if defined(_WIN32) || defined(_WIN64) - /* me_remap_guard don't have destructor (Slim Reader/Writer Lock) */ - DeleteCriticalSection(&env->me_windowsbug_lock); -#else - ENSURE(env, osal_fastmutex_destroy(&env->me_remap_guard) == MDBX_SUCCESS); -#endif /* Windows */ + const pgno_t pgno = node_largedata_pgno(node); + pgr_t lp = page_get_large(mc, pgno, mc->pg[mc->top]->txnid); + if (unlikely(lp.err != MDBX_SUCCESS)) + return lp.err; + cASSERT(mc, page_type(lp.page) == P_LARGE); -#if MDBX_LOCKING > MDBX_LOCKING_SYSV - MDBX_lockinfo *const stub = lckless_stub(env); - /* может вернуть ошибку в дочернем процессе после fork() */ - osal_ipclock_destroy(&stub->mti_wlock); -#endif /* MDBX_LOCKING */ + /* Is the ov page from this txn (or a parent) and big enough? */ + const size_t ovpages = lp.page->pages; + const size_t extra_threshold = + (mc->tree == &mc->txn->dbs[FREE_DBI]) + ? 1 + : /* LY: add configurable threshold to keep reserve space */ 0; + if (!is_frozen(mc->txn, lp.page) && ovpages >= dpages && + ovpages <= dpages + extra_threshold) { + /* yes, overwrite it. */ + if (!is_modifable(mc->txn, lp.page)) { + if (is_spilled(mc->txn, lp.page)) { + lp = /* TODO: avoid search and get txn & spill-index from + page_result */ + page_unspill(mc->txn, lp.page); + if (unlikely(lp.err)) + return lp.err; + } else { + if (unlikely(!mc->txn->parent)) { + ERROR("Unexpected not frozen/modifiable/spilled but shadowed %s " + "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," + " without parent transaction, current txn %" PRIaTXN + " front %" PRIaTXN, + "large/overflow", pgno, lp.page->txnid, mc->txn->txnid, + mc->txn->front_txnid); + return MDBX_PROBLEM; + } - while ((dp = env->me_dp_reserve) != NULL) { - MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); - VALGRIND_MAKE_MEM_DEFINED(&mp_next(dp), sizeof(MDBX_page *)); - env->me_dp_reserve = mp_next(dp); - void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t)); - osal_free(ptr); - } - VALGRIND_DESTROY_MEMPOOL(env); - osal_free(env); + /* It is writable only in a parent txn */ + page_t *np = page_shadow_alloc(mc->txn, ovpages); + if (unlikely(!np)) + return MDBX_ENOMEM; - return rc; -} + memcpy(np, lp.page, PAGEHDRSZ); /* Copy header of page */ + err = page_dirty(mc->txn, lp.page = np, ovpages); + if (unlikely(err != MDBX_SUCCESS)) + return err; -/* Search for key within a page, using binary search. - * Returns the smallest entry larger or equal to the key. - * Updates the cursor index with the index of the found entry. - * If no entry larger or equal to the key is found, returns NULL. */ -__hot static struct node_result node_search(MDBX_cursor *mc, - const MDBX_val *key) { - MDBX_page *mp = mc->mc_pg[mc->mc_top]; - const intptr_t nkeys = page_numkeys(mp); - DKBUF_DEBUG; +#if MDBX_ENABLE_PGOP_STAT + mc->txn->env->lck->pgops.clone.weak += ovpages; +#endif /* MDBX_ENABLE_PGOP_STAT */ + cASSERT(mc, dpl_check(mc->txn)); + } + } + node_set_ds(node, data->iov_len); + if (flags & MDBX_RESERVE) + data->iov_base = page_data(lp.page); + else + memcpy(page_data(lp.page), data->iov_base, data->iov_len); - DEBUG("searching %zu keys in %s %spage %" PRIaPGNO, nkeys, - IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", - mp->mp_pgno); + if (AUDIT_ENABLED()) { + err = cursor_check(mc); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + return MDBX_SUCCESS; + } - struct node_result ret; - ret.exact = false; - STATIC_ASSERT(P_BRANCH == 1); - intptr_t low = mp->mp_flags & P_BRANCH; - intptr_t high = nkeys - 1; - if (unlikely(high < low)) { - mc->mc_ki[mc->mc_top] = 0; - ret.node = NULL; - return ret; - } + if ((err = page_retire(mc, lp.page)) != MDBX_SUCCESS) + return err; + } else { + old_data.iov_len = node_ds(node); + old_data.iov_base = node_data(node); + cASSERT(mc, ptr_disp(old_data.iov_base, old_data.iov_len) <= + ptr_disp(mc->pg[mc->top], env->ps)); - intptr_t i; - MDBX_cmp_func *cmp = mc->mc_dbx->md_cmp; - MDBX_val nodekey; - if (unlikely(IS_LEAF2(mp))) { - cASSERT(mc, mp->mp_leaf2_ksize == mc->mc_db->md_xsize); - nodekey.iov_len = mp->mp_leaf2_ksize; - do { - i = (low + high) >> 1; - nodekey.iov_base = page_leaf2key(mp, i, nodekey.iov_len); - cASSERT(mc, ptr_disp(mp, mc->mc_txn->mt_env->me_psize) >= - ptr_disp(nodekey.iov_base, nodekey.iov_len)); - int cr = cmp(key, &nodekey); - DEBUG("found leaf index %zu [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); - if (cr > 0) - /* Found entry is less than the key. */ - /* Skip to get the smallest entry larger than key. */ - low = ++i; - else if (cr < 0) - high = i - 1; - else { - ret.exact = true; - break; - } - } while (likely(low <= high)); + /* DB has dups? */ + if (mc->tree->flags & MDBX_DUPSORT) { + /* Prepare (sub-)page/sub-DB to accept the new item, if needed. + * fp: old sub-page or a header faking it. + * mp: new (sub-)page. + * xdata: node data with new sub-page or sub-DB. */ + size_t growth = 0; /* growth in page size.*/ + page_t *mp = fp = xdata.iov_base = env->page_auxbuf; + mp->pgno = mc->pg[mc->top]->pgno; - /* store the key index */ - mc->mc_ki[mc->mc_top] = (indx_t)i; - ret.node = (i < nkeys) - ? /* fake for LEAF2 */ (MDBX_node *)(intptr_t)-1 - : /* There is no entry larger or equal to the key. */ NULL; - return ret; - } + /* Was a single item before, must convert now */ + if (!(node_flags(node) & N_DUPDATA)) { + /* does data match? */ + if (flags & MDBX_APPENDDUP) { + const int cmp = mc->clc->v.cmp(data, &old_data); + cASSERT(mc, cmp != 0 || eq_fast(data, &old_data)); + if (unlikely(cmp <= 0)) + return MDBX_EKEYMISMATCH; + } else if (eq_fast(data, &old_data)) { + cASSERT(mc, mc->clc->v.cmp(data, &old_data) == 0); + if (flags & MDBX_NODUPDATA) + return MDBX_KEYEXIST; + /* data is match exactly byte-to-byte, nothing to update */ + rc = MDBX_SUCCESS; + if (unlikely(batch_dupfix_done)) + goto batch_dupfix_continue; + return rc; + } - if (IS_BRANCH(mp) && cmp == cmp_int_align2) - /* Branch pages have no data, so if using integer keys, - * alignment is guaranteed. Use faster cmp_int_align4(). */ - cmp = cmp_int_align4; + /* Just overwrite the current item */ + if (flags & MDBX_CURRENT) { + cASSERT(mc, node_size(key, data) <= env->leaf_nodemax); + goto current; + } - MDBX_node *node; - do { - i = (low + high) >> 1; - node = page_node(mp, i); - nodekey.iov_len = node_ks(node); - nodekey.iov_base = node_key(node); - cASSERT(mc, ptr_disp(mp, mc->mc_txn->mt_env->me_psize) >= - ptr_disp(nodekey.iov_base, nodekey.iov_len)); - int cr = cmp(key, &nodekey); - if (IS_LEAF(mp)) - DEBUG("found leaf index %zu [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); - else - DEBUG("found branch index %zu [%s -> %" PRIaPGNO "], rc = %i", i, - DKEY_DEBUG(&nodekey), node_pgno(node), cr); - if (cr > 0) - /* Found entry is less than the key. */ - /* Skip to get the smallest entry larger than key. */ - low = ++i; - else if (cr < 0) - high = i - 1; - else { - ret.exact = true; - break; - } - } while (likely(low <= high)); + /* Back up original data item */ + memcpy(old_singledup.iov_base = fp + 1, old_data.iov_base, + old_singledup.iov_len = old_data.iov_len); - /* store the key index */ - mc->mc_ki[mc->mc_top] = (indx_t)i; - ret.node = (i < nkeys) - ? page_node(mp, i) - : /* There is no entry larger or equal to the key. */ NULL; - return ret; -} + /* Make sub-page header for the dup items, with dummy body */ + fp->flags = P_LEAF | P_SUBP; + fp->lower = 0; + xdata.iov_len = PAGEHDRSZ + old_data.iov_len + data->iov_len; + if (mc->tree->flags & MDBX_DUPFIXED) { + fp->flags |= P_DUPFIX; + fp->dupfix_ksize = (uint16_t)data->iov_len; + /* Будем создавать DUPFIX-страницу, как минимум с двумя элементами. + * При коротких значениях и наличии свободного места можно сделать + * некоторое резервирование места, чтобы при последующих добавлениях + * не сразу расширять созданную под-страницу. + * Резервирование в целом сомнительно (см ниже), но может сработать + * в плюс (а если в минус то несущественный) при коротких ключах. */ + xdata.iov_len += page_subleaf2_reserve( + env, page_room(mc->pg[mc->top]) + old_data.iov_len, + xdata.iov_len, data->iov_len); + cASSERT(mc, (xdata.iov_len & 1) == 0); + } else { + xdata.iov_len += 2 * (sizeof(indx_t) + NODESIZE) + + (old_data.iov_len & 1) + (data->iov_len & 1); + } + cASSERT(mc, (xdata.iov_len & 1) == 0); + fp->upper = (uint16_t)(xdata.iov_len - PAGEHDRSZ); + old_data.iov_len = xdata.iov_len; /* pretend olddata is fp */ + } else if (node_flags(node) & N_SUBDATA) { + /* Data is on sub-DB, just store it */ + flags |= N_DUPDATA | N_SUBDATA; + goto dupsort_put; + } else { + /* Data is on sub-page */ + fp = old_data.iov_base; + switch (flags) { + default: + growth = is_dupfix_leaf(fp) + ? fp->dupfix_ksize + : (node_size(data, nullptr) + sizeof(indx_t)); + if (page_room(fp) >= growth) { + /* На текущей под-странице есть место для добавления элемента. + * Оптимальнее продолжить использовать эту страницу, ибо + * добавление вложенного дерева увеличит WAF на одну страницу. */ + goto continue_subpage; + } + /* На текущей под-странице нет места для еще одного элемента. + * Можно либо увеличить эту под-страницу, либо вынести куст + * значений во вложенное дерево. + * + * Продолжать использовать текущую под-страницу возможно + * только пока и если размер после добавления элемента будет + * меньше leaf_nodemax. Соответственно, при превышении + * просто сразу переходим на вложенное дерево. */ + xdata.iov_len = old_data.iov_len + (growth += growth & 1); + if (xdata.iov_len > env->subpage_limit) + goto convert_to_subtree; -/* Pop a page off the top of the cursor's stack. */ -static __inline void cursor_pop(MDBX_cursor *mc) { - if (likely(mc->mc_snum)) { - DEBUG("popped page %" PRIaPGNO " off db %d cursor %p", - mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *)mc); - if (likely(--mc->mc_snum)) { - mc->mc_top--; - } else { - mc->mc_flags &= ~C_INITIALIZED; - } - } -} + /* Можно либо увеличить под-страницу, в том числе с некоторым + * запасом, либо перейти на вложенное поддерево. + * + * Резервирование места на под-странице представляется сомнительным: + * - Резервирование увеличит рыхлость страниц, в том числе + * вероятность разделения основной/гнездовой страницы; + * - Сложно предсказать полезный размер резервирования, + * особенно для не-MDBX_DUPFIXED; + * - Наличие резерва позволяет съекономить только на перемещении + * части элементов основной/гнездовой страницы при последующих + * добавлениях в нее элементов. Причем после первого изменения + * размера под-страницы, её тело будет примыкать + * к неиспользуемому месту на основной/гнездовой странице, + * поэтому последующие последовательные добавления потребуют + * только передвижения в entries[]. + * + * Соответственно, более важным/определяющим представляется + * своевременный переход к вложеному дереву, но тут достаточно + * сложный конфликт интересов: + * - При склонности к переходу к вложенным деревьям, суммарно + * в БД будет большее кол-во более рыхлых страниц. Это увеличит + * WAF, а также RAF при последовательных чтениях большой БД. + * Однако, при коротких ключах и большом кол-ве + * дубликатов/мультизначений, плотность ключей в листовых + * страницах основного дерева будет выше. Соответственно, будет + * пропорционально меньше branch-страниц. Поэтому будет выше + * вероятность оседания/не-вымывания страниц основного дерева из + * LRU-кэша, а также попадания в write-back кэш при записи. + * - Наоботот, при склонности к использованию под-страниц, будут + * наблюдаться обратные эффекты. Плюс некоторые накладные расходы + * на лишнее копирование данных под-страниц в сценариях + * нескольких обонвлений дубликатов одного куста в одной + * транзакции. + * + * Суммарно наиболее рациональным представляется такая тактика: + * - Вводим три порога subpage_limit, subpage_room_threshold + * и subpage_reserve_prereq, которые могут быть + * заданы/скорректированы пользователем в ‰ от leaf_nodemax; + * - Используем под-страницу пока её размер меньше subpage_limit + * и на основной/гнездовой странице не-менее + * subpage_room_threshold свободного места; + * - Резервируем место только для 1-3 коротких dupfix-элементов, + * расширяя размер под-страницы на размер кэш-линии ЦПУ, но + * только если на странице не менее subpage_reserve_prereq + * свободного места. + * - По-умолчанию устанавливаем: + * subpage_limit = leaf_nodemax (1000‰); + * subpage_room_threshold = 0; + * subpage_reserve_prereq = leaf_nodemax (1000‰). + */ + if (is_dupfix_leaf(fp)) + growth += page_subleaf2_reserve( + env, page_room(mc->pg[mc->top]) + old_data.iov_len, + xdata.iov_len, data->iov_len); + else { + /* TODO: Если добавить возможность для пользователя задавать + * min/max размеров ключей/данных, то здесь разумно реализовать + * тактику резервирования подобную dupfixed. */ + } + break; -/* Push a page onto the top of the cursor's stack. - * Set MDBX_TXN_ERROR on failure. */ -static __inline int cursor_push(MDBX_cursor *mc, MDBX_page *mp) { - DEBUG("pushing page %" PRIaPGNO " on db %d cursor %p", mp->mp_pgno, DDBI(mc), - (void *)mc); + case MDBX_CURRENT | MDBX_NODUPDATA: + case MDBX_CURRENT: + continue_subpage: + fp->txnid = mc->txn->front_txnid; + fp->pgno = mp->pgno; + mc->subcur->cursor.pg[0] = fp; + flags |= N_DUPDATA; + goto dupsort_put; + } + xdata.iov_len = old_data.iov_len + growth; + cASSERT(mc, (xdata.iov_len & 1) == 0); + } - if (unlikely(mc->mc_snum >= CURSOR_STACK)) { - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - return MDBX_CURSOR_FULL; - } + fp_flags = fp->flags; + if (xdata.iov_len > env->subpage_limit || + node_size_len(node_ks(node), xdata.iov_len) > env->leaf_nodemax || + (env->subpage_room_threshold && + page_room(mc->pg[mc->top]) + + node_size_len(node_ks(node), old_data.iov_len) < + env->subpage_room_threshold + + node_size_len(node_ks(node), xdata.iov_len))) { + /* Too big for a sub-page, convert to sub-DB */ + convert_to_subtree: + fp_flags &= ~P_SUBP; + nested_dupdb.dupfix_size = 0; + nested_dupdb.flags = flags_db2sub(mc->tree->flags); + if (mc->tree->flags & MDBX_DUPFIXED) { + fp_flags |= P_DUPFIX; + nested_dupdb.dupfix_size = fp->dupfix_ksize; + } + nested_dupdb.height = 1; + nested_dupdb.branch_pages = 0; + nested_dupdb.leaf_pages = 1; + nested_dupdb.large_pages = 0; + nested_dupdb.items = page_numkeys(fp); + xdata.iov_len = sizeof(nested_dupdb); + xdata.iov_base = &nested_dupdb; + const pgr_t par = gc_alloc_single(mc); + mp = par.page; + if (unlikely(par.err != MDBX_SUCCESS)) + return par.err; + mc->tree->leaf_pages += 1; + cASSERT(mc, env->ps > old_data.iov_len); + growth = env->ps - (unsigned)old_data.iov_len; + cASSERT(mc, (growth & 1) == 0); + flags |= N_DUPDATA | N_SUBDATA; + nested_dupdb.root = mp->pgno; + nested_dupdb.sequence = 0; + nested_dupdb.mod_txnid = mc->txn->txnid; + sub_root = mp; + } + if (mp != fp) { + mp->flags = fp_flags; + mp->txnid = mc->txn->front_txnid; + mp->dupfix_ksize = fp->dupfix_ksize; + mp->lower = fp->lower; + cASSERT(mc, fp->upper + growth < UINT16_MAX); + mp->upper = fp->upper + (indx_t)growth; + if (unlikely(fp_flags & P_DUPFIX)) { + memcpy(page_data(mp), page_data(fp), + page_numkeys(fp) * fp->dupfix_ksize); + cASSERT(mc, (((mp->dupfix_ksize & page_numkeys(mp)) ^ mp->upper) & + 1) == 0); + } else { + cASSERT(mc, (mp->upper & 1) == 0); + memcpy(ptr_disp(mp, mp->upper + PAGEHDRSZ), + ptr_disp(fp, fp->upper + PAGEHDRSZ), + old_data.iov_len - fp->upper - PAGEHDRSZ); + memcpy(mp->entries, fp->entries, + page_numkeys(fp) * sizeof(mp->entries[0])); + for (size_t i = 0; i < page_numkeys(fp); i++) { + cASSERT(mc, mp->entries[i] + growth <= UINT16_MAX); + mp->entries[i] += (indx_t)growth; + } + } + } - mc->mc_top = mc->mc_snum++; - mc->mc_pg[mc->mc_top] = mp; - mc->mc_ki[mc->mc_top] = 0; - return MDBX_SUCCESS; -} + if (!insert_key) + node_del(mc, 0); + ref_data = &xdata; + flags |= N_DUPDATA; + goto insert_node; + } -__hot static __always_inline int page_get_checker_lite(const uint16_t ILL, - const MDBX_page *page, - MDBX_txn *const txn, - const txnid_t front) { - if (unlikely(page->mp_flags & ILL)) { - if (ILL == P_ILL_BITS || (page->mp_flags & P_ILL_BITS)) - return bad_page(page, "invalid page's flags (%u)\n", page->mp_flags); - else if (ILL & P_OVERFLOW) { - assert((ILL & (P_BRANCH | P_LEAF | P_LEAF2)) == 0); - assert(page->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2)); - return bad_page(page, "unexpected %s instead of %s (%u)\n", - "large/overflow", "branch/leaf/leaf2", page->mp_flags); - } else if (ILL & (P_BRANCH | P_LEAF | P_LEAF2)) { - assert((ILL & P_BRANCH) && (ILL & P_LEAF) && (ILL & P_LEAF2)); - assert(page->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2)); - return bad_page(page, "unexpected %s instead of %s (%u)\n", - "branch/leaf/leaf2", "large/overflow", page->mp_flags); - } else { - assert(false); + /* MDBX passes N_SUBDATA in 'flags' to write a DB record */ + if (unlikely((node_flags(node) ^ flags) & N_SUBDATA)) + return MDBX_INCOMPATIBLE; + + current: + if (data->iov_len == old_data.iov_len) { + cASSERT(mc, EVEN_CEIL(key->iov_len) == EVEN_CEIL(node_ks(node))); + /* same size, just replace it. Note that we could + * also reuse this node if the new data is smaller, + * but instead we opt to shrink the node in that case. */ + if (flags & MDBX_RESERVE) + data->iov_base = old_data.iov_base; + else if (!(mc->flags & z_inner)) + memcpy(old_data.iov_base, data->iov_base, data->iov_len); + else { + cASSERT(mc, page_numkeys(mc->pg[mc->top]) == 1); + cASSERT(mc, page_type_compat(mc->pg[mc->top]) == P_LEAF); + cASSERT(mc, node_ds(node) == 0); + cASSERT(mc, node_flags(node) == 0); + cASSERT(mc, key->iov_len < UINT16_MAX); + node_set_ks(node, key->iov_len); + memcpy(node_key(node), key->iov_base, key->iov_len); + cASSERT(mc, ptr_disp(node_key(node), node_ds(node)) < + ptr_disp(mc->pg[mc->top], env->ps)); + goto fix_parent; + } + + if (AUDIT_ENABLED()) { + err = cursor_check(mc); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + return MDBX_SUCCESS; + } } + node_del(mc, 0); } - if (unlikely(page->mp_txnid > front) && - unlikely(page->mp_txnid > txn->mt_front || front < txn->mt_txnid)) - return bad_page( - page, - "invalid page' txnid (%" PRIaTXN ") for %s' txnid (%" PRIaTXN ")\n", - page->mp_txnid, - (front == txn->mt_front && front != txn->mt_txnid) ? "front-txn" - : "parent-page", - front); - - if (((ILL & P_OVERFLOW) || !IS_OVERFLOW(page)) && - (ILL & (P_BRANCH | P_LEAF | P_LEAF2)) == 0) { - /* Контроль четности page->mp_upper тут либо приводит к ложным ошибкам, - * либо слишком дорог по количеству операций. Заковырка в том, что mp_upper - * может быть нечетным на LEAF2-страницах, при нечетном количестве элементов - * нечетной длины. Поэтому четность page->mp_upper здесь не проверяется, но - * соответствующие полные проверки есть в page_check(). */ - if (unlikely(page->mp_upper < page->mp_lower || (page->mp_lower & 1) || - PAGEHDRSZ + page->mp_upper > txn->mt_env->me_psize)) - return bad_page(page, - "invalid page' lower(%u)/upper(%u) with limit %zu\n", - page->mp_lower, page->mp_upper, page_space(txn->mt_env)); + ref_data = data; - } else if ((ILL & P_OVERFLOW) == 0) { - const pgno_t npages = page->mp_pages; - if (unlikely(npages < 1) || unlikely(npages >= MAX_PAGENO / 2)) - return bad_page(page, "invalid n-pages (%u) for large-page\n", npages); - if (unlikely(page->mp_pgno + npages > txn->mt_next_pgno)) - return bad_page( - page, - "end of large-page beyond (%u) allocated space (%u next-pgno)\n", - page->mp_pgno + npages, txn->mt_next_pgno); +insert_node:; + const unsigned naf = flags & NODE_ADD_FLAGS; + size_t nsize = is_dupfix_leaf(mc->pg[mc->top]) + ? key->iov_len + : leaf_size(env, key, ref_data); + if (page_room(mc->pg[mc->top]) < nsize) { + rc = page_split(mc, key, ref_data, P_INVALID, + insert_key ? naf : naf | MDBX_SPLIT_REPLACE); + if (rc == MDBX_SUCCESS && AUDIT_ENABLED()) + rc = insert_key ? cursor_check(mc) : cursor_check_updating(mc); } else { - assert(false); + /* There is room already in this leaf page. */ + if (is_dupfix_leaf(mc->pg[mc->top])) { + cASSERT(mc, !(naf & (N_BIGDATA | N_SUBDATA | N_DUPDATA)) && + ref_data->iov_len == 0); + rc = node_add_dupfix(mc, mc->ki[mc->top], key); + } else + rc = node_add_leaf(mc, mc->ki[mc->top], key, ref_data, naf); + if (likely(rc == 0)) { + /* Adjust other cursors pointing to mp */ + page_t *const mp = mc->pg[mc->top]; + const size_t dbi = cursor_dbi(mc); + for (MDBX_cursor *m2 = mc->txn->cursors[dbi]; m2; m2 = m2->next) { + MDBX_cursor *m3 = (mc->flags & z_inner) ? &m2->subcur->cursor : m2; + if (!is_related(mc, m3) || m3->pg[mc->top] != mp) + continue; + if (m3->ki[mc->top] >= mc->ki[mc->top]) + m3->ki[mc->top] += insert_key; + if (inner_pointed(m3)) + cursor_inner_refresh(m3, mp, m3->ki[mc->top]); + } + } } - return MDBX_SUCCESS; -} -__cold static __noinline pgr_t -page_get_checker_full(const uint16_t ILL, MDBX_page *page, - const MDBX_cursor *const mc, const txnid_t front) { - pgr_t r = {page, page_get_checker_lite(ILL, page, mc->mc_txn, front)}; - if (likely(r.err == MDBX_SUCCESS)) - r.err = page_check(mc, page); - if (unlikely(r.err != MDBX_SUCCESS)) - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - return r; -} + if (likely(rc == MDBX_SUCCESS)) { + /* Now store the actual data in the child DB. Note that we're + * storing the user data in the keys field, so there are strict + * size limits on dupdata. The actual data fields of the child + * DB are all zero size. */ + if (flags & N_DUPDATA) { + MDBX_val empty; + dupsort_put: + empty.iov_len = 0; + empty.iov_base = nullptr; + node_t *node = page_node(mc->pg[mc->top], mc->ki[mc->top]); +#define SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE 1 + STATIC_ASSERT( + (MDBX_NODUPDATA >> SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE) == + MDBX_NOOVERWRITE); + unsigned inner_flags = + MDBX_CURRENT | ((flags & MDBX_NODUPDATA) >> + SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE); + if ((flags & MDBX_CURRENT) == 0) { + inner_flags -= MDBX_CURRENT; + rc = cursor_dupsort_setup(mc, node, mc->pg[mc->top]); + if (unlikely(rc != MDBX_SUCCESS)) + goto dupsort_error; + } + subcur_t *const mx = mc->subcur; + if (sub_root) { + cASSERT(mc, mx->nested_tree.height == 1 && + mx->nested_tree.root == sub_root->pgno); + mx->cursor.flags = z_inner; + mx->cursor.top = 0; + mx->cursor.pg[0] = sub_root; + mx->cursor.ki[0] = 0; + } + if (old_singledup.iov_base) { + /* converted, write the original data first */ + if (is_dupfix_leaf(mx->cursor.pg[0])) + rc = node_add_dupfix(&mx->cursor, 0, &old_singledup); + else + rc = node_add_leaf(&mx->cursor, 0, &old_singledup, &empty, 0); + if (unlikely(rc != MDBX_SUCCESS)) + goto dupsort_error; + mx->cursor.tree->items = 1; + } + if (!(node_flags(node) & N_SUBDATA) || sub_root) { + page_t *const mp = mc->pg[mc->top]; + const intptr_t nkeys = page_numkeys(mp); + const size_t dbi = cursor_dbi(mc); -__hot static __always_inline pgr_t page_get_inline(const uint16_t ILL, - const MDBX_cursor *const mc, - const pgno_t pgno, - const txnid_t front) { - MDBX_txn *const txn = mc->mc_txn; - tASSERT(txn, front <= txn->mt_front); + for (MDBX_cursor *m2 = mc->txn->cursors[dbi]; m2; m2 = m2->next) { + if (!is_related(mc, m2) || m2->pg[mc->top] != mp) + continue; + if (/* пропускаем незаполненные курсоры, иначе получится что у такого + курсора будет инициализирован вложенный, + что антилогично и бесполезно. */ + is_filled(m2) && m2->ki[mc->top] == mc->ki[mc->top]) { + cASSERT(m2, m2->subcur->cursor.clc == mx->cursor.clc); + m2->subcur->nested_tree = mx->nested_tree; + m2->subcur->cursor.pg[0] = mx->cursor.pg[0]; + if (old_singledup.iov_base) { + m2->subcur->cursor.top_and_flags = z_inner; + m2->subcur->cursor.ki[0] = 0; + } + DEBUG("Sub-dbi -%zu root page %" PRIaPGNO, + cursor_dbi(&m2->subcur->cursor), + m2->subcur->nested_tree.root); + } else if (!insert_key && m2->ki[mc->top] < nkeys) + cursor_inner_refresh(m2, mp, m2->ki[mc->top]); + } + } + cASSERT(mc, mc->subcur->nested_tree.items < PTRDIFF_MAX); + const size_t probe = (size_t)mc->subcur->nested_tree.items; +#define SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND 1 + STATIC_ASSERT((MDBX_APPENDDUP >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND) == + MDBX_APPEND); + inner_flags |= + (flags & MDBX_APPENDDUP) >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND; + rc = cursor_put(&mc->subcur->cursor, data, &empty, inner_flags); + if (flags & N_SUBDATA) { + void *db = node_data(node); + mc->subcur->nested_tree.mod_txnid = mc->txn->txnid; + memcpy(db, &mc->subcur->nested_tree, sizeof(tree_t)); + } + insert_data = (probe != (size_t)mc->subcur->nested_tree.items); + } + /* Increment count unless we just replaced an existing item. */ + if (insert_data) + mc->tree->items++; + if (insert_key) { + if (unlikely(rc != MDBX_SUCCESS)) + goto dupsort_error; + /* If we succeeded and the key didn't exist before, + * make sure the cursor is marked valid. */ + be_filled(mc); + } + if (likely(rc == MDBX_SUCCESS)) { + cASSERT(mc, is_filled(mc)); + if (unlikely(batch_dupfix_done)) { + batch_dupfix_continue: + /* let caller know how many succeeded, if any */ + if ((*batch_dupfix_done += 1) < batch_dupfix_given) { + data[0].iov_base = ptr_disp(data[0].iov_base, data[0].iov_len); + insert_key = insert_data = false; + old_singledup.iov_base = nullptr; + goto more; + } + } + if (AUDIT_ENABLED()) + rc = cursor_check(mc); + } + return rc; - pgr_t r; - if (unlikely(pgno >= txn->mt_next_pgno)) { - ERROR("page #%" PRIaPGNO " beyond next-pgno", pgno); - r.page = nullptr; - r.err = MDBX_PAGE_NOTFOUND; - bailout: - txn->mt_flags |= MDBX_TXN_ERROR; - return r; + dupsort_error: + if (unlikely(rc == MDBX_KEYEXIST)) { + /* should not happen, we deleted that item */ + ERROR("Unexpected %i error while put to nested dupsort's hive", rc); + rc = MDBX_PROBLEM; + } } + mc->txn->flags |= MDBX_TXN_ERROR; + return rc; +} - eASSERT(txn->mt_env, - ((txn->mt_flags ^ txn->mt_env->me_flags) & MDBX_WRITEMAP) == 0); - r.page = pgno2page(txn->mt_env, pgno); - if ((txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0) { - const MDBX_txn *spiller = txn; - do { - /* Spilled pages were dirtied in this txn and flushed - * because the dirty list got full. Bring this page - * back in from the map (but don't unspill it here, - * leave that unless page_touch happens again). */ - if (unlikely(spiller->mt_flags & MDBX_TXN_SPILLS) && - search_spilled(spiller, pgno)) - break; +__hot int cursor_put_checklen(MDBX_cursor *mc, const MDBX_val *key, + MDBX_val *data, unsigned flags) { + cASSERT(mc, (mc->flags & z_inner) == 0); + if (unlikely(key->iov_len > mc->clc->k.lmax || + key->iov_len < mc->clc->k.lmin)) { + cASSERT(mc, !"Invalid key-size"); + return MDBX_BAD_VALSIZE; + } + if (unlikely(data->iov_len > mc->clc->v.lmax || + data->iov_len < mc->clc->v.lmin)) { + cASSERT(mc, !"Invalid data-size"); + return MDBX_BAD_VALSIZE; + } - const size_t i = dpl_search(spiller, pgno); - tASSERT(txn, (intptr_t)i > 0); - if (spiller->tw.dirtylist->items[i].pgno == pgno) { - r.page = spiller->tw.dirtylist->items[i].ptr; - break; + uint64_t aligned_keybytes, aligned_databytes; + MDBX_val aligned_key, aligned_data; + if (mc->tree->flags & MDBX_INTEGERKEY) { + if (key->iov_len == 8) { + if (unlikely(7 & (uintptr_t)key->iov_base)) { + /* copy instead of return error to avoid break compatibility */ + aligned_key.iov_base = bcopy_8(&aligned_keybytes, key->iov_base); + aligned_key.iov_len = key->iov_len; + key = &aligned_key; } - - spiller = spiller->mt_parent; - } while (spiller); + } else if (key->iov_len == 4) { + if (unlikely(3 & (uintptr_t)key->iov_base)) { + /* copy instead of return error to avoid break compatibility */ + aligned_key.iov_base = bcopy_4(&aligned_keybytes, key->iov_base); + aligned_key.iov_len = key->iov_len; + key = &aligned_key; + } + } else { + cASSERT(mc, !"key-size is invalid for MDBX_INTEGERKEY"); + return MDBX_BAD_VALSIZE; + } } - - if (unlikely(r.page->mp_pgno != pgno)) { - r.err = bad_page( - r.page, "pgno mismatch (%" PRIaPGNO ") != expected (%" PRIaPGNO ")\n", - r.page->mp_pgno, pgno); - goto bailout; + if (mc->tree->flags & MDBX_INTEGERDUP) { + if (data->iov_len == 8) { + if (unlikely(7 & (uintptr_t)data->iov_base)) { + if (unlikely(flags & MDBX_MULTIPLE)) + return MDBX_BAD_VALSIZE; + /* copy instead of return error to avoid break compatibility */ + aligned_data.iov_base = bcopy_8(&aligned_databytes, data->iov_base); + aligned_data.iov_len = data->iov_len; + data = &aligned_data; + } + } else if (data->iov_len == 4) { + if (unlikely(3 & (uintptr_t)data->iov_base)) { + if (unlikely(flags & MDBX_MULTIPLE)) + return MDBX_BAD_VALSIZE; + /* copy instead of return error to avoid break compatibility */ + aligned_data.iov_base = bcopy_4(&aligned_databytes, data->iov_base); + aligned_data.iov_len = data->iov_len; + data = &aligned_data; + } + } else { + cASSERT(mc, !"data-size is invalid for MDBX_INTEGERKEY"); + return MDBX_BAD_VALSIZE; + } } - - if (unlikely(mc->mc_checking & CC_PAGECHECK)) - return page_get_checker_full(ILL, r.page, mc, front); - -#if MDBX_DISABLE_VALIDATION - r.err = MDBX_SUCCESS; -#else - r.err = page_get_checker_lite(ILL, r.page, txn, front); - if (unlikely(r.err != MDBX_SUCCESS)) - goto bailout; -#endif /* MDBX_DISABLE_VALIDATION */ - return r; + return cursor_put(mc, key, data, flags); } -/* Finish mdbx_page_search() / mdbx_page_search_lowest(). - * The cursor is at the root page, set up the rest of it. */ -__hot __noinline static int page_search_root(MDBX_cursor *mc, - const MDBX_val *key, int flags) { - MDBX_page *mp = mc->mc_pg[mc->mc_top]; - int rc; - DKBUF_DEBUG; +__hot int cursor_del(MDBX_cursor *mc, unsigned flags) { + if (unlikely(!is_filled(mc))) + return MDBX_ENODATA; - while (IS_BRANCH(mp)) { - MDBX_node *node; - intptr_t i; + int rc = cursor_touch(mc, nullptr, nullptr); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - DEBUG("branch page %" PRIaPGNO " has %zu keys", mp->mp_pgno, - page_numkeys(mp)); - /* Don't assert on branch pages in the GC. We can get here - * while in the process of rebalancing a GC branch page; we must - * let that proceed. ITS#8336 */ - cASSERT(mc, !mc->mc_dbi || page_numkeys(mp) > 1); - DEBUG("found index 0 to page %" PRIaPGNO, node_pgno(page_node(mp, 0))); + page_t *mp = mc->pg[mc->top]; + cASSERT(mc, is_modifable(mc->txn, mp)); + if (!MDBX_DISABLE_VALIDATION && unlikely(!check_leaf_type(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->pgno, mp->flags); + return MDBX_CORRUPTED; + } + if (is_dupfix_leaf(mp)) + goto del_key; - if (flags & (MDBX_PS_FIRST | MDBX_PS_LAST)) { - i = 0; - if (flags & MDBX_PS_LAST) { - i = page_numkeys(mp) - 1; - /* if already init'd, see if we're already in right place */ - if (mc->mc_flags & C_INITIALIZED) { - if (mc->mc_ki[mc->mc_top] == i) { - mc->mc_top = mc->mc_snum++; - mp = mc->mc_pg[mc->mc_top]; - goto ready; + node_t *node = page_node(mp, mc->ki[mc->top]); + if (node_flags(node) & N_DUPDATA) { + if (flags & (MDBX_ALLDUPS | /* for compatibility */ MDBX_NODUPDATA)) { + /* will subtract the final entry later */ + mc->tree->items -= mc->subcur->nested_tree.items - 1; + } else { + if (!(node_flags(node) & N_SUBDATA)) { + page_t *sp = node_data(node); + cASSERT(mc, is_subpage(sp)); + sp->txnid = mp->txnid; + mc->subcur->cursor.pg[0] = sp; + } + rc = cursor_del(&mc->subcur->cursor, 0); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + /* If sub-DB still has entries, we're done */ + if (mc->subcur->nested_tree.items) { + if (node_flags(node) & N_SUBDATA) { + /* update subDB info */ + mc->subcur->nested_tree.mod_txnid = mc->txn->txnid; + memcpy(node_data(node), &mc->subcur->nested_tree, sizeof(tree_t)); + } else { + /* shrink sub-page */ + node = node_shrink(mp, mc->ki[mc->top], node); + mc->subcur->cursor.pg[0] = node_data(node); + /* fix other sub-DB cursors pointed at sub-pages on this page */ + for (MDBX_cursor *m2 = mc->txn->cursors[cursor_dbi(mc)]; m2; + m2 = m2->next) { + if (!is_related(mc, m2) || m2->pg[mc->top] != mp) + continue; + const node_t *inner = node; + if (unlikely(m2->ki[mc->top] >= page_numkeys(mp))) { + be_hollow(m2); + continue; + } + if (m2->ki[mc->top] != mc->ki[mc->top]) { + inner = page_node(mp, m2->ki[mc->top]); + if (node_flags(inner) & N_SUBDATA) + continue; + } + m2->subcur->cursor.pg[0] = node_data(inner); } } + mc->tree->items -= 1; + cASSERT(mc, mc->tree->items > 0 && mc->tree->height > 0 && + mc->tree->root != P_INVALID); + return rc; } - } else { - const struct node_result nsr = node_search(mc, key); - if (likely(nsr.node)) - i = mc->mc_ki[mc->mc_top] + (intptr_t)nsr.exact - 1; - else - i = page_numkeys(mp) - 1; - DEBUG("following index %zu for key [%s]", i, DKEY_DEBUG(key)); + /* otherwise fall thru and delete the sub-DB */ } - cASSERT(mc, i >= 0 && i < (int)page_numkeys(mp)); - node = page_node(mp, i); - - rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - mc->mc_ki[mc->mc_top] = (indx_t)i; - if (unlikely(rc = cursor_push(mc, mp))) - return rc; - - ready: - if (flags & MDBX_PS_MODIFY) { - rc = page_touch(mc); + if ((node_flags(node) & N_SUBDATA) && mc->subcur->cursor.tree->height) { + /* add all the child DB's pages to the free list */ + rc = tree_drop(&mc->subcur->cursor, false); if (unlikely(rc != MDBX_SUCCESS)) - return rc; - mp = mc->mc_pg[mc->mc_top]; + goto fail; } + inner_gone(mc); + } else { + cASSERT(mc, !inner_pointed(mc)); + /* MDBX passes N_SUBDATA in 'flags' to delete a DB record */ + if (unlikely((node_flags(node) ^ flags) & N_SUBDATA)) + return MDBX_INCOMPATIBLE; } - if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { - ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", - mp->mp_pgno, mp->mp_flags); - return MDBX_CORRUPTED; - } - - DEBUG("found leaf page %" PRIaPGNO " for key [%s]", mp->mp_pgno, - DKEY_DEBUG(key)); - mc->mc_flags |= C_INITIALIZED; - mc->mc_flags &= ~C_EOF; - - return MDBX_SUCCESS; -} - -static int setup_sdb(MDBX_dbx *const dbx, const MDBX_db *const db, - const unsigned pagesize) { - if (unlikely(!db_check_flags(db->md_flags))) { - ERROR("incompatible or invalid db.md_flags (%u) ", db->md_flags); - return MDBX_INCOMPATIBLE; - } - if (unlikely(!dbx->md_cmp)) { - dbx->md_cmp = get_default_keycmp(db->md_flags); - dbx->md_dcmp = get_default_datacmp(db->md_flags); + /* add large/overflow pages to free list */ + if (node_flags(node) & N_BIGDATA) { + pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->txnid); + if (unlikely((rc = lp.err) || (rc = page_retire(mc, lp.page)))) + goto fail; } - dbx->md_klen_min = keysize_min(db->md_flags); - dbx->md_klen_max = keysize_max(pagesize, db->md_flags); - assert(dbx->md_klen_max != (unsigned)-1); - - dbx->md_vlen_min = valsize_min(db->md_flags); - dbx->md_vlen_max = valsize_max(pagesize, db->md_flags); - assert(dbx->md_vlen_max != (size_t)-1); +del_key: + mc->tree->items -= 1; + const MDBX_dbi dbi = cursor_dbi(mc); + indx_t ki = mc->ki[mc->top]; + mp = mc->pg[mc->top]; + cASSERT(mc, is_leaf(mp)); + node_del(mc, mc->tree->dupfix_size); - if ((db->md_flags & (MDBX_DUPFIXED | MDBX_INTEGERDUP)) != 0 && db->md_xsize) { - if (!MDBX_DISABLE_VALIDATION && unlikely(db->md_xsize < dbx->md_vlen_min || - db->md_xsize > dbx->md_vlen_max)) { - ERROR("db.md_xsize (%u) <> min/max value-length (%zu/%zu)", db->md_xsize, - dbx->md_vlen_min, dbx->md_vlen_max); - return MDBX_CORRUPTED; + /* Adjust other cursors pointing to mp */ + for (MDBX_cursor *m2 = mc->txn->cursors[dbi]; m2; m2 = m2->next) { + MDBX_cursor *m3 = (mc->flags & z_inner) ? &m2->subcur->cursor : m2; + if (!is_related(mc, m3) || m3->pg[mc->top] != mp) + continue; + if (m3->ki[mc->top] == ki) { + m3->flags |= z_after_delete; + inner_gone(m3); + } else { + m3->ki[mc->top] -= m3->ki[mc->top] > ki; + if (inner_pointed(m3)) + cursor_inner_refresh(m3, m3->pg[mc->top], m3->ki[mc->top]); } - dbx->md_vlen_min = dbx->md_vlen_max = db->md_xsize; } - return MDBX_SUCCESS; -} -static int fetch_sdb(MDBX_txn *txn, size_t dbi) { - MDBX_cursor_couple couple; - int rc = cursor_init(&couple.outer, txn, MAIN_DBI); + rc = tree_rebalance(mc); if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - MDBX_dbx *const dbx = &txn->mt_env->me_dbxs[dbi]; - rc = page_search(&couple.outer, &dbx->md_name, 0); - if (unlikely(rc != MDBX_SUCCESS)) { - bailout: - NOTICE("dbi %zu refs to inaccessible subDB `%*s` for txn %" PRIaTXN - " (err %d)", - dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, - txn->mt_txnid, rc); - return (rc == MDBX_NOTFOUND) ? MDBX_BAD_DBI : rc; - } + goto fail; - MDBX_val data; - struct node_result nsr = node_search(&couple.outer, &dbx->md_name); - if (unlikely(!nsr.exact)) { - rc = MDBX_NOTFOUND; - goto bailout; - } - if (unlikely((node_flags(nsr.node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) { - NOTICE("dbi %zu refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", - dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, - txn->mt_txnid, "wrong flags"); - return MDBX_INCOMPATIBLE; /* not a named DB */ + mc->flags |= z_after_delete; + inner_gone(mc); + if (unlikely(mc->top < 0)) { + /* DB is totally empty now, just bail out. + * Other cursors adjustments were already done + * by rebalance and aren't needed here. */ + cASSERT(mc, mc->tree->items == 0 && + (mc->tree->root == P_INVALID || + (is_inner(mc) && !mc->tree->root)) && + mc->flags < 0); + return MDBX_SUCCESS; } - rc = node_read(&couple.outer, nsr.node, &data, - couple.outer.mc_pg[couple.outer.mc_top]); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(data.iov_len != sizeof(MDBX_db))) { - NOTICE("dbi %zu refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", - dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, - txn->mt_txnid, "wrong rec-size"); - return MDBX_INCOMPATIBLE; /* not a named DB */ - } + ki = mc->ki[mc->top]; + mp = mc->pg[mc->top]; + cASSERT(mc, is_leaf(mc->pg[mc->top])); + size_t nkeys = page_numkeys(mp); + cASSERT(mc, + (mc->tree->items > 0 && nkeys > 0) || + ((mc->flags & z_inner) && mc->tree->items == 0 && nkeys == 0)); - uint16_t md_flags = UNALIGNED_PEEK_16(data.iov_base, MDBX_db, md_flags); - /* The txn may not know this DBI, or another process may - * have dropped and recreated the DB with other flags. */ - MDBX_db *const db = &txn->mt_dbs[dbi]; - if (unlikely((db->md_flags & DB_PERSISTENT_FLAGS) != md_flags)) { - NOTICE("dbi %zu refs to the re-created subDB `%*s` for txn %" PRIaTXN - " with different flags (present 0x%X != wanna 0x%X)", - dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, - txn->mt_txnid, db->md_flags & DB_PERSISTENT_FLAGS, md_flags); - return MDBX_INCOMPATIBLE; + /* Adjust this and other cursors pointing to mp */ + const intptr_t top = /* может быть сброшен в -1 */ mc->top; + for (MDBX_cursor *m2 = mc->txn->cursors[dbi]; m2; m2 = m2->next) { + MDBX_cursor *m3 = (mc->flags & z_inner) ? &m2->subcur->cursor : m2; + if (top > m3->top || m3->pg[top] != mp) + continue; + /* if m3 points past last node in page, find next sibling */ + if (m3->ki[top] >= nkeys) { + rc = cursor_sibling_right(m3); + if (rc == MDBX_NOTFOUND) { + rc = MDBX_SUCCESS; + continue; + } + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } + if (/* пропускаем незаполненные курсоры, иначе получится что у такого + курсора будет инициализирован вложенный, + что антилогично и бесполезно. */ + is_filled(m3) && m3->subcur && + (m3->ki[top] >= ki || + /* уже переместились вправо */ m3->pg[top] != mp)) { + node = page_node(m3->pg[m3->top], m3->ki[m3->top]); + /* Если это dupsort-узел, то должен быть валидный вложенный курсор. */ + if (node_flags(node) & N_DUPDATA) { + /* Тут три варианта событий: + * 1) Вложенный курсор уже инициализирован, у узла есть флаг N_SUBDATA, + * соответственно дубликаты вынесены в отдельное дерево с корнем + * в отдельной странице = ничего корректировать не требуется. + * 2) Вложенный курсор уже инициализирован, у узла нет флага N_SUBDATA, + * соответственно дубликаты размещены на вложенной sub-странице. + * 3) Курсор стоял на удалённом элементе, который имел одно значение, + * а после удаления переместился на следующий элемент с дубликатами. + * В этом случае вложенный курсор не инициализирован и тепеь его + * нужно установить на первый дубликат. */ + if (is_pointed(&m3->subcur->cursor)) { + if ((node_flags(node) & N_SUBDATA) == 0) { + cASSERT(m3, m3->subcur->cursor.top == 0 && + m3->subcur->nested_tree.height == 1); + m3->subcur->cursor.pg[0] = node_data(node); + } + } else { + rc = cursor_dupsort_setup(m3, node, m3->pg[m3->top]); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + if (node_flags(node) & N_SUBDATA) { + rc = inner_first(&m3->subcur->cursor, nullptr); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } + } + } else + inner_gone(m3); + } } - memcpy(db, data.iov_base, sizeof(MDBX_db)); -#if !MDBX_DISABLE_VALIDATION - const txnid_t pp_txnid = couple.outer.mc_pg[couple.outer.mc_top]->mp_txnid; - tASSERT(txn, txn->mt_front >= pp_txnid); - if (unlikely(db->md_mod_txnid > pp_txnid)) { - ERROR("db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")", - db->md_mod_txnid, pp_txnid); - return MDBX_CORRUPTED; - } -#endif /* !MDBX_DISABLE_VALIDATION */ - rc = setup_sdb(dbx, db, txn->mt_env->me_psize); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + cASSERT(mc, rc == MDBX_SUCCESS); + if (AUDIT_ENABLED()) + rc = cursor_check(mc); + return rc; - txn->mt_dbi_state[dbi] &= ~DBI_STALE; - return MDBX_SUCCESS; +fail: + mc->txn->flags |= MDBX_TXN_ERROR; + return rc; } -/* Search for the lowest key under the current branch page. - * This just bypasses a numkeys check in the current page - * before calling mdbx_page_search_root(), because the callers - * are all in situations where the current page is known to - * be underfilled. */ -__hot static int page_search_lowest(MDBX_cursor *mc) { - MDBX_page *mp = mc->mc_pg[mc->mc_top]; - cASSERT(mc, IS_BRANCH(mp)); - MDBX_node *node = page_node(mp, 0); - - int rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - mc->mc_ki[mc->mc_top] = 0; - if (unlikely(rc = cursor_push(mc, mp))) - return rc; - return page_search_root(mc, NULL, MDBX_PS_FIRST); -} +/*----------------------------------------------------------------------------*/ -/* Search for the page a given key should be in. - * Push it and its parent pages on the cursor stack. - * - * [in,out] mc the cursor for this operation. - * [in] key the key to search for, or NULL for first/last page. - * [in] flags If MDBX_PS_MODIFY is set, visited pages in the DB - * are touched (updated with new page numbers). - * If MDBX_PS_FIRST or MDBX_PS_LAST is set, - * find first or last leaf. - * This is used by mdbx_cursor_first() and mdbx_cursor_last(). - * If MDBX_PS_ROOTONLY set, just fetch root node, no further - * lookups. - * - * Returns 0 on success, non-zero on failure. */ -__hot static int page_search(MDBX_cursor *mc, const MDBX_val *key, int flags) { - int rc; - pgno_t root; +__hot csr_t cursor_seek(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { + DKBUF_DEBUG; - /* Make sure the txn is still viable, then find the root from - * the txn's db table and set it as the root of the cursor's stack. */ - if (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_BLOCKED)) { - DEBUG("%s", "transaction has failed, must abort"); - return MDBX_BAD_TXN; + csr_t ret; + ret.exact = false; + if (unlikely(key->iov_len < mc->clc->k.lmin || + key->iov_len > mc->clc->k.lmax)) { + cASSERT(mc, !"Invalid key-size"); + ret.err = MDBX_BAD_VALSIZE; + return ret; } - /* Make sure we're using an up-to-date root */ - if (unlikely(*mc->mc_dbi_state & DBI_STALE)) { - rc = fetch_sdb(mc->mc_txn, mc->mc_dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + MDBX_val aligned_key = *key; + uint64_t aligned_key_buf; + if (mc->tree->flags & MDBX_INTEGERKEY) { + if (aligned_key.iov_len == 8) { + if (unlikely(7 & (uintptr_t)aligned_key.iov_base)) + /* copy instead of return error to avoid break compatibility */ + aligned_key.iov_base = bcopy_8(&aligned_key_buf, aligned_key.iov_base); + } else if (aligned_key.iov_len == 4) { + if (unlikely(3 & (uintptr_t)aligned_key.iov_base)) + /* copy instead of return error to avoid break compatibility */ + aligned_key.iov_base = bcopy_4(&aligned_key_buf, aligned_key.iov_base); + } else { + cASSERT(mc, !"key-size is invalid for MDBX_INTEGERKEY"); + ret.err = MDBX_BAD_VALSIZE; + return ret; + } } - root = mc->mc_db->md_root; - if (unlikely(root == P_INVALID)) { /* Tree is empty. */ - DEBUG("%s", "tree is empty"); - return MDBX_NOTFOUND; - } + page_t *mp; + node_t *node = nullptr; + /* See if we're already on the right page */ + if (is_pointed(mc)) { + mp = mc->pg[mc->top]; + cASSERT(mc, is_leaf(mp)); + const size_t nkeys = page_numkeys(mp); + if (unlikely(nkeys == 0)) { + /* при создании первой листовой страницы */ + cASSERT(mc, mc->top == 0 && mc->tree->height == 1 && + mc->tree->branch_pages == 0 && + mc->tree->leaf_pages == 1 && mc->ki[0] == 0); + /* Логически верно, но нет смысла, ибо это мимолетная/временная + * ситуация до добавления элемента выше по стеку вызовов: + mc->flags |= z_eof | z_hollow; */ + ret.err = MDBX_NOTFOUND; + return ret; + } - cASSERT(mc, root >= NUM_METAS); - if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED) || - mc->mc_pg[0]->mp_pgno != root) { - txnid_t pp_txnid = mc->mc_db->md_mod_txnid; - pp_txnid = /* mc->mc_db->md_mod_txnid maybe zero in a legacy DB */ pp_txnid - ? pp_txnid - : mc->mc_txn->mt_txnid; - if ((mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) == 0) { - MDBX_txn *scan = mc->mc_txn; - do - if ((scan->mt_flags & MDBX_TXN_DIRTY) && - (mc->mc_dbi == MAIN_DBI || - (scan->mt_dbi_state[mc->mc_dbi] & DBI_DIRTY))) { - /* После коммита вложенных тразакций может быть mod_txnid > front */ - pp_txnid = scan->mt_front; - break; - } - while (unlikely((scan = scan->mt_parent) != nullptr)); + mc->flags &= ~z_after_delete; + + MDBX_val nodekey; + if (is_dupfix_leaf(mp)) + nodekey = page_dupfix_key(mp, 0, mc->tree->dupfix_size); + else { + node = page_node(mp, 0); + nodekey = get_key(node); + inner_gone(mc); } - if (unlikely((rc = page_get(mc, root, &mc->mc_pg[0], pp_txnid)) != 0)) - return rc; - } + int cmp = mc->clc->k.cmp(&aligned_key, &nodekey); + if (unlikely(cmp == 0)) { + /* Probably happens rarely, but first node on the page + * was the one we wanted. */ + mc->ki[mc->top] = 0; + ret.exact = true; + goto got_node; + } + + if (cmp > 0) { + /* Искомый ключ больше первого на этой странице, + * целевая позиция на этой странице либо правее (ближе к концу). */ + if (likely(nkeys > 1)) { + if (is_dupfix_leaf(mp)) { + nodekey.iov_base = page_dupfix_ptr(mp, nkeys - 1, nodekey.iov_len); + } else { + node = page_node(mp, nkeys - 1); + nodekey = get_key(node); + } + cmp = mc->clc->k.cmp(&aligned_key, &nodekey); + if (cmp == 0) { + /* last node was the one we wanted */ + mc->ki[mc->top] = (indx_t)(nkeys - 1); + ret.exact = true; + goto got_node; + } + if (cmp < 0) { + /* Искомый ключ между первым и последним на этой страницы, + * поэтому пропускаем поиск по дереву и продолжаем только на текущей + * странице. */ + /* Сравниваем с текущей позицией, ибо частным сценарием является такое + * совпадение, но не делаем проверку если текущая позиция является + * первой/последний и соответственно такое сравнение было выше. */ + if (mc->ki[mc->top] > 0 && mc->ki[mc->top] < nkeys - 1) { + if (is_dupfix_leaf(mp)) { + nodekey.iov_base = + page_dupfix_ptr(mp, mc->ki[mc->top], nodekey.iov_len); + } else { + node = page_node(mp, mc->ki[mc->top]); + nodekey = get_key(node); + } + cmp = mc->clc->k.cmp(&aligned_key, &nodekey); + if (cmp == 0) { + /* current node was the one we wanted */ + ret.exact = true; + goto got_node; + } + } + goto search_node; + } + } - mc->mc_snum = 1; - mc->mc_top = 0; + /* Если в стеке курсора есть страницы справа, то продолжим искать там. */ + cASSERT(mc, mc->tree->height > mc->top); + for (intptr_t i = 0; i < mc->top; i++) + if ((size_t)mc->ki[i] + 1 < page_numkeys(mc->pg[i])) + goto continue_other_pages; - DEBUG("db %d root page %" PRIaPGNO " has flags 0x%X", DDBI(mc), root, - mc->mc_pg[0]->mp_flags); + /* Ключ больше последнего. */ + mc->ki[mc->top] = (indx_t)nkeys; + if (op < MDBX_SET_RANGE) { + target_not_found: + cASSERT(mc, op == MDBX_SET || op == MDBX_SET_KEY || + op == MDBX_GET_BOTH || op == MDBX_GET_BOTH_RANGE); + /* Операция предполагает поиск конкретного ключа, который не найден. + * Поэтому переводим курсор в неустановленное состояние, но без сброса + * top, что позволяет работать fastpath при последующем поиске по дереву + * страниц. */ + be_hollow(mc); + ret.err = MDBX_NOTFOUND; + return ret; + } + cASSERT(mc, op == MDBX_SET_RANGE); + mc->flags |= z_eof | z_hollow; + ret.err = MDBX_NOTFOUND; + return ret; + } - if (flags & MDBX_PS_MODIFY) { - if (unlikely(rc = page_touch(mc))) - return rc; + if (mc->top == 0) { + /* There are no other pages */ + mc->ki[mc->top] = 0; + if (op >= MDBX_SET_RANGE) + goto got_node; + else + goto target_not_found; + } } + cASSERT(mc, !inner_pointed(mc)); - if (flags & MDBX_PS_ROOTONLY) - return MDBX_SUCCESS; +continue_other_pages: + ret.err = tree_search(mc, &aligned_key, 0); + if (unlikely(ret.err != MDBX_SUCCESS)) + return ret; - return page_search_root(mc, key, flags); -} + cASSERT(mc, is_pointed(mc) && !inner_pointed(mc)); + mp = mc->pg[mc->top]; + MDBX_ANALYSIS_ASSUME(mp != nullptr); + cASSERT(mc, is_leaf(mp)); -/* Read large/overflow node data. */ -static __noinline int node_read_bigdata(MDBX_cursor *mc, const MDBX_node *node, - MDBX_val *data, const MDBX_page *mp) { - cASSERT(mc, node_flags(node) == F_BIGDATA && data->iov_len == node_ds(node)); +search_node: + cASSERT(mc, is_pointed(mc) && !inner_pointed(mc)); + struct node_search_result nsr = node_search(mc, &aligned_key); + node = nsr.node; + ret.exact = nsr.exact; + if (!ret.exact) { + if (op < MDBX_SET_RANGE) + goto target_not_found; - pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); - if (unlikely((lp.err != MDBX_SUCCESS))) { - DEBUG("read large/overflow page %" PRIaPGNO " failed", - node_largedata_pgno(node)); - return lp.err; + if (node == nullptr) { + DEBUG("%s", "===> inexact leaf not found, goto sibling"); + ret.err = cursor_sibling_right(mc); + if (unlikely(ret.err != MDBX_SUCCESS)) + return ret; /* no entries matched */ + mp = mc->pg[mc->top]; + cASSERT(mc, is_leaf(mp)); + if (!is_dupfix_leaf(mp)) + node = page_node(mp, 0); + } } - cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); - data->iov_base = page_data(lp.page); - if (!MDBX_DISABLE_VALIDATION) { - const MDBX_env *env = mc->mc_txn->mt_env; - const size_t dsize = data->iov_len; - const unsigned npages = number_of_ovpages(env, dsize); - if (unlikely(lp.page->mp_pages < npages)) - return bad_page(lp.page, - "too less n-pages %u for bigdata-node (%zu bytes)", - lp.page->mp_pages, dsize); +got_node: + cASSERT(mc, is_pointed(mc) && !inner_pointed(mc)); + cASSERT(mc, mc->ki[mc->top] < page_numkeys(mc->pg[mc->top])); + if (!MDBX_DISABLE_VALIDATION && unlikely(!check_leaf_type(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->pgno, mp->flags); + ret.err = MDBX_CORRUPTED; + return ret; } - return MDBX_SUCCESS; -} - -/* Return the data associated with a given node. */ -static __always_inline int node_read(MDBX_cursor *mc, const MDBX_node *node, - MDBX_val *data, const MDBX_page *mp) { - data->iov_len = node_ds(node); - data->iov_base = node_data(node); - if (likely(node_flags(node) != F_BIGDATA)) - return MDBX_SUCCESS; - return node_read_bigdata(mc, node, data, mp); -} - -int mdbx_get(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, - MDBX_val *data) { - DKBUF_DEBUG; - DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); - int rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + if (is_dupfix_leaf(mp)) { + if (op >= MDBX_SET_KEY) + *key = page_dupfix_key(mp, mc->ki[mc->top], mc->tree->dupfix_size); + be_filled(mc); + ret.err = MDBX_SUCCESS; + return ret; + } - if (unlikely(!key || !data)) - return MDBX_EINVAL; + if (node_flags(node) & N_DUPDATA) { + ret.err = cursor_dupsort_setup(mc, node, mp); + if (unlikely(ret.err != MDBX_SUCCESS)) + return ret; + if (op >= MDBX_SET) { + MDBX_ANALYSIS_ASSUME(mc->subcur != nullptr); + if (node_flags(node) & N_SUBDATA) { + ret.err = inner_first(&mc->subcur->cursor, data); + if (unlikely(ret.err != MDBX_SUCCESS)) + return ret; + } else if (data) { + const page_t *inner_mp = mc->subcur->cursor.pg[0]; + cASSERT(mc, is_subpage(inner_mp) && is_leaf(inner_mp)); + const size_t inner_ki = mc->subcur->cursor.ki[0]; + if (is_dupfix_leaf(inner_mp)) + *data = page_dupfix_key(inner_mp, inner_ki, mc->tree->dupfix_size); + else + *data = get_key(page_node(inner_mp, inner_ki)); + } + } else { + MDBX_ANALYSIS_ASSUME(mc->subcur != nullptr); + ret = cursor_seek(&mc->subcur->cursor, data, nullptr, MDBX_SET_RANGE); + if (unlikely(ret.err != MDBX_SUCCESS)) { + if (ret.err == MDBX_NOTFOUND && op < MDBX_SET_RANGE) + goto target_not_found; + return ret; + } + if (op == MDBX_GET_BOTH && !ret.exact) + goto target_not_found; + } + } else if (likely(data)) { + if (op <= MDBX_GET_BOTH_RANGE) { + if (unlikely(data->iov_len < mc->clc->v.lmin || + data->iov_len > mc->clc->v.lmax)) { + cASSERT(mc, !"Invalid data-size"); + ret.err = MDBX_BAD_VALSIZE; + return ret; + } + MDBX_val aligned_data = *data; + uint64_t aligned_databytes; + if (mc->tree->flags & MDBX_INTEGERDUP) { + if (aligned_data.iov_len == 8) { + if (unlikely(7 & (uintptr_t)aligned_data.iov_base)) + /* copy instead of return error to avoid break compatibility */ + aligned_data.iov_base = + bcopy_8(&aligned_databytes, aligned_data.iov_base); + } else if (aligned_data.iov_len == 4) { + if (unlikely(3 & (uintptr_t)aligned_data.iov_base)) + /* copy instead of return error to avoid break compatibility */ + aligned_data.iov_base = + bcopy_4(&aligned_databytes, aligned_data.iov_base); + } else { + cASSERT(mc, !"data-size is invalid for MDBX_INTEGERDUP"); + ret.err = MDBX_BAD_VALSIZE; + return ret; + } + } + MDBX_val actual_data; + ret.err = node_read(mc, node, &actual_data, mc->pg[mc->top]); + if (unlikely(ret.err != MDBX_SUCCESS)) + return ret; + const int cmp = mc->clc->v.cmp(&aligned_data, &actual_data); + if (cmp) { + if (op != MDBX_GET_BOTH_RANGE) { + cASSERT(mc, op == MDBX_GET_BOTH); + goto target_not_found; + } + if (cmp > 0) { + ret.err = MDBX_NOTFOUND; + return ret; + } + } + *data = actual_data; + } else { + ret.err = node_read(mc, node, data, mc->pg[mc->top]); + if (unlikely(ret.err != MDBX_SUCCESS)) + return ret; + } + } - MDBX_cursor_couple cx; - rc = cursor_init(&cx.outer, txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + /* The key already matches in all other cases */ + if (op >= MDBX_SET_KEY) + get_key_optional(node, key); - return cursor_set(&cx.outer, (MDBX_val *)key, data, MDBX_SET).err; + DEBUG("==> cursor placed on key [%s], data [%s]", DKEY_DEBUG(key), + DVAL_DEBUG(data)); + ret.err = MDBX_SUCCESS; + be_filled(mc); + return ret; } -int mdbx_get_equal_or_great(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, - MDBX_val *data) { - int rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(!key || !data)) - return MDBX_EINVAL; - - if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) - return MDBX_BAD_TXN; +__hot int cursor_ops(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + const MDBX_cursor_op op) { + if (op != MDBX_GET_CURRENT) + DEBUG(">> cursor %p(0x%x), ops %u, key %p, value %p", + __Wpedantic_format_voidptr(mc), mc->flags, op, + __Wpedantic_format_voidptr(key), __Wpedantic_format_voidptr(data)); + int rc; - MDBX_cursor_couple cx; - rc = cursor_init(&cx.outer, txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + switch (op) { + case MDBX_GET_CURRENT: + cASSERT(mc, (mc->flags & z_inner) == 0); + if (unlikely(is_hollow(mc))) + return MDBX_ENODATA; + else if (mc->flags & z_after_delete) + return outer_next(mc, key, data, MDBX_NEXT_NODUP); + else if (inner_pointed(mc) && (mc->subcur->cursor.flags & z_after_delete)) + return outer_next(mc, key, data, MDBX_NEXT_DUP); + else { + const page_t *mp = mc->pg[mc->top]; + const node_t *node = page_node(mp, mc->ki[mc->top]); + get_key_optional(node, key); + if (!data) + return MDBX_SUCCESS; + if (node_flags(node) & N_DUPDATA) { + if (!MDBX_DISABLE_VALIDATION && unlikely(!mc->subcur)) + return unexpected_dupsort(mc); + mc = &mc->subcur->cursor; + if (unlikely(is_hollow(mc))) + return MDBX_ENODATA; + mp = mc->pg[mc->top]; + if (is_dupfix_leaf(mp)) + *data = page_dupfix_key(mp, mc->ki[mc->top], mc->tree->dupfix_size); + else + *data = get_key(page_node(mp, mc->ki[mc->top])); + return MDBX_SUCCESS; + } else { + cASSERT(mc, !inner_pointed(mc)); + return node_read(mc, node, data, mc->pg[mc->top]); + } + } - return cursor_get(&cx.outer, key, data, MDBX_SET_LOWERBOUND); -} - -int mdbx_get_ex(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, - MDBX_val *data, size_t *values_count) { - DKBUF_DEBUG; - DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); - - int rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) + case MDBX_GET_BOTH: + case MDBX_GET_BOTH_RANGE: + if (unlikely(data == nullptr)) + return MDBX_EINVAL; + if (unlikely(mc->subcur == nullptr)) + return MDBX_INCOMPATIBLE; + /* fall through */ + __fallthrough; + case MDBX_SET: + case MDBX_SET_KEY: + case MDBX_SET_RANGE: + if (unlikely(key == nullptr)) + return MDBX_EINVAL; + rc = cursor_seek(mc, key, data, op).err; + if (rc == MDBX_SUCCESS) + cASSERT(mc, is_filled(mc)); + else if (rc == MDBX_NOTFOUND && mc->tree->items) { + cASSERT(mc, is_pointed(mc)); + cASSERT(mc, op == MDBX_SET_RANGE || op == MDBX_GET_BOTH_RANGE || + !is_filled(mc)); + cASSERT(mc, op == MDBX_GET_BOTH_RANGE || !inner_filled(mc)); + if (op == MDBX_SET_RANGE) { + mc->ki[mc->top] = page_numkeys(mc->pg[mc->top]) - 1; + mc->flags &= ~z_hollow; + } + if (op == MDBX_GET_BOTH_RANGE && mc->subcur->cursor.top >= 0) { + MDBX_cursor *mx = &mc->subcur->cursor; + mx->ki[mx->top] = page_numkeys(mx->pg[mx->top]) - 1; + mx->flags &= ~z_hollow; + } + } else + cASSERT(mc, is_poor(mc) && !is_filled(mc)); return rc; - if (unlikely(!key || !data)) - return MDBX_EINVAL; + case MDBX_GET_MULTIPLE: + if (unlikely(!data)) + return MDBX_EINVAL; + if (unlikely((mc->tree->flags & MDBX_DUPFIXED) == 0)) + return MDBX_INCOMPATIBLE; + if (unlikely(!is_pointed(mc))) { + if (unlikely(!key)) + return MDBX_EINVAL; + rc = cursor_seek(mc, key, data, MDBX_SET).err; + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + if (unlikely(is_eof(mc) || !inner_filled(mc))) + return MDBX_ENODATA; + goto fetch_multiple; - MDBX_cursor_couple cx; - rc = cursor_init(&cx.outer, txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + case MDBX_NEXT_MULTIPLE: + if (unlikely(!data)) + return MDBX_EINVAL; + if (unlikely(mc->subcur == nullptr)) + return MDBX_INCOMPATIBLE; + rc = outer_next(mc, key, data, MDBX_NEXT_DUP); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + else { + fetch_multiple: + cASSERT(mc, is_filled(mc) && !inner_filled(mc)); + MDBX_cursor *mx = &mc->subcur->cursor; + data->iov_len = page_numkeys(mx->pg[mx->top]) * mx->tree->dupfix_size; + data->iov_base = page_data(mx->pg[mx->top]); + mx->ki[mx->top] = (indx_t)page_numkeys(mx->pg[mx->top]) - 1; + return MDBX_SUCCESS; + } - rc = cursor_set(&cx.outer, key, data, MDBX_SET_KEY).err; - if (unlikely(rc != MDBX_SUCCESS)) { - if (rc == MDBX_NOTFOUND && values_count) - *values_count = 0; + case MDBX_PREV_MULTIPLE: + if (unlikely(!data)) + return MDBX_EINVAL; + if (unlikely(mc->subcur == nullptr)) + return MDBX_INCOMPATIBLE; + if (unlikely(!is_pointed(mc))) { + rc = outer_last(mc, key, data); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + mc->subcur->cursor.ki[mc->subcur->cursor.top] = 0; + goto fetch_multiple; + } + if (unlikely(is_eof(mc) || !inner_filled(mc))) + return MDBX_ENODATA; + rc = cursor_sibling_left(&mc->subcur->cursor); + if (likely(rc == MDBX_SUCCESS)) + goto fetch_multiple; return rc; - } - if (values_count) { - *values_count = 1; - if (cx.outer.mc_xcursor != NULL) { - MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], - cx.outer.mc_ki[cx.outer.mc_top]); - if (node_flags(node) & F_DUPDATA) { - // coverity[uninit_use : FALSE] - tASSERT(txn, cx.outer.mc_xcursor == &cx.inner && - (cx.inner.mx_cursor.mc_flags & C_INITIALIZED)); - // coverity[uninit_use : FALSE] - *values_count = - (sizeof(*values_count) >= sizeof(cx.inner.mx_db.md_entries) || - cx.inner.mx_db.md_entries <= PTRDIFF_MAX) - ? (size_t)cx.inner.mx_db.md_entries - : PTRDIFF_MAX; + case MDBX_NEXT_DUP: + case MDBX_NEXT: + case MDBX_NEXT_NODUP: + rc = outer_next(mc, key, data, op); + if (rc == MDBX_NOTFOUND) { + cASSERT(mc, is_pointed(mc) || mc->tree->items == 0); + if (is_pointed(mc)) { + if (unlikely(mc->ki[mc->top] >= page_numkeys(mc->pg[mc->top]))) + return MDBX_ENODATA; + if (inner_pointed(mc)) { + MDBX_cursor *const mx = &mc->subcur->cursor; + if (unlikely(mx->ki[mx->top] >= page_numkeys(mx->pg[mx->top]))) + return MDBX_ENODATA; + mc->subcur->cursor.flags &= ~z_hollow; + } + mc->flags &= ~z_hollow; } } - } - return MDBX_SUCCESS; -} - -/* Find a sibling for a page. - * Replaces the page at the top of the cursor's stack with the specified - * sibling, if one exists. - * - * [in] mc The cursor for this operation. - * [in] dir SIBLING_LEFT or SIBLING_RIGHT. - * - * Returns 0 on success, non-zero on failure. */ -static int cursor_sibling(MDBX_cursor *mc, int dir) { - int rc; - MDBX_node *node; - MDBX_page *mp; - assert(dir == SIBLING_LEFT || dir == SIBLING_RIGHT); + return rc; - if (unlikely(mc->mc_snum < 2)) - return MDBX_NOTFOUND; /* root has no siblings */ + case MDBX_PREV_DUP: + case MDBX_PREV: + case MDBX_PREV_NODUP: + return outer_prev(mc, key, data, op); - cursor_pop(mc); - DEBUG("parent page is page %" PRIaPGNO ", index %u", - mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]); + case MDBX_FIRST: + return outer_first(mc, key, data); + case MDBX_LAST: + return outer_last(mc, key, data); - if ((dir == SIBLING_RIGHT) ? (mc->mc_ki[mc->mc_top] + (size_t)1 >= - page_numkeys(mc->mc_pg[mc->mc_top])) - : (mc->mc_ki[mc->mc_top] == 0)) { - DEBUG("no more keys aside, moving to next %s sibling", - dir ? "right" : "left"); - if (unlikely((rc = cursor_sibling(mc, dir)) != MDBX_SUCCESS)) { - /* undo cursor_pop before returning */ - mc->mc_top++; - mc->mc_snum++; - return rc; + case MDBX_LAST_DUP: + case MDBX_FIRST_DUP: + if (unlikely(data == nullptr)) + return MDBX_EINVAL; + if (unlikely(!is_filled(mc))) + return MDBX_ENODATA; + else { + node_t *node = page_node(mc->pg[mc->top], mc->ki[mc->top]); + get_key_optional(node, key); + if ((node_flags(node) & N_DUPDATA) == 0) + return node_read(mc, node, data, mc->pg[mc->top]); + else if (MDBX_DISABLE_VALIDATION || likely(mc->subcur)) + return ((op == MDBX_FIRST_DUP) ? inner_first + : inner_last)(&mc->subcur->cursor, data); + else + return unexpected_dupsort(mc); } - } else { - assert((dir - 1) == -1 || (dir - 1) == 1); - mc->mc_ki[mc->mc_top] += (indx_t)(dir - 1); - DEBUG("just moving to %s index key %u", - (dir == SIBLING_RIGHT) ? "right" : "left", mc->mc_ki[mc->mc_top]); - } - cASSERT(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); + break; - node = page_node(mp = mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); - if (unlikely(rc != MDBX_SUCCESS)) { - /* mc will be inconsistent if caller does mc_snum++ as above */ - mc->mc_flags &= ~(C_INITIALIZED | C_EOF); + case MDBX_SET_UPPERBOUND: + case MDBX_SET_LOWERBOUND: + if (unlikely(key == nullptr || data == nullptr)) + return MDBX_EINVAL; + else { + MDBX_val save_data = *data; + csr_t csr = cursor_seek(mc, key, data, MDBX_SET_RANGE); + rc = csr.err; + if (rc == MDBX_SUCCESS && csr.exact && mc->subcur) { + csr.exact = false; + if (!save_data.iov_base) { + /* Avoiding search nested dupfix hive if no data provided. + * This is changes the semantic of MDBX_SET_LOWERBOUND but avoid + * returning MDBX_BAD_VALSIZE. */ + } else if (is_pointed(&mc->subcur->cursor)) { + *data = save_data; + csr = cursor_seek(&mc->subcur->cursor, data, nullptr, MDBX_SET_RANGE); + rc = csr.err; + if (rc == MDBX_NOTFOUND) { + cASSERT(mc, !csr.exact); + rc = outer_next(mc, key, data, MDBX_NEXT_NODUP); + } + } else { + int cmp = mc->clc->v.cmp(&save_data, data); + csr.exact = (cmp == 0); + if (cmp > 0) + rc = outer_next(mc, key, data, MDBX_NEXT_NODUP); + } + } + if (rc == MDBX_SUCCESS && !csr.exact) + rc = MDBX_RESULT_TRUE; + else if (rc == MDBX_NOTFOUND) { + cASSERT(mc, F_ISSET(mc->flags, z_eof | z_hollow)); + cASSERT(mc, !inner_pointed(mc) || + F_ISSET(mc->subcur->cursor.flags, z_eof | z_hollow)); + } + if (unlikely(op == MDBX_SET_UPPERBOUND)) { + /* minor fixups for MDBX_SET_UPPERBOUND */ + if (rc == MDBX_RESULT_TRUE) + /* already at great-than by MDBX_SET_LOWERBOUND */ + rc = MDBX_SUCCESS; + else if (rc == MDBX_SUCCESS) + /* exactly match, going next */ + rc = outer_next(mc, key, data, MDBX_NEXT); + } + } return rc; - } - rc = cursor_push(mc, mp); - if (unlikely(rc != MDBX_SUCCESS)) + /* Doubtless API to positioning of the cursor at a specified key. */ + case MDBX_TO_KEY_LESSER_THAN: + case MDBX_TO_KEY_LESSER_OR_EQUAL: + case MDBX_TO_KEY_EQUAL: + case MDBX_TO_KEY_GREATER_OR_EQUAL: + case MDBX_TO_KEY_GREATER_THAN: + if (unlikely(key == nullptr)) + return MDBX_EINVAL; + else { + csr_t csr = cursor_seek(mc, key, data, MDBX_SET_RANGE); + rc = csr.err; + if (csr.exact) { + cASSERT(mc, csr.err == MDBX_SUCCESS); + if (op == MDBX_TO_KEY_LESSER_THAN) + rc = outer_prev(mc, key, data, MDBX_PREV_NODUP); + else if (op == MDBX_TO_KEY_GREATER_THAN) + rc = outer_next(mc, key, data, MDBX_NEXT_NODUP); + } else if (op < MDBX_TO_KEY_EQUAL && + (rc == MDBX_NOTFOUND || rc == MDBX_SUCCESS)) + rc = outer_prev(mc, key, data, MDBX_PREV_NODUP); + else if (op == MDBX_TO_KEY_EQUAL && rc == MDBX_SUCCESS) + rc = MDBX_NOTFOUND; + } return rc; - mc->mc_ki[mc->mc_top] = - (dir == SIBLING_LEFT) ? (indx_t)page_numkeys(mp) - 1 : 0; - return MDBX_SUCCESS; -} - -/* Move the cursor to the next data item. */ -static int cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - MDBX_cursor_op op) { - assert(op == MDBX_NEXT || op == MDBX_NEXT_DUP || op == MDBX_NEXT_NODUP); - int rc; - - if (unlikely(mc->mc_flags & C_DEL) && op == MDBX_NEXT_DUP) - return MDBX_NOTFOUND; - - if (unlikely(!(mc->mc_flags & C_INITIALIZED))) { - if (unlikely(mc->mc_flags & C_SUB)) - return MDBX_NOTFOUND; - return cursor_first(mc, key, data); - } - - const MDBX_page *mp = mc->mc_pg[mc->mc_top]; - if (unlikely(mc->mc_flags & C_EOF)) { - if (mc->mc_ki[mc->mc_top] + (size_t)1 >= page_numkeys(mp)) - return MDBX_NOTFOUND; - mc->mc_flags ^= C_EOF; - } - - if (mc->mc_xcursor) { - if (op != MDBX_NEXT_NODUP) { - const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (node_flags(node) & F_DUPDATA) { - rc = cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_NEXT); - if (likely(rc == MDBX_SUCCESS)) { - get_key_optional(node, key); - return MDBX_SUCCESS; + /* Doubtless API to positioning of the cursor at a specified key-value pair + * for multi-value hives. */ + case MDBX_TO_EXACT_KEY_VALUE_LESSER_THAN: + case MDBX_TO_EXACT_KEY_VALUE_LESSER_OR_EQUAL: + case MDBX_TO_EXACT_KEY_VALUE_EQUAL: + case MDBX_TO_EXACT_KEY_VALUE_GREATER_OR_EQUAL: + case MDBX_TO_EXACT_KEY_VALUE_GREATER_THAN: + if (unlikely(key == nullptr || data == nullptr)) + return MDBX_EINVAL; + else { + MDBX_val save_data = *data; + csr_t csr = cursor_seek(mc, key, data, MDBX_SET_KEY); + rc = csr.err; + if (rc == MDBX_SUCCESS) { + cASSERT(mc, csr.exact); + if (inner_pointed(mc)) { + MDBX_cursor *const mx = &mc->subcur->cursor; + csr = cursor_seek(mx, &save_data, nullptr, MDBX_SET_RANGE); + rc = csr.err; + if (csr.exact) { + cASSERT(mc, csr.err == MDBX_SUCCESS); + if (op == MDBX_TO_EXACT_KEY_VALUE_LESSER_THAN) + rc = inner_prev(mx, data); + else if (op == MDBX_TO_EXACT_KEY_VALUE_GREATER_THAN) + rc = inner_next(mx, data); + } else if (op < MDBX_TO_EXACT_KEY_VALUE_EQUAL && + (rc == MDBX_NOTFOUND || rc == MDBX_SUCCESS)) + rc = inner_prev(mx, data); + else if (op == MDBX_TO_EXACT_KEY_VALUE_EQUAL && rc == MDBX_SUCCESS) + rc = MDBX_NOTFOUND; + } else { + int cmp = mc->clc->v.cmp(data, &save_data); + switch (op) { + default: + __unreachable(); + case MDBX_TO_EXACT_KEY_VALUE_LESSER_THAN: + rc = (cmp < 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; + break; + case MDBX_TO_EXACT_KEY_VALUE_LESSER_OR_EQUAL: + rc = (cmp <= 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; + break; + case MDBX_TO_EXACT_KEY_VALUE_EQUAL: + rc = (cmp == 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; + break; + case MDBX_TO_EXACT_KEY_VALUE_GREATER_OR_EQUAL: + rc = (cmp >= 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; + break; + case MDBX_TO_EXACT_KEY_VALUE_GREATER_THAN: + rc = (cmp > 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; + break; + } } - if (unlikely(rc != MDBX_NOTFOUND)) - return rc; } - if (op != MDBX_NEXT) - return MDBX_NOTFOUND; } - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - } - - DEBUG("cursor_next: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, - (void *)mc); - if (mc->mc_flags & C_DEL) { - mc->mc_flags ^= C_DEL; - goto skip; - } + return rc; - intptr_t ki = mc->mc_ki[mc->mc_top]; - mc->mc_ki[mc->mc_top] = (indx_t)++ki; - const intptr_t numkeys = page_numkeys(mp); - if (unlikely(ki >= numkeys)) { - DEBUG("%s", "=====> move to next sibling page"); - mc->mc_ki[mc->mc_top] = (indx_t)(numkeys - 1); - rc = cursor_sibling(mc, SIBLING_RIGHT); - if (unlikely(rc != MDBX_SUCCESS)) { - mc->mc_flags |= C_EOF; - return rc; + case MDBX_TO_PAIR_LESSER_THAN: + case MDBX_TO_PAIR_LESSER_OR_EQUAL: + case MDBX_TO_PAIR_EQUAL: + case MDBX_TO_PAIR_GREATER_OR_EQUAL: + case MDBX_TO_PAIR_GREATER_THAN: + if (unlikely(key == nullptr || data == nullptr)) + return MDBX_EINVAL; + else { + MDBX_val save_data = *data; + csr_t csr = cursor_seek(mc, key, data, MDBX_SET_RANGE); + rc = csr.err; + if (csr.exact) { + cASSERT(mc, csr.err == MDBX_SUCCESS); + if (inner_pointed(mc)) { + MDBX_cursor *const mx = &mc->subcur->cursor; + csr = cursor_seek(mx, &save_data, nullptr, MDBX_SET_RANGE); + rc = csr.err; + if (csr.exact) { + cASSERT(mc, csr.err == MDBX_SUCCESS); + if (op == MDBX_TO_PAIR_LESSER_THAN) + rc = outer_prev(mc, key, data, MDBX_PREV); + else if (op == MDBX_TO_PAIR_GREATER_THAN) + rc = outer_next(mc, key, data, MDBX_NEXT); + } else if (op < MDBX_TO_PAIR_EQUAL && + (rc == MDBX_NOTFOUND || rc == MDBX_SUCCESS)) + rc = outer_prev(mc, key, data, MDBX_PREV); + else if (op == MDBX_TO_PAIR_EQUAL && rc == MDBX_SUCCESS) + rc = MDBX_NOTFOUND; + else if (op > MDBX_TO_PAIR_EQUAL && rc == MDBX_NOTFOUND) + rc = outer_next(mc, key, data, MDBX_NEXT); + } else { + int cmp = mc->clc->v.cmp(data, &save_data); + switch (op) { + default: + __unreachable(); + case MDBX_TO_PAIR_LESSER_THAN: + if (cmp >= 0) + rc = outer_prev(mc, key, data, MDBX_PREV); + break; + case MDBX_TO_PAIR_LESSER_OR_EQUAL: + if (cmp > 0) + rc = outer_prev(mc, key, data, MDBX_PREV); + break; + case MDBX_TO_PAIR_EQUAL: + rc = (cmp == 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; + break; + case MDBX_TO_PAIR_GREATER_OR_EQUAL: + if (cmp < 0) + rc = outer_next(mc, key, data, MDBX_NEXT); + break; + case MDBX_TO_PAIR_GREATER_THAN: + if (cmp <= 0) + rc = outer_next(mc, key, data, MDBX_NEXT); + break; + } + } + } else if (op < MDBX_TO_PAIR_EQUAL && + (rc == MDBX_NOTFOUND || rc == MDBX_SUCCESS)) + rc = outer_prev(mc, key, data, MDBX_PREV_NODUP); + else if (op == MDBX_TO_PAIR_EQUAL && rc == MDBX_SUCCESS) + rc = MDBX_NOTFOUND; } - mp = mc->mc_pg[mc->mc_top]; - DEBUG("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, - mc->mc_ki[mc->mc_top]); - } - -skip: - DEBUG("==> cursor points to page %" PRIaPGNO " with %zu keys, key index %u", - mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); + return rc; - if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { - ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", - mp->mp_pgno, mp->mp_flags); - return MDBX_CORRUPTED; + default: + DEBUG("unhandled/unimplemented cursor operation %u", op); + return MDBX_EINVAL; } +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 - if (IS_LEAF2(mp)) { - if (likely(key)) { - key->iov_len = mc->mc_db->md_xsize; - key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); - } - return MDBX_SUCCESS; - } - const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (node_flags(node) & F_DUPDATA) { - rc = cursor_xinit1(mc, node, mp); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - rc = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } else if (likely(data)) { - rc = node_read(mc, node, data, mp); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +size_t dbi_bitmap_ctz_fallback(const MDBX_txn *txn, intptr_t bmi) { + tASSERT(txn, bmi > 0); + bmi &= -bmi; + if (sizeof(txn->dbi_sparse[0]) > 4) { + static const uint8_t debruijn_ctz64[64] = { + 0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28, + 62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11, + 63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10, + 51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12}; + return debruijn_ctz64[(UINT64_C(0x022FDD63CC95386D) * (uint64_t)bmi) >> 58]; + } else { + static const uint8_t debruijn_ctz32[32] = { + 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, + 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; + return debruijn_ctz32[(UINT32_C(0x077CB531) * (uint32_t)bmi) >> 27]; } - - get_key_optional(node, key); - return MDBX_SUCCESS; } -/* Move the cursor to the previous data item. */ -static int cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - MDBX_cursor_op op) { - assert(op == MDBX_PREV || op == MDBX_PREV_DUP || op == MDBX_PREV_NODUP); - int rc; +struct dbi_snap_result dbi_snap(const MDBX_env *env, const size_t dbi) { + eASSERT(env, dbi < env->n_dbi); + struct dbi_snap_result r; + uint32_t snap = atomic_load32(&env->dbi_seqs[dbi], mo_AcquireRelease); + do { + r.sequence = snap; + r.flags = env->dbs_flags[dbi]; + snap = atomic_load32(&env->dbi_seqs[dbi], mo_AcquireRelease); + } while (unlikely(snap != r.sequence)); + return r; +} - if (unlikely(mc->mc_flags & C_DEL) && op == MDBX_PREV_DUP) - return MDBX_NOTFOUND; +__noinline int dbi_import(MDBX_txn *txn, const size_t dbi) { + const MDBX_env *const env = txn->env; + if (dbi >= env->n_dbi || !env->dbs_flags[dbi]) + return MDBX_BAD_DBI; - if (unlikely(!(mc->mc_flags & C_INITIALIZED))) { - if (unlikely(mc->mc_flags & C_SUB)) - return MDBX_NOTFOUND; - rc = cursor_last(mc, key, data); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - mc->mc_ki[mc->mc_top]++; +#if MDBX_ENABLE_DBI_SPARSE + const size_t bitmap_chunk = CHAR_BIT * sizeof(txn->dbi_sparse[0]); + const size_t bitmap_indx = dbi / bitmap_chunk; + const size_t bitmap_mask = (size_t)1 << dbi % bitmap_chunk; + if (dbi >= txn->n_dbi) { + for (size_t i = (txn->n_dbi + bitmap_chunk - 1) / bitmap_chunk; + bitmap_indx >= i; ++i) + txn->dbi_sparse[i] = 0; + eASSERT(env, (txn->dbi_sparse[bitmap_indx] & bitmap_mask) == 0); + MDBX_txn *scan = txn; + do { + eASSERT(env, scan->dbi_sparse == txn->dbi_sparse); + eASSERT(env, scan->n_dbi < dbi + 1); + scan->n_dbi = (unsigned)dbi + 1; + scan->dbi_state[dbi] = 0; + scan = scan->parent; + } while (scan /* && scan->dbi_sparse == txn->dbi_sparse */); + txn->dbi_sparse[bitmap_indx] |= bitmap_mask; + goto lindo; + } + if ((txn->dbi_sparse[bitmap_indx] & bitmap_mask) == 0) { + MDBX_txn *scan = txn; + do { + eASSERT(env, scan->dbi_sparse == txn->dbi_sparse); + eASSERT(env, scan->n_dbi == txn->n_dbi); + scan->dbi_state[dbi] = 0; + scan = scan->parent; + } while (scan /* && scan->dbi_sparse == txn->dbi_sparse */); + txn->dbi_sparse[bitmap_indx] |= bitmap_mask; + goto lindo; + } +#else + if (dbi >= txn->n_dbi) { + size_t i = txn->n_dbi; + do + txn->dbi_state[i] = 0; + while (dbi >= ++i); + txn->n_dbi = i; + goto lindo; } +#endif /* MDBX_ENABLE_DBI_SPARSE */ - const MDBX_page *mp = mc->mc_pg[mc->mc_top]; - if (mc->mc_xcursor) { - if (op != MDBX_PREV_NODUP) { - if (likely(mc->mc_ki[mc->mc_top] < page_numkeys(mp))) { - const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (node_flags(node) & F_DUPDATA) { - rc = cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_PREV); - if (likely(rc == MDBX_SUCCESS)) { - get_key_optional(node, key); - mc->mc_flags &= ~C_EOF; - return MDBX_SUCCESS; + if (!txn->dbi_state[dbi]) { + lindo: + /* dbi-слот еще не инициализирован в транзакции, а хендл не использовался */ + txn->cursors[dbi] = nullptr; + MDBX_txn *const parent = txn->parent; + if (parent) { + /* вложенная пишущая транзакция */ + int rc = dbi_check(parent, dbi); + /* копируем состояние subDB очищая new-флаги. */ + eASSERT(env, txn->dbi_seqs == parent->dbi_seqs); + txn->dbi_state[dbi] = + parent->dbi_state[dbi] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); + if (likely(rc == MDBX_SUCCESS)) { + txn->dbs[dbi] = parent->dbs[dbi]; + if (parent->cursors[dbi]) { + rc = cursor_shadow(parent->cursors[dbi], txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) { + /* не получилось забекапить курсоры */ + txn->dbi_state[dbi] = DBI_OLDEN | DBI_LINDO | DBI_STALE; + txn->flags |= MDBX_TXN_ERROR; } - if (unlikely(rc != MDBX_NOTFOUND)) - return rc; } } - if (op != MDBX_PREV) - return MDBX_NOTFOUND; + return rc; + } + txn->dbi_seqs[dbi] = 0; + txn->dbi_state[dbi] = DBI_LINDO; + } else { + eASSERT(env, txn->dbi_seqs[dbi] != env->dbi_seqs[dbi].weak); + if (unlikely((txn->dbi_state[dbi] & (DBI_VALID | DBI_OLDEN)) || + txn->cursors[dbi])) { + /* хендл уже использовался в транзакции, но был закрыт или переоткрыт, + * либо при явном пере-открытии хендла есть висячие курсоры */ + eASSERT(env, (txn->dbi_state[dbi] & DBI_STALE) == 0); + txn->dbi_seqs[dbi] = env->dbi_seqs[dbi].weak; + txn->dbi_state[dbi] = DBI_OLDEN | DBI_LINDO; + return txn->cursors[dbi] ? MDBX_DANGLING_DBI : MDBX_BAD_DBI; } - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); } - DEBUG("cursor_prev: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, - (void *)mc); - - mc->mc_flags &= ~(C_EOF | C_DEL); + /* хендл не использовался в транзакции, либо явно пере-отрывается при + * отсутствии висячих курсоров */ + eASSERT(env, (txn->dbi_state[dbi] & DBI_LINDO) && !txn->cursors[dbi]); - int ki = mc->mc_ki[mc->mc_top]; - mc->mc_ki[mc->mc_top] = (indx_t)--ki; - if (unlikely(ki < 0)) { - mc->mc_ki[mc->mc_top] = 0; - DEBUG("%s", "=====> move to prev sibling page"); - if ((rc = cursor_sibling(mc, SIBLING_LEFT)) != MDBX_SUCCESS) - return rc; - mp = mc->mc_pg[mc->mc_top]; - DEBUG("prev page is %" PRIaPGNO ", key index %u", mp->mp_pgno, - mc->mc_ki[mc->mc_top]); + /* читаем актуальные флаги и sequence */ + struct dbi_snap_result snap = dbi_snap(env, dbi); + txn->dbi_seqs[dbi] = snap.sequence; + if (snap.flags & DB_VALID) { + txn->dbs[dbi].flags = snap.flags & DB_PERSISTENT_FLAGS; + txn->dbi_state[dbi] = DBI_LINDO | DBI_VALID | DBI_STALE; + return MDBX_SUCCESS; } - DEBUG("==> cursor points to page %" PRIaPGNO " with %zu keys, key index %u", - mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); + return MDBX_BAD_DBI; +} - if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { - ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", - mp->mp_pgno, mp->mp_flags); - return MDBX_CORRUPTED; +static int defer_and_release(MDBX_env *const env, + defer_free_item_t *const chain) { + size_t length = 0; + defer_free_item_t *obsolete_chain = nullptr; +#if MDBX_ENABLE_DBI_LOCKFREE + const uint64_t now = osal_monotime(); + defer_free_item_t **scan = &env->defer_free; + if (env->defer_free) { + const uint64_t threshold_1second = osal_16dot16_to_monotime(1 * 65536); + do { + defer_free_item_t *item = *scan; + if (now - item->timestamp < threshold_1second) { + scan = &item->next; + length += 1; + } else { + *scan = item->next; + item->next = obsolete_chain; + obsolete_chain = item; + } + } while (*scan); } - if (IS_LEAF2(mp)) { - if (likely(key)) { - key->iov_len = mc->mc_db->md_xsize; - key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); - } - return MDBX_SUCCESS; + eASSERT(env, *scan == nullptr); + if (chain) { + defer_free_item_t *item = chain; + do { + item->timestamp = now; + item = item->next; + } while (item); + *scan = chain; } +#else /* MDBX_ENABLE_DBI_LOCKFREE */ + obsolete_chain = chain; +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ - const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (node_flags(node) & F_DUPDATA) { - rc = cursor_xinit1(mc, node, mp); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - rc = cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } else if (likely(data)) { - rc = node_read(mc, node, data, mp); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + ENSURE(env, osal_fastmutex_release(&env->dbi_lock) == MDBX_SUCCESS); + if (length > 42) { +#if defined(_WIN32) || defined(_WIN64) + SwitchToThread(); +#else + sched_yield(); +#endif /* Windows */ } - - get_key_optional(node, key); - return MDBX_SUCCESS; + while (obsolete_chain) { + defer_free_item_t *item = obsolete_chain; + obsolete_chain = obsolete_chain->next; + osal_free(item); + } + return chain ? MDBX_SUCCESS : MDBX_BAD_DBI; } -/* Set the cursor on a specific data item. */ -__hot static struct cursor_set_result -cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { - MDBX_page *mp; - MDBX_node *node = NULL; - DKBUF_DEBUG; - - struct cursor_set_result ret; - ret.exact = false; - if (unlikely(key->iov_len < mc->mc_dbx->md_klen_min || - key->iov_len > mc->mc_dbx->md_klen_max)) { - cASSERT(mc, !"Invalid key-size"); - ret.err = MDBX_BAD_VALSIZE; - return ret; +/* Export or close DBI handles opened in this txn. */ +int dbi_update(MDBX_txn *txn, int keep) { + MDBX_env *const env = txn->env; + tASSERT(txn, !txn->parent && txn == env->basal_txn); + bool locked = false; + defer_free_item_t *defer_chain = nullptr; + TXN_FOREACH_DBI_USER(txn, dbi) { + if (likely((txn->dbi_state[dbi] & DBI_CREAT) == 0)) + continue; + if (!locked) { + int err = osal_fastmutex_acquire(&env->dbi_lock); + if (unlikely(err != MDBX_SUCCESS)) + return err; + locked = true; + if (dbi >= env->n_dbi) + /* хендл был закрыт из другого потока пока захватывали блокировку */ + continue; + } + tASSERT(txn, dbi < env->n_dbi); + if (keep) { + env->dbs_flags[dbi] = txn->dbs[dbi].flags | DB_VALID; + } else { + uint32_t seq = dbi_seq_next(env, dbi); + defer_free_item_t *item = env->kvs[dbi].name.iov_base; + if (item) { + env->dbs_flags[dbi] = 0; + env->kvs[dbi].name.iov_len = 0; + env->kvs[dbi].name.iov_base = nullptr; + atomic_store32(&env->dbi_seqs[dbi], seq, mo_AcquireRelease); + osal_flush_incoherent_cpu_writeback(); + item->next = defer_chain; + defer_chain = item; + } else { + eASSERT(env, env->kvs[dbi].name.iov_len == 0); + eASSERT(env, env->dbs_flags[dbi] == 0); + } + } } - MDBX_val aligned_key = *key; - uint64_t aligned_key_buf; - if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { - switch (aligned_key.iov_len) { - default: - cASSERT(mc, !"key-size is invalid for MDBX_INTEGERKEY"); - ret.err = MDBX_BAD_VALSIZE; - return ret; - case 4: - if (unlikely(3 & (uintptr_t)aligned_key.iov_base)) - /* copy instead of return error to avoid break compatibility */ - aligned_key.iov_base = - memcpy(&aligned_key_buf, aligned_key.iov_base, 4); - break; - case 8: - if (unlikely(7 & (uintptr_t)aligned_key.iov_base)) - /* copy instead of return error to avoid break compatibility */ - aligned_key.iov_base = - memcpy(&aligned_key_buf, aligned_key.iov_base, 8); - break; + if (locked) { + size_t i = env->n_dbi; + while ((env->dbs_flags[i - 1] & DB_VALID) == 0) { + --i; + eASSERT(env, i >= CORE_DBS); + eASSERT(env, !env->dbs_flags[i] && !env->kvs[i].name.iov_len && + !env->kvs[i].name.iov_base); } + env->n_dbi = (unsigned)i; + defer_and_release(env, defer_chain); } + return MDBX_SUCCESS; +} - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); +int dbi_bind(MDBX_txn *txn, const size_t dbi, unsigned user_flags, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { + const MDBX_env *const env = txn->env; + eASSERT(env, dbi < txn->n_dbi && dbi < env->n_dbi); + eASSERT(env, dbi_state(txn, dbi) & DBI_LINDO); + eASSERT(env, env->dbs_flags[dbi] != DB_POISON); + if ((env->dbs_flags[dbi] & DB_VALID) == 0) { + eASSERT(env, !env->kvs[dbi].clc.k.cmp && !env->kvs[dbi].clc.v.cmp && + !env->kvs[dbi].name.iov_len && + !env->kvs[dbi].name.iov_base && + !env->kvs[dbi].clc.k.lmax && !env->kvs[dbi].clc.k.lmin && + !env->kvs[dbi].clc.v.lmax && !env->kvs[dbi].clc.v.lmin); + } else { + eASSERT(env, !(txn->dbi_state[dbi] & DBI_VALID) || + (txn->dbs[dbi].flags | DB_VALID) == env->dbs_flags[dbi]); + eASSERT(env, env->kvs[dbi].name.iov_base || dbi < CORE_DBS); + } - /* See if we're already on the right page */ - if (mc->mc_flags & C_INITIALIZED) { - MDBX_val nodekey; + /* Если dbi уже использовался, то корректными считаем четыре варианта: + * 1) user_flags равны MDBX_DB_ACCEDE + * = предполагаем что пользователь открывает существующую subDb, + * при этом код проверки не позволит установить другие компараторы. + * 2) user_flags нулевые, а оба компаратора пустые/нулевые или равны текущим + * = предполагаем что пользователь открывает существующую subDb + * старым способом с нулевыми с флагами по-умолчанию. + * 3) user_flags совпадают, а компараторы не заданы или те же + * = предполагаем что пользователь открывает subDb указывая все параметры; + * 4) user_flags отличаются, но subDb пустая и задан флаг MDBX_CREATE + * = предполагаем что пользователь пересоздает subDb; + */ + if ((user_flags & ~MDBX_CREATE) != + (unsigned)(env->dbs_flags[dbi] & DB_PERSISTENT_FLAGS)) { + /* flags are differs, check other conditions */ + if ((!user_flags && (!keycmp || keycmp == env->kvs[dbi].clc.k.cmp) && + (!datacmp || datacmp == env->kvs[dbi].clc.v.cmp)) || + user_flags == MDBX_DB_ACCEDE) { + user_flags = env->dbs_flags[dbi] & DB_PERSISTENT_FLAGS; + } else if ((user_flags & MDBX_CREATE) == 0) + return /* FIXME: return extended info */ MDBX_INCOMPATIBLE; + else { + eASSERT(env, env->dbs_flags[dbi] & DB_VALID); + if (txn->dbi_state[dbi] & DBI_STALE) { + int err = sdb_fetch(txn, dbi); + if (unlikely(err == MDBX_SUCCESS)) + return err; + } + eASSERT(env, + (txn->dbi_state[dbi] & (DBI_LINDO | DBI_VALID | DBI_STALE)) == + (DBI_LINDO | DBI_VALID)); + if (unlikely(txn->dbs[dbi].leaf_pages)) + return /* FIXME: return extended info */ MDBX_INCOMPATIBLE; - cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - mp = mc->mc_pg[mc->mc_top]; - if (unlikely(!page_numkeys(mp))) { - mc->mc_ki[mc->mc_top] = 0; - mc->mc_flags |= C_EOF; - ret.err = MDBX_NOTFOUND; - return ret; - } - if (IS_LEAF2(mp)) { - nodekey.iov_len = mc->mc_db->md_xsize; - nodekey.iov_base = page_leaf2key(mp, 0, nodekey.iov_len); - } else { - node = page_node(mp, 0); - get_key(node, &nodekey); - } - int cmp = mc->mc_dbx->md_cmp(&aligned_key, &nodekey); - if (unlikely(cmp == 0)) { - /* Probably happens rarely, but first node on the page - * was the one we wanted. */ - mc->mc_ki[mc->mc_top] = 0; - ret.exact = true; - cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); - goto got_node; - } - if (cmp > 0) { - const size_t nkeys = page_numkeys(mp); - if (likely(nkeys > 1)) { - if (IS_LEAF2(mp)) { - nodekey.iov_base = page_leaf2key(mp, nkeys - 1, nodekey.iov_len); - } else { - node = page_node(mp, nkeys - 1); - get_key(node, &nodekey); - } - cmp = mc->mc_dbx->md_cmp(&aligned_key, &nodekey); - if (cmp == 0) { - /* last node was the one we wanted */ - cASSERT(mc, nkeys >= 1 && nkeys <= UINT16_MAX + 1); - mc->mc_ki[mc->mc_top] = (indx_t)(nkeys - 1); - ret.exact = true; - cASSERT(mc, - mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); - goto got_node; - } - if (cmp < 0) { - /* This is definitely the right page, skip search_page */ - if (mc->mc_ki[mc->mc_top] != 0 /* уже проверяли выше */ && - mc->mc_ki[mc->mc_top] < page_numkeys(mp)) { - if (IS_LEAF2(mp)) { - nodekey.iov_base = - page_leaf2key(mp, mc->mc_ki[mc->mc_top], nodekey.iov_len); - } else { - node = page_node(mp, mc->mc_ki[mc->mc_top]); - get_key(node, &nodekey); - } - cmp = mc->mc_dbx->md_cmp(&aligned_key, &nodekey); - if (cmp == 0) { - /* current node was the one we wanted */ - ret.exact = true; - cASSERT(mc, mc->mc_ki[mc->mc_top] < - page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); - goto got_node; - } - } - mc->mc_flags &= ~C_EOF; - goto search_node; - } + /* Пересоздаём subDB если там пусто */ + if (unlikely(txn->cursors[dbi])) + return MDBX_DANGLING_DBI; + env->dbs_flags[dbi] = DB_POISON; + atomic_store32(&env->dbi_seqs[dbi], dbi_seq_next(env, MAIN_DBI), + mo_AcquireRelease); + + const uint32_t seq = dbi_seq_next(env, dbi); + const uint16_t db_flags = user_flags & DB_PERSISTENT_FLAGS; + eASSERT(env, txn->dbs[dbi].height == 0 && txn->dbs[dbi].items == 0 && + txn->dbs[dbi].root == P_INVALID); + env->kvs[dbi].clc.k.cmp = keycmp ? keycmp : builtin_keycmp(user_flags); + env->kvs[dbi].clc.v.cmp = datacmp ? datacmp : builtin_datacmp(user_flags); + txn->dbs[dbi].flags = db_flags; + txn->dbs[dbi].dupfix_size = 0; + if (unlikely(sdb_setup(env, &env->kvs[dbi], &txn->dbs[dbi]))) { + txn->dbi_state[dbi] = DBI_LINDO; + txn->flags |= MDBX_TXN_ERROR; + return MDBX_PROBLEM; } - /* If any parents have right-sibs, search. - * Otherwise, there's nothing further. */ - for (size_t i = 0; i < mc->mc_top; i++) - if (mc->mc_ki[i] < page_numkeys(mc->mc_pg[i]) - 1) - goto continue_other_pages; - /* There are no other pages */ - cASSERT(mc, nkeys <= UINT16_MAX); - mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; - mc->mc_flags |= C_EOF; - ret.err = MDBX_NOTFOUND; - return ret; + env->dbs_flags[dbi] = db_flags | DB_VALID; + atomic_store32(&env->dbi_seqs[dbi], seq, mo_AcquireRelease); + txn->dbi_seqs[dbi] = seq; + txn->dbi_state[dbi] = DBI_LINDO | DBI_VALID | DBI_CREAT | DBI_DIRTY; + txn->flags |= MDBX_TXN_DIRTY; } - continue_other_pages: - if (!mc->mc_top) { - /* There are no other pages */ - mc->mc_ki[mc->mc_top] = 0; - if (op >= MDBX_SET_RANGE) - goto got_node; + } - cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); - ret.err = MDBX_NOTFOUND; - return ret; - } - } else { - mc->mc_pg[0] = nullptr; + if (!keycmp) + keycmp = (env->dbs_flags[dbi] & DB_VALID) ? env->kvs[dbi].clc.k.cmp + : builtin_keycmp(user_flags); + if (env->kvs[dbi].clc.k.cmp != keycmp) { + if (env->dbs_flags[dbi] & DB_VALID) + return MDBX_EINVAL; + env->kvs[dbi].clc.k.cmp = keycmp; } - ret.err = page_search(mc, &aligned_key, 0); - if (unlikely(ret.err != MDBX_SUCCESS)) - return ret; + if (!datacmp) + datacmp = (env->dbs_flags[dbi] & DB_VALID) ? env->kvs[dbi].clc.v.cmp + : builtin_datacmp(user_flags); + if (env->kvs[dbi].clc.v.cmp != datacmp) { + if (env->dbs_flags[dbi] & DB_VALID) + return MDBX_EINVAL; + env->kvs[dbi].clc.v.cmp = datacmp; + } - mp = mc->mc_pg[mc->mc_top]; - MDBX_ANALYSIS_ASSUME(mp != nullptr); - cASSERT(mc, IS_LEAF(mp)); + return MDBX_SUCCESS; +} -search_node:; - struct node_result nsr = node_search(mc, &aligned_key); - node = nsr.node; - ret.exact = nsr.exact; - if (!ret.exact) { - if (op < MDBX_SET_RANGE) { - /* MDBX_SET specified and not an exact match. */ - if (unlikely(mc->mc_ki[mc->mc_top] >= - page_numkeys(mc->mc_pg[mc->mc_top]))) - mc->mc_flags |= C_EOF; - ret.err = MDBX_NOTFOUND; - return ret; - } +static inline size_t dbi_namelen(const MDBX_val name) { + return (name.iov_len > sizeof(defer_free_item_t)) ? name.iov_len + : sizeof(defer_free_item_t); +} - if (node == NULL) { - DEBUG("%s", "===> inexact leaf not found, goto sibling"); - ret.err = cursor_sibling(mc, SIBLING_RIGHT); - if (unlikely(ret.err != MDBX_SUCCESS)) { - mc->mc_flags |= C_EOF; - return ret; /* no entries matched */ - } - mp = mc->mc_pg[mc->mc_top]; - cASSERT(mc, IS_LEAF(mp)); - if (!IS_LEAF2(mp)) - node = page_node(mp, 0); - } - } - cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); +static int dbi_open_locked(MDBX_txn *txn, unsigned user_flags, MDBX_dbi *dbi, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp, + MDBX_val name) { + MDBX_env *const env = txn->env; -got_node: - mc->mc_flags |= C_INITIALIZED; - mc->mc_flags &= ~C_EOF; + /* Cannot mix named table(s) with DUPSORT flags */ + tASSERT(txn, + (txn->dbi_state[MAIN_DBI] & (DBI_LINDO | DBI_VALID | DBI_STALE)) == + (DBI_LINDO | DBI_VALID)); + if (unlikely(txn->dbs[MAIN_DBI].flags & MDBX_DUPSORT)) { + if (unlikely((user_flags & MDBX_CREATE) == 0)) + return MDBX_NOTFOUND; + if (unlikely(txn->dbs[MAIN_DBI].leaf_pages)) + /* В MainDB есть записи, либо она уже использовалась. */ + return MDBX_INCOMPATIBLE; - if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { - ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", - mp->mp_pgno, mp->mp_flags); - ret.err = MDBX_CORRUPTED; - return ret; - } + /* Пересоздаём MainDB когда там пусто. */ + tASSERT(txn, txn->dbs[MAIN_DBI].height == 0 && + txn->dbs[MAIN_DBI].items == 0 && + txn->dbs[MAIN_DBI].root == P_INVALID); + if (unlikely(txn->cursors[MAIN_DBI])) + return MDBX_DANGLING_DBI; + env->dbs_flags[MAIN_DBI] = DB_POISON; + atomic_store32(&env->dbi_seqs[MAIN_DBI], dbi_seq_next(env, MAIN_DBI), + mo_AcquireRelease); - if (IS_LEAF2(mp)) { - if (op >= MDBX_SET_KEY) { - key->iov_len = mc->mc_db->md_xsize; - key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); + const uint32_t seq = dbi_seq_next(env, MAIN_DBI); + const uint16_t main_flags = + txn->dbs[MAIN_DBI].flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY); + env->kvs[MAIN_DBI].clc.k.cmp = builtin_keycmp(main_flags); + env->kvs[MAIN_DBI].clc.v.cmp = builtin_datacmp(main_flags); + txn->dbs[MAIN_DBI].flags = main_flags; + txn->dbs[MAIN_DBI].dupfix_size = 0; + int err = sdb_setup(env, &env->kvs[MAIN_DBI], &txn->dbs[MAIN_DBI]); + if (unlikely(err != MDBX_SUCCESS)) { + txn->dbi_state[MAIN_DBI] = DBI_LINDO; + txn->flags |= MDBX_TXN_ERROR; + env->flags |= ENV_FATAL_ERROR; + return err; } - ret.err = MDBX_SUCCESS; - return ret; + env->dbs_flags[MAIN_DBI] = main_flags | DB_VALID; + txn->dbi_seqs[MAIN_DBI] = + atomic_store32(&env->dbi_seqs[MAIN_DBI], seq, mo_AcquireRelease); + txn->dbi_state[MAIN_DBI] |= DBI_DIRTY; + txn->flags |= MDBX_TXN_DIRTY; } - if (node_flags(node) & F_DUPDATA) { - ret.err = cursor_xinit1(mc, node, mp); - if (unlikely(ret.err != MDBX_SUCCESS)) - return ret; - if (op >= MDBX_SET) { - MDBX_ANALYSIS_ASSUME(mc->mc_xcursor != nullptr); - ret.err = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(ret.err != MDBX_SUCCESS)) - return ret; - } else { - MDBX_ANALYSIS_ASSUME(mc->mc_xcursor != nullptr); - ret = cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_SET_RANGE); - if (unlikely(ret.err != MDBX_SUCCESS)) - return ret; - if (op == MDBX_GET_BOTH && !ret.exact) { - ret.err = MDBX_NOTFOUND; - return ret; - } + tASSERT(txn, env->kvs[MAIN_DBI].clc.k.cmp); + + /* Is the DB already open? */ + size_t slot = env->n_dbi; + for (size_t scan = CORE_DBS; scan < env->n_dbi; ++scan) { + if ((env->dbs_flags[scan] & DB_VALID) == 0) { + /* Remember this free slot */ + slot = (slot < scan) ? slot : scan; + continue; } - } else if (likely(data)) { - if (op <= MDBX_GET_BOTH_RANGE) { - if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || - data->iov_len > mc->mc_dbx->md_vlen_max)) { - cASSERT(mc, !"Invalid data-size"); - ret.err = MDBX_BAD_VALSIZE; - return ret; - } - MDBX_val aligned_data = *data; - uint64_t aligned_databytes; - if (mc->mc_db->md_flags & MDBX_INTEGERDUP) { - switch (aligned_data.iov_len) { - default: - cASSERT(mc, !"data-size is invalid for MDBX_INTEGERDUP"); - ret.err = MDBX_BAD_VALSIZE; - return ret; - case 4: - if (unlikely(3 & (uintptr_t)aligned_data.iov_base)) - /* copy instead of return error to avoid break compatibility */ - aligned_data.iov_base = - memcpy(&aligned_databytes, aligned_data.iov_base, 4); - break; - case 8: - if (unlikely(7 & (uintptr_t)aligned_data.iov_base)) - /* copy instead of return error to avoid break compatibility */ - aligned_data.iov_base = - memcpy(&aligned_databytes, aligned_data.iov_base, 8); - break; - } + if (!env->kvs[MAIN_DBI].clc.k.cmp(&name, &env->kvs[scan].name)) { + slot = scan; + int err = dbi_check(txn, slot); + if (err == MDBX_BAD_DBI && + txn->dbi_state[slot] == (DBI_OLDEN | DBI_LINDO)) { + /* хендл использовался, стал невалидным, + * но теперь явно пере-открывается в этой транзакци */ + eASSERT(env, !txn->cursors[slot]); + txn->dbi_state[slot] = DBI_LINDO; + err = dbi_check(txn, slot); } - MDBX_val actual_data; - ret.err = node_read(mc, node, &actual_data, mc->mc_pg[mc->mc_top]); - if (unlikely(ret.err != MDBX_SUCCESS)) - return ret; - const int cmp = mc->mc_dbx->md_dcmp(&aligned_data, &actual_data); - if (cmp) { - cASSERT(mc, - mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); - if (op != MDBX_GET_BOTH_RANGE || cmp > 0) { - ret.err = MDBX_NOTFOUND; - return ret; + if (err == MDBX_SUCCESS) { + err = dbi_bind(txn, slot, user_flags, keycmp, datacmp); + if (likely(err == MDBX_SUCCESS)) { + goto done; } } - *data = actual_data; - } else { - ret.err = node_read(mc, node, data, mc->mc_pg[mc->mc_top]); - if (unlikely(ret.err != MDBX_SUCCESS)) - return ret; + return err; } } - /* The key already matches in all other cases */ - if (op >= MDBX_SET_KEY) - get_key_optional(node, key); + /* Fail, if no free slot and max hit */ + if (unlikely(slot >= env->max_dbi)) + return MDBX_DBS_FULL; - DEBUG("==> cursor placed on key [%s], data [%s]", DKEY_DEBUG(key), - DVAL_DEBUG(data)); - ret.err = MDBX_SUCCESS; - return ret; -} + if (env->n_dbi == slot) + eASSERT(env, !env->dbs_flags[slot] && !env->kvs[slot].name.iov_len && + !env->kvs[slot].name.iov_base); -/* Move the cursor to the first item in the database. */ -static int cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { - int rc; + env->dbs_flags[slot] = DB_POISON; + atomic_store32(&env->dbi_seqs[slot], dbi_seq_next(env, slot), + mo_AcquireRelease); + memset(&env->kvs[slot], 0, sizeof(env->kvs[slot])); + if (env->n_dbi == slot) + env->n_dbi = (unsigned)slot + 1; + eASSERT(env, slot < env->n_dbi); - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + int err = dbi_check(txn, slot); + eASSERT(env, err == MDBX_BAD_DBI); + if (err != MDBX_BAD_DBI) + return MDBX_PROBLEM; - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = page_search(mc, NULL, MDBX_PS_FIRST); - if (unlikely(rc != MDBX_SUCCESS)) + /* Find the DB info */ + MDBX_val body; + cursor_couple_t cx; + int rc = cursor_init(&cx.outer, txn, MAIN_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + rc = cursor_seek(&cx.outer, &name, &body, MDBX_SET).err; + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc != MDBX_NOTFOUND || !(user_flags & MDBX_CREATE)) return rc; + } else { + /* make sure this is actually a table */ + node_t *node = + page_node(cx.outer.pg[cx.outer.top], cx.outer.ki[cx.outer.top]); + if (unlikely((node_flags(node) & (N_DUPDATA | N_SUBDATA)) != N_SUBDATA)) + return MDBX_INCOMPATIBLE; + if (!MDBX_DISABLE_VALIDATION && unlikely(body.iov_len != sizeof(tree_t))) { + ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid subDb node size", body.iov_len); + return MDBX_CORRUPTED; + } + memcpy(&txn->dbs[slot], body.iov_base, sizeof(tree_t)); } - const MDBX_page *mp = mc->mc_pg[mc->mc_top]; - if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { - ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", - mp->mp_pgno, mp->mp_flags); - return MDBX_CORRUPTED; - } + /* Done here so we cannot fail after creating a new DB */ + void *clone = nullptr; + if (name.iov_len) { + clone = osal_malloc(dbi_namelen(name)); + if (unlikely(!clone)) + return MDBX_ENOMEM; + name.iov_base = memcpy(clone, name.iov_base, name.iov_len); + } else + name.iov_base = ""; - mc->mc_flags |= C_INITIALIZED; - mc->mc_flags &= ~C_EOF; - mc->mc_ki[mc->mc_top] = 0; + uint8_t dbi_state = DBI_LINDO | DBI_VALID | DBI_FRESH; + if (unlikely(rc)) { + /* MDBX_NOTFOUND and MDBX_CREATE: Create new DB */ + tASSERT(txn, rc == MDBX_NOTFOUND); + body.iov_base = memset(&txn->dbs[slot], 0, body.iov_len = sizeof(tree_t)); + txn->dbs[slot].root = P_INVALID; + txn->dbs[slot].mod_txnid = txn->txnid; + txn->dbs[slot].flags = user_flags & DB_PERSISTENT_FLAGS; + cx.outer.next = txn->cursors[MAIN_DBI]; + txn->cursors[MAIN_DBI] = &cx.outer; + rc = cursor_put_checklen(&cx.outer, &name, &body, + N_SUBDATA | MDBX_NOOVERWRITE); + txn->cursors[MAIN_DBI] = cx.outer.next; + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; - if (IS_LEAF2(mp)) { - if (likely(key)) { - key->iov_len = mc->mc_db->md_xsize; - key->iov_base = page_leaf2key(mp, 0, key->iov_len); - } - return MDBX_SUCCESS; + dbi_state |= DBI_DIRTY | DBI_CREAT; + txn->flags |= MDBX_TXN_DIRTY; + tASSERT(txn, (txn->dbi_state[MAIN_DBI] & DBI_DIRTY) != 0); } - MDBX_node *node = page_node(mp, 0); - if (node_flags(node) & F_DUPDATA) { - rc = cursor_xinit1(mc, node, mp); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - MDBX_ANALYSIS_ASSUME(mc->mc_xcursor != nullptr); - rc = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc)) - return rc; - } else if (likely(data)) { - rc = node_read(mc, node, data, mp); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } + /* Got info, register DBI in this txn */ + const uint32_t seq = dbi_seq_next(env, slot); + eASSERT(env, + env->dbs_flags[slot] == DB_POISON && !txn->cursors[slot] && + (txn->dbi_state[slot] & (DBI_LINDO | DBI_VALID)) == DBI_LINDO); + txn->dbi_state[slot] = dbi_state; + memcpy(&txn->dbs[slot], body.iov_base, sizeof(txn->dbs[slot])); + env->dbs_flags[slot] = txn->dbs[slot].flags; + rc = dbi_bind(txn, slot, user_flags, keycmp, datacmp); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; - get_key_optional(node, key); + env->kvs[slot].name = name; + env->dbs_flags[slot] = txn->dbs[slot].flags | DB_VALID; + txn->dbi_seqs[slot] = + atomic_store32(&env->dbi_seqs[slot], seq, mo_AcquireRelease); + +done: + *dbi = (MDBX_dbi)slot; + tASSERT(txn, slot < txn->n_dbi && (env->dbs_flags[slot] & DB_VALID) != 0); + eASSERT(env, dbi_check(txn, slot) == MDBX_SUCCESS); return MDBX_SUCCESS; + +bailout: + eASSERT(env, !txn->cursors[slot] && !env->kvs[slot].name.iov_len && + !env->kvs[slot].name.iov_base); + txn->dbi_state[slot] &= DBI_LINDO | DBI_OLDEN; + env->dbs_flags[slot] = 0; + osal_free(clone); + if (slot + 1 == env->n_dbi) + txn->n_dbi = env->n_dbi = (unsigned)slot; + return rc; } -/* Move the cursor to the last item in the database. */ -static int cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { - int rc; +int dbi_open(MDBX_txn *txn, const MDBX_val *const name, unsigned user_flags, + MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { + if (unlikely(!dbi)) + return MDBX_EINVAL; + *dbi = 0; - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + if (user_flags != MDBX_ACCEDE && + unlikely(!check_sdb_flags(user_flags & ~MDBX_CREATE))) + return MDBX_EINVAL; - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = page_search(mc, NULL, MDBX_PS_LAST); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - const MDBX_page *mp = mc->mc_pg[mc->mc_top]; - if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { - ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", - mp->mp_pgno, mp->mp_flags); - return MDBX_CORRUPTED; + if ((user_flags & MDBX_CREATE) && unlikely(txn->flags & MDBX_TXN_RDONLY)) + return MDBX_EACCESS; + + /* main table? */ + if (unlikely(name == MDBX_CHK_MAIN || name->iov_base == MDBX_CHK_MAIN)) { + rc = dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp); + if (likely(rc == MDBX_SUCCESS)) + *dbi = MAIN_DBI; + return rc; + } + if (unlikely(name == MDBX_CHK_GC || name->iov_base == MDBX_CHK_GC)) { + rc = dbi_bind(txn, FREE_DBI, user_flags, keycmp, datacmp); + if (likely(rc == MDBX_SUCCESS)) + *dbi = FREE_DBI; + return rc; } + if (unlikely(name == MDBX_CHK_META || name->iov_base == MDBX_CHK_META)) + return MDBX_EINVAL; + if (unlikely(name->iov_len > + txn->env->leaf_nodemax - NODESIZE - sizeof(tree_t))) + return MDBX_EINVAL; + +#if MDBX_ENABLE_DBI_LOCKFREE + /* Is the DB already open? */ + const MDBX_env *const env = txn->env; + size_t free_slot = env->n_dbi; + for (size_t i = CORE_DBS; i < env->n_dbi; ++i) { + retry: + if ((env->dbs_flags[i] & DB_VALID) == 0) { + free_slot = i; + continue; + } - mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mp) - 1; - mc->mc_flags |= C_INITIALIZED | C_EOF; + const uint32_t snap_seq = + atomic_load32(&env->dbi_seqs[i], mo_AcquireRelease); + const uint16_t snap_flags = env->dbs_flags[i]; + const MDBX_val snap_name = env->kvs[i].name; + if (user_flags != MDBX_ACCEDE && + (((user_flags ^ snap_flags) & DB_PERSISTENT_FLAGS) || + (keycmp && keycmp != env->kvs[i].clc.k.cmp) || + (datacmp && datacmp != env->kvs[i].clc.v.cmp))) + continue; + const uint32_t main_seq = + atomic_load32(&env->dbi_seqs[MAIN_DBI], mo_AcquireRelease); + MDBX_cmp_func *const snap_cmp = env->kvs[MAIN_DBI].clc.k.cmp; + if (unlikely(!(snap_flags & DB_VALID) || !snap_name.iov_base || + !snap_name.iov_len || !snap_cmp)) + continue; - if (IS_LEAF2(mp)) { - if (likely(key)) { - key->iov_len = mc->mc_db->md_xsize; - key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); + const bool name_match = snap_cmp(&snap_name, name) == 0; + osal_flush_incoherent_cpu_writeback(); + if (unlikely( + snap_seq != atomic_load32(&env->dbi_seqs[i], mo_AcquireRelease) || + main_seq != + atomic_load32(&env->dbi_seqs[MAIN_DBI], mo_AcquireRelease) || + snap_flags != env->dbs_flags[i] || + snap_name.iov_base != env->kvs[i].name.iov_base || + snap_name.iov_len != env->kvs[i].name.iov_len)) + goto retry; + if (name_match) { + rc = dbi_check(txn, i); + if (rc == MDBX_BAD_DBI && txn->dbi_state[i] == (DBI_OLDEN | DBI_LINDO)) { + /* хендл использовался, стал невалидным, + * но теперь явно пере-открывается в этой транзакци */ + eASSERT(env, !txn->cursors[i]); + txn->dbi_state[i] = DBI_LINDO; + rc = dbi_check(txn, i); + } + if (likely(rc == MDBX_SUCCESS)) { + rc = dbi_bind(txn, i, user_flags, keycmp, datacmp); + if (likely(rc == MDBX_SUCCESS)) + *dbi = (MDBX_dbi)i; + } + return rc; } - return MDBX_SUCCESS; } - MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (node_flags(node) & F_DUPDATA) { - rc = cursor_xinit1(mc, node, mp); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - MDBX_ANALYSIS_ASSUME(mc->mc_xcursor != nullptr); - rc = cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc)) - return rc; - } else if (likely(data)) { - rc = node_read(mc, node, data, mp); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + /* Fail, if no free slot and max hit */ + if (unlikely(free_slot >= env->max_dbi)) + return MDBX_DBS_FULL; +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ + + rc = osal_fastmutex_acquire(&txn->env->dbi_lock); + if (likely(rc == MDBX_SUCCESS)) { + rc = dbi_open_locked(txn, user_flags, dbi, keycmp, datacmp, *name); + ENSURE(txn->env, + osal_fastmutex_release(&txn->env->dbi_lock) == MDBX_SUCCESS); } + return rc; +} - get_key_optional(node, key); - return MDBX_SUCCESS; +static int dbi_open_cstr(MDBX_txn *txn, const char *name_cstr, + MDBX_db_flags_t flags, MDBX_dbi *dbi, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { + MDBX_val thunk, *name; + if (name_cstr == MDBX_CHK_MAIN || name_cstr == MDBX_CHK_GC || + name_cstr == MDBX_CHK_META) + name = (void *)name_cstr; + else { + thunk.iov_len = strlen(name_cstr); + thunk.iov_base = (void *)name_cstr; + name = &thunk; + } + return dbi_open(txn, name, flags, dbi, keycmp, datacmp); } -static __hot int cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - MDBX_cursor_op op) { - int (*mfunc)(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data); - int rc; +struct dbi_rename_result { + defer_free_item_t *defer; + int err; +}; - switch (op) { - case MDBX_GET_CURRENT: { - if (unlikely(!(mc->mc_flags & C_INITIALIZED))) - return MDBX_ENODATA; - const MDBX_page *mp = mc->mc_pg[mc->mc_top]; - if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { - ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", - mp->mp_pgno, mp->mp_flags); - return MDBX_CORRUPTED; - } - const size_t nkeys = page_numkeys(mp); - if (unlikely(mc->mc_ki[mc->mc_top] >= nkeys)) { - cASSERT(mc, nkeys <= UINT16_MAX); - if (mc->mc_flags & C_EOF) - return MDBX_ENODATA; - mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; - mc->mc_flags |= C_EOF; - return MDBX_NOTFOUND; - } - cASSERT(mc, nkeys > 0); +__cold static struct dbi_rename_result +dbi_rename_locked(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val new_name) { + struct dbi_rename_result pair; + pair.defer = nullptr; + pair.err = dbi_check(txn, dbi); + if (unlikely(pair.err != MDBX_SUCCESS)) + return pair; - rc = MDBX_SUCCESS; - if (IS_LEAF2(mp)) { - key->iov_len = mc->mc_db->md_xsize; - key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); - } else { - MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); - get_key_optional(node, key); - if (data) { - if (node_flags(node) & F_DUPDATA) { - if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) { - rc = cursor_xinit1(mc, node, mp); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - rc = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc)) - return rc; - } else { - rc = cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, - MDBX_GET_CURRENT); - if (unlikely(rc)) - return rc; - } - } else { - cASSERT(mc, !mc->mc_xcursor || !(mc->mc_xcursor->mx_cursor.mc_flags & - C_INITIALIZED)); - rc = node_read(mc, node, data, mp); - if (unlikely(rc)) - return rc; - } - } - } - break; + MDBX_env *const env = txn->env; + MDBX_val old_name = env->kvs[dbi].name; + if (env->kvs[MAIN_DBI].clc.k.cmp(&new_name, &old_name) == 0 && + MDBX_DEBUG == 0) + return pair; + + cursor_couple_t cx; + pair.err = cursor_init(&cx.outer, txn, MAIN_DBI); + if (unlikely(pair.err != MDBX_SUCCESS)) + return pair; + pair.err = cursor_seek(&cx.outer, &new_name, nullptr, MDBX_SET).err; + if (unlikely(pair.err != MDBX_NOTFOUND)) { + pair.err = (pair.err == MDBX_SUCCESS) ? MDBX_KEYEXIST : pair.err; + return pair; } - case MDBX_GET_BOTH: - case MDBX_GET_BOTH_RANGE: - if (unlikely(data == NULL)) - return MDBX_EINVAL; - if (unlikely(mc->mc_xcursor == NULL)) - return MDBX_INCOMPATIBLE; - /* fall through */ - __fallthrough; - case MDBX_SET: - case MDBX_SET_KEY: - case MDBX_SET_RANGE: - if (unlikely(key == NULL)) - return MDBX_EINVAL; - rc = cursor_set(mc, key, data, op).err; - if (mc->mc_flags & C_INITIALIZED) { - cASSERT(mc, mc->mc_snum > 0 && mc->mc_top < mc->mc_snum); - cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); - } - break; - case MDBX_GET_MULTIPLE: - if (unlikely(!data)) - return MDBX_EINVAL; - if (unlikely((mc->mc_db->md_flags & MDBX_DUPFIXED) == 0)) - return MDBX_INCOMPATIBLE; - if ((mc->mc_flags & C_INITIALIZED) == 0) { - if (unlikely(!key)) - return MDBX_EINVAL; - rc = cursor_set(mc, key, data, MDBX_SET).err; - if (unlikely(rc != MDBX_SUCCESS)) - break; - } - rc = MDBX_SUCCESS; - if (unlikely(C_INITIALIZED != (mc->mc_xcursor->mx_cursor.mc_flags & - (C_INITIALIZED | C_EOF)))) { - rc = MDBX_NOTFOUND; - break; - } - goto fetch_multiple; - case MDBX_NEXT_MULTIPLE: - if (unlikely(!data)) - return MDBX_EINVAL; - if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) - return MDBX_INCOMPATIBLE; - rc = cursor_next(mc, key, data, MDBX_NEXT_DUP); - if (rc == MDBX_SUCCESS) { - if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - fetch_multiple:; - MDBX_cursor *mx = &mc->mc_xcursor->mx_cursor; - data->iov_len = - page_numkeys(mx->mc_pg[mx->mc_top]) * mx->mc_db->md_xsize; - data->iov_base = page_data(mx->mc_pg[mx->mc_top]); - mx->mc_ki[mx->mc_top] = (indx_t)page_numkeys(mx->mc_pg[mx->mc_top]) - 1; - } else { - rc = MDBX_NOTFOUND; - } - } - break; - case MDBX_PREV_MULTIPLE: - if (unlikely(!data)) - return MDBX_EINVAL; - if (!(mc->mc_db->md_flags & MDBX_DUPFIXED)) - return MDBX_INCOMPATIBLE; - rc = MDBX_SUCCESS; - if ((mc->mc_flags & C_INITIALIZED) == 0) - rc = cursor_last(mc, key, data); - if (rc == MDBX_SUCCESS) { - MDBX_cursor *mx = &mc->mc_xcursor->mx_cursor; - rc = MDBX_NOTFOUND; - if (mx->mc_flags & C_INITIALIZED) { - rc = cursor_sibling(mx, SIBLING_LEFT); - if (rc == MDBX_SUCCESS) - goto fetch_multiple; - } - } - break; - case MDBX_NEXT: - case MDBX_NEXT_DUP: - case MDBX_NEXT_NODUP: - rc = cursor_next(mc, key, data, op); - break; - case MDBX_PREV: - case MDBX_PREV_DUP: - case MDBX_PREV_NODUP: - rc = cursor_prev(mc, key, data, op); - break; - case MDBX_FIRST: - rc = cursor_first(mc, key, data); - break; - case MDBX_FIRST_DUP: - mfunc = cursor_first; - move: - if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) - return MDBX_EINVAL; - if (unlikely(mc->mc_xcursor == NULL)) - return MDBX_INCOMPATIBLE; - if (mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top])) { - mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mc->mc_pg[mc->mc_top]); - mc->mc_flags |= C_EOF; - return MDBX_NOTFOUND; - } else { - MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (!(node_flags(node) & F_DUPDATA)) { - get_key_optional(node, key); - rc = node_read(mc, node, data, mc->mc_pg[mc->mc_top]); - break; - } - } - if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) - return MDBX_EINVAL; - rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); - break; - case MDBX_LAST: - rc = cursor_last(mc, key, data); - break; - case MDBX_LAST_DUP: - mfunc = cursor_last; - goto move; - case MDBX_SET_UPPERBOUND: /* mostly same as MDBX_SET_LOWERBOUND */ - case MDBX_SET_LOWERBOUND: { - if (unlikely(key == NULL || data == NULL)) - return MDBX_EINVAL; - MDBX_val save_data = *data; - struct cursor_set_result csr = cursor_set(mc, key, data, MDBX_SET_RANGE); - rc = csr.err; - if (rc == MDBX_SUCCESS && csr.exact && mc->mc_xcursor) { - mc->mc_flags &= ~C_DEL; - csr.exact = false; - if (!save_data.iov_base && (mc->mc_db->md_flags & MDBX_DUPFIXED)) { - /* Avoiding search nested dupfixed hive if no data provided. - * This is changes the semantic of MDBX_SET_LOWERBOUND but avoid - * returning MDBX_BAD_VALSIZE. */ - } else if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - *data = save_data; - csr = - cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_SET_RANGE); - rc = csr.err; - if (rc == MDBX_NOTFOUND) { - cASSERT(mc, !csr.exact); - rc = cursor_next(mc, key, data, MDBX_NEXT_NODUP); - } - } else { - int cmp = mc->mc_dbx->md_dcmp(&save_data, data); - csr.exact = (cmp == 0); - if (cmp > 0) - rc = cursor_next(mc, key, data, MDBX_NEXT_NODUP); - } - } - if (rc == MDBX_SUCCESS && !csr.exact) - rc = MDBX_RESULT_TRUE; - if (unlikely(op == MDBX_SET_UPPERBOUND)) { - /* minor fixups for MDBX_SET_UPPERBOUND */ - if (rc == MDBX_RESULT_TRUE) - /* already at great-than by MDBX_SET_LOWERBOUND */ - rc = MDBX_SUCCESS; - else if (rc == MDBX_SUCCESS) - /* exactly match, going next */ - rc = cursor_next(mc, key, data, MDBX_NEXT); - } - break; + pair.defer = osal_malloc(dbi_namelen(new_name)); + if (unlikely(!pair.defer)) { + pair.err = MDBX_ENOMEM; + return pair; } + new_name.iov_base = memcpy(pair.defer, new_name.iov_base, new_name.iov_len); - /* Doubtless API to positioning of the cursor at a specified key. */ - case MDBX_TO_KEY_LESSER_THAN: - case MDBX_TO_KEY_LESSER_OR_EQUAL: - case MDBX_TO_KEY_EQUAL: - case MDBX_TO_KEY_GREATER_OR_EQUAL: - case MDBX_TO_KEY_GREATER_THAN: { - if (unlikely(key == NULL)) - return MDBX_EINVAL; - struct cursor_set_result csr = cursor_set(mc, key, data, MDBX_SET_RANGE); - rc = csr.err; - if (csr.exact) { - cASSERT(mc, csr.err == MDBX_SUCCESS); - if (op == MDBX_TO_KEY_LESSER_THAN) - rc = cursor_prev(mc, key, data, MDBX_PREV_NODUP); - else if (op == MDBX_TO_KEY_GREATER_THAN) - rc = cursor_next(mc, key, data, MDBX_NEXT_NODUP); - } else if (op < MDBX_TO_KEY_EQUAL && - (rc == MDBX_NOTFOUND || rc == MDBX_SUCCESS)) - rc = cursor_prev(mc, key, data, MDBX_PREV_NODUP); - else if (op == MDBX_TO_KEY_EQUAL && rc == MDBX_SUCCESS) - rc = MDBX_NOTFOUND; - break; - } + cx.outer.next = txn->cursors[MAIN_DBI]; + txn->cursors[MAIN_DBI] = &cx.outer; - /* Doubtless API to positioning of the cursor at a specified key-value pair - * for multi-value hives. */ - case MDBX_TO_EXACT_KEY_VALUE_LESSER_THAN: - case MDBX_TO_EXACT_KEY_VALUE_LESSER_OR_EQUAL: - case MDBX_TO_EXACT_KEY_VALUE_EQUAL: - case MDBX_TO_EXACT_KEY_VALUE_GREATER_OR_EQUAL: - case MDBX_TO_EXACT_KEY_VALUE_GREATER_THAN: { - if (unlikely(key == NULL || data == NULL)) - return MDBX_EINVAL; - MDBX_val save_data = *data; - struct cursor_set_result csr = cursor_set(mc, key, data, MDBX_SET_KEY); - rc = csr.err; - if (rc == MDBX_SUCCESS) { - cASSERT(mc, csr.exact); - MDBX_cursor *const mx = - (mc->mc_xcursor && - (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) - ? &mc->mc_xcursor->mx_cursor - : nullptr; - if (mx) { - csr = cursor_set(mx, &save_data, NULL, MDBX_SET_RANGE); - rc = csr.err; - if (csr.exact) { - cASSERT(mc, csr.err == MDBX_SUCCESS); - if (op == MDBX_TO_EXACT_KEY_VALUE_LESSER_THAN) - rc = cursor_prev(mx, data, NULL, MDBX_PREV); - else if (op == MDBX_TO_EXACT_KEY_VALUE_GREATER_THAN) - rc = cursor_next(mx, data, NULL, MDBX_NEXT); - } else if (op < MDBX_TO_EXACT_KEY_VALUE_EQUAL && - (rc == MDBX_NOTFOUND || rc == MDBX_SUCCESS)) - rc = cursor_prev(mx, data, NULL, MDBX_PREV); - else if (op == MDBX_TO_EXACT_KEY_VALUE_EQUAL && rc == MDBX_SUCCESS) - rc = MDBX_NOTFOUND; - } else { - int cmp = mc->mc_dbx->md_dcmp(data, &save_data); - switch (op) { - default: - __unreachable(); - case MDBX_TO_EXACT_KEY_VALUE_LESSER_THAN: - rc = (cmp < 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; - break; - case MDBX_TO_EXACT_KEY_VALUE_LESSER_OR_EQUAL: - rc = (cmp <= 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; - break; - case MDBX_TO_EXACT_KEY_VALUE_EQUAL: - rc = (cmp == 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; - break; - case MDBX_TO_EXACT_KEY_VALUE_GREATER_OR_EQUAL: - rc = (cmp >= 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; - break; - case MDBX_TO_EXACT_KEY_VALUE_GREATER_THAN: - rc = (cmp > 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; - break; - } - } - } - break; - } - case MDBX_TO_PAIR_LESSER_THAN: - case MDBX_TO_PAIR_LESSER_OR_EQUAL: - case MDBX_TO_PAIR_EQUAL: - case MDBX_TO_PAIR_GREATER_OR_EQUAL: - case MDBX_TO_PAIR_GREATER_THAN: { - if (unlikely(key == NULL || data == NULL)) - return MDBX_EINVAL; - MDBX_val save_data = *data; - struct cursor_set_result csr = cursor_set(mc, key, data, MDBX_SET_RANGE); - rc = csr.err; - if (csr.exact) { - cASSERT(mc, csr.err == MDBX_SUCCESS); - MDBX_cursor *const mx = - (mc->mc_xcursor && - (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) - ? &mc->mc_xcursor->mx_cursor - : nullptr; - if (mx) { - csr = cursor_set(mx, &save_data, NULL, MDBX_SET_RANGE); - rc = csr.err; - if (csr.exact) { - cASSERT(mc, csr.err == MDBX_SUCCESS); - if (op == MDBX_TO_PAIR_LESSER_THAN) - rc = cursor_prev(mc, key, data, MDBX_PREV); - else if (op == MDBX_TO_PAIR_GREATER_THAN) - rc = cursor_next(mc, key, data, MDBX_NEXT); - } else if (op < MDBX_TO_PAIR_EQUAL && - (rc == MDBX_NOTFOUND || rc == MDBX_SUCCESS)) - rc = cursor_prev(mc, key, data, MDBX_PREV); - else if (op == MDBX_TO_PAIR_EQUAL && rc == MDBX_SUCCESS) - rc = MDBX_NOTFOUND; - else if (op > MDBX_TO_PAIR_EQUAL && rc == MDBX_NOTFOUND) - rc = cursor_next(mc, key, data, MDBX_NEXT); - } else { - int cmp = mc->mc_dbx->md_dcmp(data, &save_data); - switch (op) { - default: - __unreachable(); - case MDBX_TO_PAIR_LESSER_THAN: - rc = (cmp < 0) ? MDBX_SUCCESS : cursor_prev(mc, key, data, MDBX_PREV); - break; - case MDBX_TO_PAIR_LESSER_OR_EQUAL: - rc = - (cmp <= 0) ? MDBX_SUCCESS : cursor_prev(mc, key, data, MDBX_PREV); - break; - case MDBX_TO_PAIR_EQUAL: - rc = (cmp == 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; - break; - case MDBX_TO_PAIR_GREATER_OR_EQUAL: - rc = - (cmp >= 0) ? MDBX_SUCCESS : cursor_next(mc, key, data, MDBX_NEXT); - break; - case MDBX_TO_PAIR_GREATER_THAN: - rc = (cmp > 0) ? MDBX_SUCCESS : cursor_next(mc, key, data, MDBX_NEXT); - break; - } - } - } else if (op < MDBX_TO_PAIR_EQUAL && - (rc == MDBX_NOTFOUND || rc == MDBX_SUCCESS)) - rc = cursor_prev(mc, key, data, MDBX_PREV_NODUP); - else if (op == MDBX_TO_PAIR_EQUAL && rc == MDBX_SUCCESS) - rc = MDBX_NOTFOUND; - break; - } - default: - DEBUG("unhandled/unimplemented cursor operation %u", op); - return MDBX_EINVAL; + MDBX_val data = {&txn->dbs[dbi], sizeof(tree_t)}; + pair.err = cursor_put_checklen(&cx.outer, &new_name, &data, + N_SUBDATA | MDBX_NOOVERWRITE); + if (likely(pair.err == MDBX_SUCCESS)) { + pair.err = cursor_seek(&cx.outer, &old_name, nullptr, MDBX_SET).err; + if (likely(pair.err == MDBX_SUCCESS)) + pair.err = cursor_del(&cx.outer, N_SUBDATA); + if (likely(pair.err == MDBX_SUCCESS)) { + pair.defer = env->kvs[dbi].name.iov_base; + env->kvs[dbi].name = new_name; + } else + txn->flags |= MDBX_TXN_ERROR; } - mc->mc_flags &= ~C_DEL; - return rc; + txn->cursors[MAIN_DBI] = cx.outer.next; + return pair; } -int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - MDBX_cursor_op op) { - if (unlikely(mc == NULL)) - return MDBX_EINVAL; +static defer_free_item_t *dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { + eASSERT(env, dbi >= CORE_DBS); + if (unlikely(dbi >= env->n_dbi)) + return nullptr; - if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL - : MDBX_EBADSIGN; + const uint32_t seq = dbi_seq_next(env, dbi); + defer_free_item_t *defer_item = env->kvs[dbi].name.iov_base; + if (likely(defer_item)) { + env->dbs_flags[dbi] = 0; + env->kvs[dbi].name.iov_len = 0; + env->kvs[dbi].name.iov_base = nullptr; + atomic_store32(&env->dbi_seqs[dbi], seq, mo_AcquireRelease); + osal_flush_incoherent_cpu_writeback(); + defer_item->next = nullptr; - int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + if (env->n_dbi == dbi + 1) { + size_t i = env->n_dbi; + do { + --i; + eASSERT(env, i >= CORE_DBS); + eASSERT(env, !env->dbs_flags[i] && !env->kvs[i].name.iov_len && + !env->kvs[i].name.iov_base); + } while (i > CORE_DBS && !env->kvs[i - 1].name.iov_base); + env->n_dbi = (unsigned)i; + } + } - return cursor_get(mc, key, data, op); + return defer_item; } -int mdbx_cursor_scan(MDBX_cursor *mc, MDBX_predicate_func *predicate, - void *context, MDBX_cursor_op start_op, - MDBX_cursor_op turn_op, void *arg) { - if (unlikely(!predicate)) - return MDBX_EINVAL; - - const unsigned valid_start_mask = - 1 << MDBX_FIRST | 1 << MDBX_FIRST_DUP | 1 << MDBX_LAST | - 1 << MDBX_LAST_DUP | 1 << MDBX_GET_CURRENT | 1 << MDBX_GET_MULTIPLE; - if (unlikely(start_op > 30 || ((1 << start_op) & valid_start_mask) == 0)) - return MDBX_EINVAL; +/*----------------------------------------------------------------------------*/ +/* API */ - const unsigned valid_turn_mask = - 1 << MDBX_NEXT | 1 << MDBX_NEXT_DUP | 1 << MDBX_NEXT_NODUP | - 1 << MDBX_PREV | 1 << MDBX_PREV_DUP | 1 << MDBX_PREV_NODUP | - 1 << MDBX_NEXT_MULTIPLE | 1 << MDBX_PREV_MULTIPLE; - if (unlikely(turn_op > 30 || ((1 << turn_op) & valid_turn_mask) == 0)) - return MDBX_EINVAL; +int mdbx_dbi_open(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, + MDBX_dbi *dbi) { + return dbi_open_cstr(txn, name, flags, dbi, nullptr, nullptr); +} - MDBX_val key = {nullptr, 0}, data = {nullptr, 0}; - int rc = mdbx_cursor_get(mc, &key, &data, start_op); - while (likely(rc == MDBX_SUCCESS)) { - rc = predicate(context, &key, &data, arg); - if (rc != MDBX_RESULT_FALSE) - return rc; - rc = cursor_get(mc, &key, &data, turn_op); - } - return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc; +int mdbx_dbi_open2(MDBX_txn *txn, const MDBX_val *name, MDBX_db_flags_t flags, + MDBX_dbi *dbi) { + return dbi_open(txn, name, flags, dbi, nullptr, nullptr); } -int mdbx_cursor_scan_from(MDBX_cursor *mc, MDBX_predicate_func *predicate, - void *context, MDBX_cursor_op from_op, MDBX_val *key, - MDBX_val *value, MDBX_cursor_op turn_op, void *arg) { - if (unlikely(!predicate)) - return MDBX_EINVAL; +int mdbx_dbi_open_ex(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, + MDBX_dbi *dbi, MDBX_cmp_func *keycmp, + MDBX_cmp_func *datacmp) { + return dbi_open_cstr(txn, name, flags, dbi, keycmp, datacmp); +} - const unsigned valid_start_mask = - 1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | 1 << MDBX_SET_KEY | - 1 << MDBX_GET_MULTIPLE | 1 << MDBX_SET_LOWERBOUND | - 1 << MDBX_SET_UPPERBOUND; - ; - if (unlikely(from_op < MDBX_TO_KEY_LESSER_THAN && - ((1 << from_op) & valid_start_mask) == 0)) - return MDBX_EINVAL; +int mdbx_dbi_open_ex2(MDBX_txn *txn, const MDBX_val *name, + MDBX_db_flags_t flags, MDBX_dbi *dbi, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { + return dbi_open(txn, name, flags, dbi, keycmp, datacmp); +} - const unsigned valid_turn_mask = - 1 << MDBX_NEXT | 1 << MDBX_NEXT_DUP | 1 << MDBX_NEXT_NODUP | - 1 << MDBX_PREV | 1 << MDBX_PREV_DUP | 1 << MDBX_PREV_NODUP | - 1 << MDBX_NEXT_MULTIPLE | 1 << MDBX_PREV_MULTIPLE; - if (unlikely(turn_op > 30 || ((1 << turn_op) & valid_turn_mask) == 0)) - return MDBX_EINVAL; +__cold int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - int rc = mdbx_cursor_get(mc, key, value, from_op); - if (unlikely(MDBX_IS_ERROR(rc))) + cursor_couple_t cx; + rc = cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) return rc; - cASSERT(mc, key != nullptr); - MDBX_val stub; - if (!value) { - value = &stub; - rc = cursor_get(mc, key, value, MDBX_GET_CURRENT); + if (txn->dbs[dbi].height) { + cx.outer.next = txn->cursors[dbi]; + txn->cursors[dbi] = &cx.outer; + rc = tree_drop(&cx.outer, + dbi == MAIN_DBI || (cx.outer.tree->flags & MDBX_DUPSORT)); + txn->cursors[dbi] = cx.outer.next; if (unlikely(rc != MDBX_SUCCESS)) return rc; } - for (;;) { - rc = predicate(context, key, value, arg); - if (rc != MDBX_RESULT_FALSE) - return rc; - rc = cursor_get(mc, key, value, turn_op); - if (rc != MDBX_SUCCESS) - return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc; - } -} - -static int cursor_first_batch(MDBX_cursor *mc) { - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - int err = page_search(mc, NULL, MDBX_PS_FIRST); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } - cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - - mc->mc_flags |= C_INITIALIZED; - mc->mc_flags &= ~C_EOF; - mc->mc_ki[mc->mc_top] = 0; - return MDBX_SUCCESS; -} -static int cursor_next_batch(MDBX_cursor *mc) { - if (unlikely(!(mc->mc_flags & C_INITIALIZED))) - return cursor_first_batch(mc); + /* Invalidate the dropped DB's cursors */ + for (MDBX_cursor *mc = txn->cursors[dbi]; mc; mc = mc->next) + be_poor(mc); - MDBX_page *mp = mc->mc_pg[mc->mc_top]; - if (unlikely(mc->mc_flags & C_EOF)) { - if ((size_t)mc->mc_ki[mc->mc_top] + 1 >= page_numkeys(mp)) - return MDBX_NOTFOUND; - mc->mc_flags ^= C_EOF; + if (!del || dbi < CORE_DBS) { + /* reset the DB record, mark it dirty */ + txn->dbi_state[dbi] |= DBI_DIRTY; + txn->dbs[dbi].height = 0; + txn->dbs[dbi].branch_pages = 0; + txn->dbs[dbi].leaf_pages = 0; + txn->dbs[dbi].large_pages = 0; + txn->dbs[dbi].items = 0; + txn->dbs[dbi].root = P_INVALID; + txn->dbs[dbi].sequence = 0; + /* txn->dbs[dbi].mod_txnid = txn->txnid; */ + txn->flags |= MDBX_TXN_DIRTY; + return MDBX_SUCCESS; } - intptr_t ki = mc->mc_ki[mc->mc_top]; - mc->mc_ki[mc->mc_top] = (indx_t)++ki; - const intptr_t numkeys = page_numkeys(mp); - if (likely(ki >= numkeys)) { - DEBUG("%s", "=====> move to next sibling page"); - mc->mc_ki[mc->mc_top] = (indx_t)(numkeys - 1); - int err = cursor_sibling(mc, SIBLING_RIGHT); - if (unlikely(err != MDBX_SUCCESS)) { - mc->mc_flags |= C_EOF; - return err; - } - mp = mc->mc_pg[mc->mc_top]; - DEBUG("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, - mc->mc_ki[mc->mc_top]); - if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { - ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", - mp->mp_pgno, mp->mp_flags); - return MDBX_CORRUPTED; + MDBX_env *const env = txn->env; + MDBX_val name = env->kvs[dbi].name; + rc = cursor_init(&cx.outer, txn, MAIN_DBI); + if (likely(rc == MDBX_SUCCESS)) { + rc = cursor_seek(&cx.outer, &name, nullptr, MDBX_SET).err; + if (likely(rc == MDBX_SUCCESS)) { + cx.outer.next = txn->cursors[MAIN_DBI]; + txn->cursors[MAIN_DBI] = &cx.outer; + rc = cursor_del(&cx.outer, N_SUBDATA); + txn->cursors[MAIN_DBI] = cx.outer.next; + if (likely(rc == MDBX_SUCCESS)) { + tASSERT(txn, txn->dbi_state[MAIN_DBI] & DBI_DIRTY); + tASSERT(txn, txn->flags & MDBX_TXN_DIRTY); + txn->dbi_state[dbi] = DBI_LINDO | DBI_OLDEN; + rc = osal_fastmutex_acquire(&env->dbi_lock); + if (likely(rc == MDBX_SUCCESS)) + return defer_and_release(env, dbi_close_locked(env, dbi)); + } } } - return MDBX_SUCCESS; + txn->flags |= MDBX_TXN_ERROR; + return rc; } -int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, - size_t limit, MDBX_cursor_op op) { - if (unlikely(mc == NULL || count == NULL || limit < 4)) - return MDBX_EINVAL; - - if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL - : MDBX_EBADSIGN; - - int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(mc->mc_db->md_flags & MDBX_DUPSORT)) - return MDBX_INCOMPATIBLE /* must be a non-dupsort subDB */; - - switch (op) { - case MDBX_FIRST: - rc = cursor_first_batch(mc); - break; - case MDBX_NEXT: - rc = cursor_next_batch(mc); - break; - case MDBX_GET_CURRENT: - rc = likely(mc->mc_flags & C_INITIALIZED) ? MDBX_SUCCESS : MDBX_ENODATA; - break; - default: - DEBUG("unhandled/unimplemented cursor operation %u", op); - rc = MDBX_EINVAL; - break; +__cold int mdbx_dbi_rename(MDBX_txn *txn, MDBX_dbi dbi, const char *name_cstr) { + MDBX_val thunk, *name; + if (name_cstr == MDBX_CHK_MAIN || name_cstr == MDBX_CHK_GC || + name_cstr == MDBX_CHK_META) + name = (void *)name_cstr; + else { + thunk.iov_len = strlen(name_cstr); + thunk.iov_base = (void *)name_cstr; + name = &thunk; } + return mdbx_dbi_rename2(txn, dbi, name); +} - if (unlikely(rc != MDBX_SUCCESS)) { - *count = 0; +int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) return rc; - } - const MDBX_page *const mp = mc->mc_pg[mc->mc_top]; - if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { - ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", - mp->mp_pgno, mp->mp_flags); - return MDBX_CORRUPTED; - } - const size_t nkeys = page_numkeys(mp); - size_t i = mc->mc_ki[mc->mc_top], n = 0; - if (unlikely(i >= nkeys)) { - cASSERT(mc, op == MDBX_GET_CURRENT); - cASSERT(mc, mdbx_cursor_on_last(mc) == MDBX_RESULT_TRUE); - *count = 0; - if (mc->mc_flags & C_EOF) { - cASSERT(mc, mdbx_cursor_on_last(mc) == MDBX_RESULT_TRUE); - return MDBX_ENODATA; - } - if (mdbx_cursor_on_last(mc) != MDBX_RESULT_TRUE) - return MDBX_EINVAL /* again MDBX_GET_CURRENT after MDBX_GET_CURRENT */; - mc->mc_flags |= C_EOF; - return MDBX_NOTFOUND; - } + if (unlikely(dbi < CORE_DBS)) + return (dbi == MAIN_DBI) ? MDBX_SUCCESS : MDBX_BAD_DBI; - do { - if (unlikely(n + 2 > limit)) { - rc = MDBX_RESULT_TRUE; - break; - } - const MDBX_node *leaf = page_node(mp, i); - get_key(leaf, &pairs[n]); - rc = node_read(mc, leaf, &pairs[n + 1], mp); - if (unlikely(rc != MDBX_SUCCESS)) - break; - n += 2; - } while (++i < nkeys); + if (unlikely(dbi >= env->max_dbi)) + return MDBX_BAD_DBI; - mc->mc_ki[mc->mc_top] = (indx_t)i; - *count = n; + if (unlikely(dbi < CORE_DBS || dbi >= env->max_dbi)) + return MDBX_BAD_DBI; + + rc = osal_fastmutex_acquire(&env->dbi_lock); + if (likely(rc == MDBX_SUCCESS)) + rc = defer_and_release(env, dbi_close_locked(env, dbi)); return rc; } -static int touch_dbi(MDBX_cursor *mc) { - cASSERT(mc, (*mc->mc_dbi_state & DBI_DIRTY) == 0); - *mc->mc_dbi_state |= DBI_DIRTY; - mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; - if (mc->mc_dbi >= CORE_DBS) { - /* Touch DB record of named DB */ - MDBX_cursor_couple cx; - int rc = dbi_check(mc->mc_txn, MAIN_DBI); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - rc = cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - mc->mc_txn->mt_dbi_state[MAIN_DBI] |= DBI_DIRTY; - rc = page_search(&cx.outer, &mc->mc_dbx->md_name, MDBX_PS_MODIFY); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } - return MDBX_SUCCESS; -} +int mdbx_dbi_flags_ex(const MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags, + unsigned *state) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; -static __hot int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, - const MDBX_val *data) { - cASSERT(mc, (mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) == 0); - cASSERT(mc, (mc->mc_flags & C_INITIALIZED) || mc->mc_snum == 0); - cASSERT(mc, cursor_is_tracked(mc)); + if (unlikely(!flags || !state)) + return MDBX_EINVAL; - cASSERT(mc, F_ISSET(dbi_state(mc->mc_txn, FREE_DBI), DBI_LINDO | DBI_VALID)); - cASSERT(mc, F_ISSET(dbi_state(mc->mc_txn, MAIN_DBI), DBI_LINDO | DBI_VALID)); - if ((mc->mc_flags & C_SUB) == 0) { - MDBX_txn *const txn = mc->mc_txn; - txn_lru_turn(txn); + rc = dbi_check(txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (unlikely((*mc->mc_dbi_state & DBI_DIRTY) == 0)) { - int err = touch_dbi(mc); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } + *flags = txn->dbs[dbi].flags & DB_PERSISTENT_FLAGS; + *state = + txn->dbi_state[dbi] & (DBI_FRESH | DBI_CREAT | DBI_DIRTY | DBI_STALE); - /* Estimate how much space this operation will take: */ - /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */ - size_t need = CURSOR_STACK + 3; - /* 2) GC/FreeDB for any payload */ - if (mc->mc_dbi > FREE_DBI) { - need += txn->mt_dbs[FREE_DBI].md_depth + (size_t)3; - /* 3) Named DBs also dirty the main DB */ - if (mc->mc_dbi > MAIN_DBI) - need += txn->mt_dbs[MAIN_DBI].md_depth + (size_t)3; - } -#if xMDBX_DEBUG_SPILLING != 2 - /* production mode */ - /* 4) Double the page chain estimation - * for extensively splitting, rebalance and merging */ - need += need; - /* 5) Factor the key+data which to be put in */ - need += bytes2pgno(txn->mt_env, node_size(key, data)) + (size_t)1; -#else - /* debug mode */ - (void)key; - (void)data; - txn->mt_env->debug_dirtied_est = ++need; - txn->mt_env->debug_dirtied_act = 0; -#endif /* xMDBX_DEBUG_SPILLING == 2 */ + return MDBX_SUCCESS; +} - int err = txn_spill(txn, mc, need); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } +__cold int mdbx_dbi_rename2(MDBX_txn *txn, MDBX_dbi dbi, + const MDBX_val *new_name) { + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - int rc = MDBX_SUCCESS; - if (likely(mc->mc_snum) && - !IS_MODIFIABLE(mc->mc_txn, mc->mc_pg[mc->mc_snum - 1])) { - mc->mc_top = 0; - do { - rc = page_touch(mc); - if (unlikely(rc != MDBX_SUCCESS)) - break; - mc->mc_top += 1; - } while (mc->mc_top < mc->mc_snum); - mc->mc_top = mc->mc_snum - 1; + if (unlikely(new_name == MDBX_CHK_MAIN || + new_name->iov_base == MDBX_CHK_MAIN || new_name == MDBX_CHK_GC || + new_name->iov_base == MDBX_CHK_GC || new_name == MDBX_CHK_META || + new_name->iov_base == MDBX_CHK_META)) + return MDBX_EINVAL; + + if (unlikely(dbi < CORE_DBS)) + return MDBX_EINVAL; + rc = dbi_check(txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + rc = osal_fastmutex_acquire(&txn->env->dbi_lock); + if (likely(rc == MDBX_SUCCESS)) { + struct dbi_rename_result pair = dbi_rename_locked(txn, dbi, *new_name); + if (pair.defer) + pair.defer->next = nullptr; + defer_and_release(txn->env, pair.defer); + rc = pair.err; } return rc; } -static size_t leaf2_reserve(const MDBX_env *const env, size_t host_page_room, - size_t subpage_len, size_t item_len) { - eASSERT(env, (subpage_len & 1) == 0); - eASSERT(env, - env->me_subpage_reserve_prereq > env->me_subpage_room_threshold + - env->me_subpage_reserve_limit && - env->me_leaf_nodemax >= env->me_subpage_limit + NODESIZE); - size_t reserve = 0; - for (size_t n = 0; - n < 5 && reserve + item_len <= env->me_subpage_reserve_limit && - EVEN(subpage_len + item_len) <= env->me_subpage_limit && - host_page_room >= - env->me_subpage_reserve_prereq + EVEN(subpage_len + item_len); - ++n) { - subpage_len += item_len; - reserve += item_len; - } - return reserve + (subpage_len & 1); +static void stat_get(const tree_t *db, MDBX_stat *st, size_t bytes) { + st->ms_depth = db->height; + st->ms_branch_pages = db->branch_pages; + st->ms_leaf_pages = db->leaf_pages; + st->ms_overflow_pages = db->large_pages; + st->ms_entries = db->items; + if (likely(bytes >= + offsetof(MDBX_stat, ms_mod_txnid) + sizeof(st->ms_mod_txnid))) + st->ms_mod_txnid = db->mod_txnid; } -static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, - MDBX_val *data, unsigned flags) { - int err; - DKBUF_DEBUG; - MDBX_env *const env = mc->mc_txn->mt_env; - DEBUG("==> put db %d key [%s], size %" PRIuPTR ", data [%s] size %" PRIuPTR, - DDBI(mc), DKEY_DEBUG(key), key->iov_len, - DVAL_DEBUG((flags & MDBX_RESERVE) ? nullptr : data), data->iov_len); +__cold int mdbx_dbi_stat(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, + size_t bytes) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if ((flags & MDBX_CURRENT) != 0 && (mc->mc_flags & C_SUB) == 0) { - if (unlikely(flags & (MDBX_APPEND | MDBX_NOOVERWRITE))) - return MDBX_EINVAL; - /* Опция MDBX_CURRENT означает, что запрошено обновление текущей записи, - * на которой сейчас стоит курсор. Проверяем что переданный ключ совпадает - * со значением в текущей позиции курсора. - * Здесь проще вызвать cursor_get(), так как для обслуживания таблиц - * с MDBX_DUPSORT также требуется текущий размер данных. */ - MDBX_val current_key, current_data; - err = cursor_get(mc, ¤t_key, ¤t_data, MDBX_GET_CURRENT); - if (unlikely(err != MDBX_SUCCESS)) - return err; - if (mc->mc_dbx->md_cmp(key, ¤t_key) != 0) - return MDBX_EKEYMISMATCH; + if (unlikely(!dest)) + return MDBX_EINVAL; - if (unlikely((flags & MDBX_MULTIPLE))) - goto drop_current; + rc = dbi_check(txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (mc->mc_db->md_flags & MDBX_DUPSORT) { - MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (node_flags(node) & F_DUPDATA) { - cASSERT(mc, mc->mc_xcursor != NULL && - (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); - /* Если за ключом более одного значения, либо если размер данных - * отличается, то вместо обновления требуется удаление и - * последующая вставка. */ - if (mc->mc_xcursor->mx_db.md_entries > 1 || - current_data.iov_len != data->iov_len) { - drop_current: - err = cursor_del(mc, flags & MDBX_ALLDUPS); - if (unlikely(err != MDBX_SUCCESS)) - return err; - flags -= MDBX_CURRENT; - goto skip_check_samedata; - } - } else if (unlikely(node_size(key, data) > env->me_leaf_nodemax)) { - err = cursor_del(mc, 0); - if (unlikely(err != MDBX_SUCCESS)) - return err; - flags -= MDBX_CURRENT; - goto skip_check_samedata; - } - } - if (!(flags & MDBX_RESERVE) && - unlikely(cmp_lenfast(¤t_data, data) == 0)) - return MDBX_SUCCESS /* the same data, nothing to update */; - skip_check_samedata:; - } + const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid); + if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid) + return MDBX_EINVAL; - int rc = MDBX_SUCCESS; - if (mc->mc_db->md_root == P_INVALID) { - /* new database, cursor has nothing to point to */ - mc->mc_snum = 0; - mc->mc_top = 0; - mc->mc_flags &= ~C_INITIALIZED; - rc = MDBX_NO_ROOT; - } else if ((flags & MDBX_CURRENT) == 0) { - bool exact = false; - MDBX_val last_key, old_data; - if ((flags & MDBX_APPEND) && mc->mc_db->md_entries > 0) { - rc = cursor_last(mc, &last_key, &old_data); - if (likely(rc == MDBX_SUCCESS)) { - const int cmp = mc->mc_dbx->md_cmp(key, &last_key); - if (likely(cmp > 0)) { - mc->mc_ki[mc->mc_top]++; /* step forward for appending */ - rc = MDBX_NOTFOUND; - } else if (unlikely(cmp != 0)) { - /* new-key < last-key */ - return MDBX_EKEYMISMATCH; - } else { - rc = MDBX_SUCCESS; - exact = true; - } - } - } else { - struct cursor_set_result csr = - /* olddata may not be updated in case LEAF2-page of dupfixed-subDB */ - cursor_set(mc, (MDBX_val *)key, &old_data, MDBX_SET); - rc = csr.err; - exact = csr.exact; - } - if (likely(rc == MDBX_SUCCESS)) { - if (exact) { - if (unlikely(flags & MDBX_NOOVERWRITE)) { - DEBUG("duplicate key [%s]", DKEY_DEBUG(key)); - *data = old_data; - return MDBX_KEYEXIST; - } - if (unlikely(mc->mc_flags & C_SUB)) { - /* nested subtree of DUPSORT-database with the same key, - * nothing to update */ - eASSERT(env, data->iov_len == 0 && - (old_data.iov_len == 0 || - /* olddata may not be updated in case LEAF2-page - of dupfixed-subDB */ - (mc->mc_db->md_flags & MDBX_DUPFIXED))); - return MDBX_SUCCESS; - } - if (unlikely(flags & MDBX_ALLDUPS) && mc->mc_xcursor && - (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { - err = cursor_del(mc, MDBX_ALLDUPS); - if (unlikely(err != MDBX_SUCCESS)) - return err; - flags -= MDBX_ALLDUPS; - rc = mc->mc_snum ? MDBX_NOTFOUND : MDBX_NO_ROOT; - exact = false; - } else if (!(flags & (MDBX_RESERVE | MDBX_MULTIPLE))) { - /* checking for early exit without dirtying pages */ - if (unlikely(eq_fast(data, &old_data))) { - cASSERT(mc, mc->mc_dbx->md_dcmp(data, &old_data) == 0); - if (mc->mc_xcursor) { - if (flags & MDBX_NODUPDATA) - return MDBX_KEYEXIST; - if (flags & MDBX_APPENDDUP) - return MDBX_EKEYMISMATCH; - } - /* the same data, nothing to update */ - return MDBX_SUCCESS; - } - cASSERT(mc, mc->mc_dbx->md_dcmp(data, &old_data) != 0); - } - } - } else if (unlikely(rc != MDBX_NOTFOUND)) + if (unlikely(txn->flags & MDBX_TXN_BLOCKED)) + return MDBX_BAD_TXN; + + if (unlikely(txn->dbi_state[dbi] & DBI_STALE)) { + rc = sdb_fetch((MDBX_txn *)txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) return rc; } - mc->mc_flags &= ~C_DEL; - MDBX_val xdata, *ref_data = data; - size_t *batch_dupfixed_done = nullptr, batch_dupfixed_given = 0; - if (unlikely(flags & MDBX_MULTIPLE)) { - batch_dupfixed_given = data[1].iov_len; - batch_dupfixed_done = &data[1].iov_len; - *batch_dupfixed_done = 0; - } + dest->ms_psize = txn->env->ps; + stat_get(&txn->dbs[dbi], dest, bytes); + return MDBX_SUCCESS; +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 - /* Cursor is positioned, check for room in the dirty list */ - err = cursor_touch(mc, key, ref_data); - if (unlikely(err)) - return err; - if (unlikely(rc == MDBX_NO_ROOT)) { - /* new database, write a root leaf page */ - DEBUG("%s", "allocating new root leaf page"); - pgr_t npr = page_new(mc, P_LEAF); - if (unlikely(npr.err != MDBX_SUCCESS)) - return npr.err; - npr.err = cursor_push(mc, npr.page); - if (unlikely(npr.err != MDBX_SUCCESS)) - return npr.err; - mc->mc_db->md_root = npr.page->mp_pgno; - mc->mc_db->md_depth++; - if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { - assert(key->iov_len >= mc->mc_dbx->md_klen_min && - key->iov_len <= mc->mc_dbx->md_klen_max); - mc->mc_dbx->md_klen_min = mc->mc_dbx->md_klen_max = key->iov_len; - } - if (mc->mc_db->md_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) { - assert(data->iov_len >= mc->mc_dbx->md_vlen_min && - data->iov_len <= mc->mc_dbx->md_vlen_max); - assert(mc->mc_xcursor != NULL); - mc->mc_db->md_xsize = mc->mc_xcursor->mx_db.md_xsize = - (unsigned)(mc->mc_dbx->md_vlen_min = mc->mc_dbx->md_vlen_max = - mc->mc_xcursor->mx_dbx.md_klen_min = - mc->mc_xcursor->mx_dbx.md_klen_max = - data->iov_len); - if (mc->mc_flags & C_SUB) - npr.page->mp_flags |= P_LEAF2; - } - mc->mc_flags |= C_INITIALIZED; - } +static inline size_t dpl_size2bytes(ptrdiff_t size) { + assert(size > CURSOR_STACK_SIZE && (size_t)size <= PAGELIST_LIMIT); +#if MDBX_DPL_PREALLOC_FOR_RADIXSORT + size += size; +#endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ + STATIC_ASSERT(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(dpl_t) + + (PAGELIST_LIMIT * (MDBX_DPL_PREALLOC_FOR_RADIXSORT + 1)) * + sizeof(dp_t) + + MDBX_PNL_GRANULATE * sizeof(void *) * 2 < + SIZE_MAX / 4 * 3); + size_t bytes = ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(dpl_t) + + size * sizeof(dp_t), + MDBX_PNL_GRANULATE * sizeof(void *) * 2) - + MDBX_ASSUME_MALLOC_OVERHEAD; + return bytes; +} - MDBX_val old_singledup, old_data; - MDBX_db nested_dupdb; - MDBX_page *sub_root = nullptr; - bool insert_key, insert_data; - uint16_t fp_flags = P_LEAF; - MDBX_page *fp = env->me_pbuf; - fp->mp_txnid = mc->mc_txn->mt_front; - insert_key = insert_data = (rc != MDBX_SUCCESS); - old_singledup.iov_base = nullptr; - if (insert_key) { - /* The key does not exist */ - DEBUG("inserting key at index %i", mc->mc_ki[mc->mc_top]); - if ((mc->mc_db->md_flags & MDBX_DUPSORT) && - node_size(key, data) > env->me_leaf_nodemax) { - /* Too big for a node, insert in sub-DB. Set up an empty - * "old sub-page" for convert_to_subtree to expand to a full page. */ - fp->mp_leaf2_ksize = - (mc->mc_db->md_flags & MDBX_DUPFIXED) ? (uint16_t)data->iov_len : 0; - fp->mp_lower = fp->mp_upper = 0; - old_data.iov_len = PAGEHDRSZ; - goto convert_to_subtree; - } - } else { - /* there's only a key anyway, so this is a no-op */ - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - size_t ksize = mc->mc_db->md_xsize; - if (unlikely(key->iov_len != ksize)) - return MDBX_BAD_VALSIZE; - void *ptr = - page_leaf2key(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); - memcpy(ptr, key->iov_base, ksize); - fix_parent: - /* if overwriting slot 0 of leaf, need to - * update branch key if there is a parent page */ - if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { - size_t dtop = 1; - mc->mc_top--; - /* slot 0 is always an empty key, find real slot */ - while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { - mc->mc_top--; - dtop++; - } - err = MDBX_SUCCESS; - if (mc->mc_ki[mc->mc_top]) - err = update_key(mc, key); - cASSERT(mc, mc->mc_top + dtop < UINT16_MAX); - mc->mc_top += (uint8_t)dtop; - if (unlikely(err != MDBX_SUCCESS)) - return err; - } +static inline size_t dpl_bytes2size(const ptrdiff_t bytes) { + size_t size = (bytes - sizeof(dpl_t)) / sizeof(dp_t); + assert(size > CURSOR_STACK_SIZE && + size <= PAGELIST_LIMIT + MDBX_PNL_GRANULATE); +#if MDBX_DPL_PREALLOC_FOR_RADIXSORT + size >>= 1; +#endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ + return size; +} - if (AUDIT_ENABLED()) { - err = cursor_check(mc); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } - return MDBX_SUCCESS; - } +void dpl_free(MDBX_txn *txn) { + if (likely(txn->tw.dirtylist)) { + osal_free(txn->tw.dirtylist); + txn->tw.dirtylist = nullptr; + } +} - more:; - if (AUDIT_ENABLED()) { - err = cursor_check(mc); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } - MDBX_node *const node = - page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); +dpl_t *dpl_reserve(MDBX_txn *txn, size_t size) { + tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - /* Large/Overflow page overwrites need special handling */ - if (unlikely(node_flags(node) & F_BIGDATA)) { - const size_t dpages = (node_size(key, data) > env->me_leaf_nodemax) - ? number_of_ovpages(env, data->iov_len) - : 0; + size_t bytes = + dpl_size2bytes((size < PAGELIST_LIMIT) ? size : PAGELIST_LIMIT); + dpl_t *const dl = osal_realloc(txn->tw.dirtylist, bytes); + if (likely(dl)) { +#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) + bytes = malloc_usable_size(dl); +#endif /* malloc_usable_size */ + dl->detent = dpl_bytes2size(bytes); + tASSERT(txn, txn->tw.dirtylist == nullptr || dl->length <= dl->detent); + txn->tw.dirtylist = dl; + } + return dl; +} - const pgno_t pgno = node_largedata_pgno(node); - pgr_t lp = page_get_large(mc, pgno, mc->mc_pg[mc->mc_top]->mp_txnid); - if (unlikely(lp.err != MDBX_SUCCESS)) - return lp.err; - cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); +int dpl_alloc(MDBX_txn *txn) { + tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - /* Is the ov page from this txn (or a parent) and big enough? */ - const size_t ovpages = lp.page->mp_pages; - const size_t extra_threshold = - (mc->mc_dbi == FREE_DBI) - ? 1 - : /* LY: add configurable threshold to keep reserve space */ 0; - if (!IS_FROZEN(mc->mc_txn, lp.page) && ovpages >= dpages && - ovpages <= dpages + extra_threshold) { - /* yes, overwrite it. */ - if (!IS_MODIFIABLE(mc->mc_txn, lp.page)) { - if (IS_SPILLED(mc->mc_txn, lp.page)) { - lp = /* TODO: avoid search and get txn & spill-index from - page_result */ - page_unspill(mc->mc_txn, lp.page); - if (unlikely(lp.err)) - return lp.err; - } else { - if (unlikely(!mc->mc_txn->mt_parent)) { - ERROR("Unexpected not frozen/modifiable/spilled but shadowed %s " - "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," - " without parent transaction, current txn %" PRIaTXN - " front %" PRIaTXN, - "overflow/large", pgno, lp.page->mp_txnid, - mc->mc_txn->mt_txnid, mc->mc_txn->mt_front); - return MDBX_PROBLEM; - } + const size_t wanna = (txn->env->options.dp_initial < txn->geo.upper) + ? txn->env->options.dp_initial + : txn->geo.upper; +#if MDBX_FORCE_ASSERTIONS || MDBX_DEBUG + if (txn->tw.dirtylist) + /* обнуляем чтобы не сработал ассерт внутри dpl_reserve() */ + txn->tw.dirtylist->sorted = txn->tw.dirtylist->length = 0; +#endif /* asertions enabled */ + if (unlikely(!txn->tw.dirtylist || txn->tw.dirtylist->detent < wanna || + txn->tw.dirtylist->detent > wanna + wanna) && + unlikely(!dpl_reserve(txn, wanna))) + return MDBX_ENOMEM; - /* It is writable only in a parent txn */ - MDBX_page *np = page_malloc(mc->mc_txn, ovpages); - if (unlikely(!np)) - return MDBX_ENOMEM; + dpl_clear(txn->tw.dirtylist); + return MDBX_SUCCESS; +} - memcpy(np, lp.page, PAGEHDRSZ); /* Copy header of page */ - err = page_dirty(mc->mc_txn, lp.page = np, ovpages); - if (unlikely(err != MDBX_SUCCESS)) - return err; +#define MDBX_DPL_EXTRACT_KEY(ptr) ((ptr)->pgno) +RADIXSORT_IMPL(dp, dp_t, MDBX_DPL_EXTRACT_KEY, MDBX_DPL_PREALLOC_FOR_RADIXSORT, + 1) -#if MDBX_ENABLE_PGOP_STAT - mc->mc_txn->mt_env->me_lck->mti_pgop_stat.clone.weak += ovpages; -#endif /* MDBX_ENABLE_PGOP_STAT */ - cASSERT(mc, dirtylist_check(mc->mc_txn)); - } - } - node_set_ds(node, data->iov_len); - if (flags & MDBX_RESERVE) - data->iov_base = page_data(lp.page); - else - memcpy(page_data(lp.page), data->iov_base, data->iov_len); +#define DP_SORT_CMP(first, last) ((first).pgno < (last).pgno) +SORT_IMPL(dp_sort, false, dp_t, DP_SORT_CMP) - if (AUDIT_ENABLED()) { - err = cursor_check(mc); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } - return MDBX_SUCCESS; - } +__hot __noinline dpl_t *dpl_sort_slowpath(const MDBX_txn *txn) { + tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - if ((err = page_retire(mc, lp.page)) != MDBX_SUCCESS) - return err; + dpl_t *dl = txn->tw.dirtylist; + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + const size_t unsorted = dl->length - dl->sorted; + if (likely(unsorted < MDBX_RADIXSORT_THRESHOLD) || + unlikely(!dp_radixsort(dl->items + 1, dl->length))) { + if (dl->sorted > unsorted / 4 + 4 && + (MDBX_DPL_PREALLOC_FOR_RADIXSORT || + dl->length + unsorted < dl->detent + dpl_gap_mergesort)) { + dp_t *const sorted_begin = dl->items + 1; + dp_t *const sorted_end = sorted_begin + dl->sorted; + dp_t *const end = dl->items + (MDBX_DPL_PREALLOC_FOR_RADIXSORT + ? dl->length + dl->length + 1 + : dl->detent + dpl_reserve_gap); + dp_t *const tmp = end - unsorted; + assert(dl->items + dl->length + 1 < tmp); + /* copy unsorted to the end of allocated space and sort it */ + memcpy(tmp, sorted_end, unsorted * sizeof(dp_t)); + dp_sort(tmp, tmp + unsorted); + /* merge two parts from end to begin */ + dp_t *__restrict w = dl->items + dl->length; + dp_t *__restrict l = dl->items + dl->sorted; + dp_t *__restrict r = end - 1; + do { + const bool cmp = expect_with_probability(l->pgno > r->pgno, 0, .5); +#if defined(__LCC__) || __CLANG_PREREQ(13, 0) || !MDBX_HAVE_CMOV + *w = cmp ? *l-- : *r--; +#else + *w = cmp ? *l : *r; + l -= cmp; + r += (ptrdiff_t)cmp - 1; +#endif + } while (likely(--w > l)); + assert(r == tmp - 1); + assert(dl->items[0].pgno == 0 && + dl->items[dl->length + 1].pgno == P_INVALID); + if (ASSERT_ENABLED()) + for (size_t i = 0; i <= dl->length; ++i) + assert(dl->items[i].pgno < dl->items[i + 1].pgno); } else { - old_data.iov_len = node_ds(node); - old_data.iov_base = node_data(node); - cASSERT(mc, ptr_disp(old_data.iov_base, old_data.iov_len) <= - ptr_disp(mc->mc_pg[mc->mc_top], env->me_psize)); + dp_sort(dl->items + 1, dl->items + dl->length + 1); + assert(dl->items[0].pgno == 0 && + dl->items[dl->length + 1].pgno == P_INVALID); + } + } else { + assert(dl->items[0].pgno == 0 && + dl->items[dl->length + 1].pgno == P_INVALID); + } + dl->sorted = dl->length; + return dl; +} - /* DB has dups? */ - if (mc->mc_db->md_flags & MDBX_DUPSORT) { - /* Prepare (sub-)page/sub-DB to accept the new item, if needed. - * fp: old sub-page or a header faking it. - * mp: new (sub-)page. - * xdata: node data with new sub-page or sub-DB. */ - size_t growth = 0; /* growth in page size.*/ - MDBX_page *mp = fp = xdata.iov_base = env->me_pbuf; - mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; +/* Returns the index of the first dirty-page whose pgno + * member is greater than or equal to id. */ +#define DP_SEARCH_CMP(dp, id) ((dp).pgno < (id)) +SEARCH_IMPL(dp_bsearch, dp_t, pgno_t, DP_SEARCH_CMP) - /* Was a single item before, must convert now */ - if (!(node_flags(node) & F_DUPDATA)) { - /* does data match? */ - if (flags & MDBX_APPENDDUP) { - const int cmp = mc->mc_dbx->md_dcmp(data, &old_data); - cASSERT(mc, cmp != 0 || eq_fast(data, &old_data)); - if (unlikely(cmp <= 0)) - return MDBX_EKEYMISMATCH; - } else if (eq_fast(data, &old_data)) { - cASSERT(mc, mc->mc_dbx->md_dcmp(data, &old_data) == 0); - if (flags & MDBX_NODUPDATA) - return MDBX_KEYEXIST; - /* data is match exactly byte-to-byte, nothing to update */ - rc = MDBX_SUCCESS; - if (unlikely(batch_dupfixed_done)) - goto batch_dupfixed_continue; - return rc; - } +__hot __noinline MDBX_INTERNAL size_t dpl_search(const MDBX_txn *txn, + pgno_t pgno) { + tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - /* Just overwrite the current item */ - if (flags & MDBX_CURRENT) { - cASSERT(mc, node_size(key, data) <= env->me_leaf_nodemax); - goto current; - } + dpl_t *dl = txn->tw.dirtylist; + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + if (AUDIT_ENABLED()) { + for (const dp_t *ptr = dl->items + dl->sorted; --ptr > dl->items;) { + assert(ptr[0].pgno < ptr[1].pgno); + assert(ptr[0].pgno >= NUM_METAS); + } + } - /* Back up original data item */ - memcpy(old_singledup.iov_base = fp + 1, old_data.iov_base, - old_singledup.iov_len = old_data.iov_len); + switch (dl->length - dl->sorted) { + default: + /* sort a whole */ + dpl_sort_slowpath(txn); + break; + case 0: + /* whole sorted cases */ + break; - /* Make sub-page header for the dup items, with dummy body */ - fp->mp_flags = P_LEAF | P_SUBP; - fp->mp_lower = 0; - xdata.iov_len = PAGEHDRSZ + old_data.iov_len + data->iov_len; - if (mc->mc_db->md_flags & MDBX_DUPFIXED) { - fp->mp_flags |= P_LEAF2; - fp->mp_leaf2_ksize = (uint16_t)data->iov_len; - /* Будем создавать LEAF2-страницу, как минимум с двумя элементами. - * При коротких значениях и наличии свободного места можно сделать - * некоторое резервирование места, чтобы при последующих добавлениях - * не сразу расширять созданную под-страницу. - * Резервирование в целом сомнительно (см ниже), но может сработать - * в плюс (а если в минус то несущественный) при коротких ключах. */ - xdata.iov_len += leaf2_reserve( - env, page_room(mc->mc_pg[mc->mc_top]) + old_data.iov_len, - xdata.iov_len, data->iov_len); - cASSERT(mc, (xdata.iov_len & 1) == 0); - } else { - xdata.iov_len += 2 * (sizeof(indx_t) + NODESIZE) + - (old_data.iov_len & 1) + (data->iov_len & 1); - } - cASSERT(mc, (xdata.iov_len & 1) == 0); - fp->mp_upper = (uint16_t)(xdata.iov_len - PAGEHDRSZ); - old_data.iov_len = xdata.iov_len; /* pretend olddata is fp */ - } else if (node_flags(node) & F_SUBDATA) { - /* Data is on sub-DB, just store it */ - flags |= F_DUPDATA | F_SUBDATA; - goto dupsort_put; - } else { - /* Data is on sub-page */ - fp = old_data.iov_base; - switch (flags) { - default: - growth = IS_LEAF2(fp) ? fp->mp_leaf2_ksize - : (node_size(data, nullptr) + sizeof(indx_t)); - if (page_room(fp) >= growth) { - /* На текущей под-странице есть место для добавления элемента. - * Оптимальнее продолжить использовать эту страницу, ибо - * добавление вложенного дерева увеличит WAF на одну страницу. */ - goto continue_subpage; - } - /* На текущей под-странице нет места для еще одного элемента. - * Можно либо увеличить эту под-страницу, либо вынести куст - * значений во вложенное дерево. - * - * Продолжать использовать текущую под-страницу возможно - * только пока и если размер после добавления элемента будет - * меньше me_leaf_nodemax. Соответственно, при превышении - * просто сразу переходим на вложенное дерево. */ - xdata.iov_len = old_data.iov_len + (growth += growth & 1); - if (xdata.iov_len > env->me_subpage_limit) - goto convert_to_subtree; +#define LINEAR_SEARCH_CASE(N) \ + case N: \ + if (dl->items[dl->length - N + 1].pgno == pgno) \ + return dl->length - N + 1; \ + __fallthrough - /* Можно либо увеличить под-страницу, в том числе с некоторым - * запасом, либо перейти на вложенное поддерево. - * - * Резервирование места на под-странице представляется сомнительным: - * - Резервирование увеличит рыхлость страниц, в том числе - * вероятность разделения основной/гнездовой страницы; - * - Сложно предсказать полезный размер резервирования, - * особенно для не-MDBX_DUPFIXED; - * - Наличие резерва позволяет съекономить только на перемещении - * части элементов основной/гнездовой страницы при последующих - * добавлениях в нее элементов. Причем после первого изменения - * размера под-страницы, её тело будет примыкать - * к неиспользуемому месту на основной/гнездовой странице, - * поэтому последующие последовательные добавления потребуют - * только передвижения в mp_ptrs[]. - * - * Соответственно, более важным/определяющим представляется - * своевременный переход к вложеному дереву, но тут достаточно - * сложный конфликт интересов: - * - При склонности к переходу к вложенным деревьям, суммарно - * в БД будет большее кол-во более рыхлых страниц. Это увеличит - * WAF, а также RAF при последовательных чтениях большой БД. - * Однако, при коротких ключах и большом кол-ве - * дубликатов/мультизначений, плотность ключей в листовых - * страницах основного дерева будет выше. Соответственно, будет - * пропорционально меньше branch-страниц. Поэтому будет выше - * вероятность оседания/не-вымывания страниц основного дерева из - * LRU-кэша, а также попадания в write-back кэш при записи. - * - Наоботот, при склонности к использованию под-страниц, будут - * наблюдаться обратные эффекты. Плюс некоторые накладные расходы - * на лишнее копирование данных под-страниц в сценариях - * нескольких обонвлений дубликатов одного куста в одной - * транзакции. - * - * Суммарно наиболее рациональным представляется такая тактика: - * - Вводим три порога subpage_limit, subpage_room_threshold - * и subpage_reserve_prereq, которые могут быть - * заданы/скорректированы пользователем в ‰ от me_leaf_nodemax; - * - Используем под-страницу пока её размер меньше subpage_limit - * и на основной/гнездовой странице не-менее - * subpage_room_threshold свободного места; - * - Резервируем место только для 1-3 коротких dupfixed-элементов, - * расширяя размер под-страницы на размер кэш-линии ЦПУ, но - * только если на странице не менее subpage_reserve_prereq - * свободного места. - * - По-умолчанию устанавливаем: - * subpage_limit = me_leaf_nodemax (1000‰); - * subpage_room_threshold = 0; - * subpage_reserve_prereq = me_leaf_nodemax (1000‰). - */ - if (IS_LEAF2(fp)) - growth += leaf2_reserve( - env, page_room(mc->mc_pg[mc->mc_top]) + old_data.iov_len, - xdata.iov_len, data->iov_len); - break; + /* use linear scan until the threshold */ + LINEAR_SEARCH_CASE(7); /* fall through */ + LINEAR_SEARCH_CASE(6); /* fall through */ + LINEAR_SEARCH_CASE(5); /* fall through */ + LINEAR_SEARCH_CASE(4); /* fall through */ + LINEAR_SEARCH_CASE(3); /* fall through */ + LINEAR_SEARCH_CASE(2); /* fall through */ + case 1: + if (dl->items[dl->length].pgno == pgno) + return dl->length; + /* continue bsearch on the sorted part */ + break; + } + return dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items; +} - case MDBX_CURRENT | MDBX_NODUPDATA: - case MDBX_CURRENT: - continue_subpage: - fp->mp_txnid = mc->mc_txn->mt_front; - fp->mp_pgno = mp->mp_pgno; - mc->mc_xcursor->mx_cursor.mc_pg[0] = fp; - flags |= F_DUPDATA; - goto dupsort_put; - } - xdata.iov_len = old_data.iov_len + growth; - cASSERT(mc, (xdata.iov_len & 1) == 0); - } +const page_t *debug_dpl_find(const MDBX_txn *txn, const pgno_t pgno) { + tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0); + const dpl_t *dl = txn->tw.dirtylist; + if (dl) { + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + assert(dl->items[0].pgno == 0 && + dl->items[dl->length + 1].pgno == P_INVALID); + for (size_t i = dl->length; i > dl->sorted; --i) + if (dl->items[i].pgno == pgno) + return dl->items[i].ptr; - fp_flags = fp->mp_flags; - if (xdata.iov_len > env->me_subpage_limit || - node_size_len(node_ks(node), xdata.iov_len) > - env->me_leaf_nodemax || - (env->me_subpage_room_threshold && - page_room(mc->mc_pg[mc->mc_top]) + - node_size_len(node_ks(node), old_data.iov_len) < - env->me_subpage_room_threshold + - node_size_len(node_ks(node), xdata.iov_len))) { - /* Too big for a sub-page, convert to sub-DB */ - convert_to_subtree: - fp_flags &= ~P_SUBP; - nested_dupdb.md_xsize = 0; - nested_dupdb.md_flags = flags_db2sub(mc->mc_db->md_flags); - if (mc->mc_db->md_flags & MDBX_DUPFIXED) { - fp_flags |= P_LEAF2; - nested_dupdb.md_xsize = fp->mp_leaf2_ksize; - } - nested_dupdb.md_depth = 1; - nested_dupdb.md_branch_pages = 0; - nested_dupdb.md_leaf_pages = 1; - nested_dupdb.md_overflow_pages = 0; - nested_dupdb.md_entries = page_numkeys(fp); - xdata.iov_len = sizeof(nested_dupdb); - xdata.iov_base = &nested_dupdb; - const pgr_t par = page_alloc(mc); - mp = par.page; - if (unlikely(par.err != MDBX_SUCCESS)) - return par.err; - mc->mc_db->md_leaf_pages += 1; - cASSERT(mc, env->me_psize > old_data.iov_len); - growth = env->me_psize - (unsigned)old_data.iov_len; - cASSERT(mc, (growth & 1) == 0); - flags |= F_DUPDATA | F_SUBDATA; - nested_dupdb.md_root = mp->mp_pgno; - nested_dupdb.md_seq = 0; - nested_dupdb.md_mod_txnid = mc->mc_txn->mt_txnid; - sub_root = mp; - } - if (mp != fp) { - mp->mp_flags = fp_flags; - mp->mp_txnid = mc->mc_txn->mt_front; - mp->mp_leaf2_ksize = fp->mp_leaf2_ksize; - mp->mp_lower = fp->mp_lower; - cASSERT(mc, fp->mp_upper + growth < UINT16_MAX); - mp->mp_upper = fp->mp_upper + (indx_t)growth; - if (unlikely(fp_flags & P_LEAF2)) { - memcpy(page_data(mp), page_data(fp), - page_numkeys(fp) * fp->mp_leaf2_ksize); - cASSERT(mc, - (((mp->mp_leaf2_ksize & page_numkeys(mp)) ^ mp->mp_upper) & - 1) == 0); - } else { - cASSERT(mc, (mp->mp_upper & 1) == 0); - memcpy(ptr_disp(mp, mp->mp_upper + PAGEHDRSZ), - ptr_disp(fp, fp->mp_upper + PAGEHDRSZ), - old_data.iov_len - fp->mp_upper - PAGEHDRSZ); - memcpy(mp->mp_ptrs, fp->mp_ptrs, - page_numkeys(fp) * sizeof(mp->mp_ptrs[0])); - for (size_t i = 0; i < page_numkeys(fp); i++) { - cASSERT(mc, mp->mp_ptrs[i] + growth <= UINT16_MAX); - mp->mp_ptrs[i] += (indx_t)growth; - } - } - } + if (dl->sorted) { + const size_t i = dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items; + if (dl->items[i].pgno == pgno) + return dl->items[i].ptr; + } + } else { + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + } + return nullptr; +} - if (!insert_key) - node_del(mc, 0); - ref_data = &xdata; - flags |= F_DUPDATA; - goto insert_node; - } +void dpl_remove_ex(const MDBX_txn *txn, size_t i, size_t npages) { + tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - /* MDBX passes F_SUBDATA in 'flags' to write a DB record */ - if (unlikely((node_flags(node) ^ flags) & F_SUBDATA)) - return MDBX_INCOMPATIBLE; + dpl_t *dl = txn->tw.dirtylist; + assert((intptr_t)i > 0 && i <= dl->length); + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + dl->pages_including_loose -= npages; + dl->sorted -= dl->sorted >= i; + dl->length -= 1; + memmove(dl->items + i, dl->items + i + 1, + (dl->length - i + 2) * sizeof(dl->items[0])); + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); +} - current: - if (data->iov_len == old_data.iov_len) { - cASSERT(mc, EVEN(key->iov_len) == EVEN(node_ks(node))); - /* same size, just replace it. Note that we could - * also reuse this node if the new data is smaller, - * but instead we opt to shrink the node in that case. */ - if (flags & MDBX_RESERVE) - data->iov_base = old_data.iov_base; - else if (!(mc->mc_flags & C_SUB)) - memcpy(old_data.iov_base, data->iov_base, data->iov_len); - else { - cASSERT(mc, page_numkeys(mc->mc_pg[mc->mc_top]) == 1); - cASSERT(mc, PAGETYPE_COMPAT(mc->mc_pg[mc->mc_top]) == P_LEAF); - cASSERT(mc, node_ds(node) == 0); - cASSERT(mc, node_flags(node) == 0); - cASSERT(mc, key->iov_len < UINT16_MAX); - node_set_ks(node, key->iov_len); - memcpy(node_key(node), key->iov_base, key->iov_len); - cASSERT(mc, ptr_disp(node_key(node), node_ds(node)) < - ptr_disp(mc->mc_pg[mc->mc_top], env->me_psize)); - goto fix_parent; - } +int __must_check_result dpl_append(MDBX_txn *txn, pgno_t pgno, page_t *page, + size_t npages) { + tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + const dp_t dp = {page, pgno, (pgno_t)npages}; + if ((txn->flags & MDBX_WRITEMAP) == 0) { + size_t *const ptr = ptr_disp(page, -(ptrdiff_t)sizeof(size_t)); + *ptr = txn->tw.dirtylru; + } - if (AUDIT_ENABLED()) { - err = cursor_check(mc); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } - return MDBX_SUCCESS; + dpl_t *dl = txn->tw.dirtylist; + tASSERT(txn, dl->length <= PAGELIST_LIMIT + MDBX_PNL_GRANULATE); + tASSERT(txn, dl->items[0].pgno == 0 && + dl->items[dl->length + 1].pgno == P_INVALID); + if (AUDIT_ENABLED()) { + for (size_t i = dl->length; i > 0; --i) { + assert(dl->items[i].pgno != dp.pgno); + if (unlikely(dl->items[i].pgno == dp.pgno)) { + ERROR("Page %u already exist in the DPL at %zu", dp.pgno, i); + return MDBX_PROBLEM; } } - node_del(mc, 0); } - ref_data = data; - -insert_node:; - const unsigned naf = flags & NODE_ADD_FLAGS; - size_t nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) - ? key->iov_len - : leaf_size(env, key, ref_data); - if (page_room(mc->mc_pg[mc->mc_top]) < nsize) { - rc = page_split(mc, key, ref_data, P_INVALID, - insert_key ? naf : naf | MDBX_SPLIT_REPLACE); - if (rc == MDBX_SUCCESS && AUDIT_ENABLED()) - rc = insert_key ? cursor_check(mc) : cursor_check_updating(mc); - } else { - /* There is room already in this leaf page. */ - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - cASSERT(mc, !(naf & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) && - ref_data->iov_len == 0); - rc = node_add_leaf2(mc, mc->mc_ki[mc->mc_top], key); - } else - rc = node_add_leaf(mc, mc->mc_ki[mc->mc_top], key, ref_data, naf); - if (likely(rc == 0)) { - /* Adjust other cursors pointing to mp */ - const MDBX_dbi dbi = mc->mc_dbi; - const size_t top = mc->mc_top; - MDBX_page *const mp = mc->mc_pg[top]; - for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; - m2 = m2->mc_next) { - MDBX_cursor *m3 = - (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[top] != mp) - continue; - if (m3->mc_ki[top] >= mc->mc_ki[top]) - m3->mc_ki[top] += insert_key; - if (XCURSOR_INITED(m3)) - XCURSOR_REFRESH(m3, mp, m3->mc_ki[top]); - } + if (unlikely(dl->length == dl->detent)) { + if (unlikely(dl->detent >= PAGELIST_LIMIT)) { + ERROR("DPL is full (PAGELIST_LIMIT %zu)", PAGELIST_LIMIT); + return MDBX_TXN_FULL; } + const size_t size = (dl->detent < MDBX_PNL_INITIAL * 42) + ? dl->detent + dl->detent + : dl->detent + dl->detent / 2; + dl = dpl_reserve(txn, size); + if (unlikely(!dl)) + return MDBX_ENOMEM; + tASSERT(txn, dl->length < dl->detent); } - if (likely(rc == MDBX_SUCCESS)) { - /* Now store the actual data in the child DB. Note that we're - * storing the user data in the keys field, so there are strict - * size limits on dupdata. The actual data fields of the child - * DB are all zero size. */ - if (flags & F_DUPDATA) { - MDBX_val empty; - dupsort_put: - empty.iov_len = 0; - empty.iov_base = nullptr; - MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); -#define SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE 1 - STATIC_ASSERT( - (MDBX_NODUPDATA >> SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE) == - MDBX_NOOVERWRITE); - unsigned xflags = - MDBX_CURRENT | ((flags & MDBX_NODUPDATA) >> - SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE); - if ((flags & MDBX_CURRENT) == 0) { - xflags -= MDBX_CURRENT; - err = cursor_xinit1(mc, node, mc->mc_pg[mc->mc_top]); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } - if (sub_root) - mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; - /* converted, write the original data first */ - if (old_singledup.iov_base) { - rc = cursor_put_nochecklen(&mc->mc_xcursor->mx_cursor, &old_singledup, - &empty, xflags); - if (unlikely(rc)) - goto dupsort_error; - } - if (!(node_flags(node) & F_SUBDATA) || sub_root) { - /* Adjust other cursors pointing to mp */ - MDBX_xcursor *const mx = mc->mc_xcursor; - const size_t top = mc->mc_top; - MDBX_page *const mp = mc->mc_pg[top]; - const intptr_t nkeys = page_numkeys(mp); + /* Сортировка нужна для быстрого поиска, используем несколько тактик: + * 1) Сохраняем упорядоченность при естественной вставке в нужном порядке. + * 2) Добавляем в не-сортированный хвост, который сортируем и сливаем + * с отсортированной головой по необходимости, а пока хвост короткий + * ищем в нём сканированием, избегая большой пересортировки. + * 3) Если не-сортированный хвост короткий, а добавляемый элемент близок + * к концу отсортированной головы, то выгоднее сразу вставить элемент + * в нужное место. + * + * Алгоритмически: + * - добавлять в не-сортированный хвост следует только если вставка сильно + * дорогая, т.е. если целевая позиция элемента сильно далека от конца; + * - для быстрой проверки достаточно сравнить добавляемый элемент с отстоящим + * от конца на максимально-приемлемое расстояние; + * - если список короче, либо элемент в этой позиции меньше вставляемого, + * то следует перемещать элементы и вставлять в отсортированную голову; + * - если не-сортированный хвост длиннее, либо элемент в этой позиции больше, + * то следует добавлять в не-сортированный хвост. */ - for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; - m2 = m2->mc_next) { - if (m2 == mc || m2->mc_snum < mc->mc_snum) - continue; - if (!(m2->mc_flags & C_INITIALIZED)) - continue; - if (m2->mc_pg[top] == mp) { - if (m2->mc_ki[top] == mc->mc_ki[top]) { - err = cursor_xinit2(m2, mx, old_singledup.iov_base != nullptr); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } else if (!insert_key && m2->mc_ki[top] < nkeys) { - XCURSOR_REFRESH(m2, mp, m2->mc_ki[top]); - } - } - } - } - cASSERT(mc, mc->mc_xcursor->mx_db.md_entries < PTRDIFF_MAX); - const size_t probe = (size_t)mc->mc_xcursor->mx_db.md_entries; -#define SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND 1 - STATIC_ASSERT((MDBX_APPENDDUP >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND) == - MDBX_APPEND); - xflags |= (flags & MDBX_APPENDDUP) >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND; - rc = cursor_put_nochecklen(&mc->mc_xcursor->mx_cursor, data, &empty, - xflags); - if (flags & F_SUBDATA) { - void *db = node_data(node); - mc->mc_xcursor->mx_db.md_mod_txnid = mc->mc_txn->mt_txnid; - memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); - } - insert_data = (probe != (size_t)mc->mc_xcursor->mx_db.md_entries); - } - /* Increment count unless we just replaced an existing item. */ - if (insert_data) - mc->mc_db->md_entries++; - if (insert_key) { - if (unlikely(rc != MDBX_SUCCESS)) - goto dupsort_error; - /* If we succeeded and the key didn't exist before, - * make sure the cursor is marked valid. */ - mc->mc_flags |= C_INITIALIZED; - } - if (likely(rc == MDBX_SUCCESS)) { - if (unlikely(batch_dupfixed_done)) { - batch_dupfixed_continue: - /* let caller know how many succeeded, if any */ - if ((*batch_dupfixed_done += 1) < batch_dupfixed_given) { - data[0].iov_base = ptr_disp(data[0].iov_base, data[0].iov_len); - insert_key = insert_data = false; - old_singledup.iov_base = nullptr; - goto more; - } - } - if (AUDIT_ENABLED()) - rc = cursor_check(mc); - } - return rc; + dl->pages_including_loose += npages; + dp_t *i = dl->items + dl->length; - dupsort_error: - if (unlikely(rc == MDBX_KEYEXIST)) { - /* should not happen, we deleted that item */ - ERROR("Unexpected %i error while put to nested dupsort's hive", rc); - rc = MDBX_PROBLEM; - } - } - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - return rc; -} + const ptrdiff_t pivot = (ptrdiff_t)dl->length - dpl_insertion_threshold; +#if MDBX_HAVE_CMOV + const pgno_t pivot_pgno = + dl->items[(dl->length < dpl_insertion_threshold) + ? 0 + : dl->length - dpl_insertion_threshold] + .pgno; +#endif /* MDBX_HAVE_CMOV */ -static __hot int cursor_put_checklen(MDBX_cursor *mc, const MDBX_val *key, - MDBX_val *data, unsigned flags) { - cASSERT(mc, (mc->mc_flags & C_SUB) == 0); - uint64_t aligned_keybytes, aligned_databytes; - MDBX_val aligned_key, aligned_data; - if (unlikely(key->iov_len < mc->mc_dbx->md_klen_min || - key->iov_len > mc->mc_dbx->md_klen_max)) { - cASSERT(mc, !"Invalid key-size"); - return MDBX_BAD_VALSIZE; - } - if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || - data->iov_len > mc->mc_dbx->md_vlen_max)) { - cASSERT(mc, !"Invalid data-size"); - return MDBX_BAD_VALSIZE; - } + /* copy the stub beyond the end */ + i[2] = i[1]; + dl->length += 1; - if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { - switch (key->iov_len) { - default: - cASSERT(mc, !"key-size is invalid for MDBX_INTEGERKEY"); - return MDBX_BAD_VALSIZE; - case 4: - if (unlikely(3 & (uintptr_t)key->iov_base)) { - /* copy instead of return error to avoid break compatibility */ - aligned_key.iov_base = - memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 4); - key = &aligned_key; - } - break; - case 8: - if (unlikely(7 & (uintptr_t)key->iov_base)) { - /* copy instead of return error to avoid break compatibility */ - aligned_key.iov_base = - memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 8); - key = &aligned_key; - } - break; + if (likely(pivot <= (ptrdiff_t)dl->sorted) && +#if MDBX_HAVE_CMOV + pivot_pgno < dp.pgno) { +#else + (pivot <= 0 || dl->items[pivot].pgno < dp.pgno)) { +#endif /* MDBX_HAVE_CMOV */ + dl->sorted += 1; + + /* сдвигаем несортированный хвост */ + while (i >= dl->items + dl->sorted) { +#if !defined(__GNUC__) /* пытаемся избежать вызова memmove() */ + i[1] = *i; +#elif MDBX_WORDBITS == 64 && \ + (defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)) + STATIC_ASSERT(sizeof(dp) == sizeof(__uint128_t)); + ((__uint128_t *)i)[1] = *(volatile __uint128_t *)i; +#else + i[1].ptr = i->ptr; + i[1].pgno = i->pgno; + i[1].npages = i->npages; +#endif + --i; } - } - if (mc->mc_db->md_flags & MDBX_INTEGERDUP) { - switch (data->iov_len) { - default: - cASSERT(mc, !"data-size is invalid for MDBX_INTEGERKEY"); - return MDBX_BAD_VALSIZE; - case 4: - if (unlikely(3 & (uintptr_t)data->iov_base)) { - if (unlikely(flags & MDBX_MULTIPLE)) - return MDBX_BAD_VALSIZE; - /* copy instead of return error to avoid break compatibility */ - aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base, - aligned_data.iov_len = 4); - data = &aligned_data; - } - break; - case 8: - if (unlikely(7 & (uintptr_t)data->iov_base)) { - if (unlikely(flags & MDBX_MULTIPLE)) - return MDBX_BAD_VALSIZE; - /* copy instead of return error to avoid break compatibility */ - aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base, - aligned_data.iov_len = 8); - data = &aligned_data; - } - break; + /* ищем нужную позицию сдвигая отсортированные элементы */ + while (i->pgno > pgno) { + tASSERT(txn, i > dl->items); + i[1] = *i; + --i; } + tASSERT(txn, i->pgno < dp.pgno); } - return cursor_put_nochecklen(mc, key, data, flags); + + i[1] = dp; + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + assert(dl->sorted <= dl->length); + return MDBX_SUCCESS; } -int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, - MDBX_put_flags_t flags) { - if (unlikely(mc == NULL || key == NULL || data == NULL)) - return MDBX_EINVAL; +__cold bool dpl_check(MDBX_txn *txn) { + tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0); + const dpl_t *const dl = txn->tw.dirtylist; + if (!dl) { + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + return true; + } + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL - : MDBX_EBADSIGN; + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + tASSERT(txn, txn->tw.dirtyroom + dl->length == + (txn->parent ? txn->parent->tw.dirtyroom + : txn->env->options.dp_limit)); - int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + if (!AUDIT_ENABLED()) + return true; - if (unlikely(dbi_changed(mc->mc_txn, mc->mc_dbi))) - return MDBX_BAD_DBI; + size_t loose = 0, pages = 0; + for (size_t i = dl->length; i > 0; --i) { + const page_t *const dp = dl->items[i].ptr; + if (!dp) + continue; - cASSERT(mc, cursor_is_tracked(mc)); + tASSERT(txn, dp->pgno == dl->items[i].pgno); + if (unlikely(dp->pgno != dl->items[i].pgno)) + return false; - /* Check this first so counter will always be zero on any early failures. */ - if (unlikely(flags & MDBX_MULTIPLE)) { - if (unlikely(flags & MDBX_RESERVE)) - return MDBX_EINVAL; - if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) - return MDBX_INCOMPATIBLE; - const size_t dcount = data[1].iov_len; - if (unlikely(dcount < 2 || data->iov_len == 0)) - return MDBX_BAD_VALSIZE; - if (unlikely(mc->mc_db->md_xsize != data->iov_len) && mc->mc_db->md_xsize) - return MDBX_BAD_VALSIZE; - if (unlikely(dcount > MAX_MAPSIZE / 2 / - (BRANCH_NODE_MAX(MAX_PAGESIZE) - NODESIZE))) { - /* checking for multiplication overflow */ - if (unlikely(dcount > MAX_MAPSIZE / 2 / data->iov_len)) - return MDBX_TOO_LARGE; + if ((txn->flags & MDBX_WRITEMAP) == 0) { + const uint32_t age = dpl_age(txn, i); + tASSERT(txn, age < UINT32_MAX / 3); + if (unlikely(age > UINT32_MAX / 3)) + return false; } - } - if (flags & MDBX_RESERVE) { - if (unlikely(mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_REVERSEDUP | - MDBX_INTEGERDUP | MDBX_DUPFIXED))) - return MDBX_INCOMPATIBLE; - data->iov_base = nullptr; - } + tASSERT(txn, dp->flags == P_LOOSE || is_modifable(txn, dp)); + if (dp->flags == P_LOOSE) { + loose += 1; + } else if (unlikely(!is_modifable(txn, dp))) + return false; - if (unlikely(mc->mc_txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) - return (mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS - : MDBX_BAD_TXN; + const unsigned num = dpl_npages(dl, i); + pages += num; + tASSERT(txn, txn->geo.first_unallocated >= dp->pgno + num); + if (unlikely(txn->geo.first_unallocated < dp->pgno + num)) + return false; - return cursor_put_checklen(mc, key, data, flags); -} + if (i < dl->sorted) { + tASSERT(txn, dl->items[i + 1].pgno >= dp->pgno + num); + if (unlikely(dl->items[i + 1].pgno < dp->pgno + num)) + return false; + } -int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { - if (unlikely(!mc)) - return MDBX_EINVAL; + const size_t rpa = + pnl_search(txn->tw.relist, dp->pgno, txn->geo.first_unallocated); + tASSERT(txn, rpa > MDBX_PNL_GETSIZE(txn->tw.relist) || + txn->tw.relist[rpa] != dp->pgno); + if (rpa <= MDBX_PNL_GETSIZE(txn->tw.relist) && + unlikely(txn->tw.relist[rpa] == dp->pgno)) + return false; + if (num > 1) { + const size_t rpb = pnl_search(txn->tw.relist, dp->pgno + num - 1, + txn->geo.first_unallocated); + tASSERT(txn, rpa == rpb); + if (unlikely(rpa != rpb)) + return false; + } + } - if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL - : MDBX_EBADSIGN; + tASSERT(txn, loose == txn->tw.loose_count); + if (unlikely(loose != txn->tw.loose_count)) + return false; - int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + tASSERT(txn, pages == dl->pages_including_loose); + if (unlikely(pages != dl->pages_including_loose)) + return false; - if (unlikely(dbi_changed(mc->mc_txn, mc->mc_dbi))) - return MDBX_BAD_DBI; + for (size_t i = 1; i <= MDBX_PNL_GETSIZE(txn->tw.retired_pages); ++i) { + const page_t *const dp = debug_dpl_find(txn, txn->tw.retired_pages[i]); + tASSERT(txn, !dp); + if (unlikely(dp)) + return false; + } - if (unlikely(!(mc->mc_flags & C_INITIALIZED))) - return MDBX_ENODATA; + return true; +} - if (unlikely(mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top]))) - return MDBX_NOTFOUND; +/*----------------------------------------------------------------------------*/ - return cursor_del(mc, flags); +__noinline void dpl_lru_reduce(MDBX_txn *txn) { + NOTICE("lru-reduce %u -> %u", txn->tw.dirtylru, txn->tw.dirtylru >> 1); + tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); + do { + txn->tw.dirtylru >>= 1; + dpl_t *dl = txn->tw.dirtylist; + for (size_t i = 1; i <= dl->length; ++i) { + size_t *const ptr = + ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t)); + *ptr >>= 1; + } + txn = txn->parent; + } while (txn); } -static __hot int cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { - cASSERT(mc, mc->mc_flags & C_INITIALIZED); - cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top])); +void dpl_sift(MDBX_txn *const txn, pnl_t pl, const bool spilled) { + tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + if (MDBX_PNL_GETSIZE(pl) && txn->tw.dirtylist->length) { + tASSERT(txn, pnl_check_allocated(pl, (size_t)txn->geo.first_unallocated + << spilled)); + dpl_t *dl = dpl_sort(txn); - int rc = cursor_touch(mc, nullptr, nullptr); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + /* Scanning in ascend order */ + const intptr_t step = MDBX_PNL_ASCENDING ? 1 : -1; + const intptr_t begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_GETSIZE(pl); + const intptr_t end = MDBX_PNL_ASCENDING ? MDBX_PNL_GETSIZE(pl) + 1 : 0; + tASSERT(txn, pl[begin] <= pl[end - step]); - MDBX_page *mp = mc->mc_pg[mc->mc_top]; - cASSERT(mc, IS_MODIFIABLE(mc->mc_txn, mp)); - if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { - ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", - mp->mp_pgno, mp->mp_flags); - return MDBX_CORRUPTED; - } - if (IS_LEAF2(mp)) - goto del_key; + size_t w, r = dpl_search(txn, pl[begin] >> spilled); + tASSERT(txn, dl->sorted == dl->length); + for (intptr_t i = begin; r <= dl->length;) { /* scan loop */ + assert(i != end); + tASSERT(txn, !spilled || (pl[i] & 1) == 0); + pgno_t pl_pgno = pl[i] >> spilled; + pgno_t dp_pgno = dl->items[r].pgno; + if (likely(dp_pgno != pl_pgno)) { + const bool cmp = dp_pgno < pl_pgno; + r += cmp; + i += cmp ? 0 : step; + if (likely(i != end)) + continue; + return; + } - MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (node_flags(node) & F_DUPDATA) { - if (flags & (MDBX_ALLDUPS | /* for compatibility */ MDBX_NODUPDATA)) { - /* will subtract the final entry later */ - mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; - mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; - } else { - if (!(node_flags(node) & F_SUBDATA)) - mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); - rc = cursor_del(&mc->mc_xcursor->mx_cursor, 0); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - /* If sub-DB still has entries, we're done */ - if (mc->mc_xcursor->mx_db.md_entries) { - if (node_flags(node) & F_SUBDATA) { - /* update subDB info */ - mc->mc_xcursor->mx_db.md_mod_txnid = mc->mc_txn->mt_txnid; - memcpy(node_data(node), &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); - } else { - /* shrink sub-page */ - node = node_shrink(mp, mc->mc_ki[mc->mc_top], node); - mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); - /* fix other sub-DB cursors pointed at sub-pages on this page */ - for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; - m2 = m2->mc_next) { - if (m2 == mc || m2->mc_snum < mc->mc_snum) - continue; - if (!(m2->mc_flags & C_INITIALIZED)) - continue; - if (m2->mc_pg[mc->mc_top] == mp) { - MDBX_node *inner = node; - if (m2->mc_ki[mc->mc_top] >= page_numkeys(mp)) - continue; - if (m2->mc_ki[mc->mc_top] != mc->mc_ki[mc->mc_top]) { - inner = page_node(mp, m2->mc_ki[mc->mc_top]); - if (node_flags(inner) & F_SUBDATA) - continue; - } - m2->mc_xcursor->mx_cursor.mc_pg[0] = node_data(inner); - } - } - } - mc->mc_db->md_entries--; - cASSERT(mc, mc->mc_db->md_entries > 0 && mc->mc_db->md_depth > 0 && - mc->mc_db->md_root != P_INVALID); - return rc; + /* update loop */ + unsigned npages; + w = r; + remove_dl: + npages = dpl_npages(dl, r); + dl->pages_including_loose -= npages; + if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP)) + page_shadow_release(txn->env, dl->items[r].ptr, npages); + ++r; + next_i: + i += step; + if (unlikely(i == end)) { + while (r <= dl->length) + dl->items[w++] = dl->items[r++]; } else { - mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; + while (r <= dl->length) { + assert(i != end); + tASSERT(txn, !spilled || (pl[i] & 1) == 0); + pl_pgno = pl[i] >> spilled; + dp_pgno = dl->items[r].pgno; + if (dp_pgno < pl_pgno) + dl->items[w++] = dl->items[r++]; + else if (dp_pgno > pl_pgno) + goto next_i; + else + goto remove_dl; + } } - /* otherwise fall thru and delete the sub-DB */ - } - - if (node_flags(node) & F_SUBDATA) { - /* add all the child DB's pages to the free list */ - rc = drop_tree(&mc->mc_xcursor->mx_cursor, false); - if (unlikely(rc)) - goto fail; + dl->sorted = dpl_setlen(dl, w - 1); + txn->tw.dirtyroom += r - w; + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->parent ? txn->parent->tw.dirtyroom + : txn->env->options.dp_limit)); + return; } } - /* MDBX passes F_SUBDATA in 'flags' to delete a DB record */ - else if (unlikely((node_flags(node) ^ flags) & F_SUBDATA)) - return MDBX_INCOMPATIBLE; +} - /* add large/overflow pages to free list */ - if (node_flags(node) & F_BIGDATA) { - pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); - if (unlikely((rc = lp.err) || (rc = page_retire(mc, lp.page)))) - goto fail; - } +void dpl_release_shadows(MDBX_txn *txn) { + tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); + MDBX_env *env = txn->env; + dpl_t *const dl = txn->tw.dirtylist; -del_key: - mc->mc_db->md_entries--; - const MDBX_dbi dbi = mc->mc_dbi; - indx_t ki = mc->mc_ki[mc->mc_top]; - mp = mc->mc_pg[mc->mc_top]; - cASSERT(mc, IS_LEAF(mp)); - node_del(mc, mc->mc_db->md_xsize); + for (size_t i = 1; i <= dl->length; i++) + page_shadow_release(env, dl->items[i].ptr, dpl_npages(dl, i)); - /* Adjust other cursors pointing to mp */ - for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { - MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - if (m3->mc_ki[mc->mc_top] == ki) { - m3->mc_flags |= C_DEL; - if (mc->mc_db->md_flags & MDBX_DUPSORT) { - /* Sub-cursor referred into dataset which is gone */ - m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - } - continue; - } else if (m3->mc_ki[mc->mc_top] > ki) { - m3->mc_ki[mc->mc_top]--; - } - if (XCURSOR_INITED(m3)) - XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); - } - } + dpl_clear(dl); +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 - rc = rebalance(mc); + +__cold int dxb_read_header(MDBX_env *env, meta_t *dest, const int lck_exclusive, + const mdbx_mode_t mode_bits) { + memset(dest, 0, sizeof(meta_t)); + int rc = osal_filesize(env->lazy_fd, &env->dxb_mmap.filesize); if (unlikely(rc != MDBX_SUCCESS)) - goto fail; + return rc; - if (unlikely(!mc->mc_snum)) { - /* DB is totally empty now, just bail out. - * Other cursors adjustments were already done - * by rebalance and aren't needed here. */ - cASSERT(mc, mc->mc_db->md_entries == 0 && mc->mc_db->md_depth == 0 && - mc->mc_db->md_root == P_INVALID); - mc->mc_flags |= C_EOF; - return MDBX_SUCCESS; - } + unaligned_poke_u64(4, dest->sign, DATASIGN_WEAK); + rc = MDBX_CORRUPTED; - ki = mc->mc_ki[mc->mc_top]; - mp = mc->mc_pg[mc->mc_top]; - cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - size_t nkeys = page_numkeys(mp); - cASSERT(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) || - ((mc->mc_flags & C_SUB) && mc->mc_db->md_entries == 0 && - nkeys == 0)); + /* Read twice all meta pages so we can find the latest one. */ + unsigned loop_limit = NUM_METAS * 2; + /* We don't know the page size on first time. So, just guess it. */ + unsigned guess_pagesize = 0; + for (unsigned loop_count = 0; loop_count < loop_limit; ++loop_count) { + const unsigned meta_number = loop_count % NUM_METAS; + const unsigned offset = + (guess_pagesize ? guess_pagesize + : (loop_count > NUM_METAS) ? env->ps + : globals.sys_pagesize) * + meta_number; - /* Adjust this and other cursors pointing to mp */ - for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { - MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - /* if m3 points past last node in page, find next sibling */ - if (m3->mc_ki[mc->mc_top] >= nkeys) { - rc = cursor_sibling(m3, SIBLING_RIGHT); - if (rc == MDBX_NOTFOUND) { - m3->mc_flags |= C_EOF; - rc = MDBX_SUCCESS; + char buffer[MDBX_MIN_PAGESIZE]; + unsigned retryleft = 42; + while (1) { + TRACE("reading meta[%d]: offset %u, bytes %u, retry-left %u", meta_number, + offset, MDBX_MIN_PAGESIZE, retryleft); + int err = osal_pread(env->lazy_fd, buffer, MDBX_MIN_PAGESIZE, offset); + if (err == MDBX_ENODATA && offset == 0 && loop_count == 0 && + env->dxb_mmap.filesize == 0 && + mode_bits /* non-zero for DB creation */ != 0) { + NOTICE("read meta: empty file (%d, %s)", err, mdbx_strerror(err)); + return err; + } +#if defined(_WIN32) || defined(_WIN64) + if (err == ERROR_LOCK_VIOLATION) { + SleepEx(0, true); + err = osal_pread(env->lazy_fd, buffer, MDBX_MIN_PAGESIZE, offset); + if (err == ERROR_LOCK_VIOLATION && --retryleft) { + WARNING("read meta[%u,%u]: %i, %s", offset, MDBX_MIN_PAGESIZE, err, + mdbx_strerror(err)); continue; } - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; } - if (m3->mc_ki[mc->mc_top] >= ki || - /* moved to right sibling */ m3->mc_pg[mc->mc_top] != mp) { - if (m3->mc_xcursor && !(m3->mc_flags & C_EOF)) { - node = page_node(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); - /* If this node has dupdata, it may need to be reinited - * because its data has moved. - * If the xcursor was not inited it must be reinited. - * Else if node points to a subDB, nothing is needed. */ - if (node_flags(node) & F_DUPDATA) { - if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - if (!(node_flags(node) & F_SUBDATA)) - m3->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); - } else { - rc = cursor_xinit1(m3, node, m3->mc_pg[m3->mc_top]); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - rc = cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - } - } - m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; - } - m3->mc_flags |= C_DEL; +#endif /* Windows */ + if (err != MDBX_SUCCESS) { + ERROR("read meta[%u,%u]: %i, %s", offset, MDBX_MIN_PAGESIZE, err, + mdbx_strerror(err)); + return err; } - } - } - cASSERT(mc, rc == MDBX_SUCCESS); - if (AUDIT_ENABLED()) - rc = cursor_check(mc); - return rc; + char again[MDBX_MIN_PAGESIZE]; + err = osal_pread(env->lazy_fd, again, MDBX_MIN_PAGESIZE, offset); +#if defined(_WIN32) || defined(_WIN64) + if (err == ERROR_LOCK_VIOLATION) { + SleepEx(0, true); + err = osal_pread(env->lazy_fd, again, MDBX_MIN_PAGESIZE, offset); + if (err == ERROR_LOCK_VIOLATION && --retryleft) { + WARNING("read meta[%u,%u]: %i, %s", offset, MDBX_MIN_PAGESIZE, err, + mdbx_strerror(err)); + continue; + } + } +#endif /* Windows */ + if (err != MDBX_SUCCESS) { + ERROR("read meta[%u,%u]: %i, %s", offset, MDBX_MIN_PAGESIZE, err, + mdbx_strerror(err)); + return err; + } -fail: - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - return rc; -} + if (memcmp(buffer, again, MDBX_MIN_PAGESIZE) == 0 || --retryleft == 0) + break; -/* Allocate and initialize new pages for a database. - * Set MDBX_TXN_ERROR on failure. */ -static pgr_t page_new(MDBX_cursor *mc, const unsigned flags) { - cASSERT(mc, (flags & P_OVERFLOW) == 0); - pgr_t ret = page_alloc(mc); - if (unlikely(ret.err != MDBX_SUCCESS)) - return ret; + VERBOSE("meta[%u] was updated, re-read it", meta_number); + } - DEBUG("db %u allocated new page %" PRIaPGNO, mc->mc_dbi, ret.page->mp_pgno); - ret.page->mp_flags = (uint16_t)flags; - cASSERT(mc, *mc->mc_dbi_state & DBI_DIRTY); - cASSERT(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); -#if MDBX_ENABLE_PGOP_STAT - mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ + if (!retryleft) { + ERROR("meta[%u] is too volatile, skip it", meta_number); + continue; + } - STATIC_ASSERT(P_BRANCH == 1); - const unsigned is_branch = flags & P_BRANCH; + page_t *const page = (page_t *)buffer; + meta_t *const meta = page_meta(page); + rc = meta_validate(env, meta, page, meta_number, &guess_pagesize); + if (rc != MDBX_SUCCESS) + continue; - ret.page->mp_lower = 0; - ret.page->mp_upper = (indx_t)(mc->mc_txn->mt_env->me_psize - PAGEHDRSZ); - mc->mc_db->md_branch_pages += is_branch; - mc->mc_db->md_leaf_pages += 1 - is_branch; - if (unlikely(mc->mc_flags & C_SUB)) { - MDBX_db *outer = outer_db(mc); - outer->md_branch_pages += is_branch; - outer->md_leaf_pages += 1 - is_branch; + bool latch; + if (env->stuck_meta >= 0) + latch = (meta_number == (unsigned)env->stuck_meta); + else if (meta_bootid_match(meta)) + latch = meta_choice_recent( + meta->unsafe_txnid, SIGN_IS_STEADY(meta->unsafe_sign), + dest->unsafe_txnid, SIGN_IS_STEADY(dest->unsafe_sign)); + else + latch = meta_choice_steady( + meta->unsafe_txnid, SIGN_IS_STEADY(meta->unsafe_sign), + dest->unsafe_txnid, SIGN_IS_STEADY(dest->unsafe_sign)); + if (latch) { + *dest = *meta; + if (!lck_exclusive && !meta_is_steady(dest)) + loop_limit += 1; /* LY: should re-read to hush race with update */ + VERBOSE("latch meta[%u]", meta_number); + } } - return ret; -} - -static pgr_t page_new_large(MDBX_cursor *mc, const size_t npages) { - pgr_t ret = likely(npages == 1) - ? page_alloc(mc) - : page_alloc_slowpath(mc, npages, MDBX_ALLOC_DEFAULT); - if (unlikely(ret.err != MDBX_SUCCESS)) - return ret; - DEBUG("db %u allocated new large-page %" PRIaPGNO ", num %zu", mc->mc_dbi, - ret.page->mp_pgno, npages); - ret.page->mp_flags = P_OVERFLOW; - cASSERT(mc, *mc->mc_dbi_state & DBI_DIRTY); - cASSERT(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); -#if MDBX_ENABLE_PGOP_STAT - mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += npages; -#endif /* MDBX_ENABLE_PGOP_STAT */ + if (dest->pagesize == 0 || + (env->stuck_meta < 0 && + !(meta_is_steady(dest) || + meta_weak_acceptable(env, dest, lck_exclusive)))) { + ERROR("%s", "no usable meta-pages, database is corrupted"); + if (rc == MDBX_SUCCESS) { + /* TODO: try to restore the database by fully checking b-tree structure + * for the each meta page, if the corresponding option was given */ + return MDBX_CORRUPTED; + } + return rc; + } - mc->mc_db->md_overflow_pages += (pgno_t)npages; - ret.page->mp_pages = (pgno_t)npages; - cASSERT(mc, !(mc->mc_flags & C_SUB)); - return ret; + return MDBX_SUCCESS; } -__hot static int __must_check_result node_add_leaf2(MDBX_cursor *mc, - size_t indx, - const MDBX_val *key) { - MDBX_page *mp = mc->mc_pg[mc->mc_top]; - MDBX_ANALYSIS_ASSUME(key != nullptr); - DKBUF_DEBUG; - DEBUG("add to leaf2-%spage %" PRIaPGNO " index %zi, " - " key size %" PRIuPTR " [%s]", - IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, key ? key->iov_len : 0, - DKEY_DEBUG(key)); - - cASSERT(mc, key); - cASSERT(mc, PAGETYPE_COMPAT(mp) == (P_LEAF | P_LEAF2)); - const size_t ksize = mc->mc_db->md_xsize; - cASSERT(mc, ksize == key->iov_len); - const size_t nkeys = page_numkeys(mp); - cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->mp_upper) & 1) == 0); +__cold int dxb_resize(MDBX_env *const env, const pgno_t used_pgno, + const pgno_t size_pgno, pgno_t limit_pgno, + const enum resize_mode mode) { + /* Acquire guard to avoid collision between read and write txns + * around geo_in_bytes and dxb_mmap */ +#if defined(_WIN32) || defined(_WIN64) + imports.srwl_AcquireExclusive(&env->remap_guard); + int rc = MDBX_SUCCESS; + mdbx_handle_array_t *suspended = nullptr; + mdbx_handle_array_t array_onstack; +#else + int rc = osal_fastmutex_acquire(&env->remap_guard); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; +#endif - /* Just using these for counting */ - const intptr_t lower = mp->mp_lower + sizeof(indx_t); - const intptr_t upper = mp->mp_upper - (ksize - sizeof(indx_t)); - if (unlikely(lower > upper)) { - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - return MDBX_PAGE_FULL; + const size_t prev_size = env->dxb_mmap.current; + const size_t prev_limit = env->dxb_mmap.limit; + const pgno_t prev_limit_pgno = bytes2pgno(env, prev_limit); + eASSERT(env, limit_pgno >= size_pgno); + eASSERT(env, size_pgno >= used_pgno); + if (mode < explicit_resize && size_pgno <= prev_limit_pgno) { + /* The actual mapsize may be less since the geo.upper may be changed + * by other process. Avoids remapping until it necessary. */ + limit_pgno = prev_limit_pgno; } - mp->mp_lower = (indx_t)lower; - mp->mp_upper = (indx_t)upper; + const size_t limit_bytes = pgno_align2os_bytes(env, limit_pgno); + const size_t size_bytes = pgno_align2os_bytes(env, size_pgno); +#if MDBX_ENABLE_MADVISE || defined(ENABLE_MEMCHECK) + const void *const prev_map = env->dxb_mmap.base; +#endif /* MDBX_ENABLE_MADVISE || ENABLE_MEMCHECK */ - void *const ptr = page_leaf2key(mp, indx, ksize); - cASSERT(mc, nkeys >= indx); - const size_t diff = nkeys - indx; - if (likely(diff > 0)) - /* Move higher keys up one slot. */ - memmove(ptr_disp(ptr, ksize), ptr, diff * ksize); - /* insert new key */ - memcpy(ptr, key->iov_base, ksize); + VERBOSE("resize/%d datafile/mapping: " + "present %" PRIuPTR " -> %" PRIuPTR ", " + "limit %" PRIuPTR " -> %" PRIuPTR, + mode, prev_size, size_bytes, prev_limit, limit_bytes); - cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->mp_upper) & 1) == 0); - return MDBX_SUCCESS; -} + eASSERT(env, limit_bytes >= size_bytes); + eASSERT(env, bytes2pgno(env, size_bytes) >= size_pgno); + eASSERT(env, bytes2pgno(env, limit_bytes) >= limit_pgno); -static int __must_check_result node_add_branch(MDBX_cursor *mc, size_t indx, - const MDBX_val *key, - pgno_t pgno) { - MDBX_page *mp = mc->mc_pg[mc->mc_top]; - DKBUF_DEBUG; - DEBUG("add to branch-%spage %" PRIaPGNO " index %zi, node-pgno %" PRIaPGNO - " key size %" PRIuPTR " [%s]", - IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, pgno, - key ? key->iov_len : 0, DKEY_DEBUG(key)); + unsigned mresize_flags = + env->flags & (MDBX_RDONLY | MDBX_WRITEMAP | MDBX_UTTERLY_NOSYNC); + if (mode >= impilict_shrink) + mresize_flags |= txn_shrink_allowed; - cASSERT(mc, PAGETYPE_WHOLE(mp) == P_BRANCH); - STATIC_ASSERT(NODESIZE % 2 == 0); + if (limit_bytes == env->dxb_mmap.limit && + size_bytes == env->dxb_mmap.current && + size_bytes == env->dxb_mmap.filesize) + goto bailout; - /* Move higher pointers up one slot. */ - const size_t nkeys = page_numkeys(mp); - cASSERT(mc, nkeys >= indx); - for (size_t i = nkeys; i > indx; --i) - mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; + /* При использовании MDBX_NOSTICKYTHREADS с транзакциями могут работать любые + * потоки и у нас нет информации о том, какие именно. Поэтому нет возможности + * выполнить remap-действия требующие приостановки работающих с БД потоков. */ + if ((env->flags & MDBX_NOSTICKYTHREADS) == 0) { +#if defined(_WIN32) || defined(_WIN64) + if ((size_bytes < env->dxb_mmap.current && mode > implicit_grow) || + limit_bytes != env->dxb_mmap.limit) { + /* 1) Windows allows only extending a read-write section, but not a + * corresponding mapped view. Therefore in other cases we must suspend + * the local threads for safe remap. + * 2) At least on Windows 10 1803 the entire mapped section is unavailable + * for short time during NtExtendSection() or VirtualAlloc() execution. + * 3) Under Wine runtime environment on Linux a section extending is not + * supported. + * + * THEREFORE LOCAL THREADS SUSPENDING IS ALWAYS REQUIRED! */ + array_onstack.limit = ARRAY_LENGTH(array_onstack.handles); + array_onstack.count = 0; + suspended = &array_onstack; + rc = osal_suspend_threads_before_remap(env, &suspended); + if (rc != MDBX_SUCCESS) { + ERROR("failed suspend-for-remap: errcode %d", rc); + goto bailout; + } + mresize_flags |= (mode < explicit_resize) + ? MDBX_MRESIZE_MAY_UNMAP + : MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; + } +#else /* Windows */ + lck_t *const lck = env->lck_mmap.lck; + if (mode == explicit_resize && limit_bytes != env->dxb_mmap.limit) { + mresize_flags |= MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; + if (lck) { + int err = lck_rdt_lock(env) /* lock readers table until remap done */; + if (unlikely(MDBX_IS_ERROR(err))) { + rc = err; + goto bailout; + } - /* Adjust free space offsets. */ - const size_t branch_bytes = branch_size(mc->mc_txn->mt_env, key); - const intptr_t lower = mp->mp_lower + sizeof(indx_t); - const intptr_t upper = mp->mp_upper - (branch_bytes - sizeof(indx_t)); - if (unlikely(lower > upper)) { - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - return MDBX_PAGE_FULL; + /* looking for readers from this process */ + const size_t snap_nreaders = + atomic_load32(&lck->rdt_length, mo_AcquireRelease); + eASSERT(env, mode == explicit_resize); + for (size_t i = 0; i < snap_nreaders; ++i) { + if (lck->rdt[i].pid.weak == env->pid && + lck->rdt[i].tid.weak != osal_thread_self()) { + /* the base address of the mapping can't be changed since + * the other reader thread from this process exists. */ + lck_rdt_unlock(env); + mresize_flags &= ~(MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE); + break; + } + } + } + } +#endif /* ! Windows */ } - mp->mp_lower = (indx_t)lower; - mp->mp_ptrs[indx] = mp->mp_upper = (indx_t)upper; - /* Write the node data. */ - MDBX_node *node = page_node(mp, indx); - node_set_pgno(node, pgno); - node_set_flags(node, 0); - UNALIGNED_POKE_8(node, MDBX_node, mn_extra, 0); - node_set_ks(node, 0); - if (likely(key != NULL)) { - node_set_ks(node, key->iov_len); - memcpy(node_key(node), key->iov_base, key->iov_len); + const pgno_t aligned_munlock_pgno = + (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) + ? 0 + : bytes2pgno(env, size_bytes); + if (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) { + mincore_clean_cache(env); + if ((env->flags & MDBX_WRITEMAP) && env->lck->unsynced_pages.weak) { +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.msync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_msync(&env->dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), + MDBX_SYNC_NONE); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } } - return MDBX_SUCCESS; -} - -__hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx, - const MDBX_val *key, - MDBX_val *data, - unsigned flags) { - MDBX_ANALYSIS_ASSUME(key != nullptr); - MDBX_ANALYSIS_ASSUME(data != nullptr); - MDBX_page *mp = mc->mc_pg[mc->mc_top]; - DKBUF_DEBUG; - DEBUG("add to leaf-%spage %" PRIaPGNO " index %zi, data size %" PRIuPTR - " key size %" PRIuPTR " [%s]", - IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, data ? data->iov_len : 0, - key ? key->iov_len : 0, DKEY_DEBUG(key)); - cASSERT(mc, key != NULL && data != NULL); - cASSERT(mc, PAGETYPE_COMPAT(mp) == P_LEAF); - MDBX_page *largepage = NULL; + munlock_after(env, aligned_munlock_pgno, size_bytes); - size_t node_bytes; - if (unlikely(flags & F_BIGDATA)) { - /* Data already on large/overflow page. */ - STATIC_ASSERT(sizeof(pgno_t) % 2 == 0); - node_bytes = - node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t); - cASSERT(mc, page_room(mp) >= node_bytes); - } else if (unlikely(node_size(key, data) > - mc->mc_txn->mt_env->me_leaf_nodemax)) { - /* Put data on large/overflow page. */ - if (unlikely(mc->mc_db->md_flags & MDBX_DUPSORT)) { - ERROR("Unexpected target %s flags 0x%x for large data-item", "dupsort-db", - mc->mc_db->md_flags); - return MDBX_PROBLEM; - } - if (unlikely(flags & (F_DUPDATA | F_SUBDATA))) { - ERROR("Unexpected target %s flags 0x%x for large data-item", "node", - flags); - return MDBX_PROBLEM; - } - cASSERT(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data)); - const pgno_t ovpages = number_of_ovpages(mc->mc_txn->mt_env, data->iov_len); - const pgr_t npr = page_new_large(mc, ovpages); - if (unlikely(npr.err != MDBX_SUCCESS)) - return npr.err; - largepage = npr.page; - DEBUG("allocated %u large/overflow page(s) %" PRIaPGNO "for %" PRIuPTR - " data bytes", - largepage->mp_pages, largepage->mp_pgno, data->iov_len); - flags |= F_BIGDATA; - node_bytes = - node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t); - cASSERT(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data)); - } else { - cASSERT(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data)); - node_bytes = node_size(key, data) + sizeof(indx_t); - cASSERT(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data)); +#if MDBX_ENABLE_MADVISE + if (size_bytes < prev_size && mode > implicit_grow) { + NOTICE("resize-MADV_%s %u..%u", + (env->flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED", size_pgno, + bytes2pgno(env, prev_size)); + const uint32_t munlocks_before = + atomic_load32(&env->lck->mlcnt[1], mo_Relaxed); + rc = MDBX_RESULT_TRUE; +#if defined(MADV_REMOVE) + if (env->flags & MDBX_WRITEMAP) + rc = madvise(ptr_disp(env->dxb_mmap.base, size_bytes), + prev_size - size_bytes, MADV_REMOVE) + ? ignore_enosys(errno) + : MDBX_SUCCESS; +#endif /* MADV_REMOVE */ +#if defined(MADV_DONTNEED) + if (rc == MDBX_RESULT_TRUE) + rc = madvise(ptr_disp(env->dxb_mmap.base, size_bytes), + prev_size - size_bytes, MADV_DONTNEED) + ? ignore_enosys(errno) + : MDBX_SUCCESS; +#elif defined(POSIX_MADV_DONTNEED) + if (rc == MDBX_RESULT_TRUE) + rc = ignore_enosys(posix_madvise(ptr_disp(env->dxb_mmap.base, size_bytes), + prev_size - size_bytes, + POSIX_MADV_DONTNEED)); +#elif defined(POSIX_FADV_DONTNEED) + if (rc == MDBX_RESULT_TRUE) + rc = ignore_enosys(posix_fadvise(env->lazy_fd, size_bytes, + prev_size - size_bytes, + POSIX_FADV_DONTNEED)); +#endif /* MADV_DONTNEED */ + if (unlikely(MDBX_IS_ERROR(rc))) { + const uint32_t mlocks_after = + atomic_load32(&env->lck->mlcnt[0], mo_Relaxed); + if (rc == MDBX_EINVAL) { + const int severity = + (mlocks_after - munlocks_before) ? MDBX_LOG_NOTICE : MDBX_LOG_WARN; + if (LOG_ENABLED(severity)) + debug_log(severity, __func__, __LINE__, + "%s-madvise: ignore EINVAL (%d) since some pages maybe " + "locked (%u/%u mlcnt-processes)", + "resize", rc, mlocks_after, munlocks_before); + } else { + ERROR("%s-madvise(%s, %zu, +%zu), %u/%u mlcnt-processes, err %d", + "mresize", "DONTNEED", size_bytes, prev_size - size_bytes, + mlocks_after, munlocks_before, rc); + goto bailout; + } + } else + env->lck->discarded_tail.weak = size_pgno; } +#endif /* MDBX_ENABLE_MADVISE */ - /* Move higher pointers up one slot. */ - const size_t nkeys = page_numkeys(mp); - cASSERT(mc, nkeys >= indx); - for (size_t i = nkeys; i > indx; --i) - mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; + rc = osal_mresize(mresize_flags, &env->dxb_mmap, size_bytes, limit_bytes); + eASSERT(env, env->dxb_mmap.limit >= env->dxb_mmap.current); - /* Adjust free space offsets. */ - const intptr_t lower = mp->mp_lower + sizeof(indx_t); - const intptr_t upper = mp->mp_upper - (node_bytes - sizeof(indx_t)); - if (unlikely(lower > upper)) { - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - return MDBX_PAGE_FULL; +#if MDBX_ENABLE_MADVISE + if (rc == MDBX_SUCCESS) { + eASSERT(env, limit_bytes == env->dxb_mmap.limit); + eASSERT(env, size_bytes <= env->dxb_mmap.filesize); + if (mode == explicit_resize) + eASSERT(env, size_bytes == env->dxb_mmap.current); + else + eASSERT(env, size_bytes <= env->dxb_mmap.current); + env->lck->discarded_tail.weak = size_pgno; + const bool readahead = + !(env->flags & MDBX_NORDAHEAD) && + mdbx_is_readahead_reasonable(size_bytes, -(intptr_t)prev_size); + const bool force = limit_bytes != prev_limit || + env->dxb_mmap.base != prev_map +#if defined(_WIN32) || defined(_WIN64) + || prev_size > size_bytes +#endif /* Windows */ + ; + rc = dxb_set_readahead(env, size_pgno, readahead, force); } - mp->mp_lower = (indx_t)lower; - mp->mp_ptrs[indx] = mp->mp_upper = (indx_t)upper; - - /* Write the node data. */ - MDBX_node *node = page_node(mp, indx); - node_set_ks(node, key->iov_len); - node_set_flags(node, (uint8_t)flags); - UNALIGNED_POKE_8(node, MDBX_node, mn_extra, 0); - node_set_ds(node, data->iov_len); - memcpy(node_key(node), key->iov_base, key->iov_len); +#endif /* MDBX_ENABLE_MADVISE */ - void *nodedata = node_data(node); - if (likely(largepage == NULL)) { - if (unlikely(flags & F_BIGDATA)) { - memcpy(nodedata, data->iov_base, sizeof(pgno_t)); - return MDBX_SUCCESS; +bailout: + if (rc == MDBX_SUCCESS) { + eASSERT(env, env->dxb_mmap.limit >= env->dxb_mmap.current); + eASSERT(env, limit_bytes == env->dxb_mmap.limit); + eASSERT(env, size_bytes <= env->dxb_mmap.filesize); + if (mode == explicit_resize) + eASSERT(env, size_bytes == env->dxb_mmap.current); + else + eASSERT(env, size_bytes <= env->dxb_mmap.current); + /* update env-geo to avoid influences */ + env->geo_in_bytes.now = env->dxb_mmap.current; + env->geo_in_bytes.upper = env->dxb_mmap.limit; + env_options_adjust_defaults(env); +#ifdef ENABLE_MEMCHECK + if (prev_limit != env->dxb_mmap.limit || prev_map != env->dxb_mmap.base) { + VALGRIND_DISCARD(env->valgrind_handle); + env->valgrind_handle = 0; + if (env->dxb_mmap.limit) + env->valgrind_handle = VALGRIND_CREATE_BLOCK( + env->dxb_mmap.base, env->dxb_mmap.limit, "mdbx"); } +#endif /* ENABLE_MEMCHECK */ } else { - poke_pgno(nodedata, largepage->mp_pgno); - nodedata = page_data(largepage); + if (rc != MDBX_UNABLE_EXTEND_MAPSIZE && rc != MDBX_EPERM) { + ERROR("failed resize datafile/mapping: " + "present %" PRIuPTR " -> %" PRIuPTR ", " + "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", + prev_size, size_bytes, prev_limit, limit_bytes, rc); + } else { + WARNING("unable resize datafile/mapping: " + "present %" PRIuPTR " -> %" PRIuPTR ", " + "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", + prev_size, size_bytes, prev_limit, limit_bytes, rc); + eASSERT(env, env->dxb_mmap.limit >= env->dxb_mmap.current); + } + if (!env->dxb_mmap.base) { + env->flags |= ENV_FATAL_ERROR; + if (env->txn) + env->txn->flags |= MDBX_TXN_ERROR; + rc = MDBX_PANIC; + } } - if (unlikely(flags & MDBX_RESERVE)) - data->iov_base = nodedata; - else if (likely(nodedata != data->iov_base && - data->iov_len /* to avoid UBSAN traps*/ != 0)) - memcpy(nodedata, data->iov_base, data->iov_len); - return MDBX_SUCCESS; -} - -/* Delete the specified node from a page. - * [in] mc Cursor pointing to the node to delete. - * [in] ksize The size of a node. Only used if the page is - * part of a MDBX_DUPFIXED database. */ -__hot static void node_del(MDBX_cursor *mc, size_t ksize) { - MDBX_page *mp = mc->mc_pg[mc->mc_top]; - const size_t hole = mc->mc_ki[mc->mc_top]; - const size_t nkeys = page_numkeys(mp); - DEBUG("delete node %zu on %s page %" PRIaPGNO, hole, - IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno); - cASSERT(mc, hole < nkeys); - - if (IS_LEAF2(mp)) { - cASSERT(mc, ksize >= sizeof(indx_t)); - size_t diff = nkeys - 1 - hole; - void *const base = page_leaf2key(mp, hole, ksize); - if (diff) - memmove(base, ptr_disp(base, ksize), diff * ksize); - cASSERT(mc, mp->mp_lower >= sizeof(indx_t)); - mp->mp_lower -= sizeof(indx_t); - cASSERT(mc, (size_t)UINT16_MAX - mp->mp_upper >= ksize - sizeof(indx_t)); - mp->mp_upper += (indx_t)(ksize - sizeof(indx_t)); - cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->mp_upper) & 1) == 0); - return; +#if defined(_WIN32) || defined(_WIN64) + int err = MDBX_SUCCESS; + imports.srwl_ReleaseExclusive(&env->remap_guard); + if (suspended) { + err = osal_resume_threads_after_remap(suspended); + if (suspended != &array_onstack) + osal_free(suspended); } - - MDBX_node *node = page_node(mp, hole); - cASSERT(mc, !IS_BRANCH(mp) || hole || node_ks(node) == 0); - size_t hole_size = NODESIZE + node_ks(node); - if (IS_LEAF(mp)) - hole_size += - (node_flags(node) & F_BIGDATA) ? sizeof(pgno_t) : node_ds(node); - hole_size = EVEN(hole_size); - - const indx_t hole_offset = mp->mp_ptrs[hole]; - size_t r, w; - for (r = w = 0; r < nkeys; r++) - if (r != hole) - mp->mp_ptrs[w++] = (mp->mp_ptrs[r] < hole_offset) - ? mp->mp_ptrs[r] + (indx_t)hole_size - : mp->mp_ptrs[r]; - - void *const base = ptr_disp(mp, mp->mp_upper + PAGEHDRSZ); - memmove(ptr_disp(base, hole_size), base, hole_offset - mp->mp_upper); - - cASSERT(mc, mp->mp_lower >= sizeof(indx_t)); - mp->mp_lower -= sizeof(indx_t); - cASSERT(mc, (size_t)UINT16_MAX - mp->mp_upper >= hole_size); - mp->mp_upper += (indx_t)hole_size; - - if (AUDIT_ENABLED()) { - const uint8_t checking = mc->mc_checking; - mc->mc_checking |= CC_UPDATING; - const int page_check_err = page_check(mc, mp); - mc->mc_checking = checking; - cASSERT(mc, page_check_err == MDBX_SUCCESS); +#else + if (env->lck_mmap.lck && + (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) != 0) + lck_rdt_unlock(env); + int err = osal_fastmutex_release(&env->remap_guard); +#endif /* Windows */ + if (err != MDBX_SUCCESS) { + FATAL("failed resume-after-remap: errcode %d", err); + return MDBX_PANIC; } + return rc; } - -/* Compact the main page after deleting a node on a subpage. - * [in] mp The main page to operate on. - * [in] indx The index of the subpage on the main page. */ -static MDBX_node *node_shrink(MDBX_page *mp, size_t indx, MDBX_node *node) { - assert(node = page_node(mp, indx)); - MDBX_page *sp = (MDBX_page *)node_data(node); - assert(IS_SUBP(sp) && page_numkeys(sp) > 0); - const size_t delta = - EVEN_FLOOR(page_room(sp) /* avoid the node uneven-sized */); - if (unlikely(delta) == 0) - return node; - - /* Prepare to shift upward, set len = length(subpage part to shift) */ - size_t nsize = node_ds(node) - delta, len = nsize; - assert(nsize % 1 == 0); - if (!IS_LEAF2(sp)) { - len = PAGEHDRSZ; - MDBX_page *xp = ptr_disp(sp, delta); /* destination subpage */ - for (intptr_t i = page_numkeys(sp); --i >= 0;) { - assert(sp->mp_ptrs[i] >= delta); - xp->mp_ptrs[i] = (indx_t)(sp->mp_ptrs[i] - delta); +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) +void dxb_sanitize_tail(MDBX_env *env, MDBX_txn *txn) { +#if !defined(__SANITIZE_ADDRESS__) + if (!RUNNING_ON_VALGRIND) + return; +#endif + if (txn) { /* transaction start */ + if (env->poison_edge < txn->geo.first_unallocated) + env->poison_edge = txn->geo.first_unallocated; + VALGRIND_MAKE_MEM_DEFINED(env->dxb_mmap.base, + pgno2bytes(env, txn->geo.first_unallocated)); + MDBX_ASAN_UNPOISON_MEMORY_REGION( + env->dxb_mmap.base, pgno2bytes(env, txn->geo.first_unallocated)); + /* don't touch more, it should be already poisoned */ + } else { /* transaction end */ + bool should_unlock = false; + pgno_t last = MAX_PAGENO + 1; + if (env->pid != osal_getpid()) { + /* resurrect after fork */ + return; + } else if (env->txn && env_txn0_owned(env)) { + /* inside write-txn */ + last = meta_recent(env, &env->basal_txn->tw.troika) + .ptr_v->geometry.first_unallocated; + } else if (env->flags & MDBX_RDONLY) { + /* read-only mode, no write-txn, no wlock mutex */ + last = NUM_METAS; + } else if (lck_txn_lock(env, true) == MDBX_SUCCESS) { + /* no write-txn */ + last = NUM_METAS; + should_unlock = true; + } else { + /* write txn is running, therefore shouldn't poison any memory range */ + return; } - } - assert(sp->mp_upper >= sp->mp_lower + delta); - sp->mp_upper -= (indx_t)delta; - sp->mp_pgno = mp->mp_pgno; - node_set_ds(node, nsize); - - /* Shift upward */ - void *const base = ptr_disp(mp, mp->mp_upper + PAGEHDRSZ); - memmove(ptr_disp(base, delta), base, ptr_dist(sp, base) + len); - const size_t pivot = mp->mp_ptrs[indx]; - for (intptr_t i = page_numkeys(mp); --i >= 0;) { - if (mp->mp_ptrs[i] <= pivot) { - assert((size_t)UINT16_MAX - mp->mp_ptrs[i] >= delta); - mp->mp_ptrs[i] += (indx_t)delta; + last = mvcc_largest_this(env, last); + const pgno_t edge = env->poison_edge; + if (edge > last) { + eASSERT(env, last >= NUM_METAS); + env->poison_edge = last; + VALGRIND_MAKE_MEM_NOACCESS( + ptr_disp(env->dxb_mmap.base, pgno2bytes(env, last)), + pgno2bytes(env, edge - last)); + MDBX_ASAN_POISON_MEMORY_REGION( + ptr_disp(env->dxb_mmap.base, pgno2bytes(env, last)), + pgno2bytes(env, edge - last)); } + if (should_unlock) + lck_txn_unlock(env); } - assert((size_t)UINT16_MAX - mp->mp_upper >= delta); - mp->mp_upper += (indx_t)delta; - - return ptr_disp(node, delta); } +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ -/* Initial setup of a sorted-dups cursor. - * - * Sorted duplicates are implemented as a sub-database for the given key. - * The duplicate data items are actually keys of the sub-database. - * Operations on the duplicate data items are performed using a sub-cursor - * initialized when the sub-database is first accessed. This function does - * the preliminary setup of the sub-cursor, filling in the fields that - * depend only on the parent DB. - * - * [in] mc The main cursor whose sorted-dups cursor is to be initialized. */ -static int cursor_xinit0(MDBX_cursor *mc) { - MDBX_xcursor *mx = mc->mc_xcursor; - if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) { - ERROR("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", - mc->mc_dbi); - return MDBX_CORRUPTED; - } +#if MDBX_ENABLE_MADVISE +/* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ +__cold int dxb_set_readahead(const MDBX_env *env, const pgno_t edge, + const bool enable, const bool force_whole) { + eASSERT(env, edge >= NUM_METAS && edge <= MAX_PAGENO + 1); + eASSERT(env, (enable & 1) == (enable != 0)); + const bool toggle = force_whole || + ((enable ^ env->lck->readahead_anchor) & 1) || + !env->lck->readahead_anchor; + const pgno_t prev_edge = env->lck->readahead_anchor >> 1; + const size_t limit = env->dxb_mmap.limit; + size_t offset = + toggle ? 0 + : pgno_align2os_bytes(env, (prev_edge < edge) ? prev_edge : edge); + offset = (offset < limit) ? offset : limit; - mx->mx_cursor.mc_xcursor = NULL; - mx->mx_cursor.mc_next = NULL; - mx->mx_cursor.mc_txn = mc->mc_txn; - mx->mx_cursor.mc_db = &mx->mx_db; - mx->mx_cursor.mc_dbx = &mx->mx_dbx; - mx->mx_cursor.mc_dbi = mc->mc_dbi; - mx->mx_cursor.mc_dbi_state = mc->mc_dbi_state; - mx->mx_cursor.mc_snum = 0; - mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB; - STATIC_ASSERT(MDBX_DUPFIXED * 2 == P_LEAF2); - cASSERT(mc, (mc->mc_checking & (P_BRANCH | P_LEAF | P_LEAF2)) == P_LEAF); - mx->mx_cursor.mc_checking = - mc->mc_checking + ((mc->mc_db->md_flags & MDBX_DUPFIXED) << 1); - mx->mx_dbx.md_name.iov_len = 0; - mx->mx_dbx.md_name.iov_base = NULL; - mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; - mx->mx_dbx.md_dcmp = NULL; - mx->mx_dbx.md_klen_min = INT_MAX; - mx->mx_dbx.md_vlen_min = mx->mx_dbx.md_klen_max = mx->mx_dbx.md_vlen_max = 0; - return MDBX_SUCCESS; -} + size_t length = + pgno_align2os_bytes(env, (prev_edge < edge) ? edge : prev_edge); + length = (length < limit) ? length : limit; + length -= offset; -/* Final setup of a sorted-dups cursor. - * Sets up the fields that depend on the data from the main cursor. - * [in] mc The main cursor whose sorted-dups cursor is to be initialized. - * [in] node The data containing the MDBX_db record for the sorted-dup database. - */ -static int cursor_xinit1(MDBX_cursor *mc, const MDBX_node *node, - const MDBX_page *mp) { - MDBX_xcursor *mx = mc->mc_xcursor; - if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) { - ERROR("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", - mc->mc_dbi); - return MDBX_CORRUPTED; - } + eASSERT(env, 0 <= (intptr_t)length); + if (length == 0) + return MDBX_SUCCESS; - const uint8_t flags = node_flags(node); - switch (flags) { - default: - ERROR("invalid node flags %u", flags); - return MDBX_CORRUPTED; - case F_DUPDATA | F_SUBDATA: - if (!MDBX_DISABLE_VALIDATION && - unlikely(node_ds(node) != sizeof(MDBX_db))) { - ERROR("invalid nested-db record size (%zu, expect %zu)", node_ds(node), - sizeof(MDBX_db)); - return MDBX_CORRUPTED; - } - memcpy(&mx->mx_db, node_data(node), sizeof(MDBX_db)); - const txnid_t pp_txnid = mp->mp_txnid; - if (!MDBX_DISABLE_VALIDATION && - unlikely(mx->mx_db.md_mod_txnid > pp_txnid)) { - ERROR("nested-db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")", - mx->mx_db.md_mod_txnid, pp_txnid); - return MDBX_CORRUPTED; - } - mx->mx_cursor.mc_pg[0] = 0; - mx->mx_cursor.mc_snum = 0; - mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB; - break; - case F_DUPDATA: - if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) <= PAGEHDRSZ)) { - ERROR("invalid nested-page size %zu", node_ds(node)); - return MDBX_CORRUPTED; - } - MDBX_page *fp = node_data(node); - mx->mx_db.md_depth = 1; - mx->mx_db.md_branch_pages = 0; - mx->mx_db.md_leaf_pages = 1; - mx->mx_db.md_overflow_pages = 0; - mx->mx_db.md_entries = page_numkeys(fp); - mx->mx_db.md_root = fp->mp_pgno; - mx->mx_db.md_mod_txnid = mp->mp_txnid; - mx->mx_cursor.mc_snum = 1; - mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB | C_INITIALIZED; - mx->mx_cursor.mc_pg[0] = fp; - mx->mx_cursor.mc_ki[0] = 0; - mx->mx_db.md_flags = flags_db2sub(mc->mc_db->md_flags); - mx->mx_db.md_xsize = - (mc->mc_db->md_flags & MDBX_DUPFIXED) ? fp->mp_leaf2_ksize : 0; - break; - } + NOTICE("readahead %s %u..%u", enable ? "ON" : "OFF", bytes2pgno(env, offset), + bytes2pgno(env, offset + length)); - if (unlikely(mx->mx_db.md_xsize != mc->mc_db->md_xsize)) { - if (!MDBX_DISABLE_VALIDATION && unlikely(mc->mc_db->md_xsize != 0)) { - ERROR("cursor mismatched nested-db md_xsize %u", mc->mc_db->md_xsize); - return MDBX_CORRUPTED; - } - if (!MDBX_DISABLE_VALIDATION && - unlikely((mc->mc_db->md_flags & MDBX_DUPFIXED) == 0)) { - ERROR("mismatched nested-db md_flags %u", mc->mc_db->md_flags); - return MDBX_CORRUPTED; - } - if (!MDBX_DISABLE_VALIDATION && - unlikely(mx->mx_db.md_xsize < mc->mc_dbx->md_vlen_min || - mx->mx_db.md_xsize > mc->mc_dbx->md_vlen_max)) { - ERROR("mismatched nested-db.md_xsize (%u) <> min/max value-length " - "(%zu/%zu)", - mx->mx_db.md_xsize, mc->mc_dbx->md_vlen_min, - mc->mc_dbx->md_vlen_max); - return MDBX_CORRUPTED; +#if defined(F_RDAHEAD) + if (toggle && unlikely(fcntl(env->lazy_fd, F_RDAHEAD, enable) == -1)) + return errno; +#endif /* F_RDAHEAD */ + + int err; + void *const ptr = ptr_disp(env->dxb_mmap.base, offset); + if (enable) { +#if defined(MADV_NORMAL) + err = + madvise(ptr, length, MADV_NORMAL) ? ignore_enosys(errno) : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(POSIX_MADV_NORMAL) + err = ignore_enosys(posix_madvise(ptr, length, POSIX_MADV_NORMAL)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(POSIX_FADV_NORMAL) && defined(POSIX_FADV_WILLNEED) + err = ignore_enosys( + posix_fadvise(env->lazy_fd, offset, length, POSIX_FADV_NORMAL)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(_WIN32) || defined(_WIN64) + /* no madvise on Windows */ +#else +#warning "FIXME" +#endif + if (toggle) { + /* NOTE: Seems there is a bug in the Mach/Darwin/OSX kernel, + * because MADV_WILLNEED with offset != 0 may cause SIGBUS + * on following access to the hinted region. + * 19.6.0 Darwin Kernel Version 19.6.0: Tue Jan 12 22:13:05 PST 2021; + * root:xnu-6153.141.16~1/RELEASE_X86_64 x86_64 */ +#if defined(F_RDADVISE) + struct radvisory hint; + hint.ra_offset = offset; + hint.ra_count = + unlikely(length > INT_MAX && sizeof(length) > sizeof(hint.ra_count)) + ? INT_MAX + : (int)length; + (void)/* Ignore ENOTTY for DB on the ram-disk and so on */ fcntl( + env->lazy_fd, F_RDADVISE, &hint); +#elif defined(MADV_WILLNEED) + err = madvise(ptr, length, MADV_WILLNEED) ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(POSIX_MADV_WILLNEED) + err = ignore_enosys(posix_madvise(ptr, length, POSIX_MADV_WILLNEED)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(_WIN32) || defined(_WIN64) + if (imports.PrefetchVirtualMemory) { + WIN32_MEMORY_RANGE_ENTRY hint; + hint.VirtualAddress = ptr; + hint.NumberOfBytes = length; + (void)imports.PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0); + } +#elif defined(POSIX_FADV_WILLNEED) + err = ignore_enosys( + posix_fadvise(env->lazy_fd, offset, length, POSIX_FADV_WILLNEED)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#else +#warning "FIXME" +#endif } - mc->mc_db->md_xsize = mx->mx_db.md_xsize; - mc->mc_dbx->md_vlen_min = mc->mc_dbx->md_vlen_max = mx->mx_db.md_xsize; + } else { + mincore_clean_cache(env); +#if defined(MADV_RANDOM) + err = + madvise(ptr, length, MADV_RANDOM) ? ignore_enosys(errno) : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(POSIX_MADV_RANDOM) + err = ignore_enosys(posix_madvise(ptr, length, POSIX_MADV_RANDOM)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(POSIX_FADV_RANDOM) + err = ignore_enosys( + posix_fadvise(env->lazy_fd, offset, length, POSIX_FADV_RANDOM)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(_WIN32) || defined(_WIN64) + /* no madvise on Windows */ +#else +#warning "FIXME" +#endif /* MADV_RANDOM */ } - mx->mx_dbx.md_klen_min = mc->mc_dbx->md_vlen_min; - mx->mx_dbx.md_klen_max = mc->mc_dbx->md_vlen_max; - DEBUG("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi, - mx->mx_db.md_root); - return MDBX_SUCCESS; + env->lck->readahead_anchor = (enable & 1) + (edge << 1); + err = MDBX_SUCCESS; + return err; } +#endif /* MDBX_ENABLE_MADVISE */ -/* Fixup a sorted-dups cursor due to underlying update. - * Sets up some fields that depend on the data from the main cursor. - * Almost the same as init1, but skips initialization steps if the - * xcursor had already been used. - * [in] mc The main cursor whose sorted-dups cursor is to be fixed up. - * [in] src_mx The xcursor of an up-to-date cursor. - * [in] new_dupdata True if converting from a non-F_DUPDATA item. */ -static int cursor_xinit2(MDBX_cursor *mc, MDBX_xcursor *src_mx, - bool new_dupdata) { - MDBX_xcursor *mx = mc->mc_xcursor; - if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) { - ERROR("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", - mc->mc_dbi); - return MDBX_CORRUPTED; - } +__cold int dxb_setup(MDBX_env *env, const int lck_rc, + const mdbx_mode_t mode_bits) { + meta_t header; + eASSERT(env, !(env->flags & ENV_ACTIVE)); + int rc = MDBX_RESULT_FALSE; + int err = dxb_read_header(env, &header, lck_rc, mode_bits); + if (unlikely(err != MDBX_SUCCESS)) { + if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE || err != MDBX_ENODATA || + (env->flags & MDBX_RDONLY) != 0 || + /* recovery mode */ env->stuck_meta >= 0) + return err; - if (new_dupdata) { - mx->mx_cursor.mc_snum = 1; - mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB | C_INITIALIZED; - mx->mx_cursor.mc_ki[0] = 0; - } + DEBUG("%s", "create new database"); + rc = /* new database */ MDBX_RESULT_TRUE; - mx->mx_dbx.md_klen_min = src_mx->mx_dbx.md_klen_min; - mx->mx_dbx.md_klen_max = src_mx->mx_dbx.md_klen_max; - mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; - mx->mx_db = src_mx->mx_db; - mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; - if (mx->mx_cursor.mc_flags & C_INITIALIZED) { - DEBUG("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi, - mx->mx_db.md_root); - } - return MDBX_SUCCESS; -} + if (!env->geo_in_bytes.now) { + /* set defaults if not configured */ + err = mdbx_env_set_geometry(env, 0, -1, -1, -1, -1, -1); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } -static __inline int couple_init(MDBX_cursor_couple *couple, const size_t dbi, - const MDBX_txn *const txn, MDBX_db *const db, - MDBX_dbx *const dbx, uint8_t *const dbi_state) { - tASSERT(txn, F_ISSET(*dbi_state, DBI_VALID | DBI_LINDO)); - couple->outer.mc_signature = MDBX_MC_LIVE; - couple->outer.mc_next = NULL; - couple->outer.mc_backup = NULL; - couple->outer.mc_dbi = (MDBX_dbi)dbi; - couple->outer.mc_txn = (MDBX_txn *)txn; - couple->outer.mc_db = db; - couple->outer.mc_dbx = dbx; - couple->outer.mc_dbi_state = dbi_state; - couple->outer.mc_snum = 0; - couple->outer.mc_top = 0; - couple->outer.mc_pg[0] = 0; - couple->outer.mc_flags = 0; - STATIC_ASSERT(CC_BRANCH == P_BRANCH && CC_LEAF == P_LEAF && - CC_OVERFLOW == P_OVERFLOW && CC_LEAF2 == P_LEAF2); - couple->outer.mc_checking = - (AUDIT_ENABLED() || (txn->mt_env->me_flags & MDBX_VALIDATION)) - ? CC_PAGECHECK | CC_LEAF - : CC_LEAF; - couple->outer.mc_ki[0] = 0; - couple->outer.mc_xcursor = NULL; + err = env_page_auxbuffer(env); + if (unlikely(err != MDBX_SUCCESS)) + return err; - int rc = MDBX_SUCCESS; - if (unlikely(*couple->outer.mc_dbi_state & DBI_STALE)) { - rc = page_search(&couple->outer, NULL, MDBX_PS_ROOTONLY); - rc = (rc != MDBX_NOTFOUND) ? rc : MDBX_SUCCESS; - } else if (unlikely(dbx->md_klen_max == 0)) { - rc = setup_sdb(dbx, db, txn->mt_env->me_psize); - } + header = *meta_init_triplet(env, env->page_auxbuf); + err = osal_pwrite(env->lazy_fd, env->page_auxbuf, + env->ps * (size_t)NUM_METAS, 0); + if (unlikely(err != MDBX_SUCCESS)) + return err; - if (couple->outer.mc_db->md_flags & MDBX_DUPSORT) { - couple->inner.mx_cursor.mc_signature = MDBX_MC_LIVE; - couple->outer.mc_xcursor = &couple->inner; - rc = cursor_xinit0(&couple->outer); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - couple->inner.mx_dbx.md_klen_min = couple->outer.mc_dbx->md_vlen_min; - couple->inner.mx_dbx.md_klen_max = couple->outer.mc_dbx->md_vlen_max; - } - return rc; -} + err = osal_ftruncate(env->lazy_fd, env->dxb_mmap.filesize = + env->dxb_mmap.current = + env->geo_in_bytes.now); + if (unlikely(err != MDBX_SUCCESS)) + return err; -/* Initialize a cursor for a given transaction and database. */ -static int cursor_init(MDBX_cursor *mc, const MDBX_txn *txn, size_t dbi) { - STATIC_ASSERT(offsetof(MDBX_cursor_couple, outer) == 0); - int rc = dbi_check(txn, dbi); - if (likely(rc == MDBX_SUCCESS)) - rc = couple_init(container_of(mc, MDBX_cursor_couple, outer), dbi, txn, - &txn->mt_dbs[dbi], &txn->mt_env->me_dbxs[dbi], - &txn->mt_dbi_state[dbi]); - return rc; -} +#ifndef NDEBUG /* just for checking */ + err = dxb_read_header(env, &header, lck_rc, mode_bits); + if (unlikely(err != MDBX_SUCCESS)) + return err; +#endif + } -MDBX_cursor *mdbx_cursor_create(void *context) { - MDBX_cursor_couple *couple = osal_calloc(1, sizeof(MDBX_cursor_couple)); - if (unlikely(!couple)) - return nullptr; + VERBOSE("header: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO + "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO " +%u -%u, txn_id %" PRIaTXN + ", %s", + header.trees.main.root, header.trees.gc.root, header.geometry.lower, + header.geometry.first_unallocated, header.geometry.now, + header.geometry.upper, pv2pages(header.geometry.grow_pv), + pv2pages(header.geometry.shrink_pv), + unaligned_peek_u64(4, header.txnid_a), durable_caption(&header)); - couple->outer.mc_signature = MDBX_MC_READY4CLOSE; - couple->outer.mc_dbi = UINT_MAX; - couple->mc_userctx = context; - return &couple->outer; -} + if (unlikely(header.trees.gc.flags != MDBX_INTEGERKEY)) { + ERROR("unexpected/invalid db-flags 0x%u for GC/FreeDB", + header.trees.gc.flags); + return MDBX_INCOMPATIBLE; + } + env->dbs_flags[FREE_DBI] = DB_VALID | MDBX_INTEGERKEY; + env->kvs[FREE_DBI].clc.k.cmp = cmp_int_align4; /* aligned MDBX_INTEGERKEY */ + env->kvs[FREE_DBI].clc.k.lmax = env->kvs[FREE_DBI].clc.k.lmin = 8; + env->kvs[FREE_DBI].clc.v.cmp = cmp_lenfast; + env->kvs[FREE_DBI].clc.v.lmin = 4; + env->kvs[FREE_DBI].clc.v.lmax = + mdbx_env_get_maxvalsize_ex(env, MDBX_INTEGERKEY); -int mdbx_cursor_set_userctx(MDBX_cursor *mc, void *ctx) { - if (unlikely(!mc)) - return MDBX_EINVAL; + if (env->ps != header.pagesize) + env_setup_pagesize(env, header.pagesize); + const size_t used_bytes = pgno2bytes(env, header.geometry.first_unallocated); + const size_t used_aligned2os_bytes = + ceil_powerof2(used_bytes, globals.sys_pagesize); + if ((env->flags & MDBX_RDONLY) /* readonly */ + || lck_rc != MDBX_RESULT_TRUE /* not exclusive */ + || /* recovery mode */ env->stuck_meta >= 0) { + /* use present params from db */ + const size_t pagesize = header.pagesize; + err = mdbx_env_set_geometry( + env, header.geometry.lower * pagesize, header.geometry.now * pagesize, + header.geometry.upper * pagesize, + pv2pages(header.geometry.grow_pv) * pagesize, + pv2pages(header.geometry.shrink_pv) * pagesize, header.pagesize); + if (unlikely(err != MDBX_SUCCESS)) { + ERROR("%s: err %d", "could not apply geometry from db", err); + return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err; + } + } else if (env->geo_in_bytes.now) { + /* silently growth to last used page */ + if (env->geo_in_bytes.now < used_aligned2os_bytes) + env->geo_in_bytes.now = used_aligned2os_bytes; + if (env->geo_in_bytes.upper < used_aligned2os_bytes) + env->geo_in_bytes.upper = used_aligned2os_bytes; - if (unlikely(mc->mc_signature != MDBX_MC_READY4CLOSE && - mc->mc_signature != MDBX_MC_LIVE)) - return MDBX_EBADSIGN; + /* apply preconfigured params, but only if substantial changes: + * - upper or lower limit changes + * - shrink threshold or growth step + * But ignore change just a 'now/current' size. */ + if (bytes_align2os_bytes(env, env->geo_in_bytes.upper) != + pgno2bytes(env, header.geometry.upper) || + bytes_align2os_bytes(env, env->geo_in_bytes.lower) != + pgno2bytes(env, header.geometry.lower) || + bytes_align2os_bytes(env, env->geo_in_bytes.shrink) != + pgno2bytes(env, pv2pages(header.geometry.shrink_pv)) || + bytes_align2os_bytes(env, env->geo_in_bytes.grow) != + pgno2bytes(env, pv2pages(header.geometry.grow_pv))) { + + if (env->geo_in_bytes.shrink && env->geo_in_bytes.now > used_bytes) + /* pre-shrink if enabled */ + env->geo_in_bytes.now = used_bytes + env->geo_in_bytes.shrink - + used_bytes % env->geo_in_bytes.shrink; - MDBX_cursor_couple *couple = container_of(mc, MDBX_cursor_couple, outer); - couple->mc_userctx = ctx; - return MDBX_SUCCESS; -} + err = mdbx_env_set_geometry( + env, env->geo_in_bytes.lower, env->geo_in_bytes.now, + env->geo_in_bytes.upper, env->geo_in_bytes.grow, + env->geo_in_bytes.shrink, header.pagesize); + if (unlikely(err != MDBX_SUCCESS)) { + ERROR("%s: err %d", "could not apply preconfigured db-geometry", err); + return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err; + } -void *mdbx_cursor_get_userctx(const MDBX_cursor *mc) { - if (unlikely(!mc)) - return nullptr; + /* update meta fields */ + header.geometry.now = bytes2pgno(env, env->geo_in_bytes.now); + header.geometry.lower = bytes2pgno(env, env->geo_in_bytes.lower); + header.geometry.upper = bytes2pgno(env, env->geo_in_bytes.upper); + header.geometry.grow_pv = + pages2pv(bytes2pgno(env, env->geo_in_bytes.grow)); + header.geometry.shrink_pv = + pages2pv(bytes2pgno(env, env->geo_in_bytes.shrink)); - if (unlikely(mc->mc_signature != MDBX_MC_READY4CLOSE && - mc->mc_signature != MDBX_MC_LIVE)) - return nullptr; + VERBOSE("amended: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO + "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO + " +%u -%u, txn_id %" PRIaTXN ", %s", + header.trees.main.root, header.trees.gc.root, + header.geometry.lower, header.geometry.first_unallocated, + header.geometry.now, header.geometry.upper, + pv2pages(header.geometry.grow_pv), + pv2pages(header.geometry.shrink_pv), + unaligned_peek_u64(4, header.txnid_a), durable_caption(&header)); + } else { + /* fetch back 'now/current' size, since it was ignored during comparison + * and may differ. */ + env->geo_in_bytes.now = pgno_align2os_bytes(env, header.geometry.now); + } + ENSURE(env, header.geometry.now >= header.geometry.first_unallocated); + } else { + /* geo-params are not pre-configured by user, + * get current values from the meta. */ + env->geo_in_bytes.now = pgno2bytes(env, header.geometry.now); + env->geo_in_bytes.lower = pgno2bytes(env, header.geometry.lower); + env->geo_in_bytes.upper = pgno2bytes(env, header.geometry.upper); + env->geo_in_bytes.grow = pgno2bytes(env, pv2pages(header.geometry.grow_pv)); + env->geo_in_bytes.shrink = + pgno2bytes(env, pv2pages(header.geometry.shrink_pv)); + } + + ENSURE(env, pgno_align2os_bytes(env, header.geometry.now) == + env->geo_in_bytes.now); + ENSURE(env, env->geo_in_bytes.now >= used_bytes); + const uint64_t filesize_before = env->dxb_mmap.filesize; + if (unlikely(filesize_before != env->geo_in_bytes.now)) { + if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) { + VERBOSE("filesize mismatch (expect %" PRIuPTR "b/%" PRIaPGNO + "p, have %" PRIu64 "b/%" PRIaPGNO "p), " + "assume other process working", + env->geo_in_bytes.now, bytes2pgno(env, env->geo_in_bytes.now), + filesize_before, bytes2pgno(env, (size_t)filesize_before)); + } else { + WARNING("filesize mismatch (expect %" PRIuSIZE "b/%" PRIaPGNO + "p, have %" PRIu64 "b/%" PRIaPGNO "p)", + env->geo_in_bytes.now, bytes2pgno(env, env->geo_in_bytes.now), + filesize_before, bytes2pgno(env, (size_t)filesize_before)); + if (filesize_before < used_bytes) { + ERROR("last-page beyond end-of-file (last %" PRIaPGNO + ", have %" PRIaPGNO ")", + header.geometry.first_unallocated, + bytes2pgno(env, (size_t)filesize_before)); + return MDBX_CORRUPTED; + } - MDBX_cursor_couple *couple = container_of(mc, MDBX_cursor_couple, outer); - return couple->mc_userctx; -} + if (env->flags & MDBX_RDONLY) { + if (filesize_before & (globals.sys_pagesize - 1)) { + ERROR("%s", "filesize should be rounded-up to system page"); + return MDBX_WANNA_RECOVERY; + } + WARNING("%s", "ignore filesize mismatch in readonly-mode"); + } else { + VERBOSE("will resize datafile to %" PRIuSIZE " bytes, %" PRIaPGNO + " pages", + env->geo_in_bytes.now, bytes2pgno(env, env->geo_in_bytes.now)); + } + } + } -int mdbx_cursor_unbind(MDBX_cursor *mc) { - if (unlikely(!mc)) - return MDBX_EINVAL; + VERBOSE("current boot-id %" PRIx64 "-%" PRIx64 " (%savailable)", + globals.bootid.x, globals.bootid.y, + (globals.bootid.x | globals.bootid.y) ? "" : "not-"); - if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_SUCCESS - : MDBX_EBADSIGN; +#if MDBX_ENABLE_MADVISE + /* calculate readahead hint before mmap with zero redundant pages */ + const bool readahead = + !(env->flags & MDBX_NORDAHEAD) && + mdbx_is_readahead_reasonable(used_bytes, 0) == MDBX_RESULT_TRUE; +#endif /* MDBX_ENABLE_MADVISE */ - if (unlikely(mc->mc_backup)) /* Cursor from parent transaction */ - return MDBX_EINVAL; + err = osal_mmap(env->flags, &env->dxb_mmap, env->geo_in_bytes.now, + env->geo_in_bytes.upper, + (lck_rc && env->stuck_meta < 0) ? MMAP_OPTION_TRUNCATE : 0); + if (unlikely(err != MDBX_SUCCESS)) + return err; - eASSERT(nullptr, mc->mc_txn && mc->mc_txn->mt_signature == MDBX_MT_SIGNATURE); - cASSERT(mc, mc->mc_signature == MDBX_MC_LIVE); - cASSERT(mc, !mc->mc_backup); - if (unlikely(!mc->mc_txn || mc->mc_txn->mt_signature != MDBX_MT_SIGNATURE)) { - ERROR("Wrong cursor's transaction %p 0x%x", - __Wpedantic_format_voidptr(mc->mc_txn), - mc->mc_txn ? mc->mc_txn->mt_signature : 0); - return MDBX_PROBLEM; - } - if (mc->mc_flags & C_UNTRACK) { - MDBX_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; - while (*prev && *prev != mc) - prev = &(*prev)->mc_next; - cASSERT(mc, *prev == mc); - *prev = mc->mc_next; +#if MDBX_ENABLE_MADVISE +#if defined(MADV_DONTDUMP) + err = madvise(env->dxb_mmap.base, env->dxb_mmap.limit, MADV_DONTDUMP) + ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#endif /* MADV_DONTDUMP */ +#if defined(MADV_DODUMP) + if (globals.runtime_flags & MDBX_DBG_DUMP) { + const size_t meta_length_aligned2os = pgno_align2os_bytes(env, NUM_METAS); + err = madvise(env->dxb_mmap.base, meta_length_aligned2os, MADV_DODUMP) + ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; } - mc->mc_signature = MDBX_MC_READY4CLOSE; - mc->mc_flags = 0; - return MDBX_SUCCESS; -} - -int mdbx_cursor_bind(const MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { - if (unlikely(!mc)) - return MDBX_EINVAL; +#endif /* MADV_DODUMP */ +#endif /* MDBX_ENABLE_MADVISE */ - if (unlikely(mc->mc_signature != MDBX_MC_READY4CLOSE && - mc->mc_signature != MDBX_MC_LIVE)) - return MDBX_EBADSIGN; +#ifdef ENABLE_MEMCHECK + env->valgrind_handle = + VALGRIND_CREATE_BLOCK(env->dxb_mmap.base, env->dxb_mmap.limit, "mdbx"); +#endif /* ENABLE_MEMCHECK */ - int rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + eASSERT(env, used_bytes >= pgno2bytes(env, NUM_METAS) && + used_bytes <= env->dxb_mmap.limit); +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) + if (env->dxb_mmap.filesize > used_bytes && + env->dxb_mmap.filesize < env->dxb_mmap.limit) { + VALGRIND_MAKE_MEM_NOACCESS(ptr_disp(env->dxb_mmap.base, used_bytes), + env->dxb_mmap.filesize - used_bytes); + MDBX_ASAN_POISON_MEMORY_REGION(ptr_disp(env->dxb_mmap.base, used_bytes), + env->dxb_mmap.filesize - used_bytes); + } + env->poison_edge = + bytes2pgno(env, (env->dxb_mmap.filesize < env->dxb_mmap.limit) + ? env->dxb_mmap.filesize + : env->dxb_mmap.limit); +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ - rc = dbi_check(txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + troika_t troika = meta_tap(env); +#if MDBX_DEBUG + meta_troika_dump(env, &troika); +#endif + //-------------------------------- validate/rollback head & steady meta-pages + if (unlikely(env->stuck_meta >= 0)) { + /* recovery mode */ + meta_t clone; + meta_t const *const target = METAPAGE(env, env->stuck_meta); + err = meta_validate_copy(env, target, &clone); + if (unlikely(err != MDBX_SUCCESS)) { + ERROR("target meta[%u] is corrupted", + bytes2pgno(env, ptr_dist(data_page(target), env->dxb_mmap.base))); + meta_troika_dump(env, &troika); + return MDBX_CORRUPTED; + } + } else /* not recovery mode */ + while (1) { + const unsigned meta_clash_mask = meta_eq_mask(&troika); + if (unlikely(meta_clash_mask)) { + ERROR("meta-pages are clashed: mask 0x%d", meta_clash_mask); + meta_troika_dump(env, &troika); + return MDBX_CORRUPTED; + } - if (unlikely(dbi == FREE_DBI && !(txn->mt_flags & MDBX_TXN_RDONLY))) - return MDBX_EACCESS; + if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) { + /* non-exclusive mode, + * meta-pages should be validated by a first process opened the DB */ + if (troika.recent == troika.prefer_steady) + break; - if (unlikely(mc->mc_backup)) /* Cursor from parent transaction */ { - cASSERT(mc, mc->mc_signature == MDBX_MC_LIVE); - if (unlikely(mc->mc_dbi != dbi || - /* paranoia */ mc->mc_signature != MDBX_MC_LIVE || - mc->mc_txn != txn)) - return MDBX_EINVAL; + if (!env->lck_mmap.lck) { + /* LY: without-lck (read-only) mode, so it is impossible that other + * process made weak checkpoint. */ + ERROR("%s", "without-lck, unable recovery/rollback"); + meta_troika_dump(env, &troika); + return MDBX_WANNA_RECOVERY; + } - cASSERT(mc, mc->mc_db == &txn->mt_dbs[dbi]); - cASSERT(mc, mc->mc_dbx == &txn->mt_env->me_dbxs[dbi]); - cASSERT(mc, mc->mc_dbi == dbi); - cASSERT(mc, mc->mc_dbi_state == &txn->mt_dbi_state[dbi]); - return likely(mc->mc_dbi == dbi && - /* paranoia */ mc->mc_signature == MDBX_MC_LIVE && - mc->mc_txn == txn) - ? MDBX_SUCCESS - : MDBX_EINVAL /* Disallow change DBI in nested transactions */; - } + /* LY: assume just have a collision with other running process, + * or someone make a weak checkpoint */ + VERBOSE("%s", "assume collision or online weak checkpoint"); + break; + } + eASSERT(env, lck_rc == MDBX_RESULT_TRUE); + /* exclusive mode */ - if (mc->mc_signature == MDBX_MC_LIVE) { - rc = mdbx_cursor_unbind(mc); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } - cASSERT(mc, !(mc->mc_flags & C_UNTRACK)); + const meta_ptr_t recent = meta_recent(env, &troika); + const meta_ptr_t prefer_steady = meta_prefer_steady(env, &troika); + meta_t clone; + if (prefer_steady.is_steady) { + err = meta_validate_copy(env, prefer_steady.ptr_c, &clone); + if (unlikely(err != MDBX_SUCCESS)) { + ERROR("meta[%u] with %s txnid %" PRIaTXN " is corrupted, %s needed", + bytes2pgno(env, + ptr_dist(prefer_steady.ptr_c, env->dxb_mmap.base)), + "steady", prefer_steady.txnid, "manual recovery"); + meta_troika_dump(env, &troika); + return MDBX_CORRUPTED; + } + if (prefer_steady.ptr_c == recent.ptr_c) + break; + } - rc = cursor_init(mc, txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + const pgno_t pgno = + bytes2pgno(env, ptr_dist(recent.ptr_c, env->dxb_mmap.base)); + const bool last_valid = + meta_validate_copy(env, recent.ptr_c, &clone) == MDBX_SUCCESS; + eASSERT(env, + !prefer_steady.is_steady || recent.txnid != prefer_steady.txnid); + if (unlikely(!last_valid)) { + if (unlikely(!prefer_steady.is_steady)) { + ERROR("%s for open or automatic rollback, %s", + "there are no suitable meta-pages", + "manual recovery is required"); + meta_troika_dump(env, &troika); + return MDBX_CORRUPTED; + } + WARNING("meta[%u] with last txnid %" PRIaTXN + " is corrupted, rollback needed", + pgno, recent.txnid); + meta_troika_dump(env, &troika); + goto purge_meta_head; + } - mc->mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = mc; - mc->mc_flags |= C_UNTRACK; + if (meta_bootid_match(recent.ptr_c)) { + if (env->flags & MDBX_RDONLY) { + ERROR("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: " + "rollback NOT needed, steady-sync NEEDED%s", + "opening after an unclean shutdown", globals.bootid.x, + globals.bootid.y, ", but unable in read-only mode"); + meta_troika_dump(env, &troika); + return MDBX_WANNA_RECOVERY; + } + WARNING("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: " + "rollback NOT needed, steady-sync NEEDED%s", + "opening after an unclean shutdown", globals.bootid.x, + globals.bootid.y, ""); + header = clone; + env->lck->unsynced_pages.weak = header.geometry.first_unallocated; + if (!env->lck->eoos_timestamp.weak) + env->lck->eoos_timestamp.weak = osal_monotime(); + break; + } + if (unlikely(!prefer_steady.is_steady)) { + ERROR("%s, but %s for automatic rollback: %s", + "opening after an unclean shutdown", + "there are no suitable meta-pages", + "manual recovery is required"); + meta_troika_dump(env, &troika); + return MDBX_CORRUPTED; + } + if (env->flags & MDBX_RDONLY) { + ERROR("%s and rollback needed: (from head %" PRIaTXN + " to steady %" PRIaTXN ")%s", + "opening after an unclean shutdown", recent.txnid, + prefer_steady.txnid, ", but unable in read-only mode"); + meta_troika_dump(env, &troika); + return MDBX_WANNA_RECOVERY; + } - return MDBX_SUCCESS; -} + purge_meta_head: + NOTICE("%s and doing automatic rollback: " + "purge%s meta[%u] with%s txnid %" PRIaTXN, + "opening after an unclean shutdown", last_valid ? "" : " invalid", + pgno, last_valid ? " weak" : "", recent.txnid); + meta_troika_dump(env, &troika); + ENSURE(env, prefer_steady.is_steady); + err = meta_override(env, pgno, 0, + last_valid ? recent.ptr_c : prefer_steady.ptr_c); + if (err) { + ERROR("rollback: overwrite meta[%u] with txnid %" PRIaTXN ", error %d", + pgno, recent.txnid, err); + return err; + } + troika = meta_tap(env); + ENSURE(env, 0 == meta_txnid(recent.ptr_v)); + ENSURE(env, 0 == meta_eq_mask(&troika)); + } -int mdbx_cursor_open(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_cursor **ret) { - if (unlikely(!ret)) - return MDBX_EINVAL; - *ret = NULL; + if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { + //-------------------------------------------------- shrink DB & update geo + /* re-check size after mmap */ + if ((env->dxb_mmap.current & (globals.sys_pagesize - 1)) != 0 || + env->dxb_mmap.current < used_bytes) { + ERROR("unacceptable/unexpected datafile size %" PRIuPTR, + env->dxb_mmap.current); + return MDBX_PROBLEM; + } + if (env->dxb_mmap.current != env->geo_in_bytes.now) { + header.geometry.now = bytes2pgno(env, env->dxb_mmap.current); + NOTICE("need update meta-geo to filesize %" PRIuPTR " bytes, %" PRIaPGNO + " pages", + env->dxb_mmap.current, header.geometry.now); + } - MDBX_cursor *const mc = mdbx_cursor_create(nullptr); - if (unlikely(!mc)) - return MDBX_ENOMEM; + const meta_ptr_t recent = meta_recent(env, &troika); + if (/* не учитываем различия в geo.first_unallocated */ + header.geometry.grow_pv != recent.ptr_c->geometry.grow_pv || + header.geometry.shrink_pv != recent.ptr_c->geometry.shrink_pv || + header.geometry.lower != recent.ptr_c->geometry.lower || + header.geometry.upper != recent.ptr_c->geometry.upper || + header.geometry.now != recent.ptr_c->geometry.now) { + if ((env->flags & MDBX_RDONLY) != 0 || + /* recovery mode */ env->stuck_meta >= 0) { + WARNING("skipped update meta.geo in %s mode: from l%" PRIaPGNO + "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u, to l%" PRIaPGNO + "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u", + (env->stuck_meta < 0) ? "read-only" : "recovery", + recent.ptr_c->geometry.lower, recent.ptr_c->geometry.now, + recent.ptr_c->geometry.upper, + pv2pages(recent.ptr_c->geometry.shrink_pv), + pv2pages(recent.ptr_c->geometry.grow_pv), header.geometry.lower, + header.geometry.now, header.geometry.upper, + pv2pages(header.geometry.shrink_pv), + pv2pages(header.geometry.grow_pv)); + } else { + const txnid_t next_txnid = safe64_txnid_next(recent.txnid); + if (unlikely(next_txnid > MAX_TXNID)) { + ERROR("txnid overflow, raise %d", MDBX_TXN_FULL); + return MDBX_TXN_FULL; + } + NOTICE("updating meta.geo: " + "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN "), " + "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN ")", + recent.ptr_c->geometry.lower, recent.ptr_c->geometry.now, + recent.ptr_c->geometry.upper, + pv2pages(recent.ptr_c->geometry.shrink_pv), + pv2pages(recent.ptr_c->geometry.grow_pv), recent.txnid, + header.geometry.lower, header.geometry.now, + header.geometry.upper, pv2pages(header.geometry.shrink_pv), + pv2pages(header.geometry.grow_pv), next_txnid); - int rc = mdbx_cursor_bind(txn, mc, dbi); - if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_cursor_close(mc); - return rc; + ENSURE(env, header.unsafe_txnid == recent.txnid); + meta_set_txnid(env, &header, next_txnid); + err = dxb_sync_locked(env, env->flags | txn_shrink_allowed, &header, + &troika); + if (err) { + ERROR("error %d, while updating meta.geo: " + "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN "), " + "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN ")", + err, recent.ptr_c->geometry.lower, recent.ptr_c->geometry.now, + recent.ptr_c->geometry.upper, + pv2pages(recent.ptr_c->geometry.shrink_pv), + pv2pages(recent.ptr_c->geometry.grow_pv), recent.txnid, + header.geometry.lower, header.geometry.now, + header.geometry.upper, pv2pages(header.geometry.shrink_pv), + pv2pages(header.geometry.grow_pv), header.unsafe_txnid); + return err; + } + } + } + + atomic_store32(&env->lck->discarded_tail, + bytes2pgno(env, used_aligned2os_bytes), mo_Relaxed); + + if ((env->flags & MDBX_RDONLY) == 0 && env->stuck_meta < 0 && + (globals.runtime_flags & MDBX_DBG_DONT_UPGRADE) == 0) { + for (int n = 0; n < NUM_METAS; ++n) { + meta_t *const meta = METAPAGE(env, n); + if (unlikely(unaligned_peek_u64(4, &meta->magic_and_version) != + MDBX_DATA_MAGIC)) { + const txnid_t txnid = constmeta_txnid(meta); + NOTICE("%s %s" + "meta[%u], txnid %" PRIaTXN, + "updating db-format signature for", + meta_is_steady(meta) ? "stead-" : "weak-", n, txnid); + err = meta_override(env, n, txnid, meta); + if (unlikely(err != MDBX_SUCCESS) && + /* Just ignore the MDBX_PROBLEM error, since here it is + * returned only in case of the attempt to upgrade an obsolete + * meta-page that is invalid for current state of a DB, + * e.g. after shrinking DB file */ + err != MDBX_PROBLEM) { + ERROR("%s meta[%u], txnid %" PRIaTXN ", error %d", + "updating db-format signature for", n, txnid, err); + return err; + } + troika = meta_tap(env); + } + } + } + } /* lck exclusive, lck_rc == MDBX_RESULT_TRUE */ + + //---------------------------------------------------- setup madvise/readahead +#if MDBX_ENABLE_MADVISE + if (used_aligned2os_bytes < env->dxb_mmap.current) { +#if defined(MADV_REMOVE) + if (lck_rc && (env->flags & MDBX_WRITEMAP) != 0 && + /* not recovery mode */ env->stuck_meta < 0) { + NOTICE("open-MADV_%s %u..%u", "REMOVE (deallocate file space)", + env->lck->discarded_tail.weak, + bytes2pgno(env, env->dxb_mmap.current)); + err = madvise(ptr_disp(env->dxb_mmap.base, used_aligned2os_bytes), + env->dxb_mmap.current - used_aligned2os_bytes, MADV_REMOVE) + ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; + } +#endif /* MADV_REMOVE */ +#if defined(MADV_DONTNEED) + NOTICE("open-MADV_%s %u..%u", "DONTNEED", env->lck->discarded_tail.weak, + bytes2pgno(env, env->dxb_mmap.current)); + err = madvise(ptr_disp(env->dxb_mmap.base, used_aligned2os_bytes), + env->dxb_mmap.current - used_aligned2os_bytes, MADV_DONTNEED) + ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(POSIX_MADV_DONTNEED) + err = ignore_enosys(posix_madvise( + ptr_disp(env->dxb_mmap.base, used_aligned2os_bytes), + env->dxb_mmap.current - used_aligned2os_bytes, POSIX_MADV_DONTNEED)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(POSIX_FADV_DONTNEED) + err = ignore_enosys(posix_fadvise( + env->lazy_fd, used_aligned2os_bytes, + env->dxb_mmap.current - used_aligned2os_bytes, POSIX_FADV_DONTNEED)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#endif /* MADV_DONTNEED */ } - *ret = mc; - return MDBX_SUCCESS; -} + err = dxb_set_readahead(env, bytes2pgno(env, used_bytes), readahead, true); + if (unlikely(err != MDBX_SUCCESS)) + return err; +#endif /* MDBX_ENABLE_MADVISE */ -int mdbx_cursor_renew(const MDBX_txn *txn, MDBX_cursor *mc) { - return likely(mc) ? mdbx_cursor_bind(txn, mc, mc->mc_dbi) : MDBX_EINVAL; + return rc; } -int mdbx_cursor_compare(const MDBX_cursor *l, const MDBX_cursor *r, - bool ignore_multival) { - const int incomparable = INT16_MAX + 1; - if (unlikely(!l)) - return r ? -incomparable * 9 : 0; - else if (unlikely(!r)) - return incomparable * 9; +int dxb_sync_locked(MDBX_env *env, unsigned flags, meta_t *const pending, + troika_t *const troika) { + eASSERT(env, ((env->flags ^ flags) & MDBX_WRITEMAP) == 0); + eASSERT(env, pending->trees.gc.flags == MDBX_INTEGERKEY); + eASSERT(env, check_sdb_flags(pending->trees.main.flags)); + const meta_t *const meta0 = METAPAGE(env, 0); + const meta_t *const meta1 = METAPAGE(env, 1); + const meta_t *const meta2 = METAPAGE(env, 2); + const meta_ptr_t head = meta_recent(env, troika); + int rc; - if (unlikely(l->mc_signature != MDBX_MC_LIVE)) - return (r->mc_signature == MDBX_MC_LIVE) ? -incomparable * 8 : 0; - if (unlikely(r->mc_signature != MDBX_MC_LIVE)) - return (l->mc_signature == MDBX_MC_LIVE) ? incomparable * 8 : 0; + eASSERT(env, + pending < METAPAGE(env, 0) || pending > METAPAGE(env, NUM_METAS)); + eASSERT(env, (env->flags & (MDBX_RDONLY | ENV_FATAL_ERROR)) == 0); + eASSERT(env, pending->geometry.first_unallocated <= pending->geometry.now); - if (unlikely(l->mc_dbx != r->mc_dbx)) { - if (l->mc_txn->mt_env != r->mc_txn->mt_env) - return (l->mc_txn->mt_env > r->mc_txn->mt_env) ? incomparable * 7 - : -incomparable * 7; - if (l->mc_txn->mt_txnid != r->mc_txn->mt_txnid) - return (l->mc_txn->mt_txnid > r->mc_txn->mt_txnid) ? incomparable * 6 - : -incomparable * 6; - return (l->mc_dbx > r->mc_dbx) ? incomparable * 5 : -incomparable * 5; + if (flags & MDBX_SAFE_NOSYNC) { + /* Check auto-sync conditions */ + const pgno_t autosync_threshold = + atomic_load32(&env->lck->autosync_threshold, mo_Relaxed); + const uint64_t autosync_period = + atomic_load64(&env->lck->autosync_period, mo_Relaxed); + uint64_t eoos_timestamp; + if ((autosync_threshold && + atomic_load64(&env->lck->unsynced_pages, mo_Relaxed) >= + autosync_threshold) || + (autosync_period && + (eoos_timestamp = + atomic_load64(&env->lck->eoos_timestamp, mo_Relaxed)) && + osal_monotime() - eoos_timestamp >= autosync_period)) + flags &= MDBX_WRITEMAP | txn_shrink_allowed; /* force steady */ } - assert(l->mc_dbi == r->mc_dbi); - int diff = (l->mc_flags & C_INITIALIZED) - (l->mc_flags & C_INITIALIZED); - if (unlikely(diff)) - return (diff > 0) ? incomparable * 4 : -incomparable * 4; - if (unlikely((l->mc_flags & C_INITIALIZED) == 0)) - return 0; + pgno_t shrink = 0; + if (flags & txn_shrink_allowed) { + const size_t prev_discarded_pgno = + atomic_load32(&env->lck->discarded_tail, mo_Relaxed); + if (prev_discarded_pgno < pending->geometry.first_unallocated) + env->lck->discarded_tail.weak = pending->geometry.first_unallocated; + else if (prev_discarded_pgno >= + pending->geometry.first_unallocated + env->madv_threshold) { + /* LY: check conditions to discard unused pages */ + const pgno_t largest_pgno = mvcc_snapshot_largest( + env, (head.ptr_c->geometry.first_unallocated > + pending->geometry.first_unallocated) + ? head.ptr_c->geometry.first_unallocated + : pending->geometry.first_unallocated); + eASSERT(env, largest_pgno >= NUM_METAS); - size_t detent = (l->mc_snum <= r->mc_snum) ? l->mc_snum : r->mc_snum; - for (size_t i = 0; i < detent; ++i) { - diff = l->mc_ki[i] - r->mc_ki[i]; - if (diff) - return diff; - } - if (unlikely(l->mc_snum != r->mc_snum)) - return (l->mc_snum > r->mc_snum) ? incomparable * 3 : -incomparable * 3; +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) + const pgno_t edge = env->poison_edge; + if (edge > largest_pgno) { + env->poison_edge = largest_pgno; + VALGRIND_MAKE_MEM_NOACCESS( + ptr_disp(env->dxb_mmap.base, pgno2bytes(env, largest_pgno)), + pgno2bytes(env, edge - largest_pgno)); + MDBX_ASAN_POISON_MEMORY_REGION( + ptr_disp(env->dxb_mmap.base, pgno2bytes(env, largest_pgno)), + pgno2bytes(env, edge - largest_pgno)); + } +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ - assert((l->mc_xcursor != nullptr) == (r->mc_xcursor != nullptr)); - if (unlikely((l->mc_xcursor != nullptr) != (r->mc_xcursor != nullptr))) - return l->mc_xcursor ? incomparable * 2 : -incomparable * 2; - if (ignore_multival || !l->mc_xcursor) - return 0; +#if MDBX_ENABLE_MADVISE && \ + (defined(MADV_DONTNEED) || defined(POSIX_MADV_DONTNEED)) + const size_t discard_edge_pgno = pgno_align2os_pgno(env, largest_pgno); + if (prev_discarded_pgno >= discard_edge_pgno + env->madv_threshold) { + const size_t prev_discarded_bytes = + pgno_align2os_bytes(env, prev_discarded_pgno); + const size_t discard_edge_bytes = pgno2bytes(env, discard_edge_pgno); + /* из-за выравнивания prev_discarded_bytes и discard_edge_bytes + * могут быть равны */ + if (prev_discarded_bytes > discard_edge_bytes) { + NOTICE("shrink-MADV_%s %zu..%zu", "DONTNEED", discard_edge_pgno, + prev_discarded_pgno); + munlock_after(env, discard_edge_pgno, + bytes_align2os_bytes(env, env->dxb_mmap.current)); + const uint32_t munlocks_before = + atomic_load32(&env->lck->mlcnt[1], mo_Relaxed); +#if defined(MADV_DONTNEED) + int advise = MADV_DONTNEED; +#if defined(MADV_FREE) && \ + 0 /* MADV_FREE works for only anonymous vma at the moment */ + if ((env->flags & MDBX_WRITEMAP) && + global.linux_kernel_version > 0x04050000) + advise = MADV_FREE; +#endif /* MADV_FREE */ + int err = madvise(ptr_disp(env->dxb_mmap.base, discard_edge_bytes), + prev_discarded_bytes - discard_edge_bytes, advise) + ? ignore_enosys(errno) + : MDBX_SUCCESS; +#else + int err = ignore_enosys(posix_madvise( + ptr_disp(env->dxb_mmap.base, discard_edge_bytes), + prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED)); +#endif + if (unlikely(MDBX_IS_ERROR(err))) { + const uint32_t mlocks_after = + atomic_load32(&env->lck->mlcnt[0], mo_Relaxed); + if (err == MDBX_EINVAL) { + const int severity = (mlocks_after - munlocks_before) + ? MDBX_LOG_NOTICE + : MDBX_LOG_WARN; + if (LOG_ENABLED(severity)) + debug_log( + severity, __func__, __LINE__, + "%s-madvise: ignore EINVAL (%d) since some pages maybe " + "locked (%u/%u mlcnt-processes)", + "shrink", err, mlocks_after, munlocks_before); + } else { + ERROR("%s-madvise(%s, %zu, +%zu), %u/%u mlcnt-processes, err %d", + "shrink", "DONTNEED", discard_edge_bytes, + prev_discarded_bytes - discard_edge_bytes, mlocks_after, + munlocks_before, err); + return err; + } + } else + env->lck->discarded_tail.weak = discard_edge_pgno; + } + } +#endif /* MDBX_ENABLE_MADVISE && (MADV_DONTNEED || POSIX_MADV_DONTNEED) */ -#if MDBX_DEBUG - if (l->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - const MDBX_page *mp = l->mc_pg[l->mc_top]; - const MDBX_node *node = page_node(mp, l->mc_ki[l->mc_top]); - assert(node_flags(node) & F_DUPDATA); + /* LY: check conditions to shrink datafile */ + const pgno_t backlog_gap = 3 + pending->trees.gc.height * 3; + pgno_t shrink_step = 0; + if (pending->geometry.shrink_pv && + pending->geometry.now - pending->geometry.first_unallocated > + (shrink_step = pv2pages(pending->geometry.shrink_pv)) + + backlog_gap) { + if (pending->geometry.now > largest_pgno && + pending->geometry.now - largest_pgno > shrink_step + backlog_gap) { + const pgno_t aligner = + pending->geometry.grow_pv + ? /* grow_step */ pv2pages(pending->geometry.grow_pv) + : shrink_step; + const pgno_t with_backlog_gap = largest_pgno + backlog_gap; + const pgno_t aligned = + pgno_align2os_pgno(env, (size_t)with_backlog_gap + aligner - + with_backlog_gap % aligner); + const pgno_t bottom = (aligned > pending->geometry.lower) + ? aligned + : pending->geometry.lower; + if (pending->geometry.now > bottom) { + if (TROIKA_HAVE_STEADY(troika)) + /* force steady, but only if steady-checkpoint is present */ + flags &= MDBX_WRITEMAP | txn_shrink_allowed; + shrink = pending->geometry.now - bottom; + pending->geometry.now = bottom; + if (unlikely(head.txnid == pending->unsafe_txnid)) { + const txnid_t txnid = safe64_txnid_next(pending->unsafe_txnid); + NOTICE("force-forward pending-txn %" PRIaTXN " -> %" PRIaTXN, + pending->unsafe_txnid, txnid); + ENSURE(env, !env->basal_txn || !env->txn); + if (unlikely(txnid > MAX_TXNID)) { + rc = MDBX_TXN_FULL; + ERROR("txnid overflow, raise %d", rc); + goto fail; + } + meta_set_txnid(env, pending, txnid); + eASSERT(env, coherency_check_meta(env, pending, true)); + } + } + } + } + } } - if (l->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - const MDBX_page *mp = r->mc_pg[r->mc_top]; - const MDBX_node *node = page_node(mp, r->mc_ki[r->mc_top]); - assert(node_flags(node) & F_DUPDATA); + + /* LY: step#1 - sync previously written/updated data-pages */ + rc = MDBX_RESULT_FALSE /* carry steady */; + if (atomic_load64(&env->lck->unsynced_pages, mo_Relaxed)) { + eASSERT(env, ((flags ^ env->flags) & MDBX_WRITEMAP) == 0); + enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE; + unsigned sync_op = 0; + if ((flags & MDBX_SAFE_NOSYNC) == 0) { + sync_op = 1; + mode_bits = MDBX_SYNC_DATA; + if (pending->geometry.first_unallocated > + meta_prefer_steady(env, troika).ptr_c->geometry.now) + mode_bits |= MDBX_SYNC_SIZE; + if (flags & MDBX_NOMETASYNC) + mode_bits |= MDBX_SYNC_IODQ; + } else if (unlikely(env->incore)) + goto skip_incore_sync; + if (flags & MDBX_WRITEMAP) { +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.msync.weak += sync_op; +#else + (void)sync_op; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_msync( + &env->dxb_mmap, 0, + pgno_align2os_bytes(env, pending->geometry.first_unallocated), + mode_bits); + } else { +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.fsync.weak += sync_op; +#else + (void)sync_op; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_fsync(env->lazy_fd, mode_bits); + } + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + rc = (flags & MDBX_SAFE_NOSYNC) ? MDBX_RESULT_TRUE /* carry non-steady */ + : MDBX_RESULT_FALSE /* carry steady */; } -#endif /* MDBX_DEBUG */ + eASSERT(env, coherency_check_meta(env, pending, true)); - l = &l->mc_xcursor->mx_cursor; - r = &r->mc_xcursor->mx_cursor; - diff = (l->mc_flags & C_INITIALIZED) - (l->mc_flags & C_INITIALIZED); - if (unlikely(diff)) - return (diff > 0) ? incomparable * 2 : -incomparable * 2; - if (unlikely((l->mc_flags & C_INITIALIZED) == 0)) - return 0; + /* Steady or Weak */ + if (rc == MDBX_RESULT_FALSE /* carry steady */) { + meta_sign_as_steady(pending); + atomic_store64(&env->lck->eoos_timestamp, 0, mo_Relaxed); + atomic_store64(&env->lck->unsynced_pages, 0, mo_Relaxed); + } else { + assert(rc == MDBX_RESULT_TRUE /* carry non-steady */); + skip_incore_sync: + eASSERT(env, env->lck->unsynced_pages.weak > 0); + /* Может быть нулевым если unsynced_pages > 0 в результате спиллинга. + * eASSERT(env, env->lck->eoos_timestamp.weak != 0); */ + unaligned_poke_u64(4, pending->sign, DATASIGN_WEAK); + } - detent = (l->mc_snum <= r->mc_snum) ? l->mc_snum : r->mc_snum; - for (size_t i = 0; i < detent; ++i) { - diff = l->mc_ki[i] - r->mc_ki[i]; - if (diff) - return diff; + const bool legal4overwrite = + head.txnid == pending->unsafe_txnid && + !memcmp(&head.ptr_c->trees, &pending->trees, sizeof(pending->trees)) && + !memcmp(&head.ptr_c->canary, &pending->canary, sizeof(pending->canary)) && + !memcmp(&head.ptr_c->geometry, &pending->geometry, + sizeof(pending->geometry)); + meta_t *target = nullptr; + if (head.txnid == pending->unsafe_txnid) { + ENSURE(env, legal4overwrite); + if (!head.is_steady && meta_is_steady(pending)) + target = (meta_t *)head.ptr_c; + else { + WARNING("%s", "skip update meta"); + return MDBX_SUCCESS; + } + } else { + const unsigned troika_tail = troika->tail_and_flags & 3; + ENSURE(env, troika_tail < NUM_METAS && troika_tail != troika->recent && + troika_tail != troika->prefer_steady); + target = (meta_t *)meta_tail(env, troika).ptr_c; } - if (unlikely(l->mc_snum != r->mc_snum)) - return (l->mc_snum > r->mc_snum) ? incomparable : -incomparable; - return 0; -} -int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest) { - if (unlikely(!src)) - return MDBX_EINVAL; - if (unlikely(src->mc_signature != MDBX_MC_LIVE)) - return (src->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL - : MDBX_EBADSIGN; + /* LY: step#2 - update meta-page. */ + DEBUG("writing meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO + ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO + " +%u -%u, txn_id %" PRIaTXN ", %s", + data_page(target)->pgno, pending->trees.main.root, + pending->trees.gc.root, pending->geometry.lower, + pending->geometry.first_unallocated, pending->geometry.now, + pending->geometry.upper, pv2pages(pending->geometry.grow_pv), + pv2pages(pending->geometry.shrink_pv), pending->unsafe_txnid, + durable_caption(pending)); - int rc = mdbx_cursor_bind(src->mc_txn, dest, src->mc_dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + DEBUG("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, + (meta0 == head.ptr_c) ? "head" + : (meta0 == target) ? "tail" + : "stay", + durable_caption(meta0), constmeta_txnid(meta0), meta0->trees.main.root, + meta0->trees.gc.root); + DEBUG("meta1: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, + (meta1 == head.ptr_c) ? "head" + : (meta1 == target) ? "tail" + : "stay", + durable_caption(meta1), constmeta_txnid(meta1), meta1->trees.main.root, + meta1->trees.gc.root); + DEBUG("meta2: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, + (meta2 == head.ptr_c) ? "head" + : (meta2 == target) ? "tail" + : "stay", + durable_caption(meta2), constmeta_txnid(meta2), meta2->trees.main.root, + meta2->trees.gc.root); - assert(dest->mc_db == src->mc_db); - assert(dest->mc_dbi == src->mc_dbi); - assert(dest->mc_dbx == src->mc_dbx); - assert(dest->mc_dbi_state == src->mc_dbi_state); -again: - assert(dest->mc_txn == src->mc_txn); - dest->mc_flags ^= (dest->mc_flags ^ src->mc_flags) & ~C_UNTRACK; - dest->mc_top = src->mc_top; - dest->mc_snum = src->mc_snum; - for (size_t i = 0; i < src->mc_snum; ++i) { - dest->mc_ki[i] = src->mc_ki[i]; - dest->mc_pg[i] = src->mc_pg[i]; - } - - if (src->mc_xcursor) { - dest->mc_xcursor->mx_db = src->mc_xcursor->mx_db; - dest->mc_xcursor->mx_dbx = src->mc_xcursor->mx_dbx; - src = &src->mc_xcursor->mx_cursor; - dest = &dest->mc_xcursor->mx_cursor; - goto again; - } + eASSERT(env, pending->unsafe_txnid != constmeta_txnid(meta0) || + (meta_is_steady(pending) && !meta_is_steady(meta0))); + eASSERT(env, pending->unsafe_txnid != constmeta_txnid(meta1) || + (meta_is_steady(pending) && !meta_is_steady(meta1))); + eASSERT(env, pending->unsafe_txnid != constmeta_txnid(meta2) || + (meta_is_steady(pending) && !meta_is_steady(meta2))); - return MDBX_SUCCESS; -} + eASSERT(env, ((env->flags ^ flags) & MDBX_WRITEMAP) == 0); + ENSURE(env, target == head.ptr_c || + constmeta_txnid(target) < pending->unsafe_txnid); + if (flags & MDBX_WRITEMAP) { + jitter4testing(true); + if (likely(target != head.ptr_c)) { + /* LY: 'invalidate' the meta. */ + meta_update_begin(env, target, pending->unsafe_txnid); + unaligned_poke_u64(4, target->sign, DATASIGN_WEAK); +#ifndef NDEBUG + /* debug: provoke failure to catch a violators, but don't touch pagesize + * to allow readers catch actual pagesize. */ + void *provoke_begin = &target->trees.gc.root; + void *provoke_end = &target->sign; + memset(provoke_begin, 0xCC, ptr_dist(provoke_end, provoke_begin)); + jitter4testing(false); +#endif -void mdbx_cursor_close(MDBX_cursor *mc) { - if (likely(mc)) { - ENSURE(NULL, mc->mc_signature == MDBX_MC_LIVE || - mc->mc_signature == MDBX_MC_READY4CLOSE); - MDBX_txn *const txn = mc->mc_txn; - if (!mc->mc_backup) { - mc->mc_txn = NULL; - /* Unlink from txn, if tracked. */ - if (mc->mc_flags & C_UNTRACK) { - ENSURE(txn->mt_env, check_txn(txn, 0) == MDBX_SUCCESS); - MDBX_cursor **prev = &txn->mt_cursors[mc->mc_dbi]; - while (*prev && *prev != mc) - prev = &(*prev)->mc_next; - tASSERT(txn, *prev == mc); - *prev = mc->mc_next; - } - mc->mc_signature = 0; - mc->mc_next = mc; - osal_free(mc); + /* LY: update info */ + target->geometry = pending->geometry; + target->trees.gc = pending->trees.gc; + target->trees.main = pending->trees.main; + eASSERT(env, target->trees.gc.flags == MDBX_INTEGERKEY); + eASSERT(env, check_sdb_flags(target->trees.main.flags)); + target->canary = pending->canary; + memcpy(target->pages_retired, pending->pages_retired, 8); + jitter4testing(true); + + /* LY: 'commit' the meta */ + meta_update_end(env, target, unaligned_peek_u64(4, pending->txnid_b)); + jitter4testing(true); + eASSERT(env, coherency_check_meta(env, target, true)); } else { - /* Cursor closed before nested txn ends */ - tASSERT(txn, mc->mc_signature == MDBX_MC_LIVE); - ENSURE(txn->mt_env, check_txn_rw(txn, 0) == MDBX_SUCCESS); - mc->mc_signature = MDBX_MC_WAIT4EOT; + /* dangerous case (target == head), only sign could + * me updated, check assertions once again */ + eASSERT(env, + legal4overwrite && !head.is_steady && meta_is_steady(pending)); } - } -} - -int mdbx_txn_release_all_cursors(const MDBX_txn *txn, bool unbind) { - int rc = check_txn(txn, MDBX_TXN_FINISHED | MDBX_TXN_HAS_CHILD); - if (likely(rc == MDBX_SUCCESS)) { - TXN_FOREACH_DBI_FROM(txn, i, MAIN_DBI) { - while (txn->mt_cursors[i]) { - MDBX_cursor *mc = txn->mt_cursors[i]; - ENSURE(NULL, mc->mc_signature == MDBX_MC_LIVE && - (mc->mc_flags & C_UNTRACK) && !mc->mc_backup); - rc = likely(rc < INT_MAX) ? rc + 1 : rc; - txn->mt_cursors[i] = mc->mc_next; - if (unbind) { - mc->mc_signature = MDBX_MC_READY4CLOSE; - mc->mc_flags = 0; - } else { - mc->mc_signature = 0; - mc->mc_next = mc; - osal_free(mc); + memcpy(target->sign, pending->sign, 8); + osal_flush_incoherent_cpu_writeback(); + jitter4testing(true); + if (!env->incore) { + if (!MDBX_AVOID_MSYNC) { + /* sync meta-pages */ +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.msync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_msync(&env->dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + (flags & MDBX_NOMETASYNC) + ? MDBX_SYNC_NONE + : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } else { +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.wops.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + const page_t *page = data_page(target); + rc = osal_pwrite(env->fd4meta, page, env->ps, + ptr_dist(page, env->dxb_mmap.base)); + if (likely(rc == MDBX_SUCCESS)) { + osal_flush_incoherent_mmap(target, sizeof(meta_t), + globals.sys_pagesize); + if ((flags & MDBX_NOMETASYNC) == 0 && env->fd4meta == env->lazy_fd) { +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } } } + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; } } else { - eASSERT(nullptr, rc < 0); +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.wops.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + const meta_t undo_meta = *target; + eASSERT(env, pending->trees.gc.flags == MDBX_INTEGERKEY); + eASSERT(env, check_sdb_flags(pending->trees.main.flags)); + rc = osal_pwrite(env->fd4meta, pending, sizeof(meta_t), + ptr_dist(target, env->dxb_mmap.base)); + if (unlikely(rc != MDBX_SUCCESS)) { + undo: + DEBUG("%s", "write failed, disk error?"); + /* On a failure, the pagecache still contains the new data. + * Try write some old data back, to prevent it from being used. */ + osal_pwrite(env->fd4meta, &undo_meta, sizeof(meta_t), + ptr_dist(target, env->dxb_mmap.base)); + goto fail; + } + osal_flush_incoherent_mmap(target, sizeof(meta_t), globals.sys_pagesize); + /* sync meta-pages */ + if ((flags & MDBX_NOMETASYNC) == 0 && env->fd4meta == env->lazy_fd && + !env->incore) { +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + if (rc != MDBX_SUCCESS) + goto undo; + } } - return rc; -} -MDBX_txn *mdbx_cursor_txn(const MDBX_cursor *mc) { - if (unlikely(!mc || mc->mc_signature != MDBX_MC_LIVE)) - return NULL; - MDBX_txn *txn = mc->mc_txn; - if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) - return NULL; - if (unlikely(txn->mt_flags & MDBX_TXN_FINISHED)) - return NULL; - return txn; -} + uint64_t timestamp = 0; + while ("workaround for https://libmdbx.dqdkfa.ru/dead-github/issues/269") { + rc = coherency_check_written( + env, pending->unsafe_txnid, target, + bytes2pgno(env, ptr_dist(target, env->dxb_mmap.base)), ×tamp); + if (likely(rc == MDBX_SUCCESS)) + break; + if (unlikely(rc != MDBX_RESULT_TRUE)) + goto fail; + } -MDBX_dbi mdbx_cursor_dbi(const MDBX_cursor *mc) { - if (unlikely(!mc || mc->mc_signature != MDBX_MC_LIVE)) - return UINT_MAX; - return mc->mc_dbi; -} + const uint32_t sync_txnid_dist = + ((flags & MDBX_NOMETASYNC) == 0) ? 0 + : ((flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) + ? MDBX_NOMETASYNC_LAZY_FD + : MDBX_NOMETASYNC_LAZY_WRITEMAP; + env->lck->meta_sync_txnid.weak = + pending->txnid_a[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__].weak - + sync_txnid_dist; -/* Return the count of duplicate data items for the current key */ -int mdbx_cursor_count(const MDBX_cursor *mc, size_t *countp) { - if (unlikely(mc == NULL)) - return MDBX_EINVAL; + *troika = meta_tap(env); + for (MDBX_txn *txn = env->basal_txn; txn; txn = txn->nested) + if (troika != &txn->tw.troika) + txn->tw.troika = *troika; - if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL - : MDBX_EBADSIGN; + /* LY: shrink datafile if needed */ + if (unlikely(shrink)) { + VERBOSE("shrink to %" PRIaPGNO " pages (-%" PRIaPGNO ")", + pending->geometry.now, shrink); + rc = dxb_resize(env, pending->geometry.first_unallocated, + pending->geometry.now, pending->geometry.upper, + impilict_shrink); + if (rc != MDBX_SUCCESS && rc != MDBX_EPERM) + goto fail; + eASSERT(env, coherency_check_meta(env, target, true)); + } - int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + lck_t *const lck = env->lck_mmap.lck; + if (likely(lck)) + /* toggle oldest refresh */ + atomic_store32(&lck->rdt_refresh_flag, false, mo_Relaxed); - if (unlikely(countp == NULL || !(mc->mc_flags & C_INITIALIZED))) - return MDBX_EINVAL; + return MDBX_SUCCESS; - if (!mc->mc_snum) { - *countp = 0; - return MDBX_NOTFOUND; - } +fail: + env->flags |= ENV_FATAL_ERROR; + return rc; +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 - MDBX_page *mp = mc->mc_pg[mc->mc_top]; - if ((mc->mc_flags & C_EOF) && mc->mc_ki[mc->mc_top] >= page_numkeys(mp)) { - *countp = 0; - return MDBX_NOTFOUND; - } - *countp = 1; - if (mc->mc_xcursor != NULL) { - MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (node_flags(node) & F_DUPDATA) { - cASSERT(mc, mc->mc_xcursor && - (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); - *countp = unlikely(mc->mc_xcursor->mx_db.md_entries > PTRDIFF_MAX) - ? PTRDIFF_MAX - : (size_t)mc->mc_xcursor->mx_db.md_entries; - } - } - return MDBX_SUCCESS; +__cold static unsigned default_rp_augment_limit(const MDBX_env *env) { + const size_t timeframe = /* 16 секунд */ 16 << 16; + const size_t remain_1sec = + (env->options.gc_time_limit < timeframe) + ? timeframe - (size_t)env->options.gc_time_limit + : 0; + const size_t minimum = (env->maxgc_large1page * 2 > MDBX_PNL_INITIAL) + ? env->maxgc_large1page * 2 + : MDBX_PNL_INITIAL; + const size_t one_third = env->geo_in_bytes.now / 3 >> env->ps2ln; + const size_t augment_limit = + (one_third > minimum) + ? minimum + (one_third - minimum) / timeframe * remain_1sec + : minimum; + eASSERT(env, augment_limit < PAGELIST_LIMIT); + return pnl_bytes2size(pnl_size2bytes(augment_limit)); } -/* Replace the key for a branch node with a new key. - * Set MDBX_TXN_ERROR on failure. - * [in] mc Cursor pointing to the node to operate on. - * [in] key The new key to use. - * Returns 0 on success, non-zero on failure. */ -static int update_key(MDBX_cursor *mc, const MDBX_val *key) { - MDBX_page *mp; - MDBX_node *node; - size_t len; - ptrdiff_t delta, ksize, oksize; - intptr_t ptr, i, nkeys, indx; - DKBUF_DEBUG; +static bool default_prefault_write(const MDBX_env *env) { + return !MDBX_MMAP_INCOHERENT_FILE_WRITE && !env->incore && + (env->flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == MDBX_WRITEMAP; +} - cASSERT(mc, cursor_is_tracked(mc)); - indx = mc->mc_ki[mc->mc_top]; - mp = mc->mc_pg[mc->mc_top]; - node = page_node(mp, indx); - ptr = mp->mp_ptrs[indx]; -#if MDBX_DEBUG - MDBX_val k2; - k2.iov_base = node_key(node); - k2.iov_len = node_ks(node); - DEBUG("update key %zi (offset %zu) [%s] to [%s] on page %" PRIaPGNO, indx, - ptr, DVAL_DEBUG(&k2), DKEY_DEBUG(key), mp->mp_pgno); -#endif /* MDBX_DEBUG */ +static bool default_prefer_waf_insteadof_balance(const MDBX_env *env) { + (void)env; + return false; +} - /* Sizes must be 2-byte aligned. */ - ksize = EVEN(key->iov_len); - oksize = EVEN(node_ks(node)); - delta = ksize - oksize; +static uint16_t default_subpage_limit(const MDBX_env *env) { + (void)env; + return 65535 /* 100% */; +} - /* Shift node contents if EVEN(key length) changed. */ - if (delta) { - if (delta > (int)page_room(mp)) { - /* not enough space left, do a delete and split */ - DEBUG("Not enough room, delta = %zd, splitting...", delta); - pgno_t pgno = node_pgno(node); - node_del(mc, 0); - int err = page_split(mc, key, NULL, pgno, MDBX_SPLIT_REPLACE); - if (err == MDBX_SUCCESS && AUDIT_ENABLED()) - err = cursor_check_updating(mc); - return err; - } +static uint16_t default_subpage_room_threshold(const MDBX_env *env) { + (void)env; + return 0 /* 0% */; +} - nkeys = page_numkeys(mp); - for (i = 0; i < nkeys; i++) { - if (mp->mp_ptrs[i] <= ptr) { - cASSERT(mc, mp->mp_ptrs[i] >= delta); - mp->mp_ptrs[i] -= (indx_t)delta; - } - } +static uint16_t default_subpage_reserve_prereq(const MDBX_env *env) { + (void)env; + return 27525 /* 42% */; +} - void *const base = ptr_disp(mp, mp->mp_upper + PAGEHDRSZ); - len = ptr - mp->mp_upper + NODESIZE; - memmove(ptr_disp(base, -delta), base, len); - cASSERT(mc, mp->mp_upper >= delta); - mp->mp_upper -= (indx_t)delta; +static uint16_t default_subpage_reserve_limit(const MDBX_env *env) { + (void)env; + return 2753 /* 4.2% */; +} - node = page_node(mp, indx); - } +void env_options_init(MDBX_env *env) { + env->options.rp_augment_limit = MDBX_PNL_INITIAL; + env->options.dp_reserve_limit = MDBX_PNL_INITIAL; + env->options.dp_initial = MDBX_PNL_INITIAL; + env->options.spill_max_denominator = 8; + env->options.spill_min_denominator = 8; + env->options.spill_parent4child_denominator = 0; + env->options.dp_loose_limit = 64; + env->options.merge_threshold_16dot16_percent = 65536 / 4 /* 25% */; + if (default_prefer_waf_insteadof_balance(env)) + env->options.prefer_waf_insteadof_balance = true; - /* But even if no shift was needed, update ksize */ - node_set_ks(node, key->iov_len); +#if !(defined(_WIN32) || defined(_WIN64)) + env->options.writethrough_threshold = +#if defined(__linux__) || defined(__gnu_linux__) + globals.running_on_WSL1 ? MAX_PAGENO : +#endif /* Linux */ + MDBX_WRITETHROUGH_THRESHOLD_DEFAULT; +#endif /* Windows */ - if (likely(key->iov_len /* to avoid UBSAN traps*/ != 0)) - memcpy(node_key(node), key->iov_base, key->iov_len); - return MDBX_SUCCESS; + env->options.subpage.limit = default_subpage_limit(env); + env->options.subpage.room_threshold = default_subpage_room_threshold(env); + env->options.subpage.reserve_prereq = default_subpage_reserve_prereq(env); + env->options.subpage.reserve_limit = default_subpage_reserve_limit(env); } -/* Move a node from csrc to cdst. */ -static int node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { - int rc; - DKBUF_DEBUG; +void env_options_adjust_defaults(MDBX_env *env) { + if (!env->options.flags.non_auto.rp_augment_limit) + env->options.rp_augment_limit = default_rp_augment_limit(env); + if (!env->options.flags.non_auto.prefault_write) + env->options.prefault_write = default_prefault_write(env); - MDBX_page *psrc = csrc->mc_pg[csrc->mc_top]; - MDBX_page *pdst = cdst->mc_pg[cdst->mc_top]; - cASSERT(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst)); - cASSERT(csrc, csrc->mc_dbi == cdst->mc_dbi); - cASSERT(csrc, csrc->mc_top == cdst->mc_top); - if (unlikely(PAGETYPE_WHOLE(psrc) != PAGETYPE_WHOLE(pdst))) { - bailout: - ERROR("Wrong or mismatch pages's types (src %d, dst %d) to move node", - PAGETYPE_WHOLE(psrc), PAGETYPE_WHOLE(pdst)); - csrc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - return MDBX_PROBLEM; - } + const size_t basis = env->geo_in_bytes.now; + /* TODO: use options? */ + const unsigned factor = 9; + size_t threshold = (basis < ((size_t)65536 << factor)) + ? 65536 /* minimal threshold */ + : (basis > (MEGABYTE * 4 << factor)) + ? MEGABYTE * 4 /* maximal threshold */ + : basis >> factor; + threshold = + (threshold < env->geo_in_bytes.shrink || !env->geo_in_bytes.shrink) + ? threshold + : env->geo_in_bytes.shrink; - MDBX_val key4move; - switch (PAGETYPE_WHOLE(psrc)) { - case P_BRANCH: { - const MDBX_node *srcnode = page_node(psrc, csrc->mc_ki[csrc->mc_top]); - cASSERT(csrc, node_flags(srcnode) == 0); - const pgno_t srcpg = node_pgno(srcnode); - key4move.iov_len = node_ks(srcnode); - key4move.iov_base = node_key(srcnode); + env->madv_threshold = bytes2pgno(env, bytes_align2os_bytes(env, threshold)); +} - if (csrc->mc_ki[csrc->mc_top] == 0) { - const size_t snum = csrc->mc_snum; - cASSERT(csrc, snum > 0); - /* must find the lowest key below src */ - rc = page_search_lowest(csrc); - MDBX_page *lowest_page = csrc->mc_pg[csrc->mc_top]; - if (unlikely(rc)) - return rc; - cASSERT(csrc, IS_LEAF(lowest_page)); - if (unlikely(!IS_LEAF(lowest_page))) - goto bailout; - if (IS_LEAF2(lowest_page)) { - key4move.iov_len = csrc->mc_db->md_xsize; - key4move.iov_base = page_leaf2key(lowest_page, 0, key4move.iov_len); - } else { - const MDBX_node *lowest_node = page_node(lowest_page, 0); - key4move.iov_len = node_ks(lowest_node); - key4move.iov_base = node_key(lowest_node); - } +//------------------------------------------------------------------------------ - /* restore cursor after mdbx_page_search_lowest() */ - csrc->mc_snum = (uint8_t)snum; - csrc->mc_top = (uint8_t)snum - 1; - csrc->mc_ki[csrc->mc_top] = 0; +__cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, + uint64_t value) { + int err = check_env(env, false); + if (unlikely(err != MDBX_SUCCESS)) + return err; - /* paranoia */ - cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]); - cASSERT(csrc, IS_BRANCH(psrc)); - if (unlikely(!IS_BRANCH(psrc))) - goto bailout; + const bool lock_needed = + ((env->flags & ENV_ACTIVE) && env->basal_txn && !env_txn0_owned(env)); + bool should_unlock = false; + switch (option) { + case MDBX_opt_sync_bytes: + if (value == /* default */ UINT64_MAX) + value = MAX_WRITE; + if (unlikely(env->flags & MDBX_RDONLY)) + return MDBX_EACCESS; + if (unlikely(!(env->flags & ENV_ACTIVE))) + return MDBX_EPERM; + if (unlikely(value > SIZE_MAX - 65536)) + return MDBX_EINVAL; + value = bytes2pgno(env, (size_t)value + env->ps - 1); + if ((uint32_t)value != + atomic_load32(&env->lck->autosync_threshold, mo_AcquireRelease) && + atomic_store32(&env->lck->autosync_threshold, (uint32_t)value, + mo_Relaxed) + /* Дергаем sync(force=off) только если задано новое не-нулевое значение + * и мы вне транзакции */ + && lock_needed) { + err = env_sync(env, false, false); + if (err == /* нечего сбрасывать на диск */ MDBX_RESULT_TRUE) + err = MDBX_SUCCESS; } + break; - if (cdst->mc_ki[cdst->mc_top] == 0) { - const size_t snum = cdst->mc_snum; - cASSERT(csrc, snum > 0); - MDBX_cursor mn; - cursor_copy(cdst, &mn); - /* must find the lowest key below dst */ - rc = page_search_lowest(&mn); - if (unlikely(rc)) - return rc; - MDBX_page *const lowest_page = mn.mc_pg[mn.mc_top]; - cASSERT(cdst, IS_LEAF(lowest_page)); - if (unlikely(!IS_LEAF(lowest_page))) - goto bailout; - MDBX_val key; - if (IS_LEAF2(lowest_page)) { - key.iov_len = mn.mc_db->md_xsize; - key.iov_base = page_leaf2key(lowest_page, 0, key.iov_len); - } else { - MDBX_node *lowest_node = page_node(lowest_page, 0); - key.iov_len = node_ks(lowest_node); - key.iov_base = node_key(lowest_node); - } - - /* restore cursor after mdbx_page_search_lowest() */ - mn.mc_snum = (uint8_t)snum; - mn.mc_top = (uint8_t)snum - 1; - mn.mc_ki[mn.mc_top] = 0; - - const intptr_t delta = - EVEN(key.iov_len) - EVEN(node_ks(page_node(mn.mc_pg[mn.mc_top], 0))); - const intptr_t needed = - branch_size(cdst->mc_txn->mt_env, &key4move) + delta; - const intptr_t have = page_room(pdst); - if (unlikely(needed > have)) - return MDBX_RESULT_TRUE; + case MDBX_opt_sync_period: + if (value == /* default */ UINT64_MAX) + value = 2780315 /* 42.42424 секунды */; + if (unlikely(env->flags & MDBX_RDONLY)) + return MDBX_EACCESS; + if (unlikely(!(env->flags & ENV_ACTIVE))) + return MDBX_EPERM; + if (unlikely(value > UINT32_MAX)) + return MDBX_EINVAL; + value = osal_16dot16_to_monotime((uint32_t)value); + if (value != atomic_load64(&env->lck->autosync_period, mo_AcquireRelease) && + atomic_store64(&env->lck->autosync_period, value, mo_Relaxed) + /* Дергаем sync(force=off) только если задано новое не-нулевое значение + * и мы вне транзакции */ + && lock_needed) { + err = env_sync(env, false, false); + if (err == /* нечего сбрасывать на диск */ MDBX_RESULT_TRUE) + err = MDBX_SUCCESS; + } + break; - if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) - return rc; - psrc = csrc->mc_pg[csrc->mc_top]; - pdst = cdst->mc_pg[cdst->mc_top]; + case MDBX_opt_max_db: + if (value == /* default */ UINT64_MAX) + value = 42; + if (unlikely(value > MDBX_MAX_DBI)) + return MDBX_EINVAL; + if (unlikely(env->dxb_mmap.base)) + return MDBX_EPERM; + env->max_dbi = (unsigned)value + CORE_DBS; + break; - WITH_CURSOR_TRACKING(mn, rc = update_key(&mn, &key)); - if (unlikely(rc)) - return rc; - } else { - const size_t needed = branch_size(cdst->mc_txn->mt_env, &key4move); - const size_t have = page_room(pdst); - if (unlikely(needed > have)) - return MDBX_RESULT_TRUE; + case MDBX_opt_max_readers: + if (value == /* default */ UINT64_MAX) + value = MDBX_READERS_LIMIT; + if (unlikely(value < 1 || value > MDBX_READERS_LIMIT)) + return MDBX_EINVAL; + if (unlikely(env->dxb_mmap.base)) + return MDBX_EPERM; + env->max_readers = (unsigned)value; + break; - if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) - return rc; - psrc = csrc->mc_pg[csrc->mc_top]; - pdst = cdst->mc_pg[cdst->mc_top]; + case MDBX_opt_dp_reserve_limit: + if (value == /* default */ UINT64_MAX) + value = INT_MAX; + if (unlikely(value > INT_MAX)) + return MDBX_EINVAL; + if (env->options.dp_reserve_limit != (unsigned)value) { + if (lock_needed) { + err = lck_txn_lock(env, false); + if (unlikely(err != MDBX_SUCCESS)) + return err; + should_unlock = true; + } + env->options.dp_reserve_limit = (unsigned)value; + while (env->shadow_reserve_len > env->options.dp_reserve_limit) { + eASSERT(env, env->shadow_reserve != nullptr); + page_t *dp = env->shadow_reserve; + MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->ps); + VALGRIND_MAKE_MEM_DEFINED(&page_next(dp), sizeof(page_t *)); + env->shadow_reserve = page_next(dp); + void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t)); + osal_free(ptr); + env->shadow_reserve_len -= 1; + } } + break; - DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO - " to node %u on page %" PRIaPGNO, - "branch", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), - psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); - /* Add the node to the destination page. */ - rc = node_add_branch(cdst, cdst->mc_ki[cdst->mc_top], &key4move, srcpg); - } break; - - case P_LEAF: { - /* Mark src and dst as dirty. */ - if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) - return rc; - psrc = csrc->mc_pg[csrc->mc_top]; - pdst = cdst->mc_pg[cdst->mc_top]; - const MDBX_node *srcnode = page_node(psrc, csrc->mc_ki[csrc->mc_top]); - MDBX_val data; - data.iov_len = node_ds(srcnode); - data.iov_base = node_data(srcnode); - key4move.iov_len = node_ks(srcnode); - key4move.iov_base = node_key(srcnode); - DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO - " to node %u on page %" PRIaPGNO, - "leaf", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), - psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); - /* Add the node to the destination page. */ - rc = node_add_leaf(cdst, cdst->mc_ki[cdst->mc_top], &key4move, &data, - node_flags(srcnode)); - } break; + case MDBX_opt_rp_augment_limit: + if (value == /* default */ UINT64_MAX) { + env->options.flags.non_auto.rp_augment_limit = 0; + env->options.rp_augment_limit = default_rp_augment_limit(env); + } else if (unlikely(value > PAGELIST_LIMIT)) + return MDBX_EINVAL; + else { + env->options.flags.non_auto.rp_augment_limit = 1; + env->options.rp_augment_limit = (unsigned)value; + } + break; - case P_LEAF | P_LEAF2: { - /* Mark src and dst as dirty. */ - if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) - return rc; - psrc = csrc->mc_pg[csrc->mc_top]; - pdst = cdst->mc_pg[cdst->mc_top]; - key4move.iov_len = csrc->mc_db->md_xsize; - key4move.iov_base = - page_leaf2key(psrc, csrc->mc_ki[csrc->mc_top], key4move.iov_len); - DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO - " to node %u on page %" PRIaPGNO, - "leaf2", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), - psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); - /* Add the node to the destination page. */ - rc = node_add_leaf2(cdst, cdst->mc_ki[cdst->mc_top], &key4move); - } break; + case MDBX_opt_gc_time_limit: + if (value == /* default */ UINT64_MAX) + value = 0; + if (unlikely(value > UINT32_MAX)) + return MDBX_EINVAL; + if (unlikely(env->flags & MDBX_RDONLY)) + return MDBX_EACCESS; + value = osal_16dot16_to_monotime((uint32_t)value); + if (value != env->options.gc_time_limit) { + if (env->txn && lock_needed) + return MDBX_EPERM; + env->options.gc_time_limit = value; + if (!env->options.flags.non_auto.rp_augment_limit) + env->options.rp_augment_limit = default_rp_augment_limit(env); + } + break; - default: - assert(false); - goto bailout; - } + case MDBX_opt_txn_dp_limit: + case MDBX_opt_txn_dp_initial: + if (value == /* default */ UINT64_MAX) + value = PAGELIST_LIMIT; + if (unlikely(value > PAGELIST_LIMIT || value < CURSOR_STACK_SIZE * 4)) + return MDBX_EINVAL; + if (unlikely(env->flags & MDBX_RDONLY)) + return MDBX_EACCESS; + if (lock_needed) { + err = lck_txn_lock(env, false); + if (unlikely(err != MDBX_SUCCESS)) + return err; + should_unlock = true; + } + if (env->txn) + err = MDBX_EPERM /* unable change during transaction */; + else { + const pgno_t value32 = (pgno_t)value; + if (option == MDBX_opt_txn_dp_initial && + env->options.dp_initial != value32) { + env->options.dp_initial = value32; + if (env->options.dp_limit < value32) { + env->options.dp_limit = value32; + env->options.flags.non_auto.dp_limit = 1; + } + } + if (option == MDBX_opt_txn_dp_limit && env->options.dp_limit != value32) { + env->options.dp_limit = value32; + env->options.flags.non_auto.dp_limit = 1; + if (env->options.dp_initial > value32) + env->options.dp_initial = value32; + } + } + break; - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + case MDBX_opt_spill_max_denominator: + if (value == /* default */ UINT64_MAX) + value = 8; + if (unlikely(value > 255)) + return MDBX_EINVAL; + env->options.spill_max_denominator = (uint8_t)value; + break; + case MDBX_opt_spill_min_denominator: + if (value == /* default */ UINT64_MAX) + value = 8; + if (unlikely(value > 255)) + return MDBX_EINVAL; + env->options.spill_min_denominator = (uint8_t)value; + break; + case MDBX_opt_spill_parent4child_denominator: + if (value == /* default */ UINT64_MAX) + value = 0; + if (unlikely(value > 255)) + return MDBX_EINVAL; + env->options.spill_parent4child_denominator = (uint8_t)value; + break; - /* Delete the node from the source page. */ - node_del(csrc, key4move.iov_len); + case MDBX_opt_loose_limit: + if (value == /* default */ UINT64_MAX) + value = 64; + if (unlikely(value > 255)) + return MDBX_EINVAL; + env->options.dp_loose_limit = (uint8_t)value; + break; - cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]); - cASSERT(cdst, pdst == cdst->mc_pg[cdst->mc_top]); - cASSERT(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst)); + case MDBX_opt_merge_threshold_16dot16_percent: + if (value == /* default */ UINT64_MAX) + value = 65536 / 4 /* 25% */; + if (unlikely(value < 8192 || value > 32768)) + return MDBX_EINVAL; + env->options.merge_threshold_16dot16_percent = (unsigned)value; + recalculate_merge_thresholds(env); + break; - { - /* Adjust other cursors pointing to mp */ - MDBX_cursor *m2, *m3; - const MDBX_dbi dbi = csrc->mc_dbi; - cASSERT(csrc, csrc->mc_top == cdst->mc_top); - if (fromleft) { - /* If we're adding on the left, bump others up */ - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { - m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) - continue; - if (m3 != cdst && m3->mc_pg[csrc->mc_top] == pdst && - m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) { - m3->mc_ki[csrc->mc_top]++; - } - if (m3 != csrc && m3->mc_pg[csrc->mc_top] == psrc && - m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) { - m3->mc_pg[csrc->mc_top] = pdst; - m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; - cASSERT(csrc, csrc->mc_top > 0); - m3->mc_ki[csrc->mc_top - 1]++; - } - if (XCURSOR_INITED(m3) && IS_LEAF(psrc)) - XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); - } - } else { - /* Adding on the right, bump others down */ - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { - m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (m3 == csrc) - continue; - if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) - continue; - if (m3->mc_pg[csrc->mc_top] == psrc) { - if (!m3->mc_ki[csrc->mc_top]) { - m3->mc_pg[csrc->mc_top] = pdst; - m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; - cASSERT(csrc, csrc->mc_top > 0); - m3->mc_ki[csrc->mc_top - 1]--; - } else { - m3->mc_ki[csrc->mc_top]--; - } - if (XCURSOR_INITED(m3) && IS_LEAF(psrc)) - XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], - m3->mc_ki[csrc->mc_top]); - } - } + case MDBX_opt_writethrough_threshold: +#if defined(_WIN32) || defined(_WIN64) + /* позволяем "установить" значение по-умолчанию и совпадающее + * с поведением соответствующим текущей установке MDBX_NOMETASYNC */ + if (value == /* default */ UINT64_MAX && + value != ((env->flags & MDBX_NOMETASYNC) ? 0 : UINT_MAX)) + err = MDBX_EINVAL; +#else + if (value == /* default */ UINT64_MAX) + value = MDBX_WRITETHROUGH_THRESHOLD_DEFAULT; + if (value != (unsigned)value) + err = MDBX_EINVAL; + else + env->options.writethrough_threshold = (unsigned)value; +#endif + break; + + case MDBX_opt_prefault_write_enable: + if (value == /* default */ UINT64_MAX) { + env->options.prefault_write = default_prefault_write(env); + env->options.flags.non_auto.prefault_write = false; + } else if (value > 1) + err = MDBX_EINVAL; + else { + env->options.prefault_write = value != 0; + env->options.flags.non_auto.prefault_write = true; } - } + break; - /* Update the parent separators. */ - if (csrc->mc_ki[csrc->mc_top] == 0) { - cASSERT(csrc, csrc->mc_top > 0); - if (csrc->mc_ki[csrc->mc_top - 1] != 0) { - MDBX_val key; - if (IS_LEAF2(psrc)) { - key.iov_len = psrc->mp_leaf2_ksize; - key.iov_base = page_leaf2key(psrc, 0, key.iov_len); - } else { - MDBX_node *srcnode = page_node(psrc, 0); - key.iov_len = node_ks(srcnode); - key.iov_base = node_key(srcnode); - } - DEBUG("update separator for source page %" PRIaPGNO " to [%s]", - psrc->mp_pgno, DKEY_DEBUG(&key)); - MDBX_cursor mn; - cursor_copy(csrc, &mn); - cASSERT(csrc, mn.mc_snum > 0); - mn.mc_snum--; - mn.mc_top--; - /* We want rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, rc = update_key(&mn, &key)); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + case MDBX_opt_prefer_waf_insteadof_balance: + if (value == /* default */ UINT64_MAX) + env->options.prefer_waf_insteadof_balance = + default_prefer_waf_insteadof_balance(env); + else if (value > 1) + err = MDBX_EINVAL; + else + env->options.prefer_waf_insteadof_balance = value != 0; + break; + + case MDBX_opt_subpage_limit: + if (value == /* default */ UINT64_MAX) { + env->options.subpage.limit = default_subpage_limit(env); + recalculate_subpage_thresholds(env); + } else if (value > 65535) + err = MDBX_EINVAL; + else { + env->options.subpage.limit = (uint16_t)value; + recalculate_subpage_thresholds(env); } - if (IS_BRANCH(psrc)) { - const MDBX_val nullkey = {0, 0}; - const indx_t ix = csrc->mc_ki[csrc->mc_top]; - csrc->mc_ki[csrc->mc_top] = 0; - rc = update_key(csrc, &nullkey); - csrc->mc_ki[csrc->mc_top] = ix; - cASSERT(csrc, rc == MDBX_SUCCESS); + break; + + case MDBX_opt_subpage_room_threshold: + if (value == /* default */ UINT64_MAX) { + env->options.subpage.room_threshold = default_subpage_room_threshold(env); + recalculate_subpage_thresholds(env); + } else if (value > 65535) + err = MDBX_EINVAL; + else { + env->options.subpage.room_threshold = (uint16_t)value; + recalculate_subpage_thresholds(env); } - } + break; - if (cdst->mc_ki[cdst->mc_top] == 0) { - cASSERT(cdst, cdst->mc_top > 0); - if (cdst->mc_ki[cdst->mc_top - 1] != 0) { - MDBX_val key; - if (IS_LEAF2(pdst)) { - key.iov_len = pdst->mp_leaf2_ksize; - key.iov_base = page_leaf2key(pdst, 0, key.iov_len); - } else { - MDBX_node *srcnode = page_node(pdst, 0); - key.iov_len = node_ks(srcnode); - key.iov_base = node_key(srcnode); - } - DEBUG("update separator for destination page %" PRIaPGNO " to [%s]", - pdst->mp_pgno, DKEY_DEBUG(&key)); - MDBX_cursor mn; - cursor_copy(cdst, &mn); - cASSERT(cdst, mn.mc_snum > 0); - mn.mc_snum--; - mn.mc_top--; - /* We want rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, rc = update_key(&mn, &key)); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + case MDBX_opt_subpage_reserve_prereq: + if (value == /* default */ UINT64_MAX) { + env->options.subpage.reserve_prereq = default_subpage_reserve_prereq(env); + recalculate_subpage_thresholds(env); + } else if (value > 65535) + err = MDBX_EINVAL; + else { + env->options.subpage.reserve_prereq = (uint16_t)value; + recalculate_subpage_thresholds(env); } - if (IS_BRANCH(pdst)) { - const MDBX_val nullkey = {0, 0}; - const indx_t ix = cdst->mc_ki[cdst->mc_top]; - cdst->mc_ki[cdst->mc_top] = 0; - rc = update_key(cdst, &nullkey); - cdst->mc_ki[cdst->mc_top] = ix; - cASSERT(cdst, rc == MDBX_SUCCESS); + break; + + case MDBX_opt_subpage_reserve_limit: + if (value == /* default */ UINT64_MAX) { + env->options.subpage.reserve_limit = default_subpage_reserve_limit(env); + recalculate_subpage_thresholds(env); + } else if (value > 65535) + err = MDBX_EINVAL; + else { + env->options.subpage.reserve_limit = (uint16_t)value; + recalculate_subpage_thresholds(env); } + break; + + default: + return MDBX_EINVAL; } - return MDBX_SUCCESS; + if (should_unlock) + lck_txn_unlock(env); + return err; } -/* Merge one page into another. - * - * The nodes from the page pointed to by csrc will be copied to the page - * pointed to by cdst and then the csrc page will be freed. - * - * [in] csrc Cursor pointing to the source page. - * [in] cdst Cursor pointing to the destination page. - * - * Returns 0 on success, non-zero on failure. */ -static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { - MDBX_val key; - int rc; +__cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option, + uint64_t *pvalue) { + int err = check_env(env, false); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (unlikely(!pvalue)) + return MDBX_EINVAL; - cASSERT(csrc, csrc != cdst); - cASSERT(csrc, cursor_is_tracked(csrc)); - cASSERT(cdst, cursor_is_tracked(cdst)); - const MDBX_page *const psrc = csrc->mc_pg[csrc->mc_top]; - MDBX_page *pdst = cdst->mc_pg[cdst->mc_top]; - DEBUG("merging page %" PRIaPGNO " into %" PRIaPGNO, psrc->mp_pgno, - pdst->mp_pgno); - - cASSERT(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst)); - cASSERT(csrc, csrc->mc_dbi == cdst->mc_dbi && csrc->mc_db == cdst->mc_db); - cASSERT(csrc, csrc->mc_snum > 1); /* can't merge root page */ - cASSERT(cdst, cdst->mc_snum > 1); - cASSERT(cdst, cdst->mc_snum < cdst->mc_db->md_depth || - IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); - cASSERT(csrc, csrc->mc_snum < csrc->mc_db->md_depth || - IS_LEAF(csrc->mc_pg[csrc->mc_db->md_depth - 1])); - cASSERT(cdst, csrc->mc_txn->mt_env->me_options.prefer_waf_insteadof_balance || - page_room(pdst) >= page_used(cdst->mc_txn->mt_env, psrc)); - const int pagetype = PAGETYPE_WHOLE(psrc); + switch (option) { + case MDBX_opt_sync_bytes: + if (unlikely(!(env->flags & ENV_ACTIVE))) + return MDBX_EPERM; + *pvalue = pgno2bytes( + env, atomic_load32(&env->lck->autosync_threshold, mo_Relaxed)); + break; - /* Move all nodes from src to dst */ - const size_t dst_nkeys = page_numkeys(pdst); - const size_t src_nkeys = page_numkeys(psrc); - cASSERT(cdst, dst_nkeys + src_nkeys >= (IS_LEAF(psrc) ? 1u : 2u)); - if (likely(src_nkeys)) { - size_t j = dst_nkeys; - if (unlikely(pagetype & P_LEAF2)) { - /* Mark dst as dirty. */ - rc = page_touch(cdst); - cASSERT(cdst, rc != MDBX_RESULT_TRUE); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + case MDBX_opt_sync_period: + if (unlikely(!(env->flags & ENV_ACTIVE))) + return MDBX_EPERM; + *pvalue = osal_monotime_to_16dot16( + atomic_load64(&env->lck->autosync_period, mo_Relaxed)); + break; - key.iov_len = csrc->mc_db->md_xsize; - key.iov_base = page_data(psrc); - size_t i = 0; - do { - rc = node_add_leaf2(cdst, j++, &key); - cASSERT(cdst, rc != MDBX_RESULT_TRUE); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - key.iov_base = ptr_disp(key.iov_base, key.iov_len); - } while (++i != src_nkeys); - } else { - MDBX_node *srcnode = page_node(psrc, 0); - key.iov_len = node_ks(srcnode); - key.iov_base = node_key(srcnode); - if (pagetype & P_BRANCH) { - MDBX_cursor mn; - cursor_copy(csrc, &mn); - /* must find the lowest key below src */ - rc = page_search_lowest(&mn); - cASSERT(csrc, rc != MDBX_RESULT_TRUE); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + case MDBX_opt_max_db: + *pvalue = env->max_dbi - CORE_DBS; + break; - const MDBX_page *mp = mn.mc_pg[mn.mc_top]; - if (likely(!IS_LEAF2(mp))) { - cASSERT(&mn, IS_LEAF(mp)); - const MDBX_node *lowest = page_node(mp, 0); - key.iov_len = node_ks(lowest); - key.iov_base = node_key(lowest); - } else { - cASSERT(&mn, mn.mc_top > csrc->mc_top); - key.iov_len = mp->mp_leaf2_ksize; - key.iov_base = page_leaf2key(mp, mn.mc_ki[mn.mc_top], key.iov_len); - } - cASSERT(&mn, key.iov_len >= csrc->mc_dbx->md_klen_min); - cASSERT(&mn, key.iov_len <= csrc->mc_dbx->md_klen_max); + case MDBX_opt_max_readers: + *pvalue = env->max_readers; + break; - const size_t dst_room = page_room(pdst); - const size_t src_used = page_used(cdst->mc_txn->mt_env, psrc); - const size_t space_needed = src_used - node_ks(srcnode) + key.iov_len; - if (unlikely(space_needed > dst_room)) - return MDBX_RESULT_TRUE; - } + case MDBX_opt_dp_reserve_limit: + *pvalue = env->options.dp_reserve_limit; + break; - /* Mark dst as dirty. */ - rc = page_touch(cdst); - cASSERT(cdst, rc != MDBX_RESULT_TRUE); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + case MDBX_opt_rp_augment_limit: + *pvalue = env->options.rp_augment_limit; + break; - size_t i = 0; - while (true) { - if (pagetype & P_LEAF) { - MDBX_val data; - data.iov_len = node_ds(srcnode); - data.iov_base = node_data(srcnode); - rc = node_add_leaf(cdst, j++, &key, &data, node_flags(srcnode)); - } else { - cASSERT(csrc, node_flags(srcnode) == 0); - rc = node_add_branch(cdst, j++, &key, node_pgno(srcnode)); - } - cASSERT(cdst, rc != MDBX_RESULT_TRUE); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + case MDBX_opt_gc_time_limit: + *pvalue = osal_monotime_to_16dot16(env->options.gc_time_limit); + break; - if (++i == src_nkeys) - break; - srcnode = page_node(psrc, i); - key.iov_len = node_ks(srcnode); - key.iov_base = node_key(srcnode); - } - } + case MDBX_opt_txn_dp_limit: + *pvalue = env->options.dp_limit; + break; + case MDBX_opt_txn_dp_initial: + *pvalue = env->options.dp_initial; + break; - pdst = cdst->mc_pg[cdst->mc_top]; - DEBUG("dst page %" PRIaPGNO " now has %zu keys (%.1f%% filled)", - pdst->mp_pgno, page_numkeys(pdst), - page_fill(cdst->mc_txn->mt_env, pdst)); + case MDBX_opt_spill_max_denominator: + *pvalue = env->options.spill_max_denominator; + break; + case MDBX_opt_spill_min_denominator: + *pvalue = env->options.spill_min_denominator; + break; + case MDBX_opt_spill_parent4child_denominator: + *pvalue = env->options.spill_parent4child_denominator; + break; - cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]); - cASSERT(cdst, pdst == cdst->mc_pg[cdst->mc_top]); - } + case MDBX_opt_loose_limit: + *pvalue = env->options.dp_loose_limit; + break; - /* Unlink the src page from parent and add to free list. */ - csrc->mc_top--; - node_del(csrc, 0); - if (csrc->mc_ki[csrc->mc_top] == 0) { - const MDBX_val nullkey = {0, 0}; - rc = update_key(csrc, &nullkey); - cASSERT(csrc, rc != MDBX_RESULT_TRUE); - if (unlikely(rc != MDBX_SUCCESS)) { - csrc->mc_top++; - return rc; - } - } - csrc->mc_top++; + case MDBX_opt_merge_threshold_16dot16_percent: + *pvalue = env->options.merge_threshold_16dot16_percent; + break; - cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]); - cASSERT(cdst, pdst == cdst->mc_pg[cdst->mc_top]); + case MDBX_opt_writethrough_threshold: +#if defined(_WIN32) || defined(_WIN64) + *pvalue = (env->flags & MDBX_NOMETASYNC) ? 0 : INT_MAX; +#else + *pvalue = env->options.writethrough_threshold; +#endif + break; - { - /* Adjust other cursors pointing to mp */ - MDBX_cursor *m2, *m3; - const MDBX_dbi dbi = csrc->mc_dbi; - const size_t top = csrc->mc_top; + case MDBX_opt_prefault_write_enable: + *pvalue = env->options.prefault_write; + break; - for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { - m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (m3 == csrc || top >= m3->mc_snum) - continue; - if (m3->mc_pg[top] == psrc) { - m3->mc_pg[top] = pdst; - cASSERT(m3, dst_nkeys + m3->mc_ki[top] <= UINT16_MAX); - m3->mc_ki[top] += (indx_t)dst_nkeys; - m3->mc_ki[top - 1] = cdst->mc_ki[top - 1]; - } else if (m3->mc_pg[top - 1] == csrc->mc_pg[top - 1] && - m3->mc_ki[top - 1] > csrc->mc_ki[top - 1]) { - m3->mc_ki[top - 1]--; - } - if (XCURSOR_INITED(m3) && IS_LEAF(psrc)) - XCURSOR_REFRESH(m3, m3->mc_pg[top], m3->mc_ki[top]); - } - } + case MDBX_opt_prefer_waf_insteadof_balance: + *pvalue = env->options.prefer_waf_insteadof_balance; + break; - rc = page_retire(csrc, (MDBX_page *)psrc); - cASSERT(csrc, rc != MDBX_RESULT_TRUE); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + case MDBX_opt_subpage_limit: + *pvalue = env->options.subpage.limit; + break; - cASSERT(cdst, cdst->mc_db->md_entries > 0); - cASSERT(cdst, cdst->mc_snum <= cdst->mc_db->md_depth); - cASSERT(cdst, cdst->mc_top > 0); - cASSERT(cdst, cdst->mc_snum == cdst->mc_top + 1); - MDBX_page *const top_page = cdst->mc_pg[cdst->mc_top]; - const indx_t top_indx = cdst->mc_ki[cdst->mc_top]; - const unsigned save_snum = cdst->mc_snum; - const uint16_t save_depth = cdst->mc_db->md_depth; - cursor_pop(cdst); - rc = rebalance(cdst); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + case MDBX_opt_subpage_room_threshold: + *pvalue = env->options.subpage.room_threshold; + break; - cASSERT(cdst, cdst->mc_db->md_entries > 0); - cASSERT(cdst, cdst->mc_snum <= cdst->mc_db->md_depth); - cASSERT(cdst, cdst->mc_snum == cdst->mc_top + 1); + case MDBX_opt_subpage_reserve_prereq: + *pvalue = env->options.subpage.reserve_prereq; + break; -#if MDBX_ENABLE_PGOP_STAT - cdst->mc_txn->mt_env->me_lck->mti_pgop_stat.merge.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ + case MDBX_opt_subpage_reserve_limit: + *pvalue = env->options.subpage.reserve_limit; + break; - if (IS_LEAF(cdst->mc_pg[cdst->mc_top])) { - /* LY: don't touch cursor if top-page is a LEAF */ - cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || - PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); - return MDBX_SUCCESS; + default: + return MDBX_EINVAL; } - cASSERT(cdst, page_numkeys(top_page) == dst_nkeys + src_nkeys); + return MDBX_SUCCESS; +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 - if (unlikely(pagetype != PAGETYPE_WHOLE(top_page))) { - /* LY: LEAF-page becomes BRANCH, unable restore cursor's stack */ - goto bailout; - } - if (top_page == cdst->mc_pg[cdst->mc_top]) { - /* LY: don't touch cursor if prev top-page already on the top */ - cASSERT(cdst, cdst->mc_ki[cdst->mc_top] == top_indx); - cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || - PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); - return MDBX_SUCCESS; - } +bool env_txn0_owned(const MDBX_env *env) { + return (env->flags & MDBX_NOSTICKYTHREADS) + ? (env->basal_txn->owner != 0) + : (env->basal_txn->owner == osal_thread_self()); +} - const int new_snum = save_snum - save_depth + cdst->mc_db->md_depth; - if (unlikely(new_snum < 1 || new_snum > cdst->mc_db->md_depth)) { - /* LY: out of range, unable restore cursor's stack */ - goto bailout; - } +int env_page_auxbuffer(MDBX_env *env) { + return env->page_auxbuf ? MDBX_SUCCESS + : osal_memalign_alloc(globals.sys_pagesize, + env->ps * (size_t)NUM_METAS, + &env->page_auxbuf); +} - if (top_page == cdst->mc_pg[new_snum - 1]) { - cASSERT(cdst, cdst->mc_ki[new_snum - 1] == top_indx); - /* LY: restore cursor stack */ - cdst->mc_snum = (uint8_t)new_snum; - cdst->mc_top = (uint8_t)new_snum - 1; - cASSERT(cdst, cdst->mc_snum < cdst->mc_db->md_depth || - IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); - cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || - PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); - return MDBX_SUCCESS; +__cold unsigned env_setup_pagesize(MDBX_env *env, const size_t pagesize) { + STATIC_ASSERT(PTRDIFF_MAX > MAX_MAPSIZE); + STATIC_ASSERT(MDBX_MIN_PAGESIZE > sizeof(page_t) + sizeof(meta_t)); + ENSURE(env, is_powerof2(pagesize)); + ENSURE(env, pagesize >= MDBX_MIN_PAGESIZE); + ENSURE(env, pagesize <= MDBX_MAX_PAGESIZE); + env->ps = (unsigned)pagesize; + if (env->page_auxbuf) { + osal_memalign_free(env->page_auxbuf); + env->page_auxbuf = nullptr; } - MDBX_page *const stub_page = (MDBX_page *)(~(uintptr_t)top_page); - const indx_t stub_indx = top_indx; - if (save_depth > cdst->mc_db->md_depth && - ((cdst->mc_pg[save_snum - 1] == top_page && - cdst->mc_ki[save_snum - 1] == top_indx) || - (cdst->mc_pg[save_snum - 1] == stub_page && - cdst->mc_ki[save_snum - 1] == stub_indx))) { - /* LY: restore cursor stack */ - cdst->mc_pg[new_snum - 1] = top_page; - cdst->mc_ki[new_snum - 1] = top_indx; - cdst->mc_pg[new_snum] = (MDBX_page *)(~(uintptr_t)cdst->mc_pg[new_snum]); - cdst->mc_ki[new_snum] = ~cdst->mc_ki[new_snum]; - cdst->mc_snum = (uint8_t)new_snum; - cdst->mc_top = (uint8_t)new_snum - 1; - cASSERT(cdst, cdst->mc_snum < cdst->mc_db->md_depth || - IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); - cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || - PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); - return MDBX_SUCCESS; - } + STATIC_ASSERT(MAX_GC1OVPAGE(MDBX_MIN_PAGESIZE) > 4); + STATIC_ASSERT(MAX_GC1OVPAGE(MDBX_MAX_PAGESIZE) < PAGELIST_LIMIT); + const intptr_t maxgc_ov1page = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1; + ENSURE(env, + maxgc_ov1page > 42 && maxgc_ov1page < (intptr_t)PAGELIST_LIMIT / 4); + env->maxgc_large1page = (unsigned)maxgc_ov1page; + env->maxgc_per_branch = + (unsigned)((pagesize - PAGEHDRSZ) / + (sizeof(indx_t) + sizeof(node_t) + sizeof(txnid_t))); + + STATIC_ASSERT(LEAF_NODE_MAX(MDBX_MIN_PAGESIZE) > + sizeof(tree_t) + NODESIZE + 42); + STATIC_ASSERT(LEAF_NODE_MAX(MDBX_MAX_PAGESIZE) < UINT16_MAX); + STATIC_ASSERT(LEAF_NODE_MAX(MDBX_MIN_PAGESIZE) >= + BRANCH_NODE_MAX(MDBX_MIN_PAGESIZE)); + STATIC_ASSERT(BRANCH_NODE_MAX(MDBX_MAX_PAGESIZE) > NODESIZE + 42); + STATIC_ASSERT(BRANCH_NODE_MAX(MDBX_MAX_PAGESIZE) < UINT16_MAX); + const intptr_t branch_nodemax = BRANCH_NODE_MAX(pagesize); + const intptr_t leaf_nodemax = LEAF_NODE_MAX(pagesize); + ENSURE(env, branch_nodemax > (intptr_t)(NODESIZE + 42) && + branch_nodemax % 2 == 0 && + leaf_nodemax > (intptr_t)(sizeof(tree_t) + NODESIZE + 42) && + leaf_nodemax >= branch_nodemax && + leaf_nodemax < (int)UINT16_MAX && leaf_nodemax % 2 == 0); + env->leaf_nodemax = (uint16_t)leaf_nodemax; + env->branch_nodemax = (uint16_t)branch_nodemax; + env->ps2ln = (uint8_t)log2n_powerof2(pagesize); + eASSERT(env, pgno2bytes(env, 1) == pagesize); + eASSERT(env, bytes2pgno(env, pagesize + pagesize) == 2); + recalculate_merge_thresholds(env); + recalculate_subpage_thresholds(env); -bailout: - /* LY: unable restore cursor's stack */ - cdst->mc_flags &= ~C_INITIALIZED; - return MDBX_CURSOR_FULL; + const pgno_t max_pgno = bytes2pgno(env, MAX_MAPSIZE); + if (!env->options.flags.non_auto.dp_limit) { + /* auto-setup dp_limit by "The42" ;-) */ + intptr_t total_ram_pages, avail_ram_pages; + int err = mdbx_get_sysraminfo(nullptr, &total_ram_pages, &avail_ram_pages); + if (unlikely(err != MDBX_SUCCESS)) + ERROR("mdbx_get_sysraminfo(), rc %d", err); + else { + size_t reasonable_dpl_limit = + (size_t)(total_ram_pages + avail_ram_pages) / 42; + if (pagesize > globals.sys_pagesize) + reasonable_dpl_limit /= pagesize / globals.sys_pagesize; + else if (pagesize < globals.sys_pagesize) + reasonable_dpl_limit *= globals.sys_pagesize / pagesize; + reasonable_dpl_limit = (reasonable_dpl_limit < PAGELIST_LIMIT) + ? reasonable_dpl_limit + : PAGELIST_LIMIT; + reasonable_dpl_limit = (reasonable_dpl_limit > CURSOR_STACK_SIZE * 4) + ? reasonable_dpl_limit + : CURSOR_STACK_SIZE * 4; + env->options.dp_limit = (unsigned)reasonable_dpl_limit; + } + } + if (env->options.dp_limit > max_pgno - NUM_METAS) + env->options.dp_limit = max_pgno - NUM_METAS; + if (env->options.dp_initial > env->options.dp_limit) + env->options.dp_initial = env->options.dp_limit; + return env->ps; } -static void cursor_restore(const MDBX_cursor *csrc, MDBX_cursor *cdst) { - cASSERT(cdst, cdst->mc_dbi == csrc->mc_dbi); - cASSERT(cdst, cdst->mc_txn == csrc->mc_txn); - cASSERT(cdst, cdst->mc_db == csrc->mc_db); - cASSERT(cdst, cdst->mc_dbx == csrc->mc_dbx); - cASSERT(cdst, cdst->mc_dbi_state == csrc->mc_dbi_state); - cdst->mc_snum = csrc->mc_snum; - cdst->mc_top = csrc->mc_top; - cdst->mc_flags = csrc->mc_flags; - cdst->mc_checking = csrc->mc_checking; - - for (size_t i = 0; i < csrc->mc_snum; i++) { - cdst->mc_pg[i] = csrc->mc_pg[i]; - cdst->mc_ki[i] = csrc->mc_ki[i]; - } -} - -/* Copy the contents of a cursor. - * [in] csrc The cursor to copy from. - * [out] cdst The cursor to copy to. */ -static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { - cASSERT(csrc, csrc->mc_txn->mt_txnid >= - csrc->mc_txn->mt_env->me_lck->mti_oldest_reader.weak); - cdst->mc_dbi = csrc->mc_dbi; - cdst->mc_next = NULL; - cdst->mc_backup = NULL; - cdst->mc_xcursor = NULL; - cdst->mc_txn = csrc->mc_txn; - cdst->mc_db = csrc->mc_db; - cdst->mc_dbx = csrc->mc_dbx; - cdst->mc_dbi_state = csrc->mc_dbi_state; - cursor_restore(csrc, cdst); -} - -/* Rebalance the tree after a delete operation. - * [in] mc Cursor pointing to the page where rebalancing should begin. - * Returns 0 on success, non-zero on failure. */ -static int rebalance(MDBX_cursor *mc) { - cASSERT(mc, cursor_is_tracked(mc)); - cASSERT(mc, mc->mc_snum > 0); - cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth || - IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); - const int pagetype = PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]); - - STATIC_ASSERT(P_BRANCH == 1); - const size_t minkeys = (pagetype & P_BRANCH) + (size_t)1; +__cold int env_sync(MDBX_env *env, bool force, bool nonblock) { + if (unlikely(env->flags & MDBX_RDONLY)) + return MDBX_EACCESS; - /* Pages emptier than this are candidates for merging. */ - size_t room_threshold = likely(mc->mc_dbi != FREE_DBI) - ? mc->mc_txn->mt_env->me_merge_threshold - : mc->mc_txn->mt_env->me_merge_threshold_gc; + const bool txn0_owned = env_txn0_owned(env); + bool should_unlock = false; + int rc = MDBX_RESULT_TRUE /* means "nothing to sync" */; - const MDBX_page *const tp = mc->mc_pg[mc->mc_top]; - const size_t numkeys = page_numkeys(tp); - const size_t room = page_room(tp); - DEBUG("rebalancing %s page %" PRIaPGNO - " (has %zu keys, full %.1f%%, used %zu, room %zu bytes )", - (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, - page_fill(mc->mc_txn->mt_env, tp), page_used(mc->mc_txn->mt_env, tp), - room); - cASSERT(mc, IS_MODIFIABLE(mc->mc_txn, tp)); +retry:; + unsigned flags = env->flags & ~(MDBX_NOMETASYNC | txn_shrink_allowed); + if (unlikely((flags & (ENV_FATAL_ERROR | ENV_ACTIVE)) != ENV_ACTIVE)) { + rc = (flags & ENV_FATAL_ERROR) ? MDBX_PANIC : MDBX_EPERM; + goto bailout; + } - if (unlikely(numkeys < minkeys)) { - DEBUG("page %" PRIaPGNO " must be merged due keys < %zu threshold", - tp->mp_pgno, minkeys); - } else if (unlikely(room > room_threshold)) { - DEBUG("page %" PRIaPGNO " should be merged due room %zu > %zu threshold", - tp->mp_pgno, room, room_threshold); - } else { - DEBUG("no need to rebalance page %" PRIaPGNO ", room %zu < %zu threshold", - tp->mp_pgno, room, room_threshold); - cASSERT(mc, mc->mc_db->md_entries > 0); - return MDBX_SUCCESS; + const troika_t troika = + (txn0_owned | should_unlock) ? env->basal_txn->tw.troika : meta_tap(env); + const meta_ptr_t head = meta_recent(env, &troika); + const uint64_t unsynced_pages = + atomic_load64(&env->lck->unsynced_pages, mo_Relaxed); + if (unsynced_pages == 0) { + const uint32_t synched_meta_txnid_u32 = + atomic_load32(&env->lck->meta_sync_txnid, mo_Relaxed); + if (synched_meta_txnid_u32 == (uint32_t)head.txnid && head.is_steady) + goto bailout; } - int rc; - if (mc->mc_snum < 2) { - MDBX_page *const mp = mc->mc_pg[0]; - const size_t nkeys = page_numkeys(mp); - cASSERT(mc, (mc->mc_db->md_entries == 0) == (nkeys == 0)); - if (IS_SUBP(mp)) { - DEBUG("%s", "Can't rebalance a subpage, ignoring"); - cASSERT(mc, pagetype & P_LEAF); - return MDBX_SUCCESS; - } - if (nkeys == 0) { - cASSERT(mc, IS_LEAF(mp)); - DEBUG("%s", "tree is completely empty"); - cASSERT(mc, (*mc->mc_dbi_state & DBI_DIRTY) != 0); - mc->mc_db->md_root = P_INVALID; - mc->mc_db->md_depth = 0; - cASSERT(mc, mc->mc_db->md_branch_pages == 0 && - mc->mc_db->md_overflow_pages == 0 && - mc->mc_db->md_leaf_pages == 1); - /* Adjust cursors pointing to mp */ - for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; - m2 = m2->mc_next) { - MDBX_cursor *m3 = - (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (m3 == mc || !(m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_pg[0] == mp) { - m3->mc_snum = 0; - m3->mc_top = 0; - m3->mc_flags &= ~C_INITIALIZED; - } - } - mc->mc_snum = 0; - mc->mc_top = 0; - mc->mc_flags &= ~C_INITIALIZED; - return page_retire(mc, mp); - } - if (IS_BRANCH(mp) && nkeys == 1) { - DEBUG("%s", "collapsing root page!"); - mc->mc_db->md_root = node_pgno(page_node(mp, 0)); - rc = page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], mp->mp_txnid); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - mc->mc_db->md_depth--; - mc->mc_ki[0] = mc->mc_ki[1]; - for (int i = 1; i < mc->mc_db->md_depth; i++) { - mc->mc_pg[i] = mc->mc_pg[i + 1]; - mc->mc_ki[i] = mc->mc_ki[i + 1]; - } + if (should_unlock && (env->flags & MDBX_WRITEMAP) && + unlikely(head.ptr_c->geometry.first_unallocated > + bytes2pgno(env, env->dxb_mmap.current))) { - /* Adjust other cursors pointing to mp */ - for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; - m2 = m2->mc_next) { - MDBX_cursor *m3 = - (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (m3 == mc || !(m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_pg[0] == mp) { - for (int i = 0; i < mc->mc_db->md_depth; i++) { - m3->mc_pg[i] = m3->mc_pg[i + 1]; - m3->mc_ki[i] = m3->mc_ki[i + 1]; - } - m3->mc_snum--; - m3->mc_top--; - } - } - cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) || - PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]) == pagetype); - cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth || - IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); - return page_retire(mc, mp); + if (unlikely(env->stuck_meta >= 0) && + troika.recent != (uint8_t)env->stuck_meta) { + NOTICE("skip %s since wagering meta-page (%u) is mispatch the recent " + "meta-page (%u)", + "sync datafile", env->stuck_meta, troika.recent); + rc = MDBX_RESULT_TRUE; + } else { + rc = dxb_resize(env, head.ptr_c->geometry.first_unallocated, + head.ptr_c->geometry.now, head.ptr_c->geometry.upper, + implicit_grow); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; } - DEBUG("root page %" PRIaPGNO " doesn't need rebalancing (flags 0x%x)", - mp->mp_pgno, mp->mp_flags); - return MDBX_SUCCESS; } - /* The parent (branch page) must have at least 2 pointers, - * otherwise the tree is invalid. */ - const size_t pre_top = mc->mc_top - 1; - cASSERT(mc, IS_BRANCH(mc->mc_pg[pre_top])); - cASSERT(mc, !IS_SUBP(mc->mc_pg[0])); - cASSERT(mc, page_numkeys(mc->mc_pg[pre_top]) > 1); + const size_t autosync_threshold = + atomic_load32(&env->lck->autosync_threshold, mo_Relaxed); + const uint64_t autosync_period = + atomic_load64(&env->lck->autosync_period, mo_Relaxed); + uint64_t eoos_timestamp; + if (force || (autosync_threshold && unsynced_pages >= autosync_threshold) || + (autosync_period && + (eoos_timestamp = + atomic_load64(&env->lck->eoos_timestamp, mo_Relaxed)) && + osal_monotime() - eoos_timestamp >= autosync_period)) + flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; - /* Leaf page fill factor is below the threshold. - * Try to move keys from left or right neighbor, or - * merge with a neighbor page. */ + if (!txn0_owned) { + if (!should_unlock) { +#if MDBX_ENABLE_PGOP_STAT + unsigned wops = 0; +#endif /* MDBX_ENABLE_PGOP_STAT */ - /* Find neighbors. */ - MDBX_cursor mn; - cursor_copy(mc, &mn); + int err; + /* pre-sync to avoid latency for writer */ + if (unsynced_pages > /* FIXME: define threshold */ 42 && + (flags & MDBX_SAFE_NOSYNC) == 0) { + eASSERT(env, ((flags ^ env->flags) & MDBX_WRITEMAP) == 0); + if (flags & MDBX_WRITEMAP) { + /* Acquire guard to avoid collision with remap */ +#if defined(_WIN32) || defined(_WIN64) + imports.srwl_AcquireShared(&env->remap_guard); +#else + err = osal_fastmutex_acquire(&env->remap_guard); + if (unlikely(err != MDBX_SUCCESS)) + return err; +#endif + const size_t usedbytes = + pgno_align2os_bytes(env, head.ptr_c->geometry.first_unallocated); + err = osal_msync(&env->dxb_mmap, 0, usedbytes, MDBX_SYNC_DATA); +#if defined(_WIN32) || defined(_WIN64) + imports.srwl_ReleaseShared(&env->remap_guard); +#else + int unlock_err = osal_fastmutex_release(&env->remap_guard); + if (unlikely(unlock_err != MDBX_SUCCESS) && err == MDBX_SUCCESS) + err = unlock_err; +#endif + } else + err = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA); - MDBX_page *left = nullptr, *right = nullptr; - if (mn.mc_ki[pre_top] > 0) { - rc = page_get( - &mn, node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] - 1)), - &left, mc->mc_pg[mc->mc_top]->mp_txnid); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - cASSERT(mc, PAGETYPE_WHOLE(left) == PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top])); - } - if (mn.mc_ki[pre_top] + (size_t)1 < page_numkeys(mn.mc_pg[pre_top])) { - rc = page_get( - &mn, - node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] + (size_t)1)), - &right, mc->mc_pg[mc->mc_top]->mp_txnid); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - cASSERT(mc, PAGETYPE_WHOLE(right) == PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top])); - } - cASSERT(mc, left || right); + if (unlikely(err != MDBX_SUCCESS)) + return err; - const size_t ki_top = mc->mc_ki[mc->mc_top]; - const size_t ki_pre_top = mn.mc_ki[pre_top]; - const size_t nkeys = page_numkeys(mn.mc_pg[mn.mc_top]); +#if MDBX_ENABLE_PGOP_STAT + wops = 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + /* pre-sync done */ + rc = MDBX_SUCCESS /* means "some data was synced" */; + } - const size_t left_room = left ? page_room(left) : 0; - const size_t right_room = right ? page_room(right) : 0; - const size_t left_nkeys = left ? page_numkeys(left) : 0; - const size_t right_nkeys = right ? page_numkeys(right) : 0; - bool involve = false; -retry: - cASSERT(mc, mc->mc_snum > 1); - if (left_room > room_threshold && left_room >= right_room && - (IS_MODIFIABLE(mc->mc_txn, left) || involve)) { - /* try merge with left */ - cASSERT(mc, left_nkeys >= minkeys); - mn.mc_pg[mn.mc_top] = left; - mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top - 1); - mn.mc_ki[mn.mc_top] = (indx_t)(left_nkeys - 1); - mc->mc_ki[mc->mc_top] = 0; - const size_t new_ki = ki_top + left_nkeys; - mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; - /* We want rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, rc = page_merge(mc, &mn)); - if (likely(rc != MDBX_RESULT_TRUE)) { - cursor_restore(&mn, mc); - mc->mc_ki[mc->mc_top] = (indx_t)new_ki; - cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); - return rc; - } - } - if (right_room > room_threshold && - (IS_MODIFIABLE(mc->mc_txn, right) || involve)) { - /* try merge with right */ - cASSERT(mc, right_nkeys >= minkeys); - mn.mc_pg[mn.mc_top] = right; - mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top + 1); - mn.mc_ki[mn.mc_top] = 0; - mc->mc_ki[mc->mc_top] = (indx_t)nkeys; - WITH_CURSOR_TRACKING(mn, rc = page_merge(&mn, mc)); - if (likely(rc != MDBX_RESULT_TRUE)) { - mc->mc_ki[mc->mc_top] = (indx_t)ki_top; - cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); - return rc; + err = lck_txn_lock(env, nonblock); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + should_unlock = true; +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.wops.weak += wops; +#endif /* MDBX_ENABLE_PGOP_STAT */ + env->basal_txn->tw.troika = meta_tap(env); + eASSERT(env, !env->txn && !env->basal_txn->nested); + goto retry; } + eASSERT(env, head.txnid == recent_committed_txnid(env)); + env->basal_txn->txnid = head.txnid; + txn_snapshot_oldest(env->basal_txn); + flags |= txn_shrink_allowed; } - if (left_nkeys > minkeys && - (right_nkeys <= left_nkeys || right_room >= left_room) && - (IS_MODIFIABLE(mc->mc_txn, left) || involve)) { - /* try move from left */ - mn.mc_pg[mn.mc_top] = left; - mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top - 1); - mn.mc_ki[mn.mc_top] = (indx_t)(left_nkeys - 1); - mc->mc_ki[mc->mc_top] = 0; - WITH_CURSOR_TRACKING(mn, rc = node_move(&mn, mc, true)); - if (likely(rc != MDBX_RESULT_TRUE)) { - mc->mc_ki[mc->mc_top] = (indx_t)(ki_top + 1); - cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); - return rc; - } + eASSERT(env, txn0_owned || should_unlock); + eASSERT(env, !txn0_owned || (flags & txn_shrink_allowed) == 0); + + if (!head.is_steady && unlikely(env->stuck_meta >= 0) && + troika.recent != (uint8_t)env->stuck_meta) { + NOTICE("skip %s since wagering meta-page (%u) is mispatch the recent " + "meta-page (%u)", + "sync datafile", env->stuck_meta, troika.recent); + rc = MDBX_RESULT_TRUE; + goto bailout; } - if (right_nkeys > minkeys && (IS_MODIFIABLE(mc->mc_txn, right) || involve)) { - /* try move from right */ - mn.mc_pg[mn.mc_top] = right; - mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top + 1); - mn.mc_ki[mn.mc_top] = 0; - mc->mc_ki[mc->mc_top] = (indx_t)nkeys; - WITH_CURSOR_TRACKING(mn, rc = node_move(&mn, mc, false)); - if (likely(rc != MDBX_RESULT_TRUE)) { - mc->mc_ki[mc->mc_top] = (indx_t)ki_top; - cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); - return rc; - } + if (!head.is_steady || ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) { + DEBUG("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIu64, + data_page(head.ptr_c)->pgno, durable_caption(head.ptr_c), + unsynced_pages); + meta_t meta = *head.ptr_c; + rc = dxb_sync_locked(env, flags, &meta, &env->basal_txn->tw.troika); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; } - if (nkeys >= minkeys) { - mc->mc_ki[mc->mc_top] = (indx_t)ki_top; - if (AUDIT_ENABLED()) - return cursor_check_updating(mc); - return MDBX_SUCCESS; - } + /* LY: sync meta-pages if MDBX_NOMETASYNC enabled + * and someone was not synced above. */ + if (atomic_load32(&env->lck->meta_sync_txnid, mo_Relaxed) != + (uint32_t)head.txnid) + rc = meta_sync(env, head); - if (mc->mc_txn->mt_env->me_options.prefer_waf_insteadof_balance && - likely(room_threshold > 0)) { - room_threshold = 0; - goto retry; - } - if (likely(!involve) && - (likely(mc->mc_dbi != FREE_DBI) || mc->mc_txn->tw.loose_pages || - MDBX_PNL_GETSIZE(mc->mc_txn->tw.relist) || (mc->mc_flags & C_GCU) || - (mc->mc_txn->mt_flags & MDBX_TXN_DRAINED_GC) || room_threshold)) { - involve = true; - goto retry; - } - if (likely(room_threshold > 0)) { - room_threshold = 0; - goto retry; - } - ERROR("Unable to merge/rebalance %s page %" PRIaPGNO - " (has %zu keys, full %.1f%%, used %zu, room %zu bytes )", - (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, - page_fill(mc->mc_txn->mt_env, tp), page_used(mc->mc_txn->mt_env, tp), - room); - return MDBX_PROBLEM; +bailout: + if (should_unlock) + lck_txn_unlock(env); + return rc; } -__cold static int page_check(const MDBX_cursor *const mc, - const MDBX_page *const mp) { - DKBUF; - int rc = MDBX_SUCCESS; - if (unlikely(mp->mp_pgno < MIN_PAGENO || mp->mp_pgno > MAX_PAGENO)) - rc = bad_page(mp, "invalid pgno (%u)\n", mp->mp_pgno); +__cold int env_open(MDBX_env *env, mdbx_mode_t mode) { + /* Использование O_DSYNC или FILE_FLAG_WRITE_THROUGH: + * + * 0) Если размер страниц БД меньше системной страницы ОЗУ, то ядру ОС + * придется чаще обновлять страницы в unified page cache. + * + * Однако, O_DSYNC не предполагает отключение unified page cache, + * поэтому подобные затруднения будем считать проблемой ОС и/или + * ожидаемым пенальти из-за использования мелких страниц БД. + * + * 1) В режиме MDBX_SYNC_DURABLE - O_DSYNC для записи как данных, + * так и мета-страниц. Однако, на Linux отказ от O_DSYNC с последующим + * fdatasync() может быть выгоднее при использовании HDD, так как + * позволяет io-scheduler переупорядочить запись с учетом актуального + * расположения файла БД на носителе. + * + * 2) В режиме MDBX_NOMETASYNC - O_DSYNC можно использовать для данных, + * но в этом может не быть смысла, так как fdatasync() всё равно + * требуется для гарантии фиксации мета после предыдущей транзакции. + * + * В итоге на нормальных системах (не Windows) есть два варианта: + * - при возможности O_DIRECT и/или io_ring для данных, скорее всего, + * есть смысл вызвать fdatasync() перед записью данных, а затем + * использовать O_DSYNC; + * - не использовать O_DSYNC и вызывать fdatasync() после записи данных. + * + * На Windows же следует минимизировать использование FlushFileBuffers() + * из-за проблем с производительностью. Поэтому на Windows в режиме + * MDBX_NOMETASYNC: + * - мета обновляется через дескриптор без FILE_FLAG_WRITE_THROUGH; + * - перед началом записи данных вызывается FlushFileBuffers(), если + * meta_sync_txnid не совпадает с последней записанной мета; + * - данные записываются через дескриптор с FILE_FLAG_WRITE_THROUGH. + * + * 3) В режиме MDBX_SAFE_NOSYNC - O_DSYNC нет смысла использовать, пока не + * будет реализована возможность полностью асинхронной "догоняющей" + * записи в выделенном процессе-сервере с io-ring очередями внутри. + * + * ----- + * + * Использование O_DIRECT или FILE_FLAG_NO_BUFFERING: + * + * Назначение этих флагов в отключении файлового дескриптора от + * unified page cache, т.е. от отображенных в память данных в случае + * libmdbx. + * + * Поэтому, использование direct i/o в libmdbx без MDBX_WRITEMAP лишено + * смысла и контр-продуктивно, ибо так мы провоцируем ядро ОС на + * не-когерентность отображения в память с содержимым файла на носителе, + * либо требуем дополнительных проверок и действий направленных на + * фактическое отключение O_DIRECT для отображенных в память данных. + * + * В режиме MDBX_WRITEMAP когерентность отображенных данных обеспечивается + * физически. Поэтому использование direct i/o может иметь смысл, если у + * ядра ОС есть какие-то проблемы с msync(), в том числе с + * производительностью: + * - использование io_ring или gather-write может быть дешевле, чем + * просмотр PTE ядром и запись измененных/грязных; + * - но проблема в том, что записываемые из user mode страницы либо не + * будут помечены чистыми (и соответственно будут записаны ядром + * еще раз), либо ядру необходимо искать и чистить PTE при получении + * запроса на запись. + * + * Поэтому O_DIRECT или FILE_FLAG_NO_BUFFERING используется: + * - только в режиме MDBX_SYNC_DURABLE с MDBX_WRITEMAP; + * - когда ps >= me_os_psize; + * - опция сборки MDBX_AVOID_MSYNC != 0, которая по-умолчанию включена + * только на Windows (см ниже). + * + * ----- + * + * Использование FILE_FLAG_OVERLAPPED на Windows: + * + * У Windows очень плохо с I/O (за исключением прямых постраничных + * scatter/gather, которые работают в обход проблемного unified page + * cache и поэтому почти бесполезны в libmdbx). + * + * При этом всё еще хуже при использовании FlushFileBuffers(), что также + * требуется после FlushViewOfFile() в режиме MDBX_WRITEMAP. Поэтому + * на Windows вместо FlushViewOfFile() и FlushFileBuffers() следует + * использовать запись через дескриптор с FILE_FLAG_WRITE_THROUGH. + * + * В свою очередь, запись с FILE_FLAG_WRITE_THROUGH дешевле/быстрее + * при использовании FILE_FLAG_OVERLAPPED. В результате, на Windows + * в durable-режимах запись данных всегда в overlapped-режиме, + * при этом для записи мета требуется отдельный не-overlapped дескриптор. + */ - MDBX_env *const env = mc->mc_txn->mt_env; - const ptrdiff_t offset = ptr_dist(mp, env->me_map); - unsigned flags_mask = P_ILL_BITS; - unsigned flags_expected = 0; - if (offset < 0 || - offset > (ptrdiff_t)(pgno2bytes(env, mc->mc_txn->mt_next_pgno) - - ((mp->mp_flags & P_SUBP) ? PAGEHDRSZ + 1 - : env->me_psize))) { - /* should be dirty page without MDBX_WRITEMAP, or a subpage of. */ - flags_mask -= P_SUBP; - if ((env->me_flags & MDBX_WRITEMAP) != 0 || - (!IS_SHADOWED(mc->mc_txn, mp) && !(mp->mp_flags & P_SUBP))) - rc = bad_page(mp, "invalid page-address %p, offset %zi\n", - __Wpedantic_format_voidptr(mp), offset); - } else if (offset & (env->me_psize - 1)) - flags_expected = P_SUBP; + env->pid = osal_getpid(); + int rc = osal_openfile((env->flags & MDBX_RDONLY) ? MDBX_OPEN_DXB_READ + : MDBX_OPEN_DXB_LAZY, + env, env->pathname.dxb, &env->lazy_fd, mode); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (unlikely((mp->mp_flags & flags_mask) != flags_expected)) - rc = bad_page(mp, "unknown/extra page-flags (have 0x%x, expect 0x%x)\n", - mp->mp_flags & flags_mask, flags_expected); +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + env->me_sysv_ipc.key = ftok(env->pathname.dxb, 42); + if (unlikely(env->me_sysv_ipc.key == -1)) + return errno; +#endif /* MDBX_LOCKING */ - cASSERT(mc, (mc->mc_checking & CC_LEAF2) == 0 || (mc->mc_flags & C_SUB) != 0); - const uint8_t type = PAGETYPE_WHOLE(mp); - switch (type) { - default: - return bad_page(mp, "invalid type (%u)\n", type); - case P_OVERFLOW: - if (unlikely(mc->mc_flags & C_SUB)) - rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", "large", - "nested dupsort tree", mc->mc_db->md_flags); - const pgno_t npages = mp->mp_pages; - if (unlikely(npages < 1 || npages >= MAX_PAGENO / 2)) - rc = bad_page(mp, "invalid n-pages (%u) for large-page\n", npages); - if (unlikely(mp->mp_pgno + npages > mc->mc_txn->mt_next_pgno)) - rc = bad_page( - mp, "end of large-page beyond (%u) allocated space (%u next-pgno)\n", - mp->mp_pgno + npages, mc->mc_txn->mt_next_pgno); - return rc; //-------------------------- end of large/overflow page handling - case P_LEAF | P_SUBP: - if (unlikely(mc->mc_db->md_depth != 1)) - rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", - "leaf-sub", "nested dupsort db", mc->mc_db->md_flags); - /* fall through */ - __fallthrough; - case P_LEAF: - if (unlikely((mc->mc_checking & CC_LEAF2) != 0)) - rc = bad_page( - mp, "unexpected leaf-page for dupfixed subtree (db-lags 0x%x)\n", - mc->mc_db->md_flags); - break; - case P_LEAF | P_LEAF2 | P_SUBP: - if (unlikely(mc->mc_db->md_depth != 1)) - rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", - "leaf2-sub", "nested dupsort db", mc->mc_db->md_flags); - /* fall through */ - __fallthrough; - case P_LEAF | P_LEAF2: - if (unlikely((mc->mc_checking & CC_LEAF2) == 0)) - rc = bad_page( - mp, - "unexpected leaf2-page for non-dupfixed (sub)tree (db-flags 0x%x)\n", - mc->mc_db->md_flags); - break; - case P_BRANCH: - break; - } + /* Set the position in files outside of the data to avoid corruption + * due to erroneous use of file descriptors in the application code. */ + const uint64_t safe_parking_lot_offset = UINT64_C(0x7fffFFFF80000000); + osal_fseek(env->lazy_fd, safe_parking_lot_offset); - if (unlikely(mp->mp_upper < mp->mp_lower || (mp->mp_lower & 1) || - PAGEHDRSZ + mp->mp_upper > env->me_psize)) - rc = bad_page(mp, "invalid page lower(%u)/upper(%u) with limit %zu\n", - mp->mp_lower, mp->mp_upper, page_space(env)); + env->fd4meta = env->lazy_fd; +#if defined(_WIN32) || defined(_WIN64) + eASSERT(env, env->ioring.overlapped_fd == 0); + bool ior_direct = false; + if (!(env->flags & + (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_EXCLUSIVE))) { + if (MDBX_AVOID_MSYNC && (env->flags & MDBX_WRITEMAP)) { + /* Запрошен режим MDBX_SYNC_DURABLE | MDBX_WRITEMAP при активной опции + * MDBX_AVOID_MSYNC. + * + * 1) В этой комбинации наиболее выгодно использовать WriteFileGather(), + * но для этого необходимо открыть файл с флагом FILE_FLAG_NO_BUFFERING и + * после обеспечивать выравнивание адресов и размера данных на границу + * системной страницы, что в свою очередь возможно если размер страницы БД + * не меньше размера системной страницы ОЗУ. Поэтому для открытия файла в + * нужном режиме требуется знать размер страницы БД. + * + * 2) Кроме этого, в Windows запись в заблокированный регион файла + * возможно только через тот-же дескриптор. Поэтому изначальный захват + * блокировок посредством lck_seize(), захват/освобождение блокировок + * во время пишущих транзакций и запись данных должны выполнятся через + * один дескриптор. + * + * Таким образом, требуется прочитать волатильный заголовок БД, чтобы + * узнать размер страницы, чтобы открыть дескриптор файла в режиме нужном + * для записи данных, чтобы использовать именно этот дескриптор для + * изначального захвата блокировок. */ + meta_t header; + uint64_t dxb_filesize; + int err = dxb_read_header(env, &header, MDBX_SUCCESS, true); + if ((err == MDBX_SUCCESS && header.pagesize >= globals.sys_pagesize) || + (err == MDBX_ENODATA && mode && env->ps >= globals.sys_pagesize && + osal_filesize(env->lazy_fd, &dxb_filesize) == MDBX_SUCCESS && + dxb_filesize == 0)) + /* Может быть коллизия, если два процесса пытаются одновременно создать + * БД с разным размером страницы, который у одного меньше системной + * страницы, а у другого НЕ меньше. Эта допустимая, но очень странная + * ситуация. Поэтому считаем её ошибочной и не пытаемся разрешить. */ + ior_direct = true; + } - const char *const end_of_page = ptr_disp(mp, env->me_psize); - const size_t nkeys = page_numkeys(mp); - STATIC_ASSERT(P_BRANCH == 1); - if (unlikely(nkeys <= (uint8_t)(mp->mp_flags & P_BRANCH))) { - if ((!(mc->mc_flags & C_SUB) || mc->mc_db->md_entries) && - (!(mc->mc_checking & CC_UPDATING) || - !(IS_MODIFIABLE(mc->mc_txn, mp) || (mp->mp_flags & P_SUBP)))) - rc = - bad_page(mp, "%s-page nkeys (%zu) < %u\n", - IS_BRANCH(mp) ? "branch" : "leaf", nkeys, 1 + IS_BRANCH(mp)); + rc = osal_openfile(ior_direct ? MDBX_OPEN_DXB_OVERLAPPED_DIRECT + : MDBX_OPEN_DXB_OVERLAPPED, + env, env->pathname.dxb, &env->ioring.overlapped_fd, 0); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + env->dxb_lock_event = CreateEventW(nullptr, true, false, nullptr); + if (unlikely(!env->dxb_lock_event)) + return (int)GetLastError(); + osal_fseek(env->ioring.overlapped_fd, safe_parking_lot_offset); + } +#else + if (mode == 0) { + /* pickup mode for lck-file */ + struct stat st; + if (unlikely(fstat(env->lazy_fd, &st))) + return errno; + mode = st.st_mode; } + mode = (/* inherit read permissions for group and others */ mode & + (S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH)) | + /* always add read/write for owner */ S_IRUSR | S_IWUSR | + ((mode & S_IRGRP) ? /* +write if readable by group */ S_IWGRP : 0) | + ((mode & S_IROTH) ? /* +write if readable by others */ S_IWOTH : 0); +#endif /* !Windows */ + const int lck_rc = lck_setup(env, mode); + if (unlikely(MDBX_IS_ERROR(lck_rc))) + return lck_rc; + if (env->lck_mmap.fd != INVALID_HANDLE_VALUE) + osal_fseek(env->lck_mmap.fd, safe_parking_lot_offset); - const size_t ksize_max = keysize_max(env->me_psize, 0); - const size_t leaf2_ksize = mp->mp_leaf2_ksize; - if (IS_LEAF2(mp)) { - if (unlikely((mc->mc_flags & C_SUB) == 0 || - (mc->mc_db->md_flags & MDBX_DUPFIXED) == 0)) - rc = bad_page(mp, "unexpected leaf2-page (db-flags 0x%x)\n", - mc->mc_db->md_flags); - else if (unlikely(leaf2_ksize != mc->mc_db->md_xsize)) - rc = bad_page(mp, "invalid leaf2_ksize %zu\n", leaf2_ksize); - else if (unlikely(((leaf2_ksize & nkeys) ^ mp->mp_upper) & 1)) - rc = bad_page( - mp, "invalid page upper (%u) for nkeys %zu with leaf2-length %zu\n", - mp->mp_upper, nkeys, leaf2_ksize); - } else { - if (unlikely((mp->mp_upper & 1) || PAGEHDRSZ + mp->mp_upper + - nkeys * sizeof(MDBX_node) + - nkeys - 1 > - env->me_psize)) - rc = - bad_page(mp, "invalid page upper (%u) for nkeys %zu with limit %zu\n", - mp->mp_upper, nkeys, page_space(env)); + eASSERT(env, env->dsync_fd == INVALID_HANDLE_VALUE); + if (!(env->flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | DEPRECATED_MAPASYNC +#if defined(_WIN32) || defined(_WIN64) + | MDBX_EXCLUSIVE +#endif /* !Windows */ + ))) { + rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env->pathname.dxb, + &env->dsync_fd, 0); + if (unlikely(MDBX_IS_ERROR(rc))) + return rc; + if (env->dsync_fd != INVALID_HANDLE_VALUE) { + if ((env->flags & MDBX_NOMETASYNC) == 0) + env->fd4meta = env->dsync_fd; + osal_fseek(env->dsync_fd, safe_parking_lot_offset); + } } - MDBX_val here, prev = {0, 0}; - for (size_t i = 0; i < nkeys; ++i) { - if (IS_LEAF2(mp)) { - const char *const key = page_leaf2key(mp, i, leaf2_ksize); - if (unlikely(end_of_page < key + leaf2_ksize)) { - rc = bad_page(mp, "leaf2-item beyond (%zu) page-end\n", - key + leaf2_ksize - end_of_page); - continue; - } + const MDBX_env_flags_t lazy_flags = + MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC | MDBX_NOMETASYNC; + const MDBX_env_flags_t mode_flags = lazy_flags | MDBX_LIFORECLAIM | + MDBX_NORDAHEAD | MDBX_RDONLY | + MDBX_WRITEMAP; - if (unlikely(leaf2_ksize != mc->mc_dbx->md_klen_min)) { - if (unlikely(leaf2_ksize < mc->mc_dbx->md_klen_min || - leaf2_ksize > mc->mc_dbx->md_klen_max)) - rc = bad_page( - mp, "leaf2-item size (%zu) <> min/max length (%zu/%zu)\n", - leaf2_ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max); - else - mc->mc_dbx->md_klen_min = mc->mc_dbx->md_klen_max = leaf2_ksize; - } - if ((mc->mc_checking & CC_SKIPORD) == 0) { - here.iov_base = (void *)key; - here.iov_len = leaf2_ksize; - if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) - rc = bad_page(mp, "leaf2-item #%zu wrong order (%s >= %s)\n", i, - DKEY(&prev), DVAL(&here)); - prev = here; - } - } else { - const MDBX_node *const node = page_node(mp, i); - const char *const node_end = ptr_disp(node, NODESIZE); - if (unlikely(node_end > end_of_page)) { - rc = bad_page(mp, "node[%zu] (%zu) beyond page-end\n", i, - node_end - end_of_page); - continue; - } - const size_t ksize = node_ks(node); - if (unlikely(ksize > ksize_max)) - rc = bad_page(mp, "node[%zu] too long key (%zu)\n", i, ksize); - const char *const key = node_key(node); - if (unlikely(end_of_page < key + ksize)) { - rc = bad_page(mp, "node[%zu] key (%zu) beyond page-end\n", i, - key + ksize - end_of_page); - continue; - } - if ((IS_LEAF(mp) || i > 0)) { - if (unlikely(ksize < mc->mc_dbx->md_klen_min || - ksize > mc->mc_dbx->md_klen_max)) - rc = bad_page( - mp, "node[%zu] key size (%zu) <> min/max key-length (%zu/%zu)\n", - i, ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max); - if ((mc->mc_checking & CC_SKIPORD) == 0) { - here.iov_base = (void *)key; - here.iov_len = ksize; - if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) - rc = bad_page(mp, "node[%zu] key wrong order (%s >= %s)\n", i, - DKEY(&prev), DVAL(&here)); - prev = here; - } - } - if (IS_BRANCH(mp)) { - if ((mc->mc_checking & CC_UPDATING) == 0 && i == 0 && - unlikely(ksize != 0)) - rc = bad_page(mp, "branch-node[%zu] wrong 0-node key-length (%zu)\n", - i, ksize); - const pgno_t ref = node_pgno(node); - if (unlikely(ref < MIN_PAGENO) || - (unlikely(ref >= mc->mc_txn->mt_next_pgno) && - (unlikely(ref >= mc->mc_txn->mt_geo.now) || - !(mc->mc_checking & CC_RETIRING)))) - rc = bad_page(mp, "branch-node[%zu] wrong pgno (%u)\n", i, ref); - if (unlikely(node_flags(node))) - rc = bad_page(mp, "branch-node[%zu] wrong flags (%u)\n", i, - node_flags(node)); - continue; + lck_t *const lck = env->lck_mmap.lck; + if (lck && lck_rc != MDBX_RESULT_TRUE && (env->flags & MDBX_RDONLY) == 0) { + MDBX_env_flags_t snap_flags; + while ((snap_flags = atomic_load32(&lck->envmode, mo_AcquireRelease)) == + MDBX_RDONLY) { + if (atomic_cas32(&lck->envmode, MDBX_RDONLY, + (snap_flags = (env->flags & mode_flags)))) { + /* The case: + * - let's assume that for some reason the DB file is smaller + * than it should be according to the geometry, + * but not smaller than the last page used; + * - the first process that opens the database (lck_rc == RESULT_TRUE) + * does this in readonly mode and therefore cannot bring + * the file size back to normal; + * - some next process (lck_rc != RESULT_TRUE) opens the DB in + * read-write mode and now is here. + * + * FIXME: Should we re-check and set the size of DB-file right here? */ + break; } + atomic_yield(); + } - switch (node_flags(node)) { - default: - rc = - bad_page(mp, "invalid node[%zu] flags (%u)\n", i, node_flags(node)); - break; - case F_BIGDATA /* data on large-page */: - case 0 /* usual */: - case F_SUBDATA /* sub-db */: - case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: - case F_DUPDATA /* short sub-page */: - break; - } + if (env->flags & MDBX_ACCEDE) { + /* Pickup current mode-flags (MDBX_LIFORECLAIM, MDBX_NORDAHEAD, etc). */ + const MDBX_env_flags_t diff = + (snap_flags ^ env->flags) & + ((snap_flags & lazy_flags) ? mode_flags + : mode_flags & ~MDBX_WRITEMAP); + env->flags ^= diff; + NOTICE("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, env->flags ^ diff, + env->flags); + } - const size_t dsize = node_ds(node); - const char *const data = node_data(node); - if (node_flags(node) & F_BIGDATA) { - if (unlikely(end_of_page < data + sizeof(pgno_t))) { - rc = bad_page( - mp, "node-%s(%zu of %zu, %zu bytes) beyond (%zu) page-end\n", - "bigdata-pgno", i, nkeys, dsize, data + dsize - end_of_page); - continue; - } - if (unlikely(dsize <= mc->mc_dbx->md_vlen_min || - dsize > mc->mc_dbx->md_vlen_max)) - rc = bad_page( - mp, - "big-node data size (%zu) <> min/max value-length (%zu/%zu)\n", - dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); - if (unlikely(node_size_len(node_ks(node), dsize) <= - mc->mc_txn->mt_env->me_leaf_nodemax) && - mc->mc_dbi != FREE_DBI) - poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize); + /* Ранее упущенный не очевидный момент: При работе БД в режимах + * не-синхронной/отложенной фиксации на диске, все процессы-писатели должны + * иметь одинаковый режим MDBX_WRITEMAP. + * + * В противном случае, сброс на диск следует выполнять дважды: сначала + * msync(), затем fdatasync(). При этом msync() не обязан отрабатывать + * в процессах без MDBX_WRITEMAP, так как файл в память отображен только + * для чтения. Поэтому, в общем случае, различия по MDBX_WRITEMAP не + * позволяют выполнить фиксацию данных на диск, после их изменения в другом + * процессе. + * + * В режиме MDBX_UTTERLY_NOSYNC позволять совместную работу с MDBX_WRITEMAP + * также не следует, поскольку никакой процесс (в том числе последний) не + * может гарантированно сбросить данные на диск, а следовательно не должен + * помечать какую-либо транзакцию как steady. + * + * В результате, требуется либо запретить совместную работу процессам с + * разным MDBX_WRITEMAP в режиме отложенной записи, либо отслеживать такое + * смешивание и блокировать steady-пометки - что контрпродуктивно. */ + const MDBX_env_flags_t rigorous_flags = + (snap_flags & lazy_flags) + ? MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC | MDBX_WRITEMAP + : MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC; + const MDBX_env_flags_t rigorous_diff = + (snap_flags ^ env->flags) & rigorous_flags; + if (rigorous_diff) { + ERROR("current mode/flags 0x%X incompatible with requested 0x%X, " + "rigorous diff 0x%X", + env->flags, snap_flags, rigorous_diff); + return MDBX_INCOMPATIBLE; + } + } - if ((mc->mc_checking & CC_RETIRING) == 0) { - const pgr_t lp = - page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); - if (unlikely(lp.err != MDBX_SUCCESS)) - return lp.err; - cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); - const unsigned npages = number_of_ovpages(env, dsize); - if (unlikely(lp.page->mp_pages != npages)) { - if (lp.page->mp_pages < npages) - rc = bad_page(lp.page, - "too less n-pages %u for bigdata-node (%zu bytes)", - lp.page->mp_pages, dsize); - else if (mc->mc_dbi != FREE_DBI) - poor_page(lp.page, - "extra n-pages %u for bigdata-node (%zu bytes)", - lp.page->mp_pages, dsize); - } - } - continue; - } + mincore_clean_cache(env); + const int dxb_rc = dxb_setup(env, lck_rc, mode); + if (MDBX_IS_ERROR(dxb_rc)) + return dxb_rc; - if (unlikely(end_of_page < data + dsize)) { - rc = bad_page(mp, - "node-%s(%zu of %zu, %zu bytes) beyond (%zu) page-end\n", - "data", i, nkeys, dsize, data + dsize - end_of_page); - continue; - } + rc = osal_check_fs_incore(env->lazy_fd); + env->incore = false; + if (rc == MDBX_RESULT_TRUE) { + env->incore = true; + NOTICE("%s", "in-core database"); + rc = MDBX_SUCCESS; + } else if (unlikely(rc != MDBX_SUCCESS)) { + ERROR("check_fs_incore(), err %d", rc); + return rc; + } - switch (node_flags(node)) { - default: - /* wrong, but already handled */ - continue; - case 0 /* usual */: - if (unlikely(dsize < mc->mc_dbx->md_vlen_min || - dsize > mc->mc_dbx->md_vlen_max)) { - rc = bad_page( - mp, "node-data size (%zu) <> min/max value-length (%zu/%zu)\n", - dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); - continue; - } - break; - case F_SUBDATA /* sub-db */: - if (unlikely(dsize != sizeof(MDBX_db))) { - rc = bad_page(mp, "invalid sub-db record size (%zu)\n", dsize); - continue; - } - break; - case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: - if (unlikely(dsize != sizeof(MDBX_db))) { - rc = bad_page(mp, "invalid nested-db record size (%zu, expect %zu)\n", - dsize, sizeof(MDBX_db)); - continue; - } - break; - case F_DUPDATA /* short sub-page */: - if (unlikely(dsize <= PAGEHDRSZ)) { - rc = bad_page(mp, "invalid nested/sub-page record size (%zu)\n", - dsize); - continue; - } else { - const MDBX_page *const sp = (MDBX_page *)data; - switch (sp->mp_flags & - /* ignore legacy P_DIRTY flag */ ~P_LEGACY_DIRTY) { - case P_LEAF | P_SUBP: - case P_LEAF | P_LEAF2 | P_SUBP: - break; - default: - rc = bad_page(mp, "invalid nested/sub-page flags (0x%02x)\n", - sp->mp_flags); - continue; - } + if (unlikely(/* recovery mode */ env->stuck_meta >= 0) && + (lck_rc != /* exclusive */ MDBX_RESULT_TRUE || + (env->flags & MDBX_EXCLUSIVE) == 0)) { + ERROR("%s", "recovery requires exclusive mode"); + return MDBX_BUSY; + } - const char *const end_of_subpage = data + dsize; - const intptr_t nsubkeys = page_numkeys(sp); - if (unlikely(nsubkeys == 0) && !(mc->mc_checking & CC_UPDATING) && - mc->mc_db->md_entries) - rc = bad_page(mp, "no keys on a %s-page\n", - IS_LEAF2(sp) ? "leaf2-sub" : "leaf-sub"); + DEBUG("opened dbenv %p", (void *)env); + env->flags |= ENV_ACTIVE; + if (!lck || lck_rc == MDBX_RESULT_TRUE) { + env->lck->envmode.weak = env->flags & mode_flags; + env->lck->meta_sync_txnid.weak = (uint32_t)recent_committed_txnid(env); + env->lck->readers_check_timestamp.weak = osal_monotime(); + } + if (lck) { + if (lck_rc == MDBX_RESULT_TRUE) { + rc = lck_downgrade(env); + DEBUG("lck-downgrade-%s: rc %i", + (env->flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc); + if (rc != MDBX_SUCCESS) + return rc; + } else { + rc = mvcc_cleanup_dead(env, false, nullptr); + if (MDBX_IS_ERROR(rc)) + return rc; + } + } - MDBX_val sub_here, sub_prev = {0, 0}; - for (int j = 0; j < nsubkeys; j++) { - if (IS_LEAF2(sp)) { - /* LEAF2 pages have no mp_ptrs[] or node headers */ - const size_t sub_ksize = sp->mp_leaf2_ksize; - const char *const sub_key = page_leaf2key(sp, j, sub_ksize); - if (unlikely(end_of_subpage < sub_key + sub_ksize)) { - rc = bad_page(mp, "nested-leaf2-key beyond (%zu) nested-page\n", - sub_key + sub_ksize - end_of_subpage); - continue; - } + rc = (env->flags & MDBX_RDONLY) + ? MDBX_SUCCESS + : osal_ioring_create(&env->ioring +#if defined(_WIN32) || defined(_WIN64) + , + ior_direct, env->ioring.overlapped_fd +#endif /* Windows */ + ); + return rc; +} - if (unlikely(sub_ksize != mc->mc_dbx->md_vlen_min)) { - if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || - sub_ksize > mc->mc_dbx->md_vlen_max)) - rc = bad_page(mp, - "nested-leaf2-key size (%zu) <> min/max " - "value-length (%zu/%zu)\n", - sub_ksize, mc->mc_dbx->md_vlen_min, - mc->mc_dbx->md_vlen_max); - else - mc->mc_dbx->md_vlen_min = mc->mc_dbx->md_vlen_max = sub_ksize; - } - if ((mc->mc_checking & CC_SKIPORD) == 0) { - sub_here.iov_base = (void *)sub_key; - sub_here.iov_len = sub_ksize; - if (sub_prev.iov_base && - unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) - rc = bad_page(mp, - "nested-leaf2-key #%u wrong order (%s >= %s)\n", - j, DKEY(&sub_prev), DVAL(&sub_here)); - sub_prev = sub_here; - } - } else { - const MDBX_node *const sub_node = page_node(sp, j); - const char *const sub_node_end = ptr_disp(sub_node, NODESIZE); - if (unlikely(sub_node_end > end_of_subpage)) { - rc = bad_page(mp, "nested-node beyond (%zu) nested-page\n", - end_of_subpage - sub_node_end); - continue; - } - if (unlikely(node_flags(sub_node) != 0)) - rc = bad_page(mp, "nested-node invalid flags (%u)\n", - node_flags(sub_node)); +__cold int env_close(MDBX_env *env, bool resurrect_after_fork) { + const unsigned flags = env->flags; + env->flags &= ~ENV_INTERNAL_FLAGS; + if (flags & ENV_TXKEY) { + thread_key_delete(env->me_txkey); + env->me_txkey = 0; + } - const size_t sub_ksize = node_ks(sub_node); - const char *const sub_key = node_key(sub_node); - const size_t sub_dsize = node_ds(sub_node); - /* char *sub_data = node_data(sub_node); */ + if (env->lck) + munlock_all(env); - if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || - sub_ksize > mc->mc_dbx->md_vlen_max)) - rc = bad_page(mp, - "nested-node-key size (%zu) <> min/max " - "value-length (%zu/%zu)\n", - sub_ksize, mc->mc_dbx->md_vlen_min, - mc->mc_dbx->md_vlen_max); - if ((mc->mc_checking & CC_SKIPORD) == 0) { - sub_here.iov_base = (void *)sub_key; - sub_here.iov_len = sub_ksize; - if (sub_prev.iov_base && - unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) - rc = bad_page(mp, - "nested-node-key #%u wrong order (%s >= %s)\n", - j, DKEY(&sub_prev), DVAL(&sub_here)); - sub_prev = sub_here; - } - if (unlikely(sub_dsize != 0)) - rc = bad_page(mp, "nested-node non-empty data size (%zu)\n", - sub_dsize); - if (unlikely(end_of_subpage < sub_key + sub_ksize)) - rc = bad_page(mp, "nested-node-key beyond (%zu) nested-page\n", - sub_key + sub_ksize - end_of_subpage); - } - } - } - break; - } - } + rthc_lock(); + int rc = rthc_remove(env); + rthc_unlock(); + +#if MDBX_ENABLE_DBI_LOCKFREE + for (defer_free_item_t *next, *ptr = env->defer_free; ptr; ptr = next) { + next = ptr->next; + osal_free(ptr); } - return rc; -} + env->defer_free = nullptr; +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ -__cold static int cursor_check(const MDBX_cursor *mc) { - if (!mc->mc_txn->tw.dirtylist) { - cASSERT(mc, - (mc->mc_txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); - } else { - cASSERT(mc, - (mc->mc_txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - cASSERT(mc, mc->mc_txn->tw.dirtyroom + mc->mc_txn->tw.dirtylist->length == - (mc->mc_txn->mt_parent - ? mc->mc_txn->mt_parent->tw.dirtyroom - : mc->mc_txn->mt_env->me_options.dp_limit)); - } - cASSERT(mc, mc->mc_top == mc->mc_snum - 1 || (mc->mc_checking & CC_UPDATING)); - if (unlikely(mc->mc_top != mc->mc_snum - 1) && - (mc->mc_checking & CC_UPDATING) == 0) - return MDBX_CURSOR_FULL; - cASSERT(mc, (mc->mc_checking & CC_UPDATING) - ? mc->mc_snum <= mc->mc_db->md_depth - : mc->mc_snum == mc->mc_db->md_depth); - if (unlikely((mc->mc_checking & CC_UPDATING) - ? mc->mc_snum > mc->mc_db->md_depth - : mc->mc_snum != mc->mc_db->md_depth)) - return MDBX_CURSOR_FULL; + if (!(env->flags & MDBX_RDONLY)) + osal_ioring_destroy(&env->ioring); - for (int n = 0; n < (int)mc->mc_snum; ++n) { - MDBX_page *mp = mc->mc_pg[n]; - const size_t nkeys = page_numkeys(mp); - const bool expect_branch = (n < mc->mc_db->md_depth - 1) ? true : false; - const bool expect_nested_leaf = - (n + 1 == mc->mc_db->md_depth - 1) ? true : false; - const bool branch = IS_BRANCH(mp) ? true : false; - cASSERT(mc, branch == expect_branch); - if (unlikely(branch != expect_branch)) - return MDBX_CURSOR_FULL; - if ((mc->mc_checking & CC_UPDATING) == 0) { - cASSERT(mc, nkeys > mc->mc_ki[n] || (!branch && nkeys == mc->mc_ki[n] && - (mc->mc_flags & C_EOF) != 0)); - if (unlikely(nkeys <= mc->mc_ki[n] && - !(!branch && nkeys == mc->mc_ki[n] && - (mc->mc_flags & C_EOF) != 0))) - return MDBX_CURSOR_FULL; - } else { - cASSERT(mc, nkeys + 1 >= mc->mc_ki[n]); - if (unlikely(nkeys + 1 < mc->mc_ki[n])) - return MDBX_CURSOR_FULL; - } + env->lck = nullptr; + if (env->lck_mmap.lck) + osal_munmap(&env->lck_mmap); - int err = page_check(mc, mp); - if (unlikely(err != MDBX_SUCCESS)) - return err; + if (env->dxb_mmap.base) { + osal_munmap(&env->dxb_mmap); +#ifdef ENABLE_MEMCHECK + VALGRIND_DISCARD(env->valgrind_handle); + env->valgrind_handle = -1; +#endif /* ENABLE_MEMCHECK */ + } - for (size_t i = 0; i < nkeys; ++i) { - if (branch) { - MDBX_node *node = page_node(mp, i); - cASSERT(mc, node_flags(node) == 0); - if (unlikely(node_flags(node) != 0)) - return MDBX_CURSOR_FULL; - pgno_t pgno = node_pgno(node); - MDBX_page *np; - err = page_get(mc, pgno, &np, mp->mp_txnid); - cASSERT(mc, err == MDBX_SUCCESS); - if (unlikely(err != MDBX_SUCCESS)) - return err; - const bool nested_leaf = IS_LEAF(np) ? true : false; - cASSERT(mc, nested_leaf == expect_nested_leaf); - if (unlikely(nested_leaf != expect_nested_leaf)) - return MDBX_CURSOR_FULL; - err = page_check(mc, np); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } - } +#if defined(_WIN32) || defined(_WIN64) + eASSERT(env, !env->ioring.overlapped_fd || + env->ioring.overlapped_fd == INVALID_HANDLE_VALUE); + if (env->dxb_lock_event != INVALID_HANDLE_VALUE) { + CloseHandle(env->dxb_lock_event); + env->dxb_lock_event = INVALID_HANDLE_VALUE; } - return MDBX_SUCCESS; -} + eASSERT(env, !resurrect_after_fork); + if (env->pathname_char) { + osal_free(env->pathname_char); + env->pathname_char = nullptr; + } +#endif /* Windows */ -__cold static int cursor_check_updating(MDBX_cursor *mc) { - const uint8_t checking = mc->mc_checking; - mc->mc_checking |= CC_UPDATING; - const int rc = cursor_check(mc); - mc->mc_checking = checking; - return rc; -} + if (env->dsync_fd != INVALID_HANDLE_VALUE) { + (void)osal_closefile(env->dsync_fd); + env->dsync_fd = INVALID_HANDLE_VALUE; + } -int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, - const MDBX_val *data) { - int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + if (env->lazy_fd != INVALID_HANDLE_VALUE) { + (void)osal_closefile(env->lazy_fd); + env->lazy_fd = INVALID_HANDLE_VALUE; + } - if (unlikely(!key)) - return MDBX_EINVAL; + if (env->lck_mmap.fd != INVALID_HANDLE_VALUE) { + (void)osal_closefile(env->lck_mmap.fd); + env->lck_mmap.fd = INVALID_HANDLE_VALUE; + } + + if (!resurrect_after_fork) { + if (env->kvs) { + for (size_t i = CORE_DBS; i < env->n_dbi; ++i) + if (env->kvs[i].name.iov_len) + osal_free(env->kvs[i].name.iov_base); + osal_free(env->kvs); + env->n_dbi = CORE_DBS; + env->kvs = nullptr; + } + if (env->page_auxbuf) { + osal_memalign_free(env->page_auxbuf); + env->page_auxbuf = nullptr; + } + if (env->dbi_seqs) { + osal_free(env->dbi_seqs); + env->dbi_seqs = nullptr; + } + if (env->dbs_flags) { + osal_free(env->dbs_flags); + env->dbs_flags = nullptr; + } + if (env->pathname.buffer) { + osal_free(env->pathname.buffer); + env->pathname.buffer = nullptr; + } + if (env->basal_txn) { + dpl_free(env->basal_txn); + txl_free(env->basal_txn->tw.gc.reclaimed); + pnl_free(env->basal_txn->tw.retired_pages); + pnl_free(env->basal_txn->tw.spilled.list); + pnl_free(env->basal_txn->tw.relist); + osal_free(env->basal_txn); + env->basal_txn = nullptr; + } + } + env->stuck_meta = -1; + return rc; +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 - if (unlikely(dbi <= FREE_DBI)) - return MDBX_BAD_DBI; - if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) - return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; +#if MDBX_ENABLE_MINCORE +/*------------------------------------------------------------------------------ + * Проверка размещения/расположения отображенных страниц БД в ОЗУ (mem-in-core), + * с кешированием этой информации. */ - return delete (txn, dbi, key, data, 0); +static inline bool bit_tas(uint64_t *field, char bit) { + const uint64_t m = UINT64_C(1) << bit; + const bool r = (*field & m) != 0; + *field |= m; + return r; } -static int delete(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, - const MDBX_val *data, unsigned flags) { - MDBX_cursor_couple cx; - MDBX_cursor_op op; - MDBX_val rdata; - int rc; - DKBUF_DEBUG; +static bool mincore_fetch(MDBX_env *const env, const size_t unit_begin) { + lck_t *const lck = env->lck; + for (size_t i = 1; i < ARRAY_LENGTH(lck->mincore_cache.begin); ++i) { + const ptrdiff_t dist = unit_begin - lck->mincore_cache.begin[i]; + if (likely(dist >= 0 && dist < 64)) { + const pgno_t tmp_begin = lck->mincore_cache.begin[i]; + const uint64_t tmp_mask = lck->mincore_cache.mask[i]; + do { + lck->mincore_cache.begin[i] = lck->mincore_cache.begin[i - 1]; + lck->mincore_cache.mask[i] = lck->mincore_cache.mask[i - 1]; + } while (--i); + lck->mincore_cache.begin[0] = tmp_begin; + lck->mincore_cache.mask[0] = tmp_mask; + return bit_tas(lck->mincore_cache.mask, (char)dist); + } + } - DEBUG("====> delete db %u key [%s], data [%s]", dbi, DKEY_DEBUG(key), - DVAL_DEBUG(data)); + size_t pages = 64; + unsigned unit_log = globals.sys_pagesize_ln2; + unsigned shift = 0; + if (env->ps > globals.sys_pagesize) { + unit_log = env->ps2ln; + shift = env->ps2ln - globals.sys_pagesize_ln2; + pages <<= shift; + } - rc = cursor_init(&cx.outer, txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + const size_t offset = unit_begin << unit_log; + size_t length = pages << globals.sys_pagesize_ln2; + if (offset + length > env->dxb_mmap.current) { + length = env->dxb_mmap.current - offset; + pages = length >> globals.sys_pagesize_ln2; + } - if (data) { - op = MDBX_GET_BOTH; - rdata = *data; - data = &rdata; - } else { - op = MDBX_SET; - flags |= MDBX_ALLDUPS; +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.mincore.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + uint8_t *const vector = alloca(pages); + if (unlikely(mincore(ptr_disp(env->dxb_mmap.base, offset), length, + (void *)vector))) { + NOTICE("mincore(+%zu, %zu), err %d", offset, length, errno); + return false; } - rc = cursor_set(&cx.outer, (MDBX_val *)key, (MDBX_val *)data, op).err; - if (likely(rc == MDBX_SUCCESS)) { - /* let mdbx_page_split know about this cursor if needed: - * delete will trigger a rebalance; if it needs to move - * a node from one page to another, it will have to - * update the parent's separator key(s). If the new sepkey - * is larger than the current one, the parent page may - * run out of space, triggering a split. We need this - * cursor to be consistent until the end of the rebalance. */ - cx.outer.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &cx.outer; - rc = cursor_del(&cx.outer, flags); - txn->mt_cursors[dbi] = cx.outer.mc_next; + + for (size_t i = 1; i < ARRAY_LENGTH(lck->mincore_cache.begin); ++i) { + lck->mincore_cache.begin[i] = lck->mincore_cache.begin[i - 1]; + lck->mincore_cache.mask[i] = lck->mincore_cache.mask[i - 1]; } - return rc; + lck->mincore_cache.begin[0] = unit_begin; + + uint64_t mask = 0; +#ifdef MINCORE_INCORE + STATIC_ASSERT(MINCORE_INCORE == 1); +#endif + for (size_t i = 0; i < pages; ++i) { + uint64_t bit = (vector[i] & 1) == 0; + bit <<= i >> shift; + mask |= bit; + } + + lck->mincore_cache.mask[0] = ~mask; + return bit_tas(lck->mincore_cache.mask, 0); } +#endif /* MDBX_ENABLE_MINCORE */ -/* Split a page and insert a new node. - * Set MDBX_TXN_ERROR on failure. - * [in,out] mc Cursor pointing to the page and desired insertion index. - * The cursor will be updated to point to the actual page and index where - * the node got inserted after the split. - * [in] newkey The key for the newly inserted node. - * [in] newdata The data for the newly inserted node. - * [in] newpgno The page number, if the new node is a branch node. - * [in] naf The NODE_ADD_FLAGS for the new node. - * Returns 0 on success, non-zero on failure. */ -static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, - MDBX_val *const newdata, pgno_t newpgno, - const unsigned naf) { - unsigned flags; - int rc = MDBX_SUCCESS, foliage = 0; - size_t i, ptop; - MDBX_env *const env = mc->mc_txn->mt_env; - MDBX_val rkey, xdata; - MDBX_page *tmp_ki_copy = NULL; - DKBUF; +MDBX_MAYBE_UNUSED static inline bool mincore_probe(MDBX_env *const env, + const pgno_t pgno) { +#if MDBX_ENABLE_MINCORE + const size_t offset_aligned = + floor_powerof2(pgno2bytes(env, pgno), globals.sys_pagesize); + const unsigned unit_log2 = (env->ps2ln > globals.sys_pagesize_ln2) + ? env->ps2ln + : globals.sys_pagesize_ln2; + const size_t unit_begin = offset_aligned >> unit_log2; + eASSERT(env, (unit_begin << unit_log2) == offset_aligned); + const ptrdiff_t dist = unit_begin - env->lck->mincore_cache.begin[0]; + if (likely(dist >= 0 && dist < 64)) + return bit_tas(env->lck->mincore_cache.mask, (char)dist); + return mincore_fetch(env, unit_begin); +#else + (void)env; + (void)pgno; + return false; +#endif /* MDBX_ENABLE_MINCORE */ +} - MDBX_page *const mp = mc->mc_pg[mc->mc_top]; - cASSERT(mc, (mp->mp_flags & P_ILL_BITS) == 0); +/*----------------------------------------------------------------------------*/ - const size_t newindx = mc->mc_ki[mc->mc_top]; - size_t nkeys = page_numkeys(mp); - if (AUDIT_ENABLED()) { - rc = cursor_check_updating(mc); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +MDBX_MAYBE_UNUSED __hot static pgno_t * +scan4seq_fallback(pgno_t *range, const size_t len, const size_t seq) { + assert(seq > 0 && len > seq); +#if MDBX_PNL_ASCENDING + assert(range[-1] == len); + const pgno_t *const detent = range + len - seq; + const ptrdiff_t offset = (ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + if (likely(len > seq + 3)) { + do { + const pgno_t diff0 = range[offset + 0] - range[0]; + const pgno_t diff1 = range[offset + 1] - range[1]; + const pgno_t diff2 = range[offset + 2] - range[2]; + const pgno_t diff3 = range[offset + 3] - range[3]; + if (diff0 == target) + return range + 0; + if (diff1 == target) + return range + 1; + if (diff2 == target) + return range + 2; + if (diff3 == target) + return range + 3; + range += 4; + } while (range + 3 < detent); + if (range == detent) + return nullptr; } - STATIC_ASSERT(P_BRANCH == 1); - const size_t minkeys = (mp->mp_flags & P_BRANCH) + (size_t)1; - - DEBUG(">> splitting %s-page %" PRIaPGNO - " and adding %zu+%zu [%s] at %i, nkeys %zi", - IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, newkey->iov_len, - newdata ? newdata->iov_len : 0, DKEY_DEBUG(newkey), - mc->mc_ki[mc->mc_top], nkeys); - cASSERT(mc, nkeys + 1 >= minkeys * 2); + do + if (range[offset] - *range == target) + return range; + while (++range < detent); +#else + assert(range[-(ptrdiff_t)len] == len); + const pgno_t *const detent = range - len + seq; + const ptrdiff_t offset = -(ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + if (likely(len > seq + 3)) { + do { + const pgno_t diff0 = range[-0] - range[offset - 0]; + const pgno_t diff1 = range[-1] - range[offset - 1]; + const pgno_t diff2 = range[-2] - range[offset - 2]; + const pgno_t diff3 = range[-3] - range[offset - 3]; + /* Смысл вычислений до ветвлений в том, чтобы позволить компилятору + * загружать и вычислять все значения параллельно. */ + if (diff0 == target) + return range - 0; + if (diff1 == target) + return range - 1; + if (diff2 == target) + return range - 2; + if (diff3 == target) + return range - 3; + range -= 4; + } while (range > detent + 3); + if (range == detent) + return nullptr; + } + do + if (*range - range[offset] == target) + return range; + while (--range > detent); +#endif /* pnl_t sort-order */ + return nullptr; +} - /* Create a new sibling page. */ - pgr_t npr = page_new(mc, mp->mp_flags); - if (unlikely(npr.err != MDBX_SUCCESS)) - return npr.err; - MDBX_page *const sister = npr.page; - sister->mp_leaf2_ksize = mp->mp_leaf2_ksize; - DEBUG("new sibling: page %" PRIaPGNO, sister->mp_pgno); +MDBX_MAYBE_UNUSED static const pgno_t *scan4range_checker(const pnl_t pnl, + const size_t seq) { + size_t begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_GETSIZE(pnl); +#if MDBX_PNL_ASCENDING + while (seq <= MDBX_PNL_GETSIZE(pnl) - begin) { + if (pnl[begin + seq] - pnl[begin] == seq) + return pnl + begin; + ++begin; + } +#else + while (begin > seq) { + if (pnl[begin - seq] - pnl[begin] == seq) + return pnl + begin; + --begin; + } +#endif /* pnl_t sort-order */ + return nullptr; +} - /* Usually when splitting the root page, the cursor - * height is 1. But when called from update_key, - * the cursor height may be greater because it walks - * up the stack while finding the branch slot to update. */ - if (mc->mc_top < 1) { - npr = page_new(mc, P_BRANCH); - rc = npr.err; - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - MDBX_page *const pp = npr.page; - /* shift current top to make room for new parent */ - cASSERT(mc, mc->mc_snum < 2 && mc->mc_db->md_depth > 0); -#if MDBX_DEBUG - memset(mc->mc_pg + 3, 0, sizeof(mc->mc_pg) - sizeof(mc->mc_pg[0]) * 3); - memset(mc->mc_ki + 3, -1, sizeof(mc->mc_ki) - sizeof(mc->mc_ki[0]) * 3); -#endif - mc->mc_pg[2] = mc->mc_pg[1]; - mc->mc_ki[2] = mc->mc_ki[1]; - mc->mc_pg[1] = mc->mc_pg[0]; - mc->mc_ki[1] = mc->mc_ki[0]; - mc->mc_pg[0] = pp; - mc->mc_ki[0] = 0; - mc->mc_db->md_root = pp->mp_pgno; - DEBUG("root split! new root = %" PRIaPGNO, pp->mp_pgno); - foliage = mc->mc_db->md_depth++; +#if defined(_MSC_VER) && !defined(__builtin_clz) && \ + !__has_builtin(__builtin_clz) +MDBX_MAYBE_UNUSED static __always_inline size_t __builtin_clz(uint32_t value) { + unsigned long index; + _BitScanReverse(&index, value); + return 31 - index; +} +#endif /* _MSC_VER */ - /* Add left (implicit) pointer. */ - rc = node_add_branch(mc, 0, NULL, mp->mp_pgno); - if (unlikely(rc != MDBX_SUCCESS)) { - /* undo the pre-push */ - mc->mc_pg[0] = mc->mc_pg[1]; - mc->mc_ki[0] = mc->mc_ki[1]; - mc->mc_db->md_root = mp->mp_pgno; - mc->mc_db->md_depth--; - goto done; - } - mc->mc_snum++; - mc->mc_top++; - ptop = 0; - if (AUDIT_ENABLED()) { - rc = cursor_check_updating(mc); - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - } - } else { - ptop = mc->mc_top - 1; - DEBUG("parent branch page is %" PRIaPGNO, mc->mc_pg[ptop]->mp_pgno); - } +#if defined(_MSC_VER) && !defined(__builtin_clzl) && \ + !__has_builtin(__builtin_clzl) +MDBX_MAYBE_UNUSED static __always_inline size_t __builtin_clzl(size_t value) { + unsigned long index; +#ifdef _WIN64 + assert(sizeof(value) == 8); + _BitScanReverse64(&index, value); + return 63 - index; +#else + assert(sizeof(value) == 4); + _BitScanReverse(&index, value); + return 31 - index; +#endif +} +#endif /* _MSC_VER */ - MDBX_cursor mn; - cursor_copy(mc, &mn); - mn.mc_pg[mn.mc_top] = sister; - mn.mc_ki[mn.mc_top] = 0; - mn.mc_ki[ptop] = mc->mc_ki[ptop] + 1; +#if !MDBX_PNL_ASCENDING - size_t split_indx = - (newindx < nkeys) - ? /* split at the middle */ (nkeys + 1) >> 1 - : /* split at the end (i.e. like append-mode ) */ nkeys - minkeys + 1; - eASSERT(env, split_indx >= minkeys && split_indx <= nkeys - minkeys + 1); +#if !defined(MDBX_ATTRIBUTE_TARGET) && \ + (__has_attribute(__target__) || __GNUC_PREREQ(5, 0)) +#define MDBX_ATTRIBUTE_TARGET(target) __attribute__((__target__(target))) +#endif /* MDBX_ATTRIBUTE_TARGET */ - cASSERT(mc, !IS_BRANCH(mp) || newindx > 0); - MDBX_val sepkey = {nullptr, 0}; - /* It is reasonable and possible to split the page at the begin */ - if (unlikely(newindx < minkeys)) { - split_indx = minkeys; - if (newindx == 0 && !(naf & MDBX_SPLIT_REPLACE)) { - split_indx = 0; - /* Checking for ability of splitting by the left-side insertion - * of a pure page with the new key */ - for (i = 0; i < mc->mc_top; ++i) - if (mc->mc_ki[i]) { - get_key(page_node(mc->mc_pg[i], mc->mc_ki[i]), &sepkey); - if (mc->mc_dbx->md_cmp(newkey, &sepkey) >= 0) - split_indx = minkeys; - break; - } - if (split_indx == 0) { - /* Save the current first key which was omitted on the parent branch - * page and should be updated if the new first entry will be added */ - if (IS_LEAF2(mp)) { - sepkey.iov_len = mp->mp_leaf2_ksize; - sepkey.iov_base = page_leaf2key(mp, 0, sepkey.iov_len); - } else - get_key(page_node(mp, 0), &sepkey); - cASSERT(mc, mc->mc_dbx->md_cmp(newkey, &sepkey) < 0); - /* Avoiding rare complex cases of nested split the parent page(s) */ - if (page_room(mc->mc_pg[ptop]) < branch_size(env, &sepkey)) - split_indx = minkeys; - } - if (foliage) { - TRACE("pure-left: foliage %u, top %i, ptop %zu, split_indx %zi, " - "minkeys %zi, sepkey %s, parent-room %zu, need4split %zu", - foliage, mc->mc_top, ptop, split_indx, minkeys, - DKEY_DEBUG(&sepkey), page_room(mc->mc_pg[ptop]), - branch_size(env, &sepkey)); - TRACE("pure-left: newkey %s, newdata %s, newindx %zu", - DKEY_DEBUG(newkey), DVAL_DEBUG(newdata), newindx); - } - } - } - - const bool pure_right = split_indx == nkeys; - const bool pure_left = split_indx == 0; - if (unlikely(pure_right)) { - /* newindx == split_indx == nkeys */ - TRACE("no-split, but add new pure page at the %s", "right/after"); - cASSERT(mc, newindx == nkeys && split_indx == nkeys && minkeys == 1); - sepkey = *newkey; - } else if (unlikely(pure_left)) { - /* newindx == split_indx == 0 */ - TRACE("pure-left: no-split, but add new pure page at the %s", - "left/before"); - cASSERT(mc, newindx == 0 && split_indx == 0 && minkeys == 1); - TRACE("pure-left: old-first-key is %s", DKEY_DEBUG(&sepkey)); - } else { - if (IS_LEAF2(sister)) { - /* Move half of the keys to the right sibling */ - const intptr_t distance = mc->mc_ki[mc->mc_top] - split_indx; - size_t ksize = mc->mc_db->md_xsize; - void *const split = page_leaf2key(mp, split_indx, ksize); - size_t rsize = (nkeys - split_indx) * ksize; - size_t lsize = (nkeys - split_indx) * sizeof(indx_t); - cASSERT(mc, mp->mp_lower >= lsize); - mp->mp_lower -= (indx_t)lsize; - cASSERT(mc, sister->mp_lower + lsize <= UINT16_MAX); - sister->mp_lower += (indx_t)lsize; - cASSERT(mc, mp->mp_upper + rsize - lsize <= UINT16_MAX); - mp->mp_upper += (indx_t)(rsize - lsize); - cASSERT(mc, sister->mp_upper >= rsize - lsize); - sister->mp_upper -= (indx_t)(rsize - lsize); - sepkey.iov_len = ksize; - sepkey.iov_base = (newindx != split_indx) ? split : newkey->iov_base; - if (distance < 0) { - cASSERT(mc, ksize >= sizeof(indx_t)); - void *const ins = page_leaf2key(mp, mc->mc_ki[mc->mc_top], ksize); - memcpy(sister->mp_ptrs, split, rsize); - sepkey.iov_base = sister->mp_ptrs; - memmove(ptr_disp(ins, ksize), ins, - (split_indx - mc->mc_ki[mc->mc_top]) * ksize); - memcpy(ins, newkey->iov_base, ksize); - cASSERT(mc, UINT16_MAX - mp->mp_lower >= (int)sizeof(indx_t)); - mp->mp_lower += sizeof(indx_t); - cASSERT(mc, mp->mp_upper >= ksize - sizeof(indx_t)); - mp->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); - cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->mp_upper) & 1) == 0); - } else { - memcpy(sister->mp_ptrs, split, distance * ksize); - void *const ins = page_leaf2key(sister, distance, ksize); - memcpy(ins, newkey->iov_base, ksize); - memcpy(ptr_disp(ins, ksize), ptr_disp(split, distance * ksize), - rsize - distance * ksize); - cASSERT(mc, UINT16_MAX - sister->mp_lower >= (int)sizeof(indx_t)); - sister->mp_lower += sizeof(indx_t); - cASSERT(mc, sister->mp_upper >= ksize - sizeof(indx_t)); - sister->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); - cASSERT(mc, distance <= (int)UINT16_MAX); - mc->mc_ki[mc->mc_top] = (indx_t)distance; - cASSERT(mc, - (((ksize & page_numkeys(sister)) ^ sister->mp_upper) & 1) == 0); - } - - if (AUDIT_ENABLED()) { - rc = cursor_check_updating(mc); - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - rc = cursor_check_updating(&mn); - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - } - } else { - /* grab a page to hold a temporary copy */ - tmp_ki_copy = page_malloc(mc->mc_txn, 1); - if (unlikely(tmp_ki_copy == NULL)) { - rc = MDBX_ENOMEM; - goto done; - } - - const size_t max_space = page_space(env); - const size_t new_size = IS_LEAF(mp) ? leaf_size(env, newkey, newdata) - : branch_size(env, newkey); - - /* prepare to insert */ - for (i = 0; i < newindx; ++i) - tmp_ki_copy->mp_ptrs[i] = mp->mp_ptrs[i]; - tmp_ki_copy->mp_ptrs[i] = (indx_t)-1; - while (++i <= nkeys) - tmp_ki_copy->mp_ptrs[i] = mp->mp_ptrs[i - 1]; - tmp_ki_copy->mp_pgno = mp->mp_pgno; - tmp_ki_copy->mp_flags = mp->mp_flags; - tmp_ki_copy->mp_txnid = INVALID_TXNID; - tmp_ki_copy->mp_lower = 0; - tmp_ki_copy->mp_upper = (indx_t)max_space; - - /* Добавляемый узел может не поместиться в страницу-половину вместе - * с количественной половиной узлов из исходной страницы. В худшем случае, - * в страницу-половину с добавляемым узлом могут попасть самые больше узлы - * из исходной страницы, а другую половину только узлы с самыми короткими - * ключами и с пустыми данными. Поэтому, чтобы найти подходящую границу - * разреза требуется итерировать узлы и считая их объем. - * - * Однако, при простом количественном делении (без учета размера ключей - * и данных) на страницах-половинах будет примерно вдвое меньше узлов. - * Поэтому добавляемый узел точно поместится, если его размер не больше - * чем место "освобождающееся" от заголовков узлов, которые переедут - * в другую страницу-половину. Кроме этого, как минимум по одному байту - * будет в каждом ключе, в худшем случае кроме одного, который может быть - * нулевого размера. */ +#ifndef MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND +/* Workaround for GCC's bug with `-m32 -march=i686 -Ofast` + * gcc/i686-buildroot-linux-gnu/12.2.0/include/xmmintrin.h:814:1: + * error: inlining failed in call to 'always_inline' '_mm_movemask_ps': + * target specific option mismatch */ +#if !defined(__FAST_MATH__) || !__FAST_MATH__ || !defined(__GNUC__) || \ + defined(__e2k__) || defined(__clang__) || defined(__amd64__) || \ + defined(__SSE2__) +#define MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND 0 +#else +#define MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND 1 +#endif +#endif /* MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND */ - if (newindx == split_indx && nkeys >= 5) { - STATIC_ASSERT(P_BRANCH == 1); - split_indx += mp->mp_flags & P_BRANCH; - } - eASSERT(env, split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys); - const size_t dim_nodes = - (newindx >= split_indx) ? split_indx : nkeys - split_indx; - const size_t dim_used = (sizeof(indx_t) + NODESIZE + 1) * dim_nodes; - if (new_size >= dim_used) { - /* Search for best acceptable split point */ - i = (newindx < split_indx) ? 0 : nkeys; - intptr_t dir = (newindx < split_indx) ? 1 : -1; - size_t before = 0, after = new_size + page_used(env, mp); - size_t best_split = split_indx; - size_t best_shift = INT_MAX; +#if defined(__SSE2__) && defined(__SSE__) +#define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */ +#elif (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(__amd64__) +#define __SSE2__ +#define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */ +#elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) && \ + !MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND +#define MDBX_ATTRIBUTE_TARGET_SSE2 MDBX_ATTRIBUTE_TARGET("sse,sse2") +#endif /* __SSE2__ */ - TRACE("seek separator from %zu, step %zi, default %zu, new-idx %zu, " - "new-size %zu", - i, dir, split_indx, newindx, new_size); - do { - cASSERT(mc, i <= nkeys); - size_t size = new_size; - if (i != newindx) { - MDBX_node *node = ptr_disp(mp, tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ); - size = NODESIZE + node_ks(node) + sizeof(indx_t); - if (IS_LEAF(mp)) - size += (node_flags(node) & F_BIGDATA) ? sizeof(pgno_t) - : node_ds(node); - size = EVEN(size); - } +#if defined(__AVX2__) +#define MDBX_ATTRIBUTE_TARGET_AVX2 /* nope */ +#elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) && \ + !MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND +#define MDBX_ATTRIBUTE_TARGET_AVX2 MDBX_ATTRIBUTE_TARGET("sse,sse2,avx,avx2") +#endif /* __AVX2__ */ - before += size; - after -= size; - TRACE("step %zu, size %zu, before %zu, after %zu, max %zu", i, size, - before, after, max_space); +#if defined(MDBX_ATTRIBUTE_TARGET_AVX2) +#if defined(__AVX512BW__) +#define MDBX_ATTRIBUTE_TARGET_AVX512BW /* nope */ +#elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) && \ + !MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND && \ + (__GNUC_PREREQ(6, 0) || __CLANG_PREREQ(5, 0)) +#define MDBX_ATTRIBUTE_TARGET_AVX512BW \ + MDBX_ATTRIBUTE_TARGET("sse,sse2,avx,avx2,avx512bw") +#endif /* __AVX512BW__ */ +#endif /* MDBX_ATTRIBUTE_TARGET_AVX2 for MDBX_ATTRIBUTE_TARGET_AVX512BW */ - if (before <= max_space && after <= max_space) { - const size_t split = i + (dir > 0); - if (split >= minkeys && split <= nkeys + 1 - minkeys) { - const size_t shift = branchless_abs(split_indx - split); - if (shift >= best_shift) - break; - best_shift = shift; - best_split = split; - if (!best_shift) - break; - } - } - i += dir; - } while (i < nkeys); +#ifdef MDBX_ATTRIBUTE_TARGET_SSE2 +MDBX_ATTRIBUTE_TARGET_SSE2 static __always_inline unsigned +diffcmp2mask_sse2(const pgno_t *const ptr, const ptrdiff_t offset, + const __m128i pattern) { + const __m128i f = _mm_loadu_si128((const __m128i *)ptr); + const __m128i l = _mm_loadu_si128((const __m128i *)(ptr + offset)); + const __m128i cmp = _mm_cmpeq_epi32(_mm_sub_epi32(f, l), pattern); + return _mm_movemask_ps(*(const __m128 *)&cmp); +} - split_indx = best_split; - TRACE("chosen %zu", split_indx); +MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_SSE2 static pgno_t * +scan4seq_sse2(pgno_t *range, const size_t len, const size_t seq) { + assert(seq > 0 && len > seq); +#if MDBX_PNL_ASCENDING +#error "FIXME: Not implemented" +#endif /* MDBX_PNL_ASCENDING */ + assert(range[-(ptrdiff_t)len] == len); + pgno_t *const detent = range - len + seq; + const ptrdiff_t offset = -(ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + const __m128i pattern = _mm_set1_epi32(target); + uint8_t mask; + if (likely(len > seq + 3)) { + do { + mask = (uint8_t)diffcmp2mask_sse2(range - 3, offset, pattern); + if (mask) { +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) + found: +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ + return range + 28 - __builtin_clz(mask); } - eASSERT(env, split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys); + range -= 4; + } while (range > detent + 3); + if (range == detent) + return nullptr; + } - sepkey = *newkey; - if (split_indx != newindx) { - MDBX_node *node = - ptr_disp(mp, tmp_ki_copy->mp_ptrs[split_indx] + PAGEHDRSZ); - sepkey.iov_len = node_ks(node); - sepkey.iov_base = node_key(node); - } - } + /* Далее происходит чтение от 4 до 12 лишних байт, которые могут быть не + * только за пределами региона выделенного под PNL, но и пересекать границу + * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. + * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) + const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */; + if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && + !RUNNING_ON_VALGRIND) { + const unsigned extra = (unsigned)(detent + 4 - range); + assert(extra > 0 && extra < 4); + mask = 0xF << extra; + mask &= diffcmp2mask_sse2(range - 3, offset, pattern); + if (mask) + goto found; + return nullptr; } - DEBUG("separator is %zd [%s]", split_indx, DKEY_DEBUG(&sepkey)); +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ + do + if (*range - range[offset] == target) + return range; + while (--range != detent); + return nullptr; +} +#endif /* MDBX_ATTRIBUTE_TARGET_SSE2 */ - bool did_split_parent = false; - /* Copy separator key to the parent. */ - if (page_room(mn.mc_pg[ptop]) < branch_size(env, &sepkey)) { - TRACE("need split parent branch-page for key %s", DKEY_DEBUG(&sepkey)); - cASSERT(mc, page_numkeys(mn.mc_pg[ptop]) > 2); - cASSERT(mc, !pure_left); - const int snum = mc->mc_snum; - const int depth = mc->mc_db->md_depth; - mn.mc_snum--; - mn.mc_top--; - did_split_parent = true; - /* We want other splits to find mn when doing fixups */ - WITH_CURSOR_TRACKING( - mn, rc = page_split(&mn, &sepkey, NULL, sister->mp_pgno, 0)); - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - cASSERT(mc, (int)mc->mc_snum - snum == mc->mc_db->md_depth - depth); - if (AUDIT_ENABLED()) { - rc = cursor_check_updating(mc); - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - } +#ifdef MDBX_ATTRIBUTE_TARGET_AVX2 +MDBX_ATTRIBUTE_TARGET_AVX2 static __always_inline unsigned +diffcmp2mask_avx2(const pgno_t *const ptr, const ptrdiff_t offset, + const __m256i pattern) { + const __m256i f = _mm256_loadu_si256((const __m256i *)ptr); + const __m256i l = _mm256_loadu_si256((const __m256i *)(ptr + offset)); + const __m256i cmp = _mm256_cmpeq_epi32(_mm256_sub_epi32(f, l), pattern); + return _mm256_movemask_ps(*(const __m256 *)&cmp); +} - /* root split? */ - ptop += mc->mc_snum - (size_t)snum; +MDBX_ATTRIBUTE_TARGET_AVX2 static __always_inline unsigned +diffcmp2mask_sse2avx(const pgno_t *const ptr, const ptrdiff_t offset, + const __m128i pattern) { + const __m128i f = _mm_loadu_si128((const __m128i *)ptr); + const __m128i l = _mm_loadu_si128((const __m128i *)(ptr + offset)); + const __m128i cmp = _mm_cmpeq_epi32(_mm_sub_epi32(f, l), pattern); + return _mm_movemask_ps(*(const __m128 *)&cmp); +} - /* Right page might now have changed parent. - * Check if left page also changed parent. */ - if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && - mc->mc_ki[ptop] >= page_numkeys(mc->mc_pg[ptop])) { - for (i = 0; i < ptop; i++) { - mc->mc_pg[i] = mn.mc_pg[i]; - mc->mc_ki[i] = mn.mc_ki[i]; - } - mc->mc_pg[ptop] = mn.mc_pg[ptop]; - if (mn.mc_ki[ptop]) { - mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1; - } else { - /* find right page's left sibling */ - mc->mc_ki[ptop] = mn.mc_ki[ptop]; - rc = cursor_sibling(mc, SIBLING_LEFT); - if (unlikely(rc != MDBX_SUCCESS)) { - if (rc == MDBX_NOTFOUND) /* improper mdbx_cursor_sibling() result */ { - ERROR("unexpected %i error going left sibling", rc); - rc = MDBX_PROBLEM; - } - goto done; - } +MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX2 static pgno_t * +scan4seq_avx2(pgno_t *range, const size_t len, const size_t seq) { + assert(seq > 0 && len > seq); +#if MDBX_PNL_ASCENDING +#error "FIXME: Not implemented" +#endif /* MDBX_PNL_ASCENDING */ + assert(range[-(ptrdiff_t)len] == len); + pgno_t *const detent = range - len + seq; + const ptrdiff_t offset = -(ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + const __m256i pattern = _mm256_set1_epi32(target); + uint8_t mask; + if (likely(len > seq + 7)) { + do { + mask = (uint8_t)diffcmp2mask_avx2(range - 7, offset, pattern); + if (mask) { +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) + found: +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ + return range + 24 - __builtin_clz(mask); } - } - } else if (unlikely(pure_left)) { - MDBX_page *ptop_page = mc->mc_pg[ptop]; - TRACE("pure-left: adding to parent page %u node[%u] left-leaf page #%u key " - "%s", - ptop_page->mp_pgno, mc->mc_ki[ptop], sister->mp_pgno, - DKEY(mc->mc_ki[ptop] ? newkey : NULL)); - assert(mc->mc_top == ptop + 1); - mc->mc_top = (uint8_t)ptop; - rc = node_add_branch(mc, mc->mc_ki[ptop], mc->mc_ki[ptop] ? newkey : NULL, - sister->mp_pgno); - cASSERT(mc, mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1] && - ptop == mc->mc_top); - - if (likely(rc == MDBX_SUCCESS) && mc->mc_ki[ptop] == 0) { - MDBX_node *node = page_node(mc->mc_pg[ptop], 1); - TRACE("pure-left: update prev-first key on parent to %s", DKEY(&sepkey)); - cASSERT(mc, node_ks(node) == 0 && node_pgno(node) == mp->mp_pgno); - cASSERT(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 0); - mc->mc_ki[ptop] = 1; - rc = update_key(mc, &sepkey); - cASSERT(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 1); - cASSERT(mc, mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1]); - mc->mc_ki[ptop] = 0; - } else { - TRACE("pure-left: no-need-update prev-first key on parent %s", - DKEY(&sepkey)); - } - - mc->mc_top++; - if (unlikely(rc != MDBX_SUCCESS)) - goto done; + range -= 8; + } while (range > detent + 7); + if (range == detent) + return nullptr; + } - MDBX_node *node = page_node(mc->mc_pg[ptop], mc->mc_ki[ptop] + (size_t)1); - cASSERT(mc, node_pgno(node) == mp->mp_pgno && mc->mc_pg[ptop] == ptop_page); - } else { - mn.mc_top--; - TRACE("add-to-parent the right-entry[%u] for new sibling-page", - mn.mc_ki[ptop]); - rc = node_add_branch(&mn, mn.mc_ki[ptop], &sepkey, sister->mp_pgno); - mn.mc_top++; - if (unlikely(rc != MDBX_SUCCESS)) - goto done; + /* Далее происходит чтение от 4 до 28 лишних байт, которые могут быть не + * только за пределами региона выделенного под PNL, но и пересекать границу + * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. + * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) + const unsigned on_page_safe_mask = 0xfe0 /* enough for '-31' bytes offset */; + if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && + !RUNNING_ON_VALGRIND) { + const unsigned extra = (unsigned)(detent + 8 - range); + assert(extra > 0 && extra < 8); + mask = 0xFF << extra; + mask &= diffcmp2mask_avx2(range - 7, offset, pattern); + if (mask) + goto found; + return nullptr; + } +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ + if (range - 3 > detent) { + mask = diffcmp2mask_sse2avx(range - 3, offset, *(const __m128i *)&pattern); + if (mask) + return range + 28 - __builtin_clz(mask); + range -= 4; + } + while (range > detent) { + if (*range - range[offset] == target) + return range; + --range; } + return nullptr; +} +#endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */ - if (unlikely(pure_left | pure_right)) { - mc->mc_pg[mc->mc_top] = sister; - mc->mc_ki[mc->mc_top] = 0; - switch (PAGETYPE_WHOLE(sister)) { - case P_LEAF: { - cASSERT(mc, newpgno == 0 || newpgno == P_INVALID); - rc = node_add_leaf(mc, 0, newkey, newdata, naf); - } break; - case P_LEAF | P_LEAF2: { - cASSERT(mc, (naf & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); - cASSERT(mc, newpgno == 0 || newpgno == P_INVALID); - rc = node_add_leaf2(mc, 0, newkey); - } break; - default: - rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE_WHOLE(sister)); - } - if (unlikely(rc != MDBX_SUCCESS)) - goto done; +#ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW +MDBX_ATTRIBUTE_TARGET_AVX512BW static __always_inline unsigned +diffcmp2mask_avx512bw(const pgno_t *const ptr, const ptrdiff_t offset, + const __m512i pattern) { + const __m512i f = _mm512_loadu_si512((const __m512i *)ptr); + const __m512i l = _mm512_loadu_si512((const __m512i *)(ptr + offset)); + return _mm512_cmpeq_epi32_mask(_mm512_sub_epi32(f, l), pattern); +} - if (pure_right) { - for (i = 0; i < mc->mc_top; i++) - mc->mc_ki[i] = mn.mc_ki[i]; - } else if (mc->mc_ki[mc->mc_top - 1] == 0) { - for (i = 2; i <= mc->mc_top; ++i) - if (mc->mc_ki[mc->mc_top - i]) { - get_key( - page_node(mc->mc_pg[mc->mc_top - i], mc->mc_ki[mc->mc_top - i]), - &sepkey); - if (mc->mc_dbx->md_cmp(newkey, &sepkey) < 0) { - mc->mc_top -= (uint8_t)i; - DEBUG("pure-left: update new-first on parent [%i] page %u key %s", - mc->mc_ki[mc->mc_top], mc->mc_pg[mc->mc_top]->mp_pgno, - DKEY(newkey)); - rc = update_key(mc, newkey); - mc->mc_top += (uint8_t)i; - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - } - break; - } - } - } else if (tmp_ki_copy) { /* !IS_LEAF2(mp) */ - /* Move nodes */ - mc->mc_pg[mc->mc_top] = sister; - i = split_indx; - size_t n = 0; +MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX512BW static pgno_t * +scan4seq_avx512bw(pgno_t *range, const size_t len, const size_t seq) { + assert(seq > 0 && len > seq); +#if MDBX_PNL_ASCENDING +#error "FIXME: Not implemented" +#endif /* MDBX_PNL_ASCENDING */ + assert(range[-(ptrdiff_t)len] == len); + pgno_t *const detent = range - len + seq; + const ptrdiff_t offset = -(ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + const __m512i pattern = _mm512_set1_epi32(target); + unsigned mask; + if (likely(len > seq + 15)) { do { - TRACE("i %zu, nkeys %zu => n %zu, rp #%u", i, nkeys, n, sister->mp_pgno); - pgno_t pgno = 0; - MDBX_val *rdata = NULL; - if (i == newindx) { - rkey = *newkey; - if (IS_LEAF(mp)) - rdata = newdata; - else - pgno = newpgno; - flags = naf; - /* Update index for the new key. */ - mc->mc_ki[mc->mc_top] = (indx_t)n; - } else { - MDBX_node *node = ptr_disp(mp, tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ); - rkey.iov_base = node_key(node); - rkey.iov_len = node_ks(node); - if (IS_LEAF(mp)) { - xdata.iov_base = node_data(node); - xdata.iov_len = node_ds(node); - rdata = &xdata; - } else - pgno = node_pgno(node); - flags = node_flags(node); - } - - switch (PAGETYPE_WHOLE(sister)) { - case P_BRANCH: { - cASSERT(mc, 0 == (uint16_t)flags); - /* First branch index doesn't need key data. */ - rc = node_add_branch(mc, n, n ? &rkey : NULL, pgno); - } break; - case P_LEAF: { - cASSERT(mc, pgno == 0); - cASSERT(mc, rdata != NULL); - rc = node_add_leaf(mc, n, &rkey, rdata, flags); - } break; - /* case P_LEAF | P_LEAF2: { - cASSERT(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); - cASSERT(mc, gno == 0); - rc = mdbx_node_add_leaf2(mc, n, &rkey); - } break; */ - default: - rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE_WHOLE(sister)); - } - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - - ++n; - if (++i > nkeys) { - i = 0; - n = 0; - mc->mc_pg[mc->mc_top] = tmp_ki_copy; - TRACE("switch to mp #%u", tmp_ki_copy->mp_pgno); - } - } while (i != split_indx); - - TRACE("i %zu, nkeys %zu, n %zu, pgno #%u", i, nkeys, n, - mc->mc_pg[mc->mc_top]->mp_pgno); - - nkeys = page_numkeys(tmp_ki_copy); - for (i = 0; i < nkeys; i++) - mp->mp_ptrs[i] = tmp_ki_copy->mp_ptrs[i]; - mp->mp_lower = tmp_ki_copy->mp_lower; - mp->mp_upper = tmp_ki_copy->mp_upper; - memcpy(page_node(mp, nkeys - 1), page_node(tmp_ki_copy, nkeys - 1), - env->me_psize - tmp_ki_copy->mp_upper - PAGEHDRSZ); - - /* reset back to original page */ - if (newindx < split_indx) { - mc->mc_pg[mc->mc_top] = mp; - } else { - mc->mc_pg[mc->mc_top] = sister; - mc->mc_ki[ptop]++; - /* Make sure mc_ki is still valid. */ - if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && - mc->mc_ki[ptop] >= page_numkeys(mc->mc_pg[ptop])) { - for (i = 0; i <= ptop; i++) { - mc->mc_pg[i] = mn.mc_pg[i]; - mc->mc_ki[i] = mn.mc_ki[i]; - } - } - } - } else if (newindx >= split_indx) { - mc->mc_pg[mc->mc_top] = sister; - mc->mc_ki[ptop]++; - /* Make sure mc_ki is still valid. */ - if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && - mc->mc_ki[ptop] >= page_numkeys(mc->mc_pg[ptop])) { - for (i = 0; i <= ptop; i++) { - mc->mc_pg[i] = mn.mc_pg[i]; - mc->mc_ki[i] = mn.mc_ki[i]; + mask = diffcmp2mask_avx512bw(range - 15, offset, pattern); + if (mask) { +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) + found: +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ + return range + 16 - __builtin_clz(mask); } - } + range -= 16; + } while (range > detent + 15); + if (range == detent) + return nullptr; } - /* Adjust other cursors pointing to mp and/or to parent page */ - nkeys = page_numkeys(mp); - for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; - m2 = m2->mc_next) { - MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (m3 == mc) - continue; - if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (foliage) { - /* sub cursors may be on different DB */ - if (m3->mc_pg[0] != mp) - continue; - /* root split */ - for (int k = foliage; k >= 0; k--) { - m3->mc_ki[k + 1] = m3->mc_ki[k]; - m3->mc_pg[k + 1] = m3->mc_pg[k]; - } - m3->mc_ki[0] = m3->mc_ki[0] >= nkeys + pure_left; - m3->mc_pg[0] = mc->mc_pg[0]; - m3->mc_snum++; - m3->mc_top++; - } - - if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp && !pure_left) { - if (m3->mc_ki[mc->mc_top] >= newindx && !(naf & MDBX_SPLIT_REPLACE)) - m3->mc_ki[mc->mc_top]++; - if (m3->mc_ki[mc->mc_top] >= nkeys) { - m3->mc_pg[mc->mc_top] = sister; - cASSERT(mc, m3->mc_ki[mc->mc_top] >= nkeys); - m3->mc_ki[mc->mc_top] -= (indx_t)nkeys; - for (i = 0; i < mc->mc_top; i++) { - m3->mc_ki[i] = mn.mc_ki[i]; - m3->mc_pg[i] = mn.mc_pg[i]; - } - } - } else if (!did_split_parent && m3->mc_top >= ptop && - m3->mc_pg[ptop] == mc->mc_pg[ptop] && - m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { - m3->mc_ki[ptop]++; /* also for the `pure-left` case */ - } - if (XCURSOR_INITED(m3) && IS_LEAF(mp)) - XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); + /* Далее происходит чтение от 4 до 60 лишних байт, которые могут быть не + * только за пределами региона выделенного под PNL, но и пересекать границу + * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. + * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) + const unsigned on_page_safe_mask = 0xfc0 /* enough for '-63' bytes offset */; + if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && + !RUNNING_ON_VALGRIND) { + const unsigned extra = (unsigned)(detent + 16 - range); + assert(extra > 0 && extra < 16); + mask = 0xFFFF << extra; + mask &= diffcmp2mask_avx512bw(range - 15, offset, pattern); + if (mask) + goto found; + return nullptr; } - TRACE("mp #%u left: %zd, sister #%u left: %zd", mp->mp_pgno, page_room(mp), - sister->mp_pgno, page_room(sister)); - -done: - if (tmp_ki_copy) - dpage_free(env, tmp_ki_copy, 1); - - if (unlikely(rc != MDBX_SUCCESS)) - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - else { - if (AUDIT_ENABLED()) - rc = cursor_check_updating(mc); - if (unlikely(naf & MDBX_RESERVE)) { - MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (!(node_flags(node) & F_BIGDATA)) - newdata->iov_base = node_data(node); - } -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.split.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ + if (range - 7 > detent) { + mask = diffcmp2mask_avx2(range - 7, offset, *(const __m256i *)&pattern); + if (mask) + return range + 24 - __builtin_clz(mask); + range -= 8; } - - DEBUG("<< mp #%u, rc %d", mp->mp_pgno, rc); - return rc; + if (range - 3 > detent) { + mask = diffcmp2mask_sse2avx(range - 3, offset, *(const __m128i *)&pattern); + if (mask) + return range + 28 - __builtin_clz(mask); + range -= 4; + } + while (range > detent) { + if (*range - range[offset] == target) + return range; + --range; + } + return nullptr; } +#endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ -int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, - MDBX_put_flags_t flags) { - int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(!key || !data)) - return MDBX_EINVAL; - - if (unlikely(dbi <= FREE_DBI)) - return MDBX_BAD_DBI; - - if (unlikely(flags & ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS | - MDBX_ALLDUPS | MDBX_RESERVE | MDBX_APPEND | - MDBX_APPENDDUP | MDBX_CURRENT | MDBX_MULTIPLE))) - return MDBX_EINVAL; - - if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) - return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; - - MDBX_cursor_couple cx; - rc = cursor_init(&cx.outer, txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - cx.outer.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &cx.outer; +#if (defined(__ARM_NEON) || defined(__ARM_NEON__)) && \ + (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +static __always_inline size_t diffcmp2mask_neon(const pgno_t *const ptr, + const ptrdiff_t offset, + const uint32x4_t pattern) { + const uint32x4_t f = vld1q_u32(ptr); + const uint32x4_t l = vld1q_u32(ptr + offset); + const uint16x4_t cmp = vmovn_u32(vceqq_u32(vsubq_u32(f, l), pattern)); + if (sizeof(size_t) > 7) + return vget_lane_u64(vreinterpret_u64_u16(cmp), 0); + else + return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(cmp, cmp))), + 0); +} - /* LY: support for update (explicit overwrite) */ - if (flags & MDBX_CURRENT) { - rc = cursor_set(&cx.outer, (MDBX_val *)key, NULL, MDBX_SET).err; - if (likely(rc == MDBX_SUCCESS) && - (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) && - (flags & MDBX_ALLDUPS) == 0) { - /* LY: allows update (explicit overwrite) only for unique keys */ - MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], - cx.outer.mc_ki[cx.outer.mc_top]); - if (node_flags(node) & F_DUPDATA) { - tASSERT(txn, XCURSOR_INITED(&cx.outer) && - cx.outer.mc_xcursor->mx_db.md_entries > 1); - rc = MDBX_EMULTIVAL; - if ((flags & MDBX_NOOVERWRITE) == 0) { - flags -= MDBX_CURRENT; - rc = cursor_del(&cx.outer, MDBX_ALLDUPS); - } +__hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, + const size_t seq) { + assert(seq > 0 && len > seq); +#if MDBX_PNL_ASCENDING +#error "FIXME: Not implemented" +#endif /* MDBX_PNL_ASCENDING */ + assert(range[-(ptrdiff_t)len] == len); + pgno_t *const detent = range - len + seq; + const ptrdiff_t offset = -(ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + const uint32x4_t pattern = vmovq_n_u32(target); + size_t mask; + if (likely(len > seq + 3)) { + do { + mask = diffcmp2mask_neon(range - 3, offset, pattern); + if (mask) { +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) + found: +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ + return ptr_disp(range, -(__builtin_clzl(mask) >> sizeof(size_t) / 4)); } - } + range -= 4; + } while (range > detent + 3); + if (range == detent) + return nullptr; } - if (likely(rc == MDBX_SUCCESS)) - rc = cursor_put_checklen(&cx.outer, key, data, flags); - txn->mt_cursors[dbi] = cx.outer.mc_next; - - return rc; + /* Далее происходит чтение от 4 до 12 лишних байт, которые могут быть не + * только за пределами региона выделенного под PNL, но и пересекать границу + * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. + * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) + const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */; + if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && + !RUNNING_ON_VALGRIND) { + const unsigned extra = (unsigned)(detent + 4 - range); + assert(extra > 0 && extra < 4); + mask = (~(size_t)0) << (extra * sizeof(size_t) * 2); + mask &= diffcmp2mask_neon(range - 3, offset, pattern); + if (mask) + goto found; + return nullptr; + } +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ + do + if (*range - range[offset] == target) + return range; + while (--range != detent); + return nullptr; } +#endif /* __ARM_NEON || __ARM_NEON__ */ -/**** COPYING *****************************************************************/ +#if defined(__AVX512BW__) && defined(MDBX_ATTRIBUTE_TARGET_AVX512BW) +#define scan4seq_default scan4seq_avx512bw +#define scan4seq_impl scan4seq_default +#elif defined(__AVX2__) && defined(MDBX_ATTRIBUTE_TARGET_AVX2) +#define scan4seq_default scan4seq_avx2 +#elif defined(__SSE2__) && defined(MDBX_ATTRIBUTE_TARGET_SSE2) +#define scan4seq_default scan4seq_sse2 +#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && \ + (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define scan4seq_default scan4seq_neon +/* Choosing of another variants should be added here. */ +#endif /* scan4seq_default */ -/* State needed for a double-buffering compacting copy. */ -typedef struct mdbx_compacting_ctx { - MDBX_env *mc_env; - MDBX_txn *mc_txn; - osal_condpair_t mc_condpair; - uint8_t *mc_wbuf[2]; - size_t mc_wlen[2]; - mdbx_filehandle_t mc_fd; - /* Error code. Never cleared if set. Both threads can set nonzero - * to fail the copy. Not mutex-protected, MDBX expects atomic int. */ - volatile int mc_error; - pgno_t mc_next_pgno; - volatile unsigned mc_head; - volatile unsigned mc_tail; -} mdbx_compacting_ctx; +#endif /* MDBX_PNL_ASCENDING */ -/* Dedicated writer thread for compacting copy. */ -__cold static THREAD_RESULT THREAD_CALL compacting_write_thread(void *arg) { - mdbx_compacting_ctx *const ctx = arg; +#ifndef scan4seq_default +#define scan4seq_default scan4seq_fallback +#endif /* scan4seq_default */ -#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) - sigset_t sigset; - sigemptyset(&sigset); - sigaddset(&sigset, SIGPIPE); - ctx->mc_error = pthread_sigmask(SIG_BLOCK, &sigset, NULL); -#endif /* EPIPE */ +#ifdef scan4seq_impl +/* The scan4seq_impl() is the best or no alternatives */ +#elif !MDBX_HAVE_BUILTIN_CPU_SUPPORTS +/* The scan4seq_default() will be used since no cpu-features detection support + * from compiler. Please don't ask to implement cpuid-based detection and don't + * make such PRs. */ +#define scan4seq_impl scan4seq_default +#else +/* Selecting the most appropriate implementation at runtime, + * depending on the available CPU features. */ +static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, + const size_t seq); +static pgno_t *(*scan4seq_impl)(pgno_t *range, const size_t len, + const size_t seq) = scan4seq_resolver; - osal_condpair_lock(&ctx->mc_condpair); - while (!ctx->mc_error) { - while (ctx->mc_tail == ctx->mc_head && !ctx->mc_error) { - int err = osal_condpair_wait(&ctx->mc_condpair, true); - if (err != MDBX_SUCCESS) { - ctx->mc_error = err; - goto bailout; - } - } - const unsigned toggle = ctx->mc_tail & 1; - size_t wsize = ctx->mc_wlen[toggle]; - if (wsize == 0) { - ctx->mc_tail += 1; - break /* EOF */; - } - ctx->mc_wlen[toggle] = 0; - uint8_t *ptr = ctx->mc_wbuf[toggle]; - if (!ctx->mc_error) { - int err = osal_write(ctx->mc_fd, ptr, wsize); - if (err != MDBX_SUCCESS) { -#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) - if (err == EPIPE) { - /* Collect the pending SIGPIPE, - * otherwise at least OS X gives it to the process on thread-exit. */ - int unused; - sigwait(&sigset, &unused); - } -#endif /* EPIPE */ - ctx->mc_error = err; - goto bailout; - } - } - ctx->mc_tail += 1; - osal_condpair_signal(&ctx->mc_condpair, false); - } -bailout: - osal_condpair_unlock(&ctx->mc_condpair); - return (THREAD_RESULT)0; +static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, + const size_t seq) { + pgno_t *(*choice)(pgno_t *range, const size_t len, const size_t seq) = + nullptr; +#if __has_builtin(__builtin_cpu_init) || defined(__BUILTIN_CPU_INIT__) || \ + __GNUC_PREREQ(4, 8) + __builtin_cpu_init(); +#endif /* __builtin_cpu_init() */ +#ifdef MDBX_ATTRIBUTE_TARGET_SSE2 + if (__builtin_cpu_supports("sse2")) + choice = scan4seq_sse2; +#endif /* MDBX_ATTRIBUTE_TARGET_SSE2 */ +#ifdef MDBX_ATTRIBUTE_TARGET_AVX2 + if (__builtin_cpu_supports("avx2")) + choice = scan4seq_avx2; +#endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */ +#ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW + if (__builtin_cpu_supports("avx512bw")) + choice = scan4seq_avx512bw; +#endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ + /* Choosing of another variants should be added here. */ + scan4seq_impl = choice ? choice : scan4seq_default; + return scan4seq_impl(range, len, seq); } +#endif /* scan4seq_impl */ -/* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. */ -__cold static int compacting_toggle_write_buffers(mdbx_compacting_ctx *ctx) { - osal_condpair_lock(&ctx->mc_condpair); - eASSERT(ctx->mc_env, ctx->mc_head - ctx->mc_tail < 2 || ctx->mc_error); - ctx->mc_head += 1; - osal_condpair_signal(&ctx->mc_condpair, true); - while (!ctx->mc_error && - ctx->mc_head - ctx->mc_tail == 2 /* both buffers in use */) { - int err = osal_condpair_wait(&ctx->mc_condpair, false); - if (err != MDBX_SUCCESS) - ctx->mc_error = err; - } - osal_condpair_unlock(&ctx->mc_condpair); - return ctx->mc_error; -} +/*----------------------------------------------------------------------------*/ -__cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb); +#define ALLOC_COALESCE 4 /* внутреннее состояние */ +#define ALLOC_SHOULD_SCAN 8 /* внутреннее состояние */ +#define ALLOC_LIFO 16 /* внутреннее состояние */ -static int compacting_put_bytes(mdbx_compacting_ctx *ctx, const void *src, - size_t bytes, pgno_t pgno, pgno_t npages) { - assert(pgno == 0 || bytes > PAGEHDRSZ); - while (bytes > 0) { - const size_t side = ctx->mc_head & 1; - const size_t left = MDBX_ENVCOPY_WRITEBUF - ctx->mc_wlen[side]; - if (left < (pgno ? PAGEHDRSZ : 1)) { - int err = compacting_toggle_write_buffers(ctx); - if (unlikely(err != MDBX_SUCCESS)) - return err; - continue; - } - const size_t chunk = (bytes < left) ? bytes : left; - void *const dst = ctx->mc_wbuf[side] + ctx->mc_wlen[side]; - if (src) { - memcpy(dst, src, chunk); - if (pgno) { - assert(chunk > PAGEHDRSZ); - MDBX_page *mp = dst; - mp->mp_pgno = pgno; - if (mp->mp_txnid == 0) - mp->mp_txnid = ctx->mc_txn->mt_txnid; - if (mp->mp_flags == P_OVERFLOW) { - assert(bytes <= pgno2bytes(ctx->mc_env, npages)); - mp->mp_pages = npages; - } - pgno = 0; - } - src = ptr_disp(src, chunk); - } else - memset(dst, 0, chunk); - bytes -= chunk; - ctx->mc_wlen[side] += chunk; - } - return MDBX_SUCCESS; -} +static inline bool is_gc_usable(MDBX_txn *txn, const MDBX_cursor *mc, + const uint8_t flags) { + /* If txn is updating the GC, then the retired-list cannot play catch-up with + * itself by growing while trying to save it. */ + if (mc->tree == &txn->dbs[FREE_DBI] && !(flags & ALLOC_RESERVE) && + !(mc->flags & z_gcu_preparation)) + return false; -static int compacting_put_page(mdbx_compacting_ctx *ctx, const MDBX_page *mp, - const size_t head_bytes, const size_t tail_bytes, - const pgno_t npages) { - if (tail_bytes) { - assert(head_bytes + tail_bytes <= ctx->mc_env->me_psize); - assert(npages == 1 && - (PAGETYPE_WHOLE(mp) == P_BRANCH || PAGETYPE_WHOLE(mp) == P_LEAF)); - } else { - assert(head_bytes <= pgno2bytes(ctx->mc_env, npages)); - assert((npages == 1 && PAGETYPE_WHOLE(mp) == (P_LEAF | P_LEAF2)) || - PAGETYPE_WHOLE(mp) == P_OVERFLOW); + /* avoid search inside empty tree and while tree is updating, + https://libmdbx.dqdkfa.ru/dead-github/issues/31 */ + if (unlikely(txn->dbs[FREE_DBI].items == 0)) { + txn->flags |= txn_gc_drained; + return false; } - const pgno_t pgno = ctx->mc_next_pgno; - ctx->mc_next_pgno += npages; - int err = compacting_put_bytes(ctx, mp, head_bytes, pgno, npages); - if (unlikely(err != MDBX_SUCCESS)) - return err; - err = compacting_put_bytes( - ctx, nullptr, pgno2bytes(ctx->mc_env, npages) - (head_bytes + tail_bytes), - 0, 0); - if (unlikely(err != MDBX_SUCCESS)) - return err; - return compacting_put_bytes( - ctx, ptr_disp(mp, ctx->mc_env->me_psize - tail_bytes), tail_bytes, 0, 0); + return true; } -__cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, - MDBX_cursor *mc, pgno_t *root, - txnid_t parent_txnid) { - mc->mc_snum = 1; - int rc = page_get(mc, *root, &mc->mc_pg[0], parent_txnid); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - rc = page_search_root(mc, nullptr, MDBX_PS_FIRST); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - /* Make cursor pages writable */ - void *const buf = osal_malloc(pgno2bytes(ctx->mc_env, mc->mc_snum)); - if (buf == NULL) - return MDBX_ENOMEM; +__hot static bool is_already_reclaimed(const MDBX_txn *txn, txnid_t id) { + const size_t len = MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed); + for (size_t i = 1; i <= len; ++i) + if (txn->tw.gc.reclaimed[i] == id) + return true; + return false; +} - void *ptr = buf; - for (size_t i = 0; i < mc->mc_top; i++) { - page_copy(ptr, mc->mc_pg[i], ctx->mc_env->me_psize); - mc->mc_pg[i] = ptr; - ptr = ptr_disp(ptr, ctx->mc_env->me_psize); - } - /* This is writable space for a leaf page. Usually not needed. */ - MDBX_page *const leaf = ptr; - - while (mc->mc_snum > 0) { - MDBX_page *mp = mc->mc_pg[mc->mc_top]; - size_t n = page_numkeys(mp); - - if (IS_LEAF(mp)) { - if (!(mc->mc_flags & - C_SUB) /* may have nested F_SUBDATA or F_BIGDATA nodes */) { - for (size_t i = 0; i < n; i++) { - MDBX_node *node = page_node(mp, i); - if (node_flags(node) == F_BIGDATA) { - /* Need writable leaf */ - if (mp != leaf) { - mc->mc_pg[mc->mc_top] = leaf; - page_copy(leaf, mp, ctx->mc_env->me_psize); - mp = leaf; - node = page_node(mp, i); - } +__hot static pgno_t relist_get_single(MDBX_txn *txn) { + const size_t len = MDBX_PNL_GETSIZE(txn->tw.relist); + assert(len > 0); + pgno_t *target = MDBX_PNL_EDGE(txn->tw.relist); + const ptrdiff_t dir = MDBX_PNL_ASCENDING ? 1 : -1; - const pgr_t lp = - page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); - if (unlikely((rc = lp.err) != MDBX_SUCCESS)) - goto done; - const size_t datasize = node_ds(node); - const pgno_t npages = number_of_ovpages(ctx->mc_env, datasize); - poke_pgno(node_data(node), ctx->mc_next_pgno); - rc = compacting_put_page(ctx, lp.page, PAGEHDRSZ + datasize, 0, - npages); - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - } else if (node_flags(node) & F_SUBDATA) { - if (!MDBX_DISABLE_VALIDATION && - unlikely(node_ds(node) != sizeof(MDBX_db))) { - ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "invalid dupsort sub-tree node size", - (unsigned)node_ds(node)); - rc = MDBX_CORRUPTED; - goto done; - } + /* Есть ТРИ потенциально выигрышные, но противо-направленные тактики: + * + * 1. Стараться использовать страницы с наименьшими номерами. Так обмен с + * диском будет более кучным, а у страниц ближе к концу БД будет больше шансов + * попасть под авто-компактификацию. Частично эта тактика уже реализована, но + * для её эффективности требуется явно приоритезировать выделение страниц: + * - поддерживать для relist, для ближних и для дальних страниц; + * - использовать страницы из дальнего списка, если первый пуст, + * а второй слишком большой, либо при пустой GC. + * + * 2. Стараться выделять страницы последовательно. Так записываемые на диск + * регионы будут линейными, что принципиально ускоряет запись на HDD. + * Одновременно, в среднем это не повлияет на чтение, точнее говоря, если + * порядок чтения не совпадает с порядком изменения (иначе говоря, если + * чтение не коррклирует с обновлениями и/или вставками) то не повлияет, иначе + * может ускорить. Однако, последовательности в среднем достаточно редки. + * Поэтому для эффективности требуется аккумулировать и поддерживать в ОЗУ + * огромные списки страниц, а затем сохранять их обратно в БД. Текущий формат + * БД (без битовых карт) для этого крайне не удачен. Поэтому эта тактика не + * имеет шансов быть успешной без смены формата БД (Mithril). + * + * 3. Стараться экономить последовательности страниц. Это позволяет избегать + * лишнего чтения/поиска в GC при более-менее постоянном размещении и/или + * обновлении данных требующих более одной страницы. Проблема в том, что без + * информации от приложения библиотека не может знать насколько + * востребованными будут последовательности в ближайшей перспективе, а + * экономия последовательностей "на всякий случай" не только затратна + * сама-по-себе, но и работает во вред. + * + * Поэтому: + * - в TODO добавляется разделение relist на «ближние» и «дальние» страницы, + * с последующей реализацией первой тактики; + * - преимущественное использование последовательностей отправляется + * в MithrilDB как составляющая "HDD frendly" feature; + * - реализованная в 3757eb72f7c6b46862f8f17881ac88e8cecc1979 экономия + * последовательностей отключается через MDBX_ENABLE_SAVING_SEQUENCES=0. + * + * В качестве альтернативы для безусловной «экономии» последовательностей, + * в следующих версиях libmdbx, вероятно, будет предложено + * API для взаимодействия с GC: + * - получение размера GC, включая гистограммы размеров последовательностей + * и близости к концу БД; + * - включение формирования "линейного запаса" для последующего использования + * в рамках текущей транзакции; + * - намеренная загрузка GC в память для коагуляции и "выпрямления"; + * - намеренное копирование данных из страниц в конце БД для последующего + * из освобождения, т.е. контролируемая компактификация по запросу. */ - /* Need writable leaf */ - if (mp != leaf) { - mc->mc_pg[mc->mc_top] = leaf; - page_copy(leaf, mp, ctx->mc_env->me_psize); - mp = leaf; - node = page_node(mp, i); - } +#ifndef MDBX_ENABLE_SAVING_SEQUENCES +#define MDBX_ENABLE_SAVING_SEQUENCES 0 +#endif + if (MDBX_ENABLE_SAVING_SEQUENCES && unlikely(target[dir] == *target + 1) && + len > 2) { + /* Пытаемся пропускать последовательности при наличии одиночных элементов. + * TODO: необходимо кэшировать пропускаемые последовательности + * чтобы не сканировать список сначала при каждом выделении. */ + pgno_t *scan = target + dir + dir; + size_t left = len; + do { + if (likely(scan[-dir] != *scan - 1 && *scan + 1 != scan[dir])) { +#if MDBX_PNL_ASCENDING + target = scan; + break; +#else + /* вырезаем элемент с перемещением хвоста */ + const pgno_t pgno = *scan; + MDBX_PNL_SETSIZE(txn->tw.relist, len - 1); + while (++scan <= target) + scan[-1] = *scan; + return pgno; +#endif + } + scan += dir; + } while (--left > 2); + } - MDBX_db *nested = nullptr; - if (node_flags(node) & F_DUPDATA) { - rc = cursor_xinit1(mc, node, mp); - if (likely(rc == MDBX_SUCCESS)) { - nested = &mc->mc_xcursor->mx_db; - rc = compacting_walk_tree(ctx, &mc->mc_xcursor->mx_cursor, - &nested->md_root, mp->mp_txnid); - } - } else { - cASSERT(mc, (mc->mc_flags & C_SUB) == 0 && mc->mc_xcursor == 0); - MDBX_cursor_couple *couple = - container_of(mc, MDBX_cursor_couple, outer); - cASSERT(mc, - couple->inner.mx_cursor.mc_signature == ~MDBX_MC_LIVE && - !couple->inner.mx_cursor.mc_flags && - !couple->inner.mx_cursor.mc_db && - !couple->inner.mx_cursor.mc_dbx); - nested = &couple->inner.mx_db; - memcpy(nested, node_data(node), sizeof(MDBX_db)); - rc = compacting_walk_sdb(ctx, nested); + const pgno_t pgno = *target; +#if MDBX_PNL_ASCENDING + /* вырезаем элемент с перемещением хвоста */ + MDBX_PNL_SETSIZE(txn->tw.relist, len - 1); + for (const pgno_t *const end = txn->tw.relist + len - 1; target <= end; + ++target) + *target = target[1]; +#else + /* перемещать хвост не нужно, просто усекам список */ + MDBX_PNL_SETSIZE(txn->tw.relist, len - 1); +#endif + return pgno; +} + +__hot static pgno_t relist_get_sequence(MDBX_txn *txn, const size_t num, + uint8_t flags) { + const size_t len = MDBX_PNL_GETSIZE(txn->tw.relist); + pgno_t *edge = MDBX_PNL_EDGE(txn->tw.relist); + assert(len >= num && num > 1); + const size_t seq = num - 1; +#if !MDBX_PNL_ASCENDING + if (edge[-(ptrdiff_t)seq] - *edge == seq) { + if (unlikely(flags & ALLOC_RESERVE)) + return P_INVALID; + assert(edge == scan4range_checker(txn->tw.relist, seq)); + /* перемещать хвост не нужно, просто усекам список */ + MDBX_PNL_SETSIZE(txn->tw.relist, len - num); + return *edge; + } +#endif + pgno_t *target = scan4seq_impl(edge, len, seq); + assert(target == scan4range_checker(txn->tw.relist, seq)); + if (target) { + if (unlikely(flags & ALLOC_RESERVE)) + return P_INVALID; + const pgno_t pgno = *target; + /* вырезаем найденную последовательность с перемещением хвоста */ + MDBX_PNL_SETSIZE(txn->tw.relist, len - num); +#if MDBX_PNL_ASCENDING + for (const pgno_t *const end = txn->tw.relist + len - num; target <= end; + ++target) + *target = target[num]; +#else + for (const pgno_t *const end = txn->tw.relist + len; ++target <= end;) + target[-(ptrdiff_t)num] = *target; +#endif + return pgno; + } + return 0; +} + +static inline pgr_t page_alloc_finalize(MDBX_env *const env, + MDBX_txn *const txn, + const MDBX_cursor *const mc, + const pgno_t pgno, const size_t num) { +#if MDBX_ENABLE_PROFGC + size_t majflt_before; + const uint64_t cputime_before = osal_cputime(&majflt_before); + gc_prof_stat_t *const prof = (mc->mc_dbi == FREE_DBI) + ? &env->lck->pgops.gc_prof.self + : &env->lck->pgops.gc_prof.work; +#else + (void)mc; +#endif /* MDBX_ENABLE_PROFGC */ + ENSURE(env, pgno >= NUM_METAS); + + pgr_t ret; + bool need_clean = (env->flags & MDBX_PAGEPERTURB) != 0; + if (env->flags & MDBX_WRITEMAP) { + ret.page = pgno2page(env, pgno); + MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num)); + VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); + + /* Содержимое выделенной страницы не нужно, но если страница отсутствует + * в ОЗУ (что весьма вероятно), то любое обращение к ней приведет + * к page-fault: + * - прерыванию по отсутствию страницы; + * - переключение контекста в режим ядра с засыпанием процесса; + * - чтение страницы с диска; + * - обновление PTE и пробуждением процесса; + * - переключение контекста по доступности ЦПУ. + * + * Пытаемся минимизировать накладные расходы записывая страницу, что при + * наличии unified page cache приведет к появлению страницы в ОЗУ без чтения + * с диска. При этом запись на диск должна быть отложена адекватным ядром, + * так как страница отображена в память в режиме чтения-записи и следом в + * неё пишет ЦПУ. */ + + /* В случае если страница в памяти процесса, то излишняя запись может быть + * достаточно дорогой. Кроме системного вызова и копирования данных, в особо + * одаренных ОС при этом могут включаться файловая система, выделяться + * временная страница, пополняться очереди асинхронного выполнения, + * обновляться PTE с последующей генерацией page-fault и чтением данных из + * грязной I/O очереди. Из-за этого штраф за лишнюю запись может быть + * сравним с избегаемым ненужным чтением. */ + if (env->prefault_write_activated) { + void *const pattern = + ptr_disp(env->page_auxbuf, need_clean ? env->ps : env->ps * 2); + size_t file_offset = pgno2bytes(env, pgno); + if (likely(num == 1)) { + if (!mincore_probe(env, pgno)) { + osal_pwrite(env->lazy_fd, pattern, env->ps, file_offset); +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.prefault.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + need_clean = false; + } + } else { + struct iovec iov[MDBX_AUXILARY_IOV_MAX]; + size_t n = 0, cleared = 0; + for (size_t i = 0; i < num; ++i) { + if (!mincore_probe(env, pgno + (pgno_t)i)) { + ++cleared; + iov[n].iov_len = env->ps; + iov[n].iov_base = pattern; + if (unlikely(++n == MDBX_AUXILARY_IOV_MAX)) { + osal_pwritev(env->lazy_fd, iov, MDBX_AUXILARY_IOV_MAX, + file_offset); +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.prefault.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + file_offset += pgno2bytes(env, MDBX_AUXILARY_IOV_MAX); + n = 0; } - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - memcpy(node_data(node), nested, sizeof(MDBX_db)); } } - } - } else { - mc->mc_ki[mc->mc_top]++; - if (mc->mc_ki[mc->mc_top] < n) { - while (1) { - const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); - rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - mc->mc_top++; - mc->mc_snum++; - mc->mc_ki[mc->mc_top] = 0; - if (!IS_BRANCH(mp)) { - mc->mc_pg[mc->mc_top] = mp; - break; - } - /* Whenever we advance to a sibling branch page, - * we must proceed all the way down to its first leaf. */ - page_copy(mc->mc_pg[mc->mc_top], mp, ctx->mc_env->me_psize); + if (likely(n > 0)) { + osal_pwritev(env->lazy_fd, iov, n, file_offset); +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.prefault.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ } - continue; + if (cleared == num) + need_clean = false; } } - - const pgno_t pgno = ctx->mc_next_pgno; - if (likely(!IS_LEAF2(mp))) { - rc = compacting_put_page( - ctx, mp, PAGEHDRSZ + mp->mp_lower, - ctx->mc_env->me_psize - (PAGEHDRSZ + mp->mp_upper), 1); - } else { - rc = compacting_put_page( - ctx, mp, PAGEHDRSZ + page_numkeys(mp) * mp->mp_leaf2_ksize, 0, 1); - } - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - - if (mc->mc_top) { - /* Update parent if there is one */ - node_set_pgno( - page_node(mc->mc_pg[mc->mc_top - 1], mc->mc_ki[mc->mc_top - 1]), - pgno); - cursor_pop(mc); - } else { - /* Otherwise we're done */ - *root = pgno; - break; + } else { + ret.page = page_shadow_alloc(txn, num); + if (unlikely(!ret.page)) { + ret.err = MDBX_ENOMEM; + goto bailout; } } -done: - osal_free(buf); - return rc; -} -__cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb) { - if (unlikely(sdb->md_root == P_INVALID)) - return MDBX_SUCCESS; /* empty db */ + if (unlikely(need_clean)) + memset(ret.page, -1, pgno2bytes(env, num)); - MDBX_cursor_couple couple; - memset(&couple, 0, sizeof(couple)); - couple.inner.mx_cursor.mc_signature = ~MDBX_MC_LIVE; - MDBX_dbx dbx = {.md_klen_min = INT_MAX}; - uint8_t dbi_state = DBI_LINDO | DBI_VALID; - int rc = couple_init(&couple, ~0u, ctx->mc_txn, sdb, &dbx, &dbi_state); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); + ret.page->pgno = pgno; + ret.page->dupfix_ksize = 0; + ret.page->flags = 0; + if ((ASSERT_ENABLED() || AUDIT_ENABLED()) && num > 1) { + ret.page->pages = (pgno_t)num; + ret.page->flags = P_LARGE; + } - couple.outer.mc_checking |= CC_SKIPORD | CC_PAGECHECK; - couple.inner.mx_cursor.mc_checking |= CC_SKIPORD | CC_PAGECHECK; - if (!sdb->md_mod_txnid) - sdb->md_mod_txnid = ctx->mc_txn->mt_txnid; - return compacting_walk_tree(ctx, &couple.outer, &sdb->md_root, - sdb->md_mod_txnid); + ret.err = page_dirty(txn, ret.page, (pgno_t)num); +bailout: + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - + MDBX_ENABLE_REFUND)); +#if MDBX_ENABLE_PROFGC + size_t majflt_after; + prof->xtime_cpu += osal_cputime(&majflt_after) - cputime_before; + prof->majflt += (uint32_t)(majflt_after - majflt_before); +#endif /* MDBX_ENABLE_PROFGC */ + return ret; } -__cold static void compacting_fixup_meta(MDBX_env *env, MDBX_meta *meta) { - eASSERT(env, meta->mm_dbs[FREE_DBI].md_mod_txnid || - meta->mm_dbs[FREE_DBI].md_root == P_INVALID); - eASSERT(env, meta->mm_dbs[MAIN_DBI].md_mod_txnid || - meta->mm_dbs[MAIN_DBI].md_root == P_INVALID); - - /* Calculate filesize taking in account shrink/growing thresholds */ - if (meta->mm_geo.next != meta->mm_geo.now) { - meta->mm_geo.now = meta->mm_geo.next; - const size_t aligner = pv2pages( - meta->mm_geo.grow_pv ? meta->mm_geo.grow_pv : meta->mm_geo.shrink_pv); - if (aligner) { - const pgno_t aligned = pgno_align2os_pgno( - env, meta->mm_geo.next + aligner - meta->mm_geo.next % aligner); - meta->mm_geo.now = aligned; - } - } +pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num, + uint8_t flags) { + pgr_t ret; + MDBX_txn *const txn = mc->txn; + MDBX_env *const env = txn->env; +#if MDBX_ENABLE_PROFGC + gc_prof_stat_t *const prof = (mc->mc_dbi == FREE_DBI) + ? &env->lck->pgops.gc_prof.self + : &env->lck->pgops.gc_prof.work; + prof->spe_counter += 1; +#endif /* MDBX_ENABLE_PROFGC */ - if (meta->mm_geo.now < meta->mm_geo.lower) - meta->mm_geo.now = meta->mm_geo.lower; - if (meta->mm_geo.now > meta->mm_geo.upper) - meta->mm_geo.now = meta->mm_geo.upper; + eASSERT(env, num > 0 || (flags & ALLOC_RESERVE)); + eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - + MDBX_ENABLE_REFUND)); - /* Update signature */ - assert(meta->mm_geo.now >= meta->mm_geo.next); - unaligned_poke_u64(4, meta->mm_sign, meta_sign(meta)); -} + size_t newnext; + const uint64_t monotime_begin = + (MDBX_ENABLE_PROFGC || (num > 1 && env->options.gc_time_limit)) + ? osal_monotime() + : 0; + struct monotime_cache now_cache; + now_cache.expire_countdown = + 1 /* старт с 1 позволяет избавиться как от лишних системных вызовов когда + лимит времени задан нулевой или уже исчерпан, так и от подсчета + времени при не-достижении rp_augment_limit */ + ; + now_cache.value = monotime_begin; + pgno_t pgno = 0; + if (num > 1) { +#if MDBX_ENABLE_PROFGC + prof->xpages += 1; +#endif /* MDBX_ENABLE_PROFGC */ + if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) { + eASSERT(env, + MDBX_PNL_LAST(txn->tw.relist) < txn->geo.first_unallocated && + MDBX_PNL_FIRST(txn->tw.relist) < txn->geo.first_unallocated); + pgno = relist_get_sequence(txn, num, flags); + if (likely(pgno)) + goto done; + } + } else { + eASSERT(env, num == 0 || MDBX_PNL_GETSIZE(txn->tw.relist) == 0); + eASSERT(env, !(flags & ALLOC_RESERVE) || num == 0); + } -/* Make resizable */ -__cold static void meta_make_sizeable(MDBX_meta *meta) { - meta->mm_geo.lower = MIN_PAGENO; - if (meta->mm_geo.grow_pv == 0) { - const pgno_t step = 1 + (meta->mm_geo.upper - meta->mm_geo.lower) / 42; - meta->mm_geo.grow_pv = pages2pv(step); + //--------------------------------------------------------------------------- + + if (unlikely(!is_gc_usable(txn, mc, flags))) { + eASSERT(env, (txn->flags & txn_gc_drained) || num > 1); + goto no_gc; } - if (meta->mm_geo.shrink_pv == 0) { - const pgno_t step = pv2pages(meta->mm_geo.grow_pv) << 1; - meta->mm_geo.shrink_pv = pages2pv(step); + + eASSERT(env, + (flags & (ALLOC_COALESCE | ALLOC_LIFO | ALLOC_SHOULD_SCAN)) == 0); + flags += (env->flags & MDBX_LIFORECLAIM) ? ALLOC_LIFO : 0; + + if (/* Не коагулируем записи при подготовке резерва для обновления GC. + * Иначе попытка увеличить резерв может приводить к необходимости ещё + * большего резерва из-за увеличения списка переработанных страниц. */ + (flags & ALLOC_RESERVE) == 0) { + if (txn->dbs[FREE_DBI].branch_pages && + MDBX_PNL_GETSIZE(txn->tw.relist) < env->maxgc_large1page / 2) + flags += ALLOC_COALESCE; } -} -/* Copy environment with compaction. */ -__cold static int env_compact(MDBX_env *env, MDBX_txn *read_txn, - mdbx_filehandle_t fd, uint8_t *buffer, - const bool dest_is_pipe, - const MDBX_copy_flags_t flags) { - const size_t meta_bytes = pgno2bytes(env, NUM_METAS); - uint8_t *const data_buffer = - buffer + ceil_powerof2(meta_bytes, env->me_os_psize); - MDBX_meta *const meta = init_metas(env, buffer); - meta_set_txnid(env, meta, read_txn->mt_txnid); + MDBX_cursor *const gc = ptr_disp(env->basal_txn, sizeof(MDBX_txn)); + eASSERT(env, mc != gc && gc->next == gc); + gc->txn = txn; + gc->dbi_state = txn->dbi_state; + gc->top_and_flags = z_fresh_mark; - if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) - meta_make_sizeable(meta); + env->prefault_write_activated = env->options.prefault_write; + if (env->prefault_write_activated) { + /* Проверка посредством minicore() существенно снижает затраты, но в + * простейших случаях (тривиальный бенчмарк) интегральная производительность + * становится вдвое меньше. А на платформах без mincore() и с проблемной + * подсистемой виртуальной памяти ситуация может быть многократно хуже. + * Поэтому избегаем затрат в ситуациях когда prefault-write скорее всего не + * нужна. */ + const bool readahead_enabled = env->lck->readahead_anchor & 1; + const pgno_t readahead_edge = env->lck->readahead_anchor >> 1; + if (/* Не суетимся если GC почти пустая и БД маленькая */ + (txn->dbs[FREE_DBI].branch_pages == 0 && txn->geo.now < 1234) || + /* Не суетимся если страница в зоне включенного упреждающего чтения */ + (readahead_enabled && pgno + num < readahead_edge)) + env->prefault_write_activated = false; + } - /* copy canary sequences if present */ - if (read_txn->mt_canary.v) { - meta->mm_canary = read_txn->mt_canary; - meta->mm_canary.v = constmeta_txnid(meta); +retry_gc_refresh_oldest:; + txnid_t oldest = txn_snapshot_oldest(txn); +retry_gc_have_oldest: + if (unlikely(oldest >= txn->txnid)) { + ERROR("unexpected/invalid oldest-readed txnid %" PRIaTXN + " for current-txnid %" PRIaTXN, + oldest, txn->txnid); + ret.err = MDBX_PROBLEM; + goto fail; } + const txnid_t detent = oldest + 1; - if (read_txn->mt_dbs[MAIN_DBI].md_root == P_INVALID) { - /* When the DB is empty, handle it specially to - * fix any breakage like page leaks from ITS#8174. */ - meta->mm_dbs[MAIN_DBI].md_flags = read_txn->mt_dbs[MAIN_DBI].md_flags; - compacting_fixup_meta(env, meta); - if (dest_is_pipe) { - int rc = osal_write(fd, buffer, meta_bytes); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } - } else { - /* Count free pages + GC pages. */ - MDBX_cursor_couple couple; - int rc = cursor_init(&couple.outer, read_txn, FREE_DBI); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - pgno_t gc = read_txn->mt_dbs[FREE_DBI].md_branch_pages + - read_txn->mt_dbs[FREE_DBI].md_leaf_pages + - read_txn->mt_dbs[FREE_DBI].md_overflow_pages; - MDBX_val key, data; - while ((rc = cursor_get(&couple.outer, &key, &data, MDBX_NEXT)) == - MDBX_SUCCESS) { - const MDBX_PNL pnl = data.iov_base; - if (unlikely(data.iov_len % sizeof(pgno_t) || - data.iov_len < MDBX_PNL_SIZEOF(pnl))) { - ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "invalid GC-record length", data.iov_len); - return MDBX_CORRUPTED; - } - if (unlikely(!pnl_check(pnl, read_txn->mt_next_pgno))) { - ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "invalid GC-record content"); - return MDBX_CORRUPTED; + txnid_t id = 0; + MDBX_cursor_op op = MDBX_FIRST; + if (flags & ALLOC_LIFO) { + if (!txn->tw.gc.reclaimed) { + txn->tw.gc.reclaimed = txl_alloc(); + if (unlikely(!txn->tw.gc.reclaimed)) { + ret.err = MDBX_ENOMEM; + goto fail; } - gc += MDBX_PNL_GETSIZE(pnl); } - if (unlikely(rc != MDBX_NOTFOUND)) - return rc; + /* Begin lookup backward from oldest reader */ + id = detent - 1; + op = MDBX_SET_RANGE; + } else if (txn->tw.gc.last_reclaimed) { + /* Continue lookup forward from last-reclaimed */ + id = txn->tw.gc.last_reclaimed + 1; + if (id >= detent) + goto depleted_gc; + op = MDBX_SET_RANGE; + } - /* Substract GC-pages from mt_next_pgno to find the new mt_next_pgno. */ - meta->mm_geo.next = read_txn->mt_next_pgno - gc; - /* Set with current main DB */ - meta->mm_dbs[MAIN_DBI] = read_txn->mt_dbs[MAIN_DBI]; +next_gc:; + MDBX_val key; + key.iov_base = &id; + key.iov_len = sizeof(id); - mdbx_compacting_ctx ctx; - memset(&ctx, 0, sizeof(ctx)); - rc = osal_condpair_init(&ctx.mc_condpair); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +#if MDBX_ENABLE_PROFGC + prof->rsteps += 1; +#endif /* MDBX_ENABLE_PROFGC */ - memset(data_buffer, 0, 2 * (size_t)MDBX_ENVCOPY_WRITEBUF); - ctx.mc_wbuf[0] = data_buffer; - ctx.mc_wbuf[1] = data_buffer + (size_t)MDBX_ENVCOPY_WRITEBUF; - ctx.mc_next_pgno = NUM_METAS; - ctx.mc_env = env; - ctx.mc_fd = fd; - ctx.mc_txn = read_txn; + /* Seek first/next GC record */ + ret.err = cursor_ops(gc, &key, nullptr, op); + if (unlikely(ret.err != MDBX_SUCCESS)) { + if (unlikely(ret.err != MDBX_NOTFOUND)) + goto fail; + if ((flags & ALLOC_LIFO) && op == MDBX_SET_RANGE) { + op = MDBX_PREV; + goto next_gc; + } + goto depleted_gc; + } + if (unlikely(key.iov_len != sizeof(txnid_t))) { + ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC key-length"); + ret.err = MDBX_CORRUPTED; + goto fail; + } + id = unaligned_peek_u64(4, key.iov_base); + if (flags & ALLOC_LIFO) { + op = MDBX_PREV; + if (id >= detent || is_already_reclaimed(txn, id)) + goto next_gc; + } else { + op = MDBX_NEXT; + if (unlikely(id >= detent)) + goto depleted_gc; + } + txn->flags &= ~txn_gc_drained; - osal_thread_t thread; - int thread_err = osal_thread_create(&thread, compacting_write_thread, &ctx); - if (likely(thread_err == MDBX_SUCCESS)) { - if (dest_is_pipe) { - if (!meta->mm_dbs[MAIN_DBI].md_mod_txnid) - meta->mm_dbs[MAIN_DBI].md_mod_txnid = read_txn->mt_txnid; - compacting_fixup_meta(env, meta); - rc = osal_write(fd, buffer, meta_bytes); - } - if (likely(rc == MDBX_SUCCESS)) - rc = compacting_walk_sdb(&ctx, &meta->mm_dbs[MAIN_DBI]); - if (ctx.mc_wlen[ctx.mc_head & 1]) - /* toggle to flush non-empty buffers */ - compacting_toggle_write_buffers(&ctx); + /* Reading next GC record */ + MDBX_val data; + page_t *const mp = gc->pg[gc->top]; + if (unlikely((ret.err = node_read(gc, page_node(mp, gc->ki[gc->top]), &data, + mp)) != MDBX_SUCCESS)) + goto fail; - if (likely(rc == MDBX_SUCCESS) && - unlikely(meta->mm_geo.next != ctx.mc_next_pgno)) { - if (ctx.mc_next_pgno > meta->mm_geo.next) { - ERROR("the source DB %s: post-compactification used pages %" PRIaPGNO - " %c expected %" PRIaPGNO, - "has double-used pages or other corruption", ctx.mc_next_pgno, - '>', meta->mm_geo.next); - rc = MDBX_CORRUPTED; /* corrupted DB */ - } - if (ctx.mc_next_pgno < meta->mm_geo.next) { - WARNING( - "the source DB %s: post-compactification used pages %" PRIaPGNO - " %c expected %" PRIaPGNO, - "has page leak(s)", ctx.mc_next_pgno, '<', meta->mm_geo.next); - if (dest_is_pipe) - /* the root within already written meta-pages is wrong */ - rc = MDBX_CORRUPTED; + pgno_t *gc_pnl = (pgno_t *)data.iov_base; + if (unlikely(data.iov_len % sizeof(pgno_t) || + data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || + !pnl_check(gc_pnl, txn->geo.first_unallocated))) { + ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC value-length"); + ret.err = MDBX_CORRUPTED; + goto fail; + } + + const size_t gc_len = MDBX_PNL_GETSIZE(gc_pnl); + TRACE("gc-read: id #%" PRIaTXN " len %zu, re-list will %zu ", id, gc_len, + gc_len + MDBX_PNL_GETSIZE(txn->tw.relist)); + + if (unlikely(gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= + env->maxgc_large1page)) { + /* Don't try to coalesce too much. */ + if (flags & ALLOC_SHOULD_SCAN) { + eASSERT(env, flags & ALLOC_COALESCE); + eASSERT(env, !(flags & ALLOC_RESERVE)); + eASSERT(env, num > 0); +#if MDBX_ENABLE_PROFGC + env->lck->pgops.gc_prof.coalescences += 1; +#endif /* MDBX_ENABLE_PROFGC */ + TRACE("clear %s %s", "ALLOC_COALESCE", "since got threshold"); + if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) { + eASSERT(env, + MDBX_PNL_LAST(txn->tw.relist) < txn->geo.first_unallocated && + MDBX_PNL_FIRST(txn->tw.relist) < + txn->geo.first_unallocated); + if (likely(num == 1)) { + pgno = relist_get_single(txn); + goto done; } - /* fixup meta */ - meta->mm_geo.next = ctx.mc_next_pgno; + pgno = relist_get_sequence(txn, num, flags); + if (likely(pgno)) + goto done; } - - /* toggle with empty buffers to exit thread's loop */ - eASSERT(env, (ctx.mc_wlen[ctx.mc_head & 1]) == 0); - compacting_toggle_write_buffers(&ctx); - thread_err = osal_thread_join(thread); - eASSERT(env, (ctx.mc_tail == ctx.mc_head && - ctx.mc_wlen[ctx.mc_head & 1] == 0) || - ctx.mc_error); - osal_condpair_destroy(&ctx.mc_condpair); + flags -= ALLOC_COALESCE | ALLOC_SHOULD_SCAN; + } + if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE( + txn->tw.relist) >= env->options.rp_augment_limit) && + ((/* not a slot-request from gc-update */ num && + /* have enough unallocated space */ txn->geo.upper >= + txn->geo.first_unallocated + num && + monotime_since_cached(monotime_begin, &now_cache) + + txn->tw.gc.time_acc >= + env->options.gc_time_limit) || + gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= PAGELIST_LIMIT)) { + /* Stop reclaiming to avoid large/overflow the page list. This is a rare + * case while search for a continuously multi-page region in a + * large database, see https://libmdbx.dqdkfa.ru/dead-github/issues/123 */ + NOTICE("stop reclaiming %s: %zu (current) + %zu " + "(chunk) -> %zu, rp_augment_limit %u", + likely(gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) < PAGELIST_LIMIT) + ? "since rp_augment_limit was reached" + : "to avoid PNL overflow", + MDBX_PNL_GETSIZE(txn->tw.relist), gc_len, + gc_len + MDBX_PNL_GETSIZE(txn->tw.relist), + env->options.rp_augment_limit); + goto depleted_gc; } - if (unlikely(thread_err != MDBX_SUCCESS)) - return thread_err; - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - if (unlikely(ctx.mc_error != MDBX_SUCCESS)) - return ctx.mc_error; - if (!dest_is_pipe) - compacting_fixup_meta(env, meta); } - /* Extend file if required */ - if (meta->mm_geo.now != meta->mm_geo.next) { - const size_t whole_size = pgno2bytes(env, meta->mm_geo.now); - if (!dest_is_pipe) - return osal_ftruncate(fd, whole_size); - - const size_t used_size = pgno2bytes(env, meta->mm_geo.next); - memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF); - for (size_t offset = used_size; offset < whole_size;) { - const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset) - ? (size_t)MDBX_ENVCOPY_WRITEBUF - : whole_size - offset; - int rc = osal_write(fd, data_buffer, chunk); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - offset += chunk; - } + /* Remember ID of readed GC record */ + txn->tw.gc.last_reclaimed = id; + if (flags & ALLOC_LIFO) { + ret.err = txl_append(&txn->tw.gc.reclaimed, id); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; } - return MDBX_SUCCESS; -} -/* Copy environment as-is. */ -__cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, - mdbx_filehandle_t fd, uint8_t *buffer, - const bool dest_is_pipe, - const MDBX_copy_flags_t flags) { - /* We must start the actual read txn after blocking writers */ - int rc = txn_end(read_txn, TXN_END_RESET_TMP); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + /* Append PNL from GC record to tw.relist */ + ret.err = pnl_need(&txn->tw.relist, gc_len); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; - /* Temporarily block writers until we snapshot the meta pages */ - rc = osal_txn_lock(env, false); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + if (LOG_ENABLED(MDBX_LOG_EXTRA)) { + DEBUG_EXTRA("readed GC-pnl txn %" PRIaTXN " root %" PRIaPGNO + " len %zu, PNL", + id, txn->dbs[FREE_DBI].root, gc_len); + for (size_t i = gc_len; i; i--) + DEBUG_EXTRA_PRINT(" %" PRIaPGNO, gc_pnl[i]); + DEBUG_EXTRA_PRINT(", first_unallocated %u\n", txn->geo.first_unallocated); + } - rc = txn_renew(read_txn, MDBX_TXN_RDONLY); - if (unlikely(rc != MDBX_SUCCESS)) { - osal_txn_unlock(env); - return rc; + /* Merge in descending sorted order */ + pnl_merge(txn->tw.relist, gc_pnl); + flags |= ALLOC_SHOULD_SCAN; + if (AUDIT_ENABLED()) { + if (unlikely(!pnl_check(txn->tw.relist, txn->geo.first_unallocated))) { + ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid txn retired-list"); + ret.err = MDBX_CORRUPTED; + goto fail; + } + } else { + eASSERT(env, + pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated)); } + eASSERT(env, dpl_check(txn)); - jitter4testing(false); - const size_t meta_bytes = pgno2bytes(env, NUM_METAS); - const meta_troika_t troika = meta_tap(env); - /* Make a snapshot of meta-pages, - * but writing ones after the data was flushed */ - memcpy(buffer, env->me_map, meta_bytes); - MDBX_meta *const headcopy = /* LY: get pointer to the snapshot copy */ - ptr_disp(buffer, ptr_dist(meta_recent(env, &troika).ptr_c, env->me_map)); - osal_txn_unlock(env); - - if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) - meta_make_sizeable(headcopy); - /* Update signature to steady */ - unaligned_poke_u64(4, headcopy->mm_sign, meta_sign(headcopy)); - - /* Copy the data */ - const size_t whole_size = pgno_align2os_bytes(env, read_txn->mt_end_pgno); - const size_t used_size = pgno2bytes(env, read_txn->mt_next_pgno); - jitter4testing(false); - - if (dest_is_pipe) - rc = osal_write(fd, buffer, meta_bytes); + eASSERT(env, MDBX_PNL_GETSIZE(txn->tw.relist) == 0 || + MDBX_PNL_MOST(txn->tw.relist) < txn->geo.first_unallocated); + if (MDBX_ENABLE_REFUND && MDBX_PNL_GETSIZE(txn->tw.relist) && + unlikely(MDBX_PNL_MOST(txn->tw.relist) == + txn->geo.first_unallocated - 1)) { + /* Refund suitable pages into "unallocated" space */ + txn_refund(txn); + } + eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - + MDBX_ENABLE_REFUND)); - uint8_t *const data_buffer = - buffer + ceil_powerof2(meta_bytes, env->me_os_psize); -#if MDBX_USE_COPYFILERANGE - static bool copyfilerange_unavailable; - bool not_the_same_filesystem = false; - struct statfs statfs_info; - if (fstatfs(fd, &statfs_info) || - statfs_info.f_type == /* ECRYPTFS_SUPER_MAGIC */ 0xf15f) - /* avoid use copyfilerange_unavailable() to ecryptfs due bugs */ - not_the_same_filesystem = true; -#endif /* MDBX_USE_COPYFILERANGE */ - for (size_t offset = meta_bytes; rc == MDBX_SUCCESS && offset < used_size;) { -#if MDBX_USE_SENDFILE - static bool sendfile_unavailable; - if (dest_is_pipe && likely(!sendfile_unavailable)) { - off_t in_offset = offset; - const ssize_t written = - sendfile(fd, env->me_lazy_fd, &in_offset, used_size - offset); - if (likely(written > 0)) { - offset = in_offset; - continue; - } - rc = MDBX_ENODATA; - if (written == 0 || ignore_enosys(rc = errno) != MDBX_RESULT_TRUE) - break; - sendfile_unavailable = true; - } -#endif /* MDBX_USE_SENDFILE */ + /* Done for a kick-reclaim mode, actually no page needed */ + if (unlikely(num == 0)) { + eASSERT(env, ret.err == MDBX_SUCCESS); + TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "early-exit for slot", id, + MDBX_PNL_GETSIZE(txn->tw.relist)); + goto early_exit; + } -#if MDBX_USE_COPYFILERANGE - if (!dest_is_pipe && !not_the_same_filesystem && - likely(!copyfilerange_unavailable)) { - off_t in_offset = offset, out_offset = offset; - ssize_t bytes_copied = copy_file_range( - env->me_lazy_fd, &in_offset, fd, &out_offset, used_size - offset, 0); - if (likely(bytes_copied > 0)) { - offset = in_offset; - continue; - } - rc = MDBX_ENODATA; - if (bytes_copied == 0) - break; - rc = errno; - if (rc == EXDEV || rc == /* workaround for ecryptfs bug(s), - maybe useful for others FS */ - EINVAL) - not_the_same_filesystem = true; - else if (ignore_enosys(rc) == MDBX_RESULT_TRUE) - copyfilerange_unavailable = true; - else - break; - } -#endif /* MDBX_USE_COPYFILERANGE */ + /* TODO: delete reclaimed records */ - /* fallback to portable */ - const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < used_size - offset) - ? (size_t)MDBX_ENVCOPY_WRITEBUF - : used_size - offset; - /* copy to avoid EFAULT in case swapped-out */ - memcpy(data_buffer, ptr_disp(env->me_map, offset), chunk); - rc = osal_write(fd, data_buffer, chunk); - offset += chunk; + eASSERT(env, op == MDBX_PREV || op == MDBX_NEXT); + if (flags & ALLOC_COALESCE) { + TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "coalesce-continue", id, + MDBX_PNL_GETSIZE(txn->tw.relist)); + goto next_gc; } - /* Extend file if required */ - if (likely(rc == MDBX_SUCCESS) && whole_size != used_size) { - if (!dest_is_pipe) - rc = osal_ftruncate(fd, whole_size); - else { - memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF); - for (size_t offset = used_size; - rc == MDBX_SUCCESS && offset < whole_size;) { - const size_t chunk = - ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset) - ? (size_t)MDBX_ENVCOPY_WRITEBUF - : whole_size - offset; - rc = osal_write(fd, data_buffer, chunk); - offset += chunk; - } +scan: + eASSERT(env, flags & ALLOC_SHOULD_SCAN); + eASSERT(env, num > 0); + if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) { + eASSERT(env, + MDBX_PNL_LAST(txn->tw.relist) < txn->geo.first_unallocated && + MDBX_PNL_FIRST(txn->tw.relist) < txn->geo.first_unallocated); + if (likely(num == 1)) { + eASSERT(env, !(flags & ALLOC_RESERVE)); + pgno = relist_get_single(txn); + goto done; } + pgno = relist_get_sequence(txn, num, flags); + if (likely(pgno)) + goto done; + } + flags -= ALLOC_SHOULD_SCAN; + if (ret.err == MDBX_SUCCESS) { + TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "continue-search", id, + MDBX_PNL_GETSIZE(txn->tw.relist)); + goto next_gc; } - return rc; -} - -__cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, - MDBX_copy_flags_t flags) { - int rc = check_env(env, true); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +depleted_gc: + TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "gc-depleted", id, + MDBX_PNL_GETSIZE(txn->tw.relist)); + ret.err = MDBX_NOTFOUND; + if (flags & ALLOC_SHOULD_SCAN) + goto scan; + txn->flags |= txn_gc_drained; - const int dest_is_pipe = osal_is_pipe(fd); - if (MDBX_IS_ERROR(dest_is_pipe)) - return dest_is_pipe; + //------------------------------------------------------------------------- - if (!dest_is_pipe) { - rc = osal_fseek(fd, 0); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } + /* There is no suitable pages in the GC and to be able to allocate + * we should CHOICE one of: + * - make a new steady checkpoint if reclaiming was stopped by + * the last steady-sync, or wipe it in the MDBX_UTTERLY_NOSYNC mode; + * - kick lagging reader(s) if reclaiming was stopped by ones of it. + * - extend the database file. */ - const size_t buffer_size = - pgno_align2os_bytes(env, NUM_METAS) + - ceil_powerof2(((flags & MDBX_CP_COMPACT) - ? 2 * (size_t)MDBX_ENVCOPY_WRITEBUF - : (size_t)MDBX_ENVCOPY_WRITEBUF), - env->me_os_psize); + /* Will use new pages from the map if nothing is suitable in the GC. */ + newnext = txn->geo.first_unallocated + num; - uint8_t *buffer = NULL; - rc = osal_memalign_alloc(env->me_os_psize, buffer_size, (void **)&buffer); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + /* Does reclaiming stopped at the last steady point? */ + const meta_ptr_t recent = meta_recent(env, &txn->tw.troika); + const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.troika); + if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady && + detent == prefer_steady.txnid + 1) { + DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN + "-%s, detent %" PRIaTXN, + recent.txnid, durable_caption(recent.ptr_c), prefer_steady.txnid, + durable_caption(prefer_steady.ptr_c), detent); + const pgno_t autosync_threshold = + atomic_load32(&env->lck->autosync_threshold, mo_Relaxed); + const uint64_t autosync_period = + atomic_load64(&env->lck->autosync_period, mo_Relaxed); + uint64_t eoos_timestamp; + /* wipe the last steady-point if one of: + * - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified + * - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted + * otherwise, make a new steady-point if one of: + * - auto-sync threshold is specified and reached; + * - upper limit of database size is reached; + * - database is full (with the current file size) + * AND auto-sync threshold it NOT specified */ + if (F_ISSET(env->flags, MDBX_UTTERLY_NOSYNC) && + ((autosync_threshold | autosync_period) == 0 || + newnext >= prefer_steady.ptr_c->geometry.now)) { + /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode + * without any auto-sync threshold(s). */ +#if MDBX_ENABLE_PROFGC + env->lck->pgops.gc_prof.wipes += 1; +#endif /* MDBX_ENABLE_PROFGC */ + ret.err = meta_wipe_steady(env, detent); + DEBUG("gc-wipe-steady, rc %d", ret.err); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + eASSERT(env, prefer_steady.ptr_c != + meta_prefer_steady(env, &txn->tw.troika).ptr_c); + goto retry_gc_refresh_oldest; + } + if ((autosync_threshold && + atomic_load64(&env->lck->unsynced_pages, mo_Relaxed) >= + autosync_threshold) || + (autosync_period && + (eoos_timestamp = + atomic_load64(&env->lck->eoos_timestamp, mo_Relaxed)) && + osal_monotime() - eoos_timestamp >= autosync_period) || + newnext >= txn->geo.upper || + ((num == 0 || newnext >= txn->geo.end_pgno) && + (autosync_threshold | autosync_period) == 0)) { + /* make steady checkpoint. */ +#if MDBX_ENABLE_PROFGC + env->lck->pgops.gc_prof.flushes += 1; +#endif /* MDBX_ENABLE_PROFGC */ + meta_t meta = *recent.ptr_c; + ret.err = dxb_sync_locked(env, env->flags & MDBX_WRITEMAP, &meta, + &txn->tw.troika); + DEBUG("gc-make-steady, rc %d", ret.err); + eASSERT(env, ret.err != MDBX_RESULT_TRUE); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + eASSERT(env, prefer_steady.ptr_c != + meta_prefer_steady(env, &txn->tw.troika).ptr_c); + goto retry_gc_refresh_oldest; + } + } - MDBX_txn *read_txn = NULL; - /* Do the lock/unlock of the reader mutex before starting the - * write txn. Otherwise other read txns could block writers. */ - rc = mdbx_txn_begin(env, NULL, MDBX_TXN_RDONLY, &read_txn); - if (unlikely(rc != MDBX_SUCCESS)) { - osal_memalign_free(buffer); - return rc; + if (unlikely(true == + atomic_load32(&env->lck->rdt_refresh_flag, mo_AcquireRelease))) { + oldest = txn_snapshot_oldest(txn); + if (oldest >= detent) + goto retry_gc_have_oldest; } - if (!dest_is_pipe) { - /* Firstly write a stub to meta-pages. - * Now we sure to incomplete copy will not be used. */ - memset(buffer, -1, pgno2bytes(env, NUM_METAS)); - rc = osal_write(fd, buffer, pgno2bytes(env, NUM_METAS)); + /* Avoid kick lagging reader(s) if is enough unallocated space + * at the end of database file. */ + if (!(flags & ALLOC_RESERVE) && newnext <= txn->geo.end_pgno) { + eASSERT(env, pgno == 0); + goto done; } - if (likely(rc == MDBX_SUCCESS)) { - memset(buffer, 0, pgno2bytes(env, NUM_METAS)); - rc = ((flags & MDBX_CP_COMPACT) ? env_compact : env_copy_asis)( - env, read_txn, fd, buffer, dest_is_pipe, flags); + if (oldest < txn->txnid - xMDBX_TXNID_STEP) { + oldest = mvcc_kick_laggards(env, oldest); + if (oldest >= detent) + goto retry_gc_have_oldest; } - mdbx_txn_abort(read_txn); - if (!dest_is_pipe) { - if (likely(rc == MDBX_SUCCESS)) - rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE); + //--------------------------------------------------------------------------- - /* Write actual meta */ - if (likely(rc == MDBX_SUCCESS)) - rc = osal_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0); +no_gc: + eASSERT(env, pgno == 0); +#ifndef MDBX_ENABLE_BACKLOG_DEPLETED +#define MDBX_ENABLE_BACKLOG_DEPLETED 0 +#endif /* MDBX_ENABLE_BACKLOG_DEPLETED*/ + if (MDBX_ENABLE_BACKLOG_DEPLETED && + unlikely(!(txn->flags & txn_gc_drained))) { + ret.err = MDBX_BACKLOG_DEPLETED; + goto fail; + } + if (flags & ALLOC_RESERVE) { + ret.err = MDBX_NOTFOUND; + goto fail; + } - if (likely(rc == MDBX_SUCCESS)) - rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + /* Will use new pages from the map if nothing is suitable in the GC. */ + newnext = txn->geo.first_unallocated + num; + if (newnext <= txn->geo.end_pgno) + goto done; + + if (newnext > txn->geo.upper || !txn->geo.grow_pv) { + NOTICE("gc-alloc: next %zu > upper %" PRIaPGNO, newnext, txn->geo.upper); + ret.err = MDBX_MAP_FULL; + goto fail; } - osal_memalign_free(buffer); - return rc; -} + eASSERT(env, newnext > txn->geo.end_pgno); + const size_t grow_step = pv2pages(txn->geo.grow_pv); + size_t aligned = pgno_align2os_pgno( + env, (pgno_t)(newnext + grow_step - newnext % grow_step)); -__cold int mdbx_env_copy(MDBX_env *env, const char *dest_path, - MDBX_copy_flags_t flags) { -#if defined(_WIN32) || defined(_WIN64) - wchar_t *dest_pathW = nullptr; - int rc = osal_mb2w(dest_path, &dest_pathW); - if (likely(rc == MDBX_SUCCESS)) { - rc = mdbx_env_copyW(env, dest_pathW, flags); - osal_free(dest_pathW); - } - return rc; -} - -__cold int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest_path, - MDBX_copy_flags_t flags) { -#endif /* Windows */ - - int rc = check_env(env, true); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(!dest_path)) - return MDBX_EINVAL; - - /* The destination path must exist, but the destination file must not. - * We don't want the OS to cache the writes, since the source data is - * already in the OS cache. */ - mdbx_filehandle_t newfd; - rc = osal_openfile(MDBX_OPEN_COPY, env, dest_path, &newfd, -#if defined(_WIN32) || defined(_WIN64) - (mdbx_mode_t)-1 -#else - S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP -#endif - ); + if (aligned > txn->geo.upper) + aligned = txn->geo.upper; + eASSERT(env, aligned >= newnext); -#if defined(_WIN32) || defined(_WIN64) - /* no locking required since the file opened with ShareMode == 0 */ -#else - if (rc == MDBX_SUCCESS) { - MDBX_STRUCT_FLOCK lock_op; - memset(&lock_op, 0, sizeof(lock_op)); - lock_op.l_type = F_WRLCK; - lock_op.l_whence = SEEK_SET; - lock_op.l_start = 0; - lock_op.l_len = OFF_T_MAX; - if (MDBX_FCNTL(newfd, MDBX_F_SETLK, &lock_op) -#if (defined(__linux__) || defined(__gnu_linux__)) && defined(LOCK_EX) && \ - (!defined(__ANDROID_API__) || __ANDROID_API__ >= 24) - || flock(newfd, LOCK_EX | LOCK_NB) -#endif /* Linux */ - ) - rc = errno; + VERBOSE("try growth datafile to %zu pages (+%zu)", aligned, + aligned - txn->geo.end_pgno); + ret.err = dxb_resize(env, txn->geo.first_unallocated, (pgno_t)aligned, + txn->geo.upper, implicit_grow); + if (ret.err != MDBX_SUCCESS) { + ERROR("unable growth datafile to %zu pages (+%zu), errcode %d", aligned, + aligned - txn->geo.end_pgno, ret.err); + goto fail; } -#endif /* Windows / POSIX */ + env->txn->geo.end_pgno = (pgno_t)aligned; + eASSERT(env, pgno == 0); - if (rc == MDBX_SUCCESS) - rc = mdbx_env_copy2fd(env, newfd, flags); + //--------------------------------------------------------------------------- - if (newfd != INVALID_HANDLE_VALUE) { - int err = osal_closefile(newfd); - if (rc == MDBX_SUCCESS && err != rc) - rc = err; - if (rc != MDBX_SUCCESS) - (void)osal_removefile(dest_path); +done: + ret.err = MDBX_SUCCESS; + if (likely((flags & ALLOC_RESERVE) == 0)) { + if (pgno) { + eASSERT(env, + pgno + num <= txn->geo.first_unallocated && pgno >= NUM_METAS); + eASSERT(env, + pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - + MDBX_ENABLE_REFUND)); + } else { + pgno = txn->geo.first_unallocated; + txn->geo.first_unallocated += (pgno_t)num; + eASSERT(env, txn->geo.first_unallocated <= txn->geo.end_pgno); + eASSERT(env, + pgno >= NUM_METAS && pgno + num <= txn->geo.first_unallocated); + } + + ret = page_alloc_finalize(env, txn, mc, pgno, num); + if (unlikely(ret.err != MDBX_SUCCESS)) { + fail: + eASSERT(env, ret.err != MDBX_SUCCESS); + eASSERT(env, + pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - + MDBX_ENABLE_REFUND)); + int level; + const char *what; + if (flags & ALLOC_RESERVE) { + level = (flags & ALLOC_UNIMPORTANT) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE; + what = num ? "reserve-pages" : "fetch-slot"; + } else { + txn->flags |= MDBX_TXN_ERROR; + level = MDBX_LOG_ERROR; + what = "pages"; + } + if (LOG_ENABLED(level)) + debug_log(level, __func__, __LINE__, + "unable alloc %zu %s, alloc-flags 0x%x, err %d, txn-flags " + "0x%x, re-list-len %zu, loose-count %zu, gc: height %u, " + "branch %zu, leaf %zu, large %zu, entries %zu\n", + num, what, flags, ret.err, txn->flags, + MDBX_PNL_GETSIZE(txn->tw.relist), txn->tw.loose_count, + txn->dbs[FREE_DBI].height, + (size_t)txn->dbs[FREE_DBI].branch_pages, + (size_t)txn->dbs[FREE_DBI].leaf_pages, + (size_t)txn->dbs[FREE_DBI].large_pages, + (size_t)txn->dbs[FREE_DBI].items); + ret.page = nullptr; + } + if (num > 1) + txn->tw.gc.time_acc += monotime_since_cached(monotime_begin, &now_cache); + } else { + early_exit: + DEBUG("return nullptr for %zu pages for ALLOC_%s, rc %d", num, + num ? "RESERVE" : "SLOT", ret.err); + ret.page = nullptr; } - return rc; +#if MDBX_ENABLE_PROFGC + prof->rtime_monotonic += osal_monotime() - monotime_begin; +#endif /* MDBX_ENABLE_PROFGC */ + return ret; } -/******************************************************************************/ - -__cold int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags, - bool onoff) { - int rc = check_env(env, false); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(flags & - ((env->me_flags & MDBX_ENV_ACTIVE) ? ~ENV_CHANGEABLE_FLAGS - : ~ENV_USABLE_FLAGS))) - return MDBX_EPERM; +__hot pgr_t gc_alloc_single(const MDBX_cursor *const mc) { + MDBX_txn *const txn = mc->txn; + tASSERT(txn, mc->txn->flags & MDBX_TXN_DIRTY); + tASSERT(txn, + F_ISSET(*cursor_dbi_state(mc), DBI_LINDO | DBI_VALID | DBI_DIRTY)); - if (unlikely(env->me_flags & MDBX_RDONLY)) - return MDBX_EACCESS; + /* If there are any loose pages, just use them */ + while (likely(txn->tw.loose_pages)) { +#if MDBX_ENABLE_REFUND + if (unlikely(txn->tw.loose_refund_wl > txn->geo.first_unallocated)) { + txn_refund(txn); + if (!txn->tw.loose_pages) + break; + } +#endif /* MDBX_ENABLE_REFUND */ - const bool lock_needed = - (env->me_flags & MDBX_ENV_ACTIVE) && !env_txn0_owned(env); - bool should_unlock = false; - if (lock_needed) { - rc = osal_txn_lock(env, false); - if (unlikely(rc)) - return rc; - should_unlock = true; + page_t *lp = txn->tw.loose_pages; + MDBX_ASAN_UNPOISON_MEMORY_REGION(lp, txn->env->ps); + VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *)); + txn->tw.loose_pages = page_next(lp); + txn->tw.loose_count--; + DEBUG_EXTRA("db %d use loose page %" PRIaPGNO, cursor_dbi_dbg(mc), + lp->pgno); + tASSERT(txn, lp->pgno < txn->geo.first_unallocated); + tASSERT(txn, lp->pgno >= NUM_METAS); + VALGRIND_MAKE_MEM_UNDEFINED(page_data(lp), page_space(txn->env)); + lp->txnid = txn->front_txnid; + pgr_t ret = {lp, MDBX_SUCCESS}; + return ret; } - if (onoff) - env->me_flags = merge_sync_flags(env->me_flags, flags); - else - env->me_flags &= ~flags; + if (likely(MDBX_PNL_GETSIZE(txn->tw.relist) > 0)) + return page_alloc_finalize(txn->env, txn, mc, relist_get_single(txn), 1); - if (should_unlock) - osal_txn_unlock(env); - return MDBX_SUCCESS; + return gc_alloc_ex(mc, 1, ALLOC_DEFAULT); } +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 -__cold int mdbx_env_get_flags(const MDBX_env *env, unsigned *arg) { - int rc = check_env(env, false); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(!arg)) - return MDBX_EINVAL; - *arg = env->me_flags & ENV_USABLE_FLAGS; - return MDBX_SUCCESS; +MDBX_NOTHROW_PURE_FUNCTION static bool is_lifo(const MDBX_txn *txn) { + return (txn->env->flags & MDBX_LIFORECLAIM) != 0; } -__cold int mdbx_env_set_userctx(MDBX_env *env, void *ctx) { - int rc = check_env(env, false); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - env->me_userctx = ctx; - return MDBX_SUCCESS; +MDBX_MAYBE_UNUSED static inline const char *dbg_prefix(const gcu_t *ctx) { + return is_lifo(ctx->cursor.txn) ? " lifo" : " fifo"; } -__cold void *mdbx_env_get_userctx(const MDBX_env *env) { - return env ? env->me_userctx : NULL; +static inline size_t backlog_size(MDBX_txn *txn) { + return MDBX_PNL_GETSIZE(txn->tw.relist) + txn->tw.loose_count; } -__cold int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) { - int rc = check_env(env, false); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - -#if MDBX_DEBUG - env->me_assert_func = func; - return MDBX_SUCCESS; +static int clean_stored_retired(MDBX_txn *txn, gcu_t *ctx) { + int err = MDBX_SUCCESS; + if (ctx->retired_stored) { + MDBX_cursor *const gc = ptr_disp(txn, sizeof(MDBX_txn)); + tASSERT(txn, txn == txn->env->basal_txn && gc->next == gc); + gc->txn = txn; + gc->dbi_state = txn->dbi_state; + gc->top_and_flags = z_fresh_mark; + gc->next = txn->cursors[FREE_DBI]; + txn->cursors[FREE_DBI] = gc; + do { + MDBX_val key, val; +#if MDBX_ENABLE_BIGFOOT + key.iov_base = &ctx->bigfoot; #else - (void)func; - return MDBX_ENOSYS; -#endif + key.iov_base = &txn->txnid; +#endif /* MDBX_ENABLE_BIGFOOT */ + key.iov_len = sizeof(txnid_t); + const csr_t csr = cursor_seek(gc, &key, &val, MDBX_SET); + if (csr.err == MDBX_SUCCESS && csr.exact) { + ctx->retired_stored = 0; + err = cursor_del(gc, 0); + TRACE("== clear-4linear, backlog %zu, err %d", backlog_size(txn), err); + } else + err = (csr.err == MDBX_NOTFOUND) ? MDBX_SUCCESS : csr.err; + } +#if MDBX_ENABLE_BIGFOOT + while (!err && --ctx->bigfoot >= txn->txnid); +#else + while (0); +#endif /* MDBX_ENABLE_BIGFOOT */ + txn->cursors[FREE_DBI] = gc->next; + gc->next = gc; + } + return err; } -#if defined(_WIN32) || defined(_WIN64) -__cold int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **arg) { - int rc = check_env(env, true); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +static int touch_gc(gcu_t *ctx) { + tASSERT(ctx->cursor.txn, is_pointed(&ctx->cursor) || + ctx->cursor.txn->dbs[FREE_DBI].leaf_pages == 0); + MDBX_val key, val; + key.iov_base = val.iov_base = nullptr; + key.iov_len = sizeof(txnid_t); + val.iov_len = MDBX_PNL_SIZEOF(ctx->cursor.txn->tw.retired_pages); + ctx->cursor.flags |= z_gcu_preparation; + int err = cursor_touch(&ctx->cursor, &key, &val); + ctx->cursor.flags -= z_gcu_preparation; + return err; +} - if (unlikely(!arg)) - return MDBX_EINVAL; +/* Prepare a backlog of pages to modify GC itself, while reclaiming is + * prohibited. It should be enough to prevent search in gc_alloc_ex() + * during a deleting, when GC tree is unbalanced. */ +static int prepare_backlog(MDBX_txn *txn, gcu_t *ctx) { + const size_t for_cow = txn->dbs[FREE_DBI].height; + const size_t for_rebalance = + for_cow + 1 + + (txn->dbs[FREE_DBI].height + 1ul >= txn->dbs[FREE_DBI].branch_pages); + size_t for_split = ctx->retired_stored == 0; + tASSERT(txn, is_pointed(&ctx->cursor) || txn->dbs[FREE_DBI].leaf_pages == 0); - *arg = env->me_pathname.specified; - return MDBX_SUCCESS; -} -#endif /* Windows */ + const intptr_t retired_left = + MDBX_PNL_SIZEOF(txn->tw.retired_pages) - ctx->retired_stored; + size_t for_relist = 0; + if (MDBX_ENABLE_BIGFOOT && retired_left > 0) { + for_relist = (retired_left + txn->env->maxgc_large1page - 1) / + txn->env->maxgc_large1page; + const size_t per_branch_page = txn->env->maxgc_per_branch; + for (size_t entries = for_relist; entries > 1; for_split += entries) + entries = (entries + per_branch_page - 1) / per_branch_page; + } else if (!MDBX_ENABLE_BIGFOOT && retired_left != 0) { + for_relist = + largechunk_npages(txn->env, MDBX_PNL_SIZEOF(txn->tw.retired_pages)); + } -__cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { - int rc = check_env(env, true); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + const size_t for_tree_before_touch = for_cow + for_rebalance + for_split; + const size_t for_tree_after_touch = for_rebalance + for_split; + const size_t for_all_before_touch = for_relist + for_tree_before_touch; + const size_t for_all_after_touch = for_relist + for_tree_after_touch; - if (unlikely(!arg)) - return MDBX_EINVAL; + if (likely(for_relist < 2 && backlog_size(txn) > for_all_before_touch) && + (ctx->cursor.top < 0 || + is_modifable(txn, ctx->cursor.pg[ctx->cursor.top]))) + return MDBX_SUCCESS; -#if defined(_WIN32) || defined(_WIN64) - if (!env->me_pathname_char) { - *arg = nullptr; - DWORD flags = /* WC_ERR_INVALID_CHARS */ 0x80; - size_t mb_len = - WideCharToMultiByte(CP_THREAD_ACP, flags, env->me_pathname.specified, - -1, nullptr, 0, nullptr, nullptr); - rc = mb_len ? MDBX_SUCCESS : (int)GetLastError(); - if (rc == ERROR_INVALID_FLAGS) { - mb_len = WideCharToMultiByte(CP_THREAD_ACP, flags = 0, - env->me_pathname.specified, -1, nullptr, 0, - nullptr, nullptr); - rc = mb_len ? MDBX_SUCCESS : (int)GetLastError(); - } - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + TRACE(">> retired-stored %zu, left %zi, backlog %zu, need %zu (4list %zu, " + "4split %zu, " + "4cow %zu, 4tree %zu)", + ctx->retired_stored, retired_left, backlog_size(txn), + for_all_before_touch, for_relist, for_split, for_cow, + for_tree_before_touch); - char *const mb_pathname = osal_malloc(mb_len); - if (!mb_pathname) - return MDBX_ENOMEM; - if (mb_len != (size_t)WideCharToMultiByte( - CP_THREAD_ACP, flags, env->me_pathname.specified, -1, - mb_pathname, (int)mb_len, nullptr, nullptr)) { - rc = (int)GetLastError(); - osal_free(mb_pathname); - return rc; + int err = touch_gc(ctx); + TRACE("== after-touch, backlog %zu, err %d", backlog_size(txn), err); + + if (!MDBX_ENABLE_BIGFOOT && unlikely(for_relist > 1) && + MDBX_PNL_GETSIZE(txn->tw.retired_pages) != ctx->retired_stored && + err == MDBX_SUCCESS) { + if (unlikely(ctx->retired_stored)) { + err = clean_stored_retired(txn, ctx); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (!ctx->retired_stored) + return /* restart by tail-recursion */ prepare_backlog(txn, ctx); } - if (env->me_pathname_char || - InterlockedCompareExchangePointer( - (PVOID volatile *)&env->me_pathname_char, mb_pathname, nullptr)) - osal_free(mb_pathname); + err = gc_alloc_ex(&ctx->cursor, for_relist, ALLOC_RESERVE).err; + TRACE("== after-4linear, backlog %zu, err %d", backlog_size(txn), err); + cASSERT(&ctx->cursor, + backlog_size(txn) >= for_relist || err != MDBX_SUCCESS); } - *arg = env->me_pathname_char; -#else - *arg = env->me_pathname.specified; -#endif /* Windows */ - return MDBX_SUCCESS; -} - -__cold int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *arg) { - int rc = check_env(env, true); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - if (unlikely(!arg)) - return MDBX_EINVAL; + while (backlog_size(txn) < for_all_after_touch && err == MDBX_SUCCESS) + err = gc_alloc_ex(&ctx->cursor, 0, ALLOC_RESERVE | ALLOC_UNIMPORTANT).err; - *arg = env->me_lazy_fd; - return MDBX_SUCCESS; + TRACE("<< backlog %zu, err %d, gc: height %u, branch %zu, leaf %zu, large " + "%zu, entries %zu", + backlog_size(txn), err, txn->dbs[FREE_DBI].height, + (size_t)txn->dbs[FREE_DBI].branch_pages, + (size_t)txn->dbs[FREE_DBI].leaf_pages, + (size_t)txn->dbs[FREE_DBI].large_pages, + (size_t)txn->dbs[FREE_DBI].items); + tASSERT(txn, err != MDBX_NOTFOUND || (txn->flags & txn_gc_drained) != 0); + return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS; } -static void stat_get(const MDBX_db *db, MDBX_stat *st, size_t bytes) { - st->ms_depth = db->md_depth; - st->ms_branch_pages = db->md_branch_pages; - st->ms_leaf_pages = db->md_leaf_pages; - st->ms_overflow_pages = db->md_overflow_pages; - st->ms_entries = db->md_entries; - if (likely(bytes >= - offsetof(MDBX_stat, ms_mod_txnid) + sizeof(st->ms_mod_txnid))) - st->ms_mod_txnid = db->md_mod_txnid; -} +static inline void zeroize_reserved(const MDBX_env *env, MDBX_val pnl) { +#if MDBX_DEBUG && (defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__)) + /* Для предотвращения предупреждения Valgrind из mdbx_dump_val() + * вызванное через макрос DVAL_DEBUG() на выходе + * из cursor_seek(MDBX_SET_KEY), которая вызывается ниже внутри gc_update() в + * цикле очистки и цикле заполнения зарезервированных элементов. */ + memset(pnl.iov_base, 0xBB, pnl.iov_len); +#endif /* MDBX_DEBUG && (ENABLE_MEMCHECK || __SANITIZE_ADDRESS__) */ -static void stat_add(const MDBX_db *db, MDBX_stat *const st, - const size_t bytes) { - st->ms_depth += db->md_depth; - st->ms_branch_pages += db->md_branch_pages; - st->ms_leaf_pages += db->md_leaf_pages; - st->ms_overflow_pages += db->md_overflow_pages; - st->ms_entries += db->md_entries; - if (likely(bytes >= - offsetof(MDBX_stat, ms_mod_txnid) + sizeof(st->ms_mod_txnid))) - st->ms_mod_txnid = (st->ms_mod_txnid > db->md_mod_txnid) ? st->ms_mod_txnid - : db->md_mod_txnid; + /* PNL is initially empty, zero out at least the length */ + memset(pnl.iov_base, 0, sizeof(pgno_t)); + if ((env->flags & (MDBX_WRITEMAP | MDBX_NOMEMINIT)) == 0) + /* zero out to avoid leaking values from uninitialized malloc'ed memory + * to the file in non-writemap mode if length of the saving page-list + * was changed during space reservation. */ + memset(pnl.iov_base, 0, pnl.iov_len); } -__cold static int stat_acc(const MDBX_txn *txn, MDBX_stat *st, size_t bytes) { - int err = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(err != MDBX_SUCCESS)) - return err; - - MDBX_cursor_couple cx; - err = cursor_init(&cx.outer, (MDBX_txn *)txn, MAIN_DBI); - if (unlikely(err != MDBX_SUCCESS)) - return err; +static int gcu_loose(MDBX_txn *txn, gcu_t *ctx) { + tASSERT(txn, txn->tw.loose_count > 0); + /* Return loose page numbers to tw.relist, + * though usually none are left at this point. + * The pages themselves remain in dirtylist. */ + if (unlikely(!txn->tw.gc.reclaimed && txn->tw.gc.last_reclaimed < 1)) { + TRACE("%s: try allocate gc-slot for %zu loose-pages", dbg_prefix(ctx), + txn->tw.loose_count); + int err = gc_alloc_ex(&ctx->cursor, 0, ALLOC_RESERVE).err; + if (err == MDBX_SUCCESS) { + TRACE("%s: retry since gc-slot for %zu loose-pages available", + dbg_prefix(ctx), txn->tw.loose_count); + return MDBX_SUCCESS; + } - const MDBX_env *const env = txn->mt_env; - st->ms_psize = env->me_psize; - TXN_FOREACH_DBI_FROM( - txn, dbi, - /* assuming GC is internal and not subject for accounting */ MAIN_DBI) { - if ((txn->mt_dbi_state[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID) - stat_add(txn->mt_dbs + dbi, st, bytes); + /* Put loose page numbers in tw.retired_pages, + * since unable to return ones to tw.relist. */ + err = pnl_need(&txn->tw.retired_pages, txn->tw.loose_count); + if (unlikely(err != MDBX_SUCCESS)) + return err; + for (page_t *lp = txn->tw.loose_pages; lp; lp = page_next(lp)) { + pnl_append_prereserved(txn->tw.retired_pages, lp->pgno); + MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *)); + VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *)); + } + TRACE("%s: append %zu loose-pages to retired-pages", dbg_prefix(ctx), + txn->tw.loose_count); + } else { + /* Room for loose pages + temp PNL with same */ + int err = pnl_need(&txn->tw.relist, 2 * txn->tw.loose_count + 2); + if (unlikely(err != MDBX_SUCCESS)) + return err; + pnl_t loose = txn->tw.relist + MDBX_PNL_ALLOCLEN(txn->tw.relist) - + txn->tw.loose_count - 1; + size_t count = 0; + for (page_t *lp = txn->tw.loose_pages; lp; lp = page_next(lp)) { + tASSERT(txn, lp->flags == P_LOOSE); + loose[++count] = lp->pgno; + MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *)); + VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *)); + } + tASSERT(txn, count == txn->tw.loose_count); + MDBX_PNL_SETSIZE(loose, count); + pnl_sort(loose, txn->geo.first_unallocated); + pnl_merge(txn->tw.relist, loose); + TRACE("%s: append %zu loose-pages to reclaimed-pages", dbg_prefix(ctx), + txn->tw.loose_count); + } + + /* filter-out list of dirty-pages from loose-pages */ + dpl_t *const dl = txn->tw.dirtylist; + if (dl) { + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + tASSERT(txn, dl->sorted <= dl->length); + size_t w = 0, sorted_out = 0; + for (size_t r = w; ++r <= dl->length;) { + page_t *dp = dl->items[r].ptr; + tASSERT(txn, dp->flags == P_LOOSE || is_modifable(txn, dp)); + tASSERT(txn, dpl_endpgno(dl, r) <= txn->geo.first_unallocated); + if ((dp->flags & P_LOOSE) == 0) { + if (++w != r) + dl->items[w] = dl->items[r]; + } else { + tASSERT(txn, dp->flags == P_LOOSE); + sorted_out += dl->sorted >= r; + if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP)) + page_shadow_release(txn->env, dp, 1); + } + } + TRACE("%s: filtered-out loose-pages from %zu -> %zu dirty-pages", + dbg_prefix(ctx), dl->length, w); + tASSERT(txn, txn->tw.loose_count == dl->length - w); + dl->sorted -= sorted_out; + tASSERT(txn, dl->sorted <= w); + dpl_setlen(dl, w); + dl->pages_including_loose -= txn->tw.loose_count; + txn->tw.dirtyroom += txn->tw.loose_count; + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->parent ? txn->parent->tw.dirtyroom + : txn->env->options.dp_limit)); + } else { + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); } + txn->tw.loose_pages = nullptr; + txn->tw.loose_count = 0; +#if MDBX_ENABLE_REFUND + txn->tw.loose_refund_wl = 0; +#endif /* MDBX_ENABLE_REFUND */ + return MDBX_SUCCESS; +} - if (!(txn->mt_dbs[MAIN_DBI].md_flags & MDBX_DUPSORT) && - txn->mt_dbs[MAIN_DBI].md_entries /* TODO: use `md_subs` field */) { +static int gcu_retired(MDBX_txn *txn, gcu_t *ctx) { + int err; + if (unlikely(!ctx->retired_stored)) { + /* Make sure last page of GC is touched and on retired-list */ + err = outer_last(&ctx->cursor, nullptr, nullptr); + if (likely(err == MDBX_SUCCESS)) + err = touch_gc(ctx); + if (unlikely(err != MDBX_SUCCESS) && err != MDBX_NOTFOUND) + return err; + } - /* scan and account not opened named subDBs */ - err = page_search(&cx.outer, NULL, MDBX_PS_FIRST); - while (err == MDBX_SUCCESS) { - const MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top]; - for (size_t i = 0; i < page_numkeys(mp); i++) { - const MDBX_node *node = page_node(mp, i); - if (node_flags(node) != F_SUBDATA) - continue; - if (unlikely(node_ds(node) != sizeof(MDBX_db))) { - ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "invalid subDb node size", node_ds(node)); - return MDBX_CORRUPTED; - } + MDBX_val key, data; +#if MDBX_ENABLE_BIGFOOT + size_t retired_pages_before; + do { + if (ctx->bigfoot > txn->txnid) { + err = clean_stored_retired(txn, ctx); + if (unlikely(err != MDBX_SUCCESS)) + return err; + tASSERT(txn, ctx->bigfoot <= txn->txnid); + } - /* skip opened and already accounted */ - const MDBX_val name = {node_key(node), node_ks(node)}; - TXN_FOREACH_DBI_USER(txn, dbi) { - if ((txn->mt_dbi_state[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID && - env->me_dbxs[MAIN_DBI].md_cmp(&name, - &env->me_dbxs[dbi].md_name) == 0) { - node = NULL; - break; - } - } + retired_pages_before = MDBX_PNL_GETSIZE(txn->tw.retired_pages); + err = prepare_backlog(txn, ctx); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (retired_pages_before != MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { + TRACE("%s: retired-list changed (%zu -> %zu), retry", dbg_prefix(ctx), + retired_pages_before, MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + break; + } - if (node) { - MDBX_db db; - memcpy(&db, node_data(node), sizeof(db)); - stat_add(&db, st, bytes); + pnl_sort(txn->tw.retired_pages, txn->geo.first_unallocated); + ctx->retired_stored = 0; + ctx->bigfoot = txn->txnid; + do { + if (ctx->retired_stored) { + err = prepare_backlog(txn, ctx); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (ctx->retired_stored >= MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { + TRACE("%s: retired-list changed (%zu -> %zu), retry", dbg_prefix(ctx), + retired_pages_before, MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + break; } } - err = cursor_sibling(&cx.outer, SIBLING_RIGHT); - } - if (unlikely(err != MDBX_NOTFOUND)) + key.iov_len = sizeof(txnid_t); + key.iov_base = &ctx->bigfoot; + const size_t left = + MDBX_PNL_GETSIZE(txn->tw.retired_pages) - ctx->retired_stored; + const size_t chunk = + (left > txn->env->maxgc_large1page && ctx->bigfoot < MAX_TXNID) + ? txn->env->maxgc_large1page + : left; + data.iov_len = (chunk + 1) * sizeof(pgno_t); + err = cursor_put(&ctx->cursor, &key, &data, MDBX_RESERVE); + if (unlikely(err != MDBX_SUCCESS)) + return err; + +#if MDBX_DEBUG && (defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__)) + /* Для предотвращения предупреждения Valgrind из mdbx_dump_val() + * вызванное через макрос DVAL_DEBUG() на выходе + * из cursor_seek(MDBX_SET_KEY), которая вызывается как выше в цикле + * очистки, так и ниже в цикле заполнения зарезервированных элементов. + */ + memset(data.iov_base, 0xBB, data.iov_len); +#endif /* MDBX_DEBUG && (ENABLE_MEMCHECK || __SANITIZE_ADDRESS__) */ + + if (retired_pages_before == MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { + const size_t at = (is_lifo(txn) == MDBX_PNL_ASCENDING) + ? left - chunk + : ctx->retired_stored; + pgno_t *const begin = txn->tw.retired_pages + at; + /* MDBX_PNL_ASCENDING == false && LIFO == false: + * - the larger pgno is at the beginning of retired list + * and should be placed with the larger txnid. + * MDBX_PNL_ASCENDING == true && LIFO == true: + * - the larger pgno is at the ending of retired list + * and should be placed with the smaller txnid. */ + const pgno_t save = *begin; + *begin = (pgno_t)chunk; + memcpy(data.iov_base, begin, data.iov_len); + *begin = save; + TRACE("%s: put-retired/bigfoot @ %" PRIaTXN + " (slice #%u) #%zu [%zu..%zu] of %zu", + dbg_prefix(ctx), ctx->bigfoot, + (unsigned)(ctx->bigfoot - txn->txnid), chunk, at, at + chunk, + retired_pages_before); + } + ctx->retired_stored += chunk; + } while (ctx->retired_stored < MDBX_PNL_GETSIZE(txn->tw.retired_pages) && + (++ctx->bigfoot, true)); + } while (retired_pages_before != MDBX_PNL_GETSIZE(txn->tw.retired_pages)); +#else + /* Write to last page of GC */ + key.iov_len = sizeof(txnid_t); + key.iov_base = &txn->txnid; + do { + prepare_backlog(txn, ctx); + data.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); + err = cursor_put(&ctx->cursor, &key, &data, MDBX_RESERVE); + if (unlikely(err != MDBX_SUCCESS)) return err; - } - return MDBX_SUCCESS; -} +#if MDBX_DEBUG && (defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__)) + /* Для предотвращения предупреждения Valgrind из mdbx_dump_val() + * вызванное через макрос DVAL_DEBUG() на выходе + * из cursor_seek(MDBX_SET_KEY), которая вызывается как выше в цикле + * очистки, так и ниже в цикле заполнения зарезервированных элементов. */ + memset(data.iov_base, 0xBB, data.iov_len); +#endif /* MDBX_DEBUG && (ENABLE_MEMCHECK || __SANITIZE_ADDRESS__) */ -__cold int mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn, - MDBX_stat *dest, size_t bytes) { - if (unlikely(!dest)) - return MDBX_EINVAL; - const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid); - if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid) - return MDBX_EINVAL; + /* Retry if tw.retired_pages[] grew during the Put() */ + } while (data.iov_len < MDBX_PNL_SIZEOF(txn->tw.retired_pages)); - if (likely(txn)) { - if (env && unlikely(txn->mt_env != env)) - return MDBX_EINVAL; - return stat_acc(txn, dest, bytes); - } + ctx->retired_stored = MDBX_PNL_GETSIZE(txn->tw.retired_pages); + pnl_sort(txn->tw.retired_pages, txn->geo.first_unallocated); + tASSERT(txn, data.iov_len == MDBX_PNL_SIZEOF(txn->tw.retired_pages)); + memcpy(data.iov_base, txn->tw.retired_pages, data.iov_len); - int err = check_env(env, true); - if (unlikely(err != MDBX_SUCCESS)) - return err; + TRACE("%s: put-retired #%zu @ %" PRIaTXN, dbg_prefix(ctx), + ctx->retired_stored, txn->txnid); +#endif /* MDBX_ENABLE_BIGFOOT */ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) { + size_t i = ctx->retired_stored; + DEBUG_EXTRA("txn %" PRIaTXN " root %" PRIaPGNO " num %zu, retired-PNL", + txn->txnid, txn->dbs[FREE_DBI].root, i); + for (; i; i--) + DEBUG_EXTRA_PRINT(" %" PRIaPGNO, txn->tw.retired_pages[i]); + DEBUG_EXTRA_PRINT("%s\n", "."); + } + return MDBX_SUCCESS; +} - if (env->me_txn && env_txn0_owned(env)) - /* inside write-txn */ - return stat_acc(env->me_txn, dest, bytes); +typedef struct gcu_rid_result { + int err; + txnid_t rid; +} rid_t; + +static rid_t get_rid_for_reclaimed(MDBX_txn *txn, gcu_t *ctx, + const size_t left) { + rid_t r; + if (is_lifo(txn)) { + if (txn->tw.gc.reclaimed == nullptr) { + txn->tw.gc.reclaimed = txl_alloc(); + if (unlikely(!txn->tw.gc.reclaimed)) { + r.err = MDBX_ENOMEM; + goto return_error; + } + } + if (MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) < txl_max && + left > (MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) - ctx->reused_slot) * + txn->env->maxgc_large1page && + !ctx->dense) { + /* Hужен свободный для для сохранения списка страниц. */ + bool need_cleanup = false; + txnid_t snap_oldest = 0; + retry_rid: + do { + r.err = gc_alloc_ex(&ctx->cursor, 0, ALLOC_RESERVE).err; + snap_oldest = txn->env->lck->cached_oldest.weak; + if (likely(r.err == MDBX_SUCCESS)) { + TRACE("%s: took @%" PRIaTXN " from GC", dbg_prefix(ctx), + MDBX_PNL_LAST(txn->tw.gc.reclaimed)); + need_cleanup = true; + } + } while (r.err == MDBX_SUCCESS && + MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) < txl_max && + left > + (MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) - ctx->reused_slot) * + txn->env->maxgc_large1page); + + if (likely(r.err == MDBX_SUCCESS)) { + TRACE("%s: got enough from GC.", dbg_prefix(ctx)); + goto return_continue; + } else if (unlikely(r.err != MDBX_NOTFOUND)) + /* LY: some troubles... */ + goto return_error; + + if (MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed)) { + if (need_cleanup) { + txl_sort(txn->tw.gc.reclaimed); + ctx->cleaned_slot = 0; + } + ctx->rid = MDBX_PNL_LAST(txn->tw.gc.reclaimed); + } else { + tASSERT(txn, txn->tw.gc.last_reclaimed == 0); + if (unlikely(txn_snapshot_oldest(txn) != snap_oldest)) + /* should retry gc_alloc_ex() + * if the oldest reader changes since the last attempt */ + goto retry_rid; + /* no reclaimable GC entries, + * therefore no entries with ID < mdbx_find_oldest(txn) */ + txn->tw.gc.last_reclaimed = ctx->rid = snap_oldest; + TRACE("%s: none recycled yet, set rid to @%" PRIaTXN, dbg_prefix(ctx), + ctx->rid); + } + + /* В GC нет годных к переработке записей, + * будем использовать свободные id в обратном порядке. */ + while (MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) < txl_max && + left > + (MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) - ctx->reused_slot) * + txn->env->maxgc_large1page) { + if (unlikely(ctx->rid <= MIN_TXNID)) { + if (unlikely(MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) <= + ctx->reused_slot)) { + NOTICE("** restart: reserve depleted (reused_gc_slot %zu >= " + "gc.reclaimed %zu)", + ctx->reused_slot, MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed)); + goto return_restart; + } + break; + } - MDBX_txn *tmp_txn; - err = mdbx_txn_begin((MDBX_env *)env, NULL, MDBX_TXN_RDONLY, &tmp_txn); - if (unlikely(err != MDBX_SUCCESS)) - return err; + tASSERT(txn, ctx->rid >= MIN_TXNID && ctx->rid <= MAX_TXNID); + ctx->rid -= 1; + MDBX_val key = {&ctx->rid, sizeof(ctx->rid)}, data; + r.err = cursor_seek(&ctx->cursor, &key, &data, MDBX_SET_KEY).err; + if (unlikely(r.err == MDBX_SUCCESS)) { + DEBUG("%s: GC's id %" PRIaTXN " is present, going to first", + dbg_prefix(ctx), ctx->rid); + r.err = outer_first(&ctx->cursor, &key, nullptr); + if (unlikely(r.err != MDBX_SUCCESS || + key.iov_len != sizeof(txnid_t))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC-key size", (unsigned)key.iov_len); + r.err = MDBX_CORRUPTED; + goto return_error; + } + const txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); + if (unlikely(gc_first <= MIN_TXNID)) { + DEBUG("%s: no free GC's id(s) less than %" PRIaTXN + " (going dense-mode)", + dbg_prefix(ctx), ctx->rid); + ctx->dense = true; + goto return_restart; + } + ctx->rid = gc_first - 1; + } - const int rc = stat_acc(tmp_txn, dest, bytes); - err = mdbx_txn_abort(tmp_txn); - if (unlikely(err != MDBX_SUCCESS)) - return err; - return rc; -} + tASSERT(txn, !ctx->dense); + r.err = txl_append(&txn->tw.gc.reclaimed, ctx->rid); + if (unlikely(r.err != MDBX_SUCCESS)) + goto return_error; -__cold int mdbx_dbi_dupsort_depthmask(const MDBX_txn *txn, MDBX_dbi dbi, - uint32_t *mask) { - int rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + if (ctx->reused_slot) + /* rare case, but it is better to clear and re-create GC entries + * with less fragmentation. */ + need_cleanup = true; + else + ctx->cleaned_slot += + 1 /* mark cleanup is not needed for added slot. */; - if (unlikely(!mask)) - return MDBX_EINVAL; + TRACE("%s: append @%" PRIaTXN + " to lifo-reclaimed, cleaned-gc-slot = %zu", + dbg_prefix(ctx), ctx->rid, ctx->cleaned_slot); + } - MDBX_cursor_couple cx; - rc = cursor_init(&cx.outer, txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - if ((cx.outer.mc_db->md_flags & MDBX_DUPSORT) == 0) - return MDBX_RESULT_TRUE; + if (need_cleanup) { + if (ctx->cleaned_slot) { + TRACE("%s: restart to clear and re-create GC entries", + dbg_prefix(ctx)); + goto return_restart; + } + goto return_continue; + } + } - MDBX_val key, data; - rc = cursor_first(&cx.outer, &key, &data); - *mask = 0; - while (rc == MDBX_SUCCESS) { - const MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], - cx.outer.mc_ki[cx.outer.mc_top]); - const MDBX_db *db = node_data(node); - const unsigned flags = node_flags(node); - switch (flags) { - case F_BIGDATA: - case 0: - /* single-value entry, deep = 0 */ - *mask |= 1 << 0; - break; - case F_DUPDATA: - /* single sub-page, deep = 1 */ - *mask |= 1 << 1; - break; - case F_DUPDATA | F_SUBDATA: - /* sub-tree */ - *mask |= 1 << UNALIGNED_PEEK_16(db, MDBX_db, md_depth); - break; - default: - ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "invalid node-size", flags); - return MDBX_CORRUPTED; + const size_t i = MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) - ctx->reused_slot; + tASSERT(txn, i > 0 && i <= MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed)); + r.rid = txn->tw.gc.reclaimed[i]; + TRACE("%s: take @%" PRIaTXN " from lifo-reclaimed[%zu]", dbg_prefix(ctx), + r.rid, i); + } else { + tASSERT(txn, txn->tw.gc.reclaimed == nullptr); + if (unlikely(ctx->rid == 0)) { + ctx->rid = txn_snapshot_oldest(txn); + MDBX_val key; + r.err = outer_first(&ctx->cursor, &key, nullptr); + if (likely(r.err == MDBX_SUCCESS)) { + if (unlikely(key.iov_len != sizeof(txnid_t))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC-key size", (unsigned)key.iov_len); + r.err = MDBX_CORRUPTED; + goto return_error; + } + const txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); + if (ctx->rid >= gc_first) + ctx->rid = gc_first - 1; + if (unlikely(ctx->rid == 0)) { + ERROR("%s", "** no GC tail-space to store (going dense-mode)"); + ctx->dense = true; + goto return_restart; + } + } else if (r.err != MDBX_NOTFOUND) { + r.rid = 0; + return r; + } + txn->tw.gc.last_reclaimed = ctx->rid; + ctx->cleaned_id = ctx->rid + 1; } - rc = cursor_next(&cx.outer, &key, &data, MDBX_NEXT_NODUP); + r.rid = ctx->rid--; + TRACE("%s: take @%" PRIaTXN " from GC", dbg_prefix(ctx), r.rid); } + ++ctx->reused_slot; + r.err = MDBX_SUCCESS; + return r; - return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc; +return_continue: + r.err = MDBX_SUCCESS; + r.rid = 0; + return r; + +return_restart: + r.err = MDBX_RESULT_TRUE; + r.rid = 0; + return r; + +return_error: + tASSERT(txn, r.err != MDBX_SUCCESS); + r.rid = 0; + return r; } -__cold static int env_info_snap(const MDBX_env *env, const MDBX_txn *txn, - MDBX_envinfo *out, const size_t bytes, - meta_troika_t *const troika) { - const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid); - const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat); - if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) - return MDBX_PANIC; +/* Cleanups reclaimed GC (aka freeDB) records, saves the retired-list (aka + * freelist) of current transaction to GC, puts back into GC leftover of the + * reclaimed pages with chunking. This recursive changes the reclaimed-list, + * loose-list and retired-list. Keep trying until it stabilizes. + * + * NOTE: This code is a consequence of many iterations of adding crutches (aka + * "checks and balances") to partially bypass the fundamental design problems + * inherited from LMDB. So do not try to understand it completely in order to + * avoid your madness. */ +int gc_update(MDBX_txn *txn, gcu_t *ctx) { + TRACE("\n>>> @%" PRIaTXN, txn->txnid); + MDBX_env *const env = txn->env; + ctx->cursor.next = txn->cursors[FREE_DBI]; + txn->cursors[FREE_DBI] = &ctx->cursor; + int rc; - /* is the environment open? - * (https://libmdbx.dqdkfa.ru/dead-github/issues/171) */ - if (unlikely(!env->me_map)) { - /* environment not yet opened */ -#if 1 - /* default behavior: returns the available info but zeroed the rest */ - memset(out, 0, bytes); - out->mi_geo.lower = env->me_dbgeo.lower; - out->mi_geo.upper = env->me_dbgeo.upper; - out->mi_geo.shrink = env->me_dbgeo.shrink; - out->mi_geo.grow = env->me_dbgeo.grow; - out->mi_geo.current = env->me_dbgeo.now; - out->mi_maxreaders = env->me_maxreaders; - out->mi_dxb_pagesize = env->me_psize; - out->mi_sys_pagesize = env->me_os_psize; - if (likely(bytes > size_before_bootid)) { - out->mi_bootid.current.x = bootid.x; - out->mi_bootid.current.y = bootid.y; - } - return MDBX_SUCCESS; -#else - /* some users may prefer this behavior: return appropriate error */ - return MDBX_EPERM; -#endif + // tASSERT(txn, MDBX_PNL_GETSIZE(txn->tw.retired_pages) || + // ctx->cleaned_slot < + // (txn->tw.gc.reclaimed ? + // MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) : 0) + // || ctx->cleaned_id < txn->tw.gc.last_reclaimed); + + /* txn->tw.relist[] can grow and shrink during this call. + * txn->tw.gc.last_reclaimed and txn->tw.retired_pages[] can only grow. + * But page numbers cannot disappear from txn->tw.retired_pages[]. */ +#if MDBX_ENABLE_GC_EXPERIMENTAL +retry_clean_adj: + ctx->reserve_adj = 0; +#endif /* MDBX_ENABLE_GC_EXPERIMENTAL */ +retry: + ctx->loop += ctx->prev_first_unallocated == txn->geo.first_unallocated; + TRACE(">> restart, loop %u", ctx->loop); + + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - + MDBX_ENABLE_REFUND)); + tASSERT(txn, dpl_check(txn)); + if (unlikely(/* paranoia */ ctx->loop > ((MDBX_DEBUG > 0) ? 12 : 42))) { + ERROR("too more loops %u, bailout", ctx->loop); + rc = MDBX_PROBLEM; + goto bailout; } - *troika = (txn && !(txn->mt_flags & MDBX_TXN_RDONLY)) ? txn->tw.troika - : meta_tap(env); - const meta_ptr_t head = meta_recent(env, troika); - const MDBX_meta *const meta0 = METAPAGE(env, 0); - const MDBX_meta *const meta1 = METAPAGE(env, 1); - const MDBX_meta *const meta2 = METAPAGE(env, 2); - out->mi_recent_txnid = head.txnid; - out->mi_meta_txnid[0] = troika->txnid[0]; - out->mi_meta_sign[0] = unaligned_peek_u64(4, meta0->mm_sign); - out->mi_meta_txnid[1] = troika->txnid[1]; - out->mi_meta_sign[1] = unaligned_peek_u64(4, meta1->mm_sign); - out->mi_meta_txnid[2] = troika->txnid[2]; - out->mi_meta_sign[2] = unaligned_peek_u64(4, meta2->mm_sign); - if (likely(bytes > size_before_bootid)) { - memcpy(&out->mi_bootid.meta[0], &meta0->mm_bootid, 16); - memcpy(&out->mi_bootid.meta[1], &meta1->mm_bootid, 16); - memcpy(&out->mi_bootid.meta[2], &meta2->mm_bootid, 16); + if (unlikely(ctx->dense || + ctx->prev_first_unallocated > txn->geo.first_unallocated)) { + rc = clean_stored_retired(txn, ctx); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; } - const volatile MDBX_meta *txn_meta = head.ptr_v; - out->mi_last_pgno = txn_meta->mm_geo.next - 1; - out->mi_geo.current = pgno2bytes(env, txn_meta->mm_geo.now); - if (txn) { - out->mi_last_pgno = txn->mt_next_pgno - 1; - out->mi_geo.current = pgno2bytes(env, txn->mt_end_pgno); + ctx->prev_first_unallocated = txn->geo.first_unallocated; + rc = MDBX_SUCCESS; + ctx->reserved = 0; + ctx->cleaned_slot = 0; + ctx->reused_slot = 0; + ctx->amount = ctx->fill_idx = ~0u; + ctx->cleaned_id = 0; + ctx->rid = txn->tw.gc.last_reclaimed; + while (true) { + /* Come back here after each Put() in case retired-list changed */ + TRACE("%s", " >> continue"); - const txnid_t wanna_meta_txnid = (txn->mt_flags & MDBX_TXN_RDONLY) - ? txn->mt_txnid - : txn->mt_txnid - xMDBX_TXNID_STEP; - txn_meta = (out->mi_meta_txnid[0] == wanna_meta_txnid) ? meta0 : txn_meta; - txn_meta = (out->mi_meta_txnid[1] == wanna_meta_txnid) ? meta1 : txn_meta; - txn_meta = (out->mi_meta_txnid[2] == wanna_meta_txnid) ? meta2 : txn_meta; - } - out->mi_geo.lower = pgno2bytes(env, txn_meta->mm_geo.lower); - out->mi_geo.upper = pgno2bytes(env, txn_meta->mm_geo.upper); - out->mi_geo.shrink = pgno2bytes(env, pv2pages(txn_meta->mm_geo.shrink_pv)); - out->mi_geo.grow = pgno2bytes(env, pv2pages(txn_meta->mm_geo.grow_pv)); - out->mi_mapsize = env->me_dxb_mmap.limit; + tASSERT(txn, + pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - + MDBX_ENABLE_REFUND)); + MDBX_val key, data; + if (is_lifo(txn)) { + if (ctx->cleaned_slot < + (txn->tw.gc.reclaimed ? MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) : 0)) { + ctx->reserved = 0; + ctx->cleaned_slot = 0; + ctx->reused_slot = 0; + ctx->fill_idx = ~0u; + /* LY: cleanup reclaimed records. */ + do { + ctx->cleaned_id = txn->tw.gc.reclaimed[++ctx->cleaned_slot]; + tASSERT(txn, ctx->cleaned_slot > 0 && + ctx->cleaned_id <= env->lck->cached_oldest.weak); + key.iov_base = &ctx->cleaned_id; + key.iov_len = sizeof(ctx->cleaned_id); + rc = cursor_seek(&ctx->cursor, &key, nullptr, MDBX_SET).err; + if (rc == MDBX_NOTFOUND) + continue; + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + rc = prepare_backlog(txn, ctx); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + tASSERT(txn, ctx->cleaned_id <= env->lck->cached_oldest.weak); + TRACE("%s: cleanup-reclaimed-id [%zu]%" PRIaTXN, dbg_prefix(ctx), + ctx->cleaned_slot, ctx->cleaned_id); + tASSERT(txn, *txn->cursors == &ctx->cursor); + rc = cursor_del(&ctx->cursor, 0); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } while (ctx->cleaned_slot < MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed)); + txl_sort(txn->tw.gc.reclaimed); + } + } else { + /* Удаляем оставшиеся вынутые из GC записи. */ + while (txn->tw.gc.last_reclaimed && + ctx->cleaned_id <= txn->tw.gc.last_reclaimed) { + rc = outer_first(&ctx->cursor, &key, nullptr); + if (rc == MDBX_NOTFOUND) + break; + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + if (!MDBX_DISABLE_VALIDATION && + unlikely(key.iov_len != sizeof(txnid_t))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC-key size", (unsigned)key.iov_len); + rc = MDBX_CORRUPTED; + goto bailout; + } + if (ctx->rid != ctx->cleaned_id) { + ctx->rid = ctx->cleaned_id; + ctx->reserved = 0; + ctx->reused_slot = 0; + } + ctx->cleaned_id = unaligned_peek_u64(4, key.iov_base); + if (ctx->cleaned_id > txn->tw.gc.last_reclaimed) + break; + rc = prepare_backlog(txn, ctx); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + tASSERT(txn, ctx->cleaned_id <= txn->tw.gc.last_reclaimed); + tASSERT(txn, ctx->cleaned_id <= env->lck->cached_oldest.weak); + TRACE("%s: cleanup-reclaimed-id %" PRIaTXN, dbg_prefix(ctx), + ctx->cleaned_id); + tASSERT(txn, *txn->cursors == &ctx->cursor); + rc = cursor_del(&ctx->cursor, 0); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + } - const MDBX_lockinfo *const lck = env->me_lck; - out->mi_maxreaders = env->me_maxreaders; - out->mi_numreaders = env->me_lck_mmap.lck - ? atomic_load32(&lck->mti_numreaders, mo_Relaxed) - : INT32_MAX; - out->mi_dxb_pagesize = env->me_psize; - out->mi_sys_pagesize = env->me_os_psize; + tASSERT(txn, + pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - + MDBX_ENABLE_REFUND)); + tASSERT(txn, dpl_check(txn)); + if (AUDIT_ENABLED()) { + rc = audit_ex(txn, ctx->retired_stored, false); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } - if (likely(bytes > size_before_bootid)) { - const uint64_t unsynced_pages = - atomic_load64(&lck->mti_unsynced_pages, mo_Relaxed) + - ((uint32_t)out->mi_recent_txnid != - atomic_load32(&lck->mti_meta_sync_txnid, mo_Relaxed)); - out->mi_unsync_volume = pgno2bytes(env, (size_t)unsynced_pages); - const uint64_t monotime_now = osal_monotime(); - uint64_t ts = atomic_load64(&lck->mti_eoos_timestamp, mo_Relaxed); - out->mi_since_sync_seconds16dot16 = - ts ? osal_monotime_to_16dot16_noUnderflow(monotime_now - ts) : 0; - ts = atomic_load64(&lck->mti_reader_check_timestamp, mo_Relaxed); - out->mi_since_reader_check_seconds16dot16 = - ts ? osal_monotime_to_16dot16_noUnderflow(monotime_now - ts) : 0; - out->mi_autosync_threshold = pgno2bytes( - env, atomic_load32(&lck->mti_autosync_threshold, mo_Relaxed)); - out->mi_autosync_period_seconds16dot16 = - osal_monotime_to_16dot16_noUnderflow( - atomic_load64(&lck->mti_autosync_period, mo_Relaxed)); - out->mi_bootid.current.x = bootid.x; - out->mi_bootid.current.y = bootid.y; - out->mi_mode = env->me_lck_mmap.lck ? lck->mti_envmode.weak : env->me_flags; - } + /* return suitable into unallocated space */ + if (txn_refund(txn)) { + tASSERT(txn, + pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - + MDBX_ENABLE_REFUND)); + if (AUDIT_ENABLED()) { + rc = audit_ex(txn, ctx->retired_stored, false); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + } - if (likely(bytes > size_before_pgop_stat)) { -#if MDBX_ENABLE_PGOP_STAT - out->mi_pgop_stat.newly = - atomic_load64(&lck->mti_pgop_stat.newly, mo_Relaxed); - out->mi_pgop_stat.cow = atomic_load64(&lck->mti_pgop_stat.cow, mo_Relaxed); - out->mi_pgop_stat.clone = - atomic_load64(&lck->mti_pgop_stat.clone, mo_Relaxed); - out->mi_pgop_stat.split = - atomic_load64(&lck->mti_pgop_stat.split, mo_Relaxed); - out->mi_pgop_stat.merge = - atomic_load64(&lck->mti_pgop_stat.merge, mo_Relaxed); - out->mi_pgop_stat.spill = - atomic_load64(&lck->mti_pgop_stat.spill, mo_Relaxed); - out->mi_pgop_stat.unspill = - atomic_load64(&lck->mti_pgop_stat.unspill, mo_Relaxed); - out->mi_pgop_stat.wops = - atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed); - out->mi_pgop_stat.prefault = - atomic_load64(&lck->mti_pgop_stat.prefault, mo_Relaxed); - out->mi_pgop_stat.mincore = - atomic_load64(&lck->mti_pgop_stat.mincore, mo_Relaxed); - out->mi_pgop_stat.msync = - atomic_load64(&lck->mti_pgop_stat.msync, mo_Relaxed); - out->mi_pgop_stat.fsync = - atomic_load64(&lck->mti_pgop_stat.fsync, mo_Relaxed); -#else - memset(&out->mi_pgop_stat, 0, sizeof(out->mi_pgop_stat)); -#endif /* MDBX_ENABLE_PGOP_STAT*/ - } + if (txn->tw.loose_pages) { + /* put loose pages into the reclaimed- or retired-list */ + rc = gcu_loose(txn, ctx); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + if (unlikely(txn->tw.loose_pages)) + continue; + } - txnid_t overall_latter_reader_txnid = out->mi_recent_txnid; - txnid_t self_latter_reader_txnid = overall_latter_reader_txnid; - if (env->me_lck_mmap.lck) { - for (size_t i = 0; i < out->mi_numreaders; ++i) { - const uint32_t pid = - atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); - if (pid) { - const txnid_t txnid = safe64_read(&lck->mti_readers[i].mr_txnid); - if (overall_latter_reader_txnid > txnid) - overall_latter_reader_txnid = txnid; - if (pid == env->me_pid && self_latter_reader_txnid > txnid) - self_latter_reader_txnid = txnid; - } + if (unlikely(ctx->reserved > MDBX_PNL_GETSIZE(txn->tw.relist)) && + (ctx->loop < 5 || ctx->reserved - MDBX_PNL_GETSIZE(txn->tw.relist) > + env->maxgc_large1page / 2)) { + TRACE("%s: reclaimed-list changed %zu -> %zu, retry", dbg_prefix(ctx), + ctx->amount, MDBX_PNL_GETSIZE(txn->tw.relist)); +#if MDBX_ENABLE_GC_EXPERIMENTAL + ctx->reserve_adj += ctx->reserved - MDBX_PNL_GETSIZE(txn->tw.relist); +#endif /* MDBX_ENABLE_GC_EXPERIMENTAL */ + goto retry; } - } - out->mi_self_latter_reader_txnid = self_latter_reader_txnid; - out->mi_latter_reader_txnid = overall_latter_reader_txnid; + ctx->amount = MDBX_PNL_GETSIZE(txn->tw.relist); - osal_compiler_barrier(); - return MDBX_SUCCESS; -} + if (ctx->retired_stored < MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { + /* store retired-list into GC */ + rc = gcu_retired(txn, ctx); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + continue; + } -__cold int env_info(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo *out, - size_t bytes, meta_troika_t *troika) { - MDBX_envinfo snap; - int rc = env_info_snap(env, txn, &snap, sizeof(snap), troika); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + tASSERT(txn, + pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - + MDBX_ENABLE_REFUND)); + tASSERT(txn, txn->tw.loose_count == 0); - eASSERT(env, sizeof(snap) >= bytes); - while (1) { - rc = env_info_snap(env, txn, out, bytes, troika); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - snap.mi_since_sync_seconds16dot16 = out->mi_since_sync_seconds16dot16; - snap.mi_since_reader_check_seconds16dot16 = - out->mi_since_reader_check_seconds16dot16; - if (likely(memcmp(&snap, out, bytes) == 0)) - return MDBX_SUCCESS; - memcpy(&snap, out, bytes); - } -} + TRACE("%s", " >> reserving"); + if (AUDIT_ENABLED()) { + rc = audit_ex(txn, ctx->retired_stored, false); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } +#if MDBX_ENABLE_GC_EXPERIMENTAL + const size_t left = ctx->amount - ctx->reserved - ctx->reserve_adj; + TRACE("%s: amount %zu, reserved %zd, reserve_adj %zu, left %zd, " + "lifo-reclaimed-slots %zu, " + "reused-gc-slots %zu", + dbg_prefix(ctx), ctx->amount, ctx->reserved, ctx->reserve_adj, left, + txn->tw.gc.reclaimed ? MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) : 0, + ctx->reused_slot); +#else + const size_t left = ctx->amount - ctx->reserved; + TRACE("%s: amount %zu, reserved %zd, left %zd, " + "lifo-reclaimed-slots %zu, " + "reused-gc-slots %zu", + dbg_prefix(ctx), ctx->amount, ctx->reserved, left, + txn->tw.gc.reclaimed ? MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) : 0, + ctx->reused_slot); +#endif /* MDBX_ENABLE_GC_EXPERIMENTAL */ + if (0 >= (intptr_t)left) + break; -__cold int mdbx_preopen_snapinfo(const char *pathname, MDBX_envinfo *out, - size_t bytes) { -#if defined(_WIN32) || defined(_WIN64) - wchar_t *pathnameW = nullptr; - int rc = osal_mb2w(pathname, &pathnameW); - if (likely(rc == MDBX_SUCCESS)) { - rc = mdbx_preopen_snapinfoW(pathnameW, out, bytes); - osal_free(pathnameW); - } - return rc; -} + const rid_t rid_result = get_rid_for_reclaimed(txn, ctx, left); + if (unlikely(!rid_result.rid)) { + rc = rid_result.err; + if (likely(rc == MDBX_SUCCESS)) + continue; + if (likely(rc == MDBX_RESULT_TRUE)) + goto retry; + goto bailout; + } + tASSERT(txn, rid_result.err == MDBX_SUCCESS); + const txnid_t reservation_gc_id = rid_result.rid; -__cold int mdbx_preopen_snapinfoW(const wchar_t *pathname, MDBX_envinfo *out, - size_t bytes) { -#endif /* Windows */ - if (unlikely(!out)) - return MDBX_EINVAL; + size_t chunk = left; + if (unlikely(left > env->maxgc_large1page)) { + const size_t avail_gc_slots = + txn->tw.gc.reclaimed + ? MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) - ctx->reused_slot + 1 + : (ctx->rid < INT16_MAX) ? (size_t)ctx->rid + : INT16_MAX; + if (likely(avail_gc_slots > 1)) { +#if MDBX_ENABLE_BIGFOOT + chunk = env->maxgc_large1page; + if (avail_gc_slots < INT16_MAX && + unlikely(left > env->maxgc_large1page * avail_gc_slots)) + /* TODO: Можно смотреть последовательности какой длины есть в relist + * и пробовать нарезать куски соответствующего размера. + * Смысл в том, чтобы не дробить последовательности страниц, + * а использовать целиком. */ + chunk = env->maxgc_large1page + + left / (env->maxgc_large1page * avail_gc_slots) * + env->maxgc_large1page; +#else + if (chunk < env->maxgc_large1page * 2) + chunk /= 2; + else { + const size_t prefer_max_scatter = 257; + const size_t threshold = + env->maxgc_large1page * ((avail_gc_slots < prefer_max_scatter) + ? avail_gc_slots + : prefer_max_scatter); + if (left < threshold) + chunk = env->maxgc_large1page; + else { + const size_t tail = left - threshold + env->maxgc_large1page + 1; + size_t span = 1; + size_t avail = ((pgno2bytes(env, span) - PAGEHDRSZ) / + sizeof(pgno_t)) /* - 1 + span */; + if (tail > avail) { + for (size_t i = ctx->amount - span; i > 0; --i) { + if (MDBX_PNL_ASCENDING ? (txn->tw.relist[i] + span) + : (txn->tw.relist[i] - span) == + txn->tw.relist[i + span]) { + span += 1; + avail = + ((pgno2bytes(env, span) - PAGEHDRSZ) / sizeof(pgno_t)) - + 1 + span; + if (avail >= tail) + break; + } + } + } - const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid); - const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat); - if (unlikely(bytes != sizeof(MDBX_envinfo)) && bytes != size_before_bootid && - bytes != size_before_pgop_stat) - return MDBX_EINVAL; + chunk = (avail >= tail) ? tail - span + : (avail_gc_slots > 3 && + ctx->reused_slot < prefer_max_scatter - 3) + ? avail - span + : tail; + } + } +#endif /* MDBX_ENABLE_BIGFOOT */ + } + } + tASSERT(txn, chunk > 0); - memset(out, 0, bytes); - if (likely(bytes > size_before_bootid)) { - out->mi_bootid.current.x = bootid.x; - out->mi_bootid.current.y = bootid.y; - } + TRACE("%s: gc_rid %" PRIaTXN ", reused_gc_slot %zu, reservation-id " + "%" PRIaTXN, + dbg_prefix(ctx), ctx->rid, ctx->reused_slot, reservation_gc_id); - MDBX_env env; - memset(&env, 0, sizeof(env)); - env.me_pid = osal_getpid(); - const size_t os_psize = osal_syspagesize(); - if (unlikely(!is_powerof2(os_psize) || os_psize < MIN_PAGESIZE)) { - ERROR("unsuitable system pagesize %" PRIuPTR, os_psize); - return MDBX_INCOMPATIBLE; - } - out->mi_sys_pagesize = env.me_os_psize = (unsigned)os_psize; - env.me_flags = MDBX_RDONLY | MDBX_NORDAHEAD | MDBX_ACCEDE | MDBX_VALIDATION; - env.me_stuck_meta = -1; - env.me_lfd = INVALID_HANDLE_VALUE; - env.me_lazy_fd = INVALID_HANDLE_VALUE; - env.me_dsync_fd = INVALID_HANDLE_VALUE; - env.me_fd4meta = INVALID_HANDLE_VALUE; -#if defined(_WIN32) || defined(_WIN64) - env.me_data_lock_event = INVALID_HANDLE_VALUE; - env.me_overlapped_fd = INVALID_HANDLE_VALUE; -#endif /* Windows */ + TRACE("%s: chunk %zu, gc-per-ovpage %u", dbg_prefix(ctx), chunk, + env->maxgc_large1page); - int rc = env_handle_pathname(&env, pathname, 0); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - rc = osal_openfile(MDBX_OPEN_DXB_READ, &env, env.me_pathname.dxb, - &env.me_lazy_fd, 0); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; + tASSERT(txn, reservation_gc_id <= env->lck->cached_oldest.weak); + if (unlikely(reservation_gc_id < MIN_TXNID || + reservation_gc_id > + atomic_load64(&env->lck->cached_oldest, mo_Relaxed))) { + ERROR("** internal error (reservation_gc_id %" PRIaTXN ")", + reservation_gc_id); + rc = MDBX_PROBLEM; + goto bailout; + } - MDBX_meta header; - rc = read_header(&env, &header, 0, 0); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; + key.iov_len = sizeof(reservation_gc_id); + key.iov_base = (void *)&reservation_gc_id; + data.iov_len = (chunk + 1) * sizeof(pgno_t); + TRACE("%s: reserve %zu [%zu...%zu) @%" PRIaTXN, dbg_prefix(ctx), chunk, + ctx->reserved + 1, ctx->reserved + chunk + 1, reservation_gc_id); + prepare_backlog(txn, ctx); + rc = cursor_put(&ctx->cursor, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); + tASSERT(txn, + pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - + MDBX_ENABLE_REFUND)); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; - setup_pagesize(&env, header.mm_psize); - out->mi_dxb_pagesize = env.me_psize; - out->mi_geo.lower = pgno2bytes(&env, header.mm_geo.lower); - out->mi_geo.upper = pgno2bytes(&env, header.mm_geo.upper); - out->mi_geo.shrink = pgno2bytes(&env, pv2pages(header.mm_geo.shrink_pv)); - out->mi_geo.grow = pgno2bytes(&env, pv2pages(header.mm_geo.grow_pv)); - out->mi_geo.current = pgno2bytes(&env, header.mm_geo.now); - out->mi_last_pgno = header.mm_geo.next - 1; + zeroize_reserved(env, data); + ctx->reserved += chunk; + TRACE("%s: reserved %zu (+%zu), continue", dbg_prefix(ctx), ctx->reserved, + chunk); - const unsigned n = 0; - out->mi_recent_txnid = constmeta_txnid(&header); - out->mi_meta_sign[n] = unaligned_peek_u64(4, &header.mm_sign); - if (likely(bytes > size_before_bootid)) - memcpy(&out->mi_bootid.meta[n], &header.mm_bootid, 16); + continue; + } -bailout: - env_close(&env, false); - return rc; -} + tASSERT( + txn, + ctx->cleaned_slot == + (txn->tw.gc.reclaimed ? MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) : 0)); -__cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, - MDBX_envinfo *arg, size_t bytes) { - if (unlikely((env == NULL && txn == NULL) || arg == NULL)) - return MDBX_EINVAL; + TRACE("%s", " >> filling"); + /* Fill in the reserved records */ +#if MDBX_ENABLE_GC_EXPERIMENTAL + size_t excess_slots = 0; +#endif /* MDBX_ENABLE_GC_EXPERIMENTAL */ + ctx->fill_idx = + txn->tw.gc.reclaimed + ? MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) - ctx->reused_slot + : ctx->reused_slot; + rc = MDBX_SUCCESS; + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - + MDBX_ENABLE_REFUND)); + tASSERT(txn, dpl_check(txn)); + if (ctx->amount) { + MDBX_val key, data; + key.iov_len = data.iov_len = 0; /* avoid MSVC warning */ + key.iov_base = data.iov_base = nullptr; - const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid); - const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat); - if (unlikely(bytes != sizeof(MDBX_envinfo)) && bytes != size_before_bootid && - bytes != size_before_pgop_stat) - return MDBX_EINVAL; + size_t left = ctx->amount, excess = 0; + if (txn->tw.gc.reclaimed == nullptr) { + tASSERT(txn, is_lifo(txn) == 0); + rc = outer_first(&ctx->cursor, &key, &data); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } else { + tASSERT(txn, is_lifo(txn) != 0); + } - if (txn) { - int err = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } - if (env) { - int err = check_env(env, false); - if (unlikely(err != MDBX_SUCCESS)) - return err; - if (txn && unlikely(txn->mt_env != env)) - return MDBX_EINVAL; - } else { - env = txn->mt_env; - } + while (true) { + txnid_t fill_gc_id; + TRACE("%s: left %zu of %zu", dbg_prefix(ctx), left, + MDBX_PNL_GETSIZE(txn->tw.relist)); + if (txn->tw.gc.reclaimed == nullptr) { + tASSERT(txn, is_lifo(txn) == 0); + fill_gc_id = unaligned_peek_u64(4, key.iov_base); + if (ctx->fill_idx == 0 || fill_gc_id > txn->tw.gc.last_reclaimed) { +#if MDBX_ENABLE_GC_EXPERIMENTAL + if (!left) + break; +#endif /* MDBX_ENABLE_GC_EXPERIMENTAL */ + NOTICE("** restart: reserve depleted (fill_idx %zu, fill_id %" PRIaTXN + " > last_reclaimed %" PRIaTXN ", left %zu", + ctx->fill_idx, fill_gc_id, txn->tw.gc.last_reclaimed, left); +#if MDBX_ENABLE_GC_EXPERIMENTAL + ctx->reserve_adj = + (ctx->reserve_adj > left) ? ctx->reserve_adj - left : 0; +#endif /* MDBX_ENABLE_GC_EXPERIMENTAL */ + goto retry; + } + ctx->fill_idx -= 1; + } else { + tASSERT(txn, is_lifo(txn) != 0); + if (ctx->fill_idx >= MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed)) { +#if MDBX_ENABLE_GC_EXPERIMENTAL + if (!left) + break; +#endif /* MDBX_ENABLE_GC_EXPERIMENTAL */ + NOTICE("** restart: reserve depleted (fill_idx %zu >= " + "gc.reclaimed %zu, left %zu", + ctx->fill_idx, MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed), left); +#if MDBX_ENABLE_GC_EXPERIMENTAL + ctx->reserve_adj = + (ctx->reserve_adj > left) ? ctx->reserve_adj - left : 0; +#endif /* MDBX_ENABLE_GC_EXPERIMENTAL */ + goto retry; + } + ctx->fill_idx += 1; + fill_gc_id = txn->tw.gc.reclaimed[ctx->fill_idx]; + TRACE("%s: seek-reservation @%" PRIaTXN " at gc.reclaimed[%zu]", + dbg_prefix(ctx), fill_gc_id, ctx->fill_idx); + key.iov_base = &fill_gc_id; + key.iov_len = sizeof(fill_gc_id); + rc = cursor_seek(&ctx->cursor, &key, &data, MDBX_SET_KEY).err; + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + tASSERT(txn, + ctx->cleaned_slot == (txn->tw.gc.reclaimed + ? MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) + : 0)); + tASSERT(txn, + fill_gc_id > 0 && fill_gc_id <= env->lck->cached_oldest.weak); + key.iov_base = &fill_gc_id; + key.iov_len = sizeof(fill_gc_id); - meta_troika_t troika; - return env_info(env, txn, arg, bytes, &troika); -} + tASSERT(txn, data.iov_len >= sizeof(pgno_t) * 2); + size_t chunk = data.iov_len / sizeof(pgno_t) - 1; + if (unlikely(chunk > left)) { + const size_t delta = chunk - left; + excess += delta; + TRACE("%s: chunk %zu > left %zu, @%" PRIaTXN, dbg_prefix(ctx), chunk, + left, fill_gc_id); +#if MDBX_ENABLE_GC_EXPERIMENTAL + if (!left) { + excess_slots += 1; + goto next; + } +#endif /* MDBX_ENABLE_GC_EXPERIMENTAL */ + if ((ctx->loop < 5 && delta > (ctx->loop / 2)) || + delta > env->maxgc_large1page) + data.iov_len = (left + 1) * sizeof(pgno_t); + chunk = left; + } + rc = cursor_put(&ctx->cursor, &key, &data, MDBX_CURRENT | MDBX_RESERVE); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + zeroize_reserved(env, data); -static __inline MDBX_cmp_func *get_default_keycmp(MDBX_db_flags_t flags) { - return (flags & MDBX_REVERSEKEY) ? cmp_reverse - : (flags & MDBX_INTEGERKEY) ? cmp_int_align2 - : cmp_lexical; -} + if (unlikely(txn->tw.loose_count || + ctx->amount != MDBX_PNL_GETSIZE(txn->tw.relist))) { + NOTICE("** restart: reclaimed-list changed (%zu -> %zu, loose +%zu)", + ctx->amount, MDBX_PNL_GETSIZE(txn->tw.relist), + txn->tw.loose_count); +#if MDBX_ENABLE_GC_EXPERIMENTAL + if (ctx->loop < 5 || (ctx->loop > 10 && (ctx->loop & 1))) + goto retry_clean_adj; +#endif /* MDBX_ENABLE_GC_EXPERIMENTAL */ + goto retry; + } -static __inline MDBX_cmp_func *get_default_datacmp(MDBX_db_flags_t flags) { - return !(flags & MDBX_DUPSORT) - ? cmp_lenfast - : ((flags & MDBX_INTEGERDUP) - ? cmp_int_unaligned - : ((flags & MDBX_REVERSEDUP) ? cmp_reverse : cmp_lexical)); -} + if (unlikely(txn->tw.gc.reclaimed + ? ctx->cleaned_slot < + MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) + : ctx->cleaned_id < txn->tw.gc.last_reclaimed)) { + NOTICE("%s", "** restart: reclaimed-slots changed"); + goto retry; + } + if (unlikely(ctx->retired_stored != + MDBX_PNL_GETSIZE(txn->tw.retired_pages))) { + tASSERT(txn, + ctx->retired_stored < MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + NOTICE("** restart: retired-list growth (%zu -> %zu)", + ctx->retired_stored, MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + goto retry; + } -static int dbi_bind(MDBX_txn *txn, const size_t dbi, unsigned user_flags, - MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { - const MDBX_env *const env = txn->mt_env; - eASSERT(env, dbi < txn->mt_numdbs && dbi < env->me_numdbs); - eASSERT(env, dbi_state(txn, dbi) & DBI_LINDO); - eASSERT(env, env->me_db_flags[dbi] != DB_POISON); - if ((env->me_db_flags[dbi] & DB_VALID) == 0) { - eASSERT(env, !env->me_dbxs[dbi].md_cmp && !env->me_dbxs[dbi].md_dcmp && - !env->me_dbxs[dbi].md_name.iov_len && - !env->me_dbxs[dbi].md_name.iov_base && - !env->me_dbxs[dbi].md_klen_max && - !env->me_dbxs[dbi].md_klen_min && - !env->me_dbxs[dbi].md_vlen_max && - !env->me_dbxs[dbi].md_vlen_min); - } else { - eASSERT(env, !(txn->mt_dbi_state[dbi] & DBI_VALID) || - (txn->mt_dbs[dbi].md_flags | DB_VALID) == - env->me_db_flags[dbi]); - eASSERT(env, env->me_dbxs[dbi].md_name.iov_base || dbi < CORE_DBS); - } + pgno_t *dst = data.iov_base; + *dst++ = (pgno_t)chunk; + pgno_t *src = MDBX_PNL_BEGIN(txn->tw.relist) + left - chunk; + memcpy(dst, src, chunk * sizeof(pgno_t)); + pgno_t *from = src, *to = src + chunk; + TRACE("%s: fill %zu [ %zu:%" PRIaPGNO "...%zu:%" PRIaPGNO "] @%" PRIaTXN, + dbg_prefix(ctx), chunk, from - txn->tw.relist, from[0], + to - txn->tw.relist, to[-1], fill_gc_id); - /* Если dbi уже использовался, то корректными считаем четыре варианта: - * 1) user_flags равны MDBX_DB_ACCEDE - * = предполагаем что пользователь открывает существующую subDb, - * при этом код проверки не позволит установить другие компараторы. - * 2) user_flags нулевые, а оба компаратора пустые/нулевые или равны текущим - * = предполагаем что пользователь открывает существующую subDb - * старым способом с нулевыми с флагами по-умолчанию. - * 3) user_flags совпадают, а компараторы не заданы или те же - * = предполагаем что пользователь открывает subDb указывая все параметры; - * 4) user_flags отличаются, но subDb пустая и задан флаг MDBX_CREATE - * = предполагаем что пользователь пересоздает subDb; - */ - if ((user_flags & ~MDBX_CREATE) != - (unsigned)(env->me_db_flags[dbi] & DB_PERSISTENT_FLAGS)) { - /* flags are differs, check other conditions */ - if ((!user_flags && (!keycmp || keycmp == env->me_dbxs[dbi].md_cmp) && - (!datacmp || datacmp == env->me_dbxs[dbi].md_dcmp)) || - user_flags == MDBX_DB_ACCEDE) { - user_flags = env->me_db_flags[dbi] & DB_PERSISTENT_FLAGS; - } else if ((user_flags & MDBX_CREATE) == 0) - return /* FIXME: return extended info */ MDBX_INCOMPATIBLE; - else { - eASSERT(env, env->me_db_flags[dbi] & DB_VALID); - if (txn->mt_dbi_state[dbi] & DBI_STALE) { - int err = fetch_sdb(txn, dbi); - if (unlikely(err == MDBX_SUCCESS)) - return err; + left -= chunk; + if (AUDIT_ENABLED()) { + rc = audit_ex(txn, ctx->retired_stored + ctx->amount - left, true); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; } - eASSERT(env, - (txn->mt_dbi_state[dbi] & (DBI_LINDO | DBI_VALID | DBI_STALE)) == - (DBI_LINDO | DBI_VALID)); - if (unlikely(txn->mt_dbs[dbi].md_leaf_pages)) - return /* FIXME: return extended info */ MDBX_INCOMPATIBLE; - /* Пересоздаём subDB если там пусто */ - if (unlikely(txn->mt_cursors[dbi])) - return MDBX_DANGLING_DBI; - env->me_db_flags[dbi] = DB_POISON; - atomic_store32(&env->me_dbi_seqs[dbi], dbi_seq_next(env, MAIN_DBI), - mo_AcquireRelease); +#if MDBX_ENABLE_GC_EXPERIMENTAL + next: +#else + if (left == 0) + break; +#endif /* MDBX_ENABLE_GC_EXPERIMENTAL */ - const uint32_t seq = dbi_seq_next(env, dbi); - const uint16_t db_flags = user_flags & DB_PERSISTENT_FLAGS; - eASSERT(env, txn->mt_dbs[dbi].md_depth == 0 && - txn->mt_dbs[dbi].md_entries == 0 && - txn->mt_dbs[dbi].md_root == P_INVALID); - env->me_dbxs[dbi].md_cmp = - keycmp ? keycmp : get_default_keycmp(user_flags); - env->me_dbxs[dbi].md_dcmp = - datacmp ? datacmp : get_default_datacmp(user_flags); - txn->mt_dbs[dbi].md_flags = db_flags; - txn->mt_dbs[dbi].md_xsize = 0; - if (unlikely(setup_sdb(&env->me_dbxs[dbi], &txn->mt_dbs[dbi], - env->me_psize))) { - txn->mt_dbi_state[dbi] = DBI_LINDO; - txn->mt_flags |= MDBX_TXN_ERROR; - return MDBX_PROBLEM; + if (txn->tw.gc.reclaimed == nullptr) { + tASSERT(txn, is_lifo(txn) == 0); + rc = outer_next(&ctx->cursor, &key, &data, MDBX_NEXT); + if (unlikely(rc != MDBX_SUCCESS)) { +#if MDBX_ENABLE_GC_EXPERIMENTAL + if (rc == MDBX_NOTFOUND && !left) { + rc = MDBX_SUCCESS; + break; + } +#endif /* MDBX_ENABLE_GC_EXPERIMENTAL */ + goto bailout; + } + } else { + tASSERT(txn, is_lifo(txn) != 0); } + } - env->me_db_flags[dbi] = db_flags | DB_VALID; - atomic_store32(&env->me_dbi_seqs[dbi], seq, mo_AcquireRelease); - txn->mt_dbi_seqs[dbi] = seq; - txn->mt_dbi_state[dbi] = DBI_LINDO | DBI_VALID | DBI_CREAT | DBI_DIRTY; - txn->mt_flags |= MDBX_TXN_DIRTY; + if (excess) { +#if MDBX_ENABLE_GC_EXPERIMENTAL + size_t n = excess, adj = excess; + while (n >= env->maxgc_large1page) + adj -= n /= env->maxgc_large1page; + ctx->reserve_adj += adj; + TRACE("%s: extra %zu reserved space, adj +%zu (%zu)", dbg_prefix(ctx), + excess, adj, ctx->reserve_adj); +#endif /* MDBX_ENABLE_GC_EXPERIMENTAL */ } } - if (!keycmp) - keycmp = (env->me_db_flags[dbi] & DB_VALID) - ? env->me_dbxs[dbi].md_cmp - : get_default_keycmp(user_flags); - if (env->me_dbxs[dbi].md_cmp != keycmp) { - if (env->me_db_flags[dbi] & DB_VALID) - return MDBX_EINVAL; - env->me_dbxs[dbi].md_cmp = keycmp; + tASSERT(txn, rc == MDBX_SUCCESS); + if (unlikely(txn->tw.loose_count != 0 || + ctx->amount != MDBX_PNL_GETSIZE(txn->tw.relist))) { + NOTICE("** restart: got %zu loose pages (reclaimed-list %zu -> %zu)", + txn->tw.loose_count, ctx->amount, MDBX_PNL_GETSIZE(txn->tw.relist)); + goto retry; } - if (!datacmp) - datacmp = (env->me_db_flags[dbi] & DB_VALID) - ? env->me_dbxs[dbi].md_dcmp - : get_default_datacmp(user_flags); - if (env->me_dbxs[dbi].md_dcmp != datacmp) { - if (env->me_db_flags[dbi] & DB_VALID) - return MDBX_EINVAL; - env->me_dbxs[dbi].md_dcmp = datacmp; +#if MDBX_ENABLE_GC_EXPERIMENTAL + if (unlikely(excess_slots)) { + const bool will_retry = ctx->loop < 5 || excess_slots > 1; + NOTICE("** %s: reserve excess (excess-slots %zu, filled-slot %zu, adj %zu, " + "loop %zu)", + will_retry ? "restart" : "ignore", excess_slots, ctx->fill_idx, + ctx->reserve_adj, ctx->loop); + if (will_retry) + goto retry; } +#else + if (unlikely(ctx->fill_idx != (txn->tw.gc.reclaimed + ? MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) + : 0))) { + const bool will_retry = ctx->loop < 9; + NOTICE("** %s: reserve excess (filled-idx %zu, loop %u)", + will_retry ? "restart" : "ignore", ctx->fill_idx, ctx->loop); + if (will_retry) + goto retry; + } +#endif /* MDBX_ENABLE_GC_EXPERIMENTAL */ - return MDBX_SUCCESS; -} - -static __inline size_t dbi_namelen(const MDBX_val name) { - return (name.iov_len > sizeof(struct mdbx_defer_free_item)) - ? name.iov_len - : sizeof(struct mdbx_defer_free_item); -} + tASSERT(txn, txn->tw.gc.reclaimed == nullptr || + ctx->cleaned_slot == MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed)); -static int dbi_open_locked(MDBX_txn *txn, unsigned user_flags, MDBX_dbi *dbi, - MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp, - MDBX_val name) { - MDBX_env *const env = txn->mt_env; +bailout: + txn->cursors[FREE_DBI] = ctx->cursor.next; - /* Cannot mix named table(s) with DUPSORT flags */ - tASSERT(txn, - (txn->mt_dbi_state[MAIN_DBI] & (DBI_LINDO | DBI_VALID | DBI_STALE)) == - (DBI_LINDO | DBI_VALID)); - if (unlikely(txn->mt_dbs[MAIN_DBI].md_flags & MDBX_DUPSORT)) { - if (unlikely((user_flags & MDBX_CREATE) == 0)) - return MDBX_NOTFOUND; - if (unlikely(txn->mt_dbs[MAIN_DBI].md_leaf_pages)) - /* В MainDB есть записи, либо она уже использовалась. */ - return MDBX_INCOMPATIBLE; + MDBX_PNL_SETSIZE(txn->tw.relist, 0); +#if MDBX_ENABLE_PROFGC + env->lck->pgops.gc_prof.wloops += (uint32_t)ctx->loop; +#endif /* MDBX_ENABLE_PROFGC */ + TRACE("<<< %u loops, rc = %d", ctx->loop, rc); + return rc; +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 - /* Пересоздаём MainDB когда там пусто. */ - tASSERT(txn, txn->mt_dbs[MAIN_DBI].md_depth == 0 && - txn->mt_dbs[MAIN_DBI].md_entries == 0 && - txn->mt_dbs[MAIN_DBI].md_root == P_INVALID); - if (unlikely(txn->mt_cursors[MAIN_DBI])) - return MDBX_DANGLING_DBI; - env->me_db_flags[MAIN_DBI] = DB_POISON; - atomic_store32(&env->me_dbi_seqs[MAIN_DBI], dbi_seq_next(env, MAIN_DBI), - mo_AcquireRelease); - const uint32_t seq = dbi_seq_next(env, MAIN_DBI); - const uint16_t main_flags = - txn->mt_dbs[MAIN_DBI].md_flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY); - env->me_dbxs[MAIN_DBI].md_cmp = get_default_keycmp(main_flags); - env->me_dbxs[MAIN_DBI].md_dcmp = get_default_datacmp(main_flags); - txn->mt_dbs[MAIN_DBI].md_flags = main_flags; - txn->mt_dbs[MAIN_DBI].md_xsize = 0; - if (unlikely(setup_sdb(&env->me_dbxs[MAIN_DBI], &txn->mt_dbs[MAIN_DBI], - env->me_psize) != MDBX_SUCCESS)) { - txn->mt_dbi_state[MAIN_DBI] = DBI_LINDO; - txn->mt_flags |= MDBX_TXN_ERROR; - env->me_flags |= MDBX_FATAL_ERROR; - return MDBX_FATAL_ERROR; - } - env->me_db_flags[MAIN_DBI] = main_flags | DB_VALID; - txn->mt_dbi_seqs[MAIN_DBI] = - atomic_store32(&env->me_dbi_seqs[MAIN_DBI], seq, mo_AcquireRelease); - txn->mt_dbi_state[MAIN_DBI] |= DBI_DIRTY; - txn->mt_flags |= MDBX_TXN_DIRTY; - } - - tASSERT(txn, env->me_dbxs[MAIN_DBI].md_cmp); +static void mdbx_init(void); +static void mdbx_fini(void); - /* Is the DB already open? */ - size_t slot = env->me_numdbs; - for (size_t scan = CORE_DBS; scan < env->me_numdbs; ++scan) { - if ((env->me_db_flags[scan] & DB_VALID) == 0) { - /* Remember this free slot */ - slot = (slot < scan) ? slot : scan; - continue; - } - if (!env->me_dbxs[MAIN_DBI].md_cmp(&name, &env->me_dbxs[scan].md_name)) { - slot = scan; - int err = dbi_check(txn, slot); - if (err == MDBX_BAD_DBI && - txn->mt_dbi_state[slot] == (DBI_OLDEN | DBI_LINDO)) { - /* хендл использовался, стал невалидным, - * но теперь явно пере-открывается в этой транзакци */ - eASSERT(env, !txn->mt_cursors[slot]); - txn->mt_dbi_state[slot] = DBI_LINDO; - err = dbi_check(txn, slot); - } - if (err == MDBX_SUCCESS) { - err = dbi_bind(txn, slot, user_flags, keycmp, datacmp); - if (likely(err == MDBX_SUCCESS)) { - goto done; - } - } - return err; - } - } +/*----------------------------------------------------------------------------*/ +/* mdbx constructor/destructor */ - /* Fail, if no free slot and max hit */ - if (unlikely(slot >= env->me_maxdbs)) - return MDBX_DBS_FULL; +#if defined(_WIN32) || defined(_WIN64) - if (env->me_numdbs == slot) - eASSERT(env, !env->me_db_flags[slot] && - !env->me_dbxs[slot].md_name.iov_len && - !env->me_dbxs[slot].md_name.iov_base); +#if MDBX_BUILD_SHARED_LIBRARY +#if MDBX_WITHOUT_MSVC_CRT && defined(NDEBUG) +/* DEBUG/CHECKED builds still require MSVC's CRT for runtime checks. + * + * Define dll's entry point only for Release build when NDEBUG is defined and + * MDBX_WITHOUT_MSVC_CRT=ON. if the entry point isn't defined then MSVC's will + * automatically use DllMainCRTStartup() from CRT library, which also + * automatically call DllMain() from our mdbx.dll */ +#pragma comment(linker, "/ENTRY:DllMain") +#endif /* MDBX_WITHOUT_MSVC_CRT */ - env->me_db_flags[slot] = DB_POISON; - atomic_store32(&env->me_dbi_seqs[slot], dbi_seq_next(env, slot), - mo_AcquireRelease); - memset(&env->me_dbxs[slot], 0, sizeof(env->me_dbxs[slot])); - if (env->me_numdbs == slot) - env->me_numdbs = (unsigned)slot + 1; - eASSERT(env, slot < env->me_numdbs); +BOOL APIENTRY DllMain(HANDLE module, DWORD reason, LPVOID reserved) +#else +#if !MDBX_MANUAL_MODULE_HANDLER +static +#endif /* !MDBX_MANUAL_MODULE_HANDLER */ + void NTAPI + mdbx_module_handler(PVOID module, DWORD reason, PVOID reserved) +#endif /* MDBX_BUILD_SHARED_LIBRARY */ +{ + (void)reserved; + switch (reason) { + case DLL_PROCESS_ATTACH: + windows_import(); + mdbx_init(); + break; + case DLL_PROCESS_DETACH: + mdbx_fini(); + break; - int err = dbi_check(txn, slot); - eASSERT(env, err == MDBX_BAD_DBI); - if (err != MDBX_BAD_DBI) - return MDBX_PROBLEM; - - /* Find the DB info */ - MDBX_val body; - MDBX_cursor_couple cx; - int rc = cursor_init(&cx.outer, txn, MAIN_DBI); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - rc = cursor_set(&cx.outer, &name, &body, MDBX_SET).err; - if (unlikely(rc != MDBX_SUCCESS)) { - if (rc != MDBX_NOTFOUND || !(user_flags & MDBX_CREATE)) - return rc; - } else { - /* make sure this is actually a table */ - MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], - cx.outer.mc_ki[cx.outer.mc_top]); - if (unlikely((node_flags(node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) - return MDBX_INCOMPATIBLE; - if (!MDBX_DISABLE_VALIDATION && unlikely(body.iov_len != sizeof(MDBX_db))) { - ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "invalid subDb node size", body.iov_len); - return MDBX_CORRUPTED; - } - memcpy(&txn->mt_dbs[slot], body.iov_base, sizeof(MDBX_db)); + case DLL_THREAD_ATTACH: + break; + case DLL_THREAD_DETACH: + rthc_thread_dtor(module); + break; } +#if MDBX_BUILD_SHARED_LIBRARY + return TRUE; +#endif +} - /* Done here so we cannot fail after creating a new DB */ - void *clone = nullptr; - if (name.iov_len) { - clone = osal_malloc(dbi_namelen(name)); - if (unlikely(!clone)) - return MDBX_ENOMEM; - name.iov_base = memcpy(clone, name.iov_base, name.iov_len); - } else - name.iov_base = ""; +#if !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER +#if defined(_MSC_VER) +# pragma const_seg(push) +# pragma data_seg(push) - uint8_t dbi_state = DBI_LINDO | DBI_VALID | DBI_FRESH; - if (unlikely(rc)) { - /* MDBX_NOTFOUND and MDBX_CREATE: Create new DB */ - tASSERT(txn, rc == MDBX_NOTFOUND); - body.iov_base = - memset(&txn->mt_dbs[slot], 0, body.iov_len = sizeof(MDBX_db)); - txn->mt_dbs[slot].md_root = P_INVALID; - txn->mt_dbs[slot].md_mod_txnid = txn->mt_txnid; - txn->mt_dbs[slot].md_flags = user_flags & DB_PERSISTENT_FLAGS; - WITH_CURSOR_TRACKING( - cx.outer, rc = cursor_put_checklen(&cx.outer, &name, &body, - F_SUBDATA | MDBX_NOOVERWRITE)); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; +# ifndef _M_IX86 + /* kick a linker to create the TLS directory if not already done */ +# pragma comment(linker, "/INCLUDE:_tls_used") + /* Force some symbol references. */ +# pragma comment(linker, "/INCLUDE:mdbx_tls_anchor") + /* specific const-segment for WIN64 */ +# pragma const_seg(".CRT$XLB") + const +# else + /* kick a linker to create the TLS directory if not already done */ +# pragma comment(linker, "/INCLUDE:__tls_used") + /* Force some symbol references. */ +# pragma comment(linker, "/INCLUDE:_mdbx_tls_anchor") + /* specific data-segment for WIN32 */ +# pragma data_seg(".CRT$XLB") +# endif - dbi_state |= DBI_DIRTY | DBI_CREAT; - txn->mt_flags |= MDBX_TXN_DIRTY; - tASSERT(txn, (txn->mt_dbi_state[MAIN_DBI] & DBI_DIRTY) != 0); - } + __declspec(allocate(".CRT$XLB")) PIMAGE_TLS_CALLBACK mdbx_tls_anchor = mdbx_module_handler; +# pragma data_seg(pop) +# pragma const_seg(pop) - /* Got info, register DBI in this txn */ - const uint32_t seq = dbi_seq_next(env, slot); - eASSERT(env, - env->me_db_flags[slot] == DB_POISON && !txn->mt_cursors[slot] && - (txn->mt_dbi_state[slot] & (DBI_LINDO | DBI_VALID)) == DBI_LINDO); - txn->mt_dbi_state[slot] = dbi_state; - memcpy(&txn->mt_dbs[slot], body.iov_base, sizeof(txn->mt_dbs[slot])); - env->me_db_flags[slot] = txn->mt_dbs[slot].md_flags; - rc = dbi_bind(txn, slot, user_flags, keycmp, datacmp); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; +#elif defined(__GNUC__) +# ifndef _M_IX86 + const +# endif + PIMAGE_TLS_CALLBACK mdbx_tls_anchor __attribute__((__section__(".CRT$XLB"), used)) = mdbx_module_handler; +#else +# error FIXME +#endif +#endif /* !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER */ - env->me_dbxs[slot].md_name = name; - env->me_db_flags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID; - txn->mt_dbi_seqs[slot] = - atomic_store32(&env->me_dbi_seqs[slot], seq, mo_AcquireRelease); +#else -done: - *dbi = (MDBX_dbi)slot; - tASSERT(txn, - slot < txn->mt_numdbs && (env->me_db_flags[slot] & DB_VALID) != 0); - eASSERT(env, dbi_check(txn, slot) == MDBX_SUCCESS); - return MDBX_SUCCESS; +#if defined(__linux__) || defined(__gnu_linux__) +#include -bailout: - eASSERT(env, !txn->mt_cursors[slot] && !env->me_dbxs[slot].md_name.iov_len && - !env->me_dbxs[slot].md_name.iov_base); - txn->mt_dbi_state[slot] &= DBI_LINDO | DBI_OLDEN; - env->me_db_flags[slot] = 0; - osal_free(clone); - if (slot + 1 == env->me_numdbs) - txn->mt_numdbs = env->me_numdbs = (unsigned)slot; - return rc; +MDBX_EXCLUDE_FOR_GPROF +__cold static uint8_t probe_for_WSL(const char *tag) { + const char *const WSL = strstr(tag, "WSL"); + if (WSL && WSL[3] >= '2' && WSL[3] <= '9') + return WSL[3] - '0'; + const char *const wsl = strstr(tag, "wsl"); + if (wsl && wsl[3] >= '2' && wsl[3] <= '9') + return wsl[3] - '0'; + if (WSL || wsl || strcasestr(tag, "Microsoft")) + /* Expecting no new kernel within WSL1, either it will explicitly + * marked by an appropriate WSL-version hint. */ + return (globals.linux_kernel_version < /* 4.19.x */ 0x04130000) ? 1 : 2; + return 0; } +#endif /* Linux */ -static int dbi_open(MDBX_txn *txn, const MDBX_val *const name, - unsigned user_flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, - MDBX_cmp_func *datacmp) { - if (unlikely(!dbi)) - return MDBX_EINVAL; - *dbi = 0; - if (unlikely((user_flags & ~DB_USABLE_FLAGS) != 0)) - return MDBX_EINVAL; - - int rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if ((user_flags & MDBX_CREATE) && unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) - return MDBX_EACCESS; +#ifdef ENABLE_GPROF +extern void _mcleanup(void); +extern void monstartup(unsigned long, unsigned long); +extern void _init(void); +extern void _fini(void); +extern void __gmon_start__(void) __attribute__((__weak__)); +#endif /* ENABLE_GPROF */ - switch (user_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED | MDBX_DUPSORT | - MDBX_REVERSEDUP | MDBX_ACCEDE)) { - case MDBX_ACCEDE: - if ((user_flags & MDBX_CREATE) == 0) - break; - __fallthrough /* fall through */; - default: - return MDBX_EINVAL; +MDBX_EXCLUDE_FOR_GPROF +__cold static __attribute__((__constructor__)) void +mdbx_global_constructor(void) { +#ifdef ENABLE_GPROF + if (!&__gmon_start__) + monstartup((uintptr_t)&_init, (uintptr_t)&_fini); +#endif /* ENABLE_GPROF */ - case MDBX_DUPSORT: - case MDBX_DUPSORT | MDBX_REVERSEDUP: - case MDBX_DUPSORT | MDBX_DUPFIXED: - case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP: - case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP: - case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP: - case MDBX_DB_DEFAULTS: - break; +#if defined(__linux__) || defined(__gnu_linux__) + struct utsname buffer; + if (uname(&buffer) == 0) { + int i = 0; + char *p = buffer.release; + while (*p && i < 4) { + if (*p >= '0' && *p <= '9') { + long number = strtol(p, &p, 10); + if (number > 0) { + if (number > 255) + number = 255; + globals.linux_kernel_version += number << (24 - i * 8); + } + ++i; + } else { + ++p; + } + } + /* "Official" way of detecting WSL1 but not WSL2 + * https://github.com/Microsoft/WSL/issues/423#issuecomment-221627364 + * + * WARNING: False negative detection of WSL1 will result in DATA LOSS! + * So, the REQUIREMENTS for this code: + * 1. MUST detect WSL1 without false-negatives. + * 2. DESIRABLE detect WSL2 but without the risk of violating the first. */ + globals.running_on_WSL1 = probe_for_WSL(buffer.version) == 1 || + probe_for_WSL(buffer.sysname) == 1 || + probe_for_WSL(buffer.release) == 1; } - tASSERT(txn, db_check_flags((uint16_t)user_flags)); +#endif /* Linux */ - /* main table? */ - if (unlikely(name == MDBX_CHK_MAIN || name->iov_base == MDBX_CHK_MAIN)) { - rc = dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp); - if (likely(rc == MDBX_SUCCESS)) - *dbi = MAIN_DBI; - return rc; - } - if (unlikely(name == MDBX_CHK_GC || name->iov_base == MDBX_CHK_GC)) { - rc = dbi_bind(txn, FREE_DBI, user_flags, keycmp, datacmp); - if (likely(rc == MDBX_SUCCESS)) - *dbi = FREE_DBI; - return rc; - } - if (unlikely(name == MDBX_CHK_META || name->iov_base == MDBX_CHK_META)) - return MDBX_EINVAL; - if (unlikely(name->iov_len > - txn->mt_env->me_leaf_nodemax - NODESIZE - sizeof(MDBX_db))) - return MDBX_EINVAL; + mdbx_init(); +} -#if MDBX_ENABLE_DBI_LOCKFREE - /* Is the DB already open? */ - const MDBX_env *const env = txn->mt_env; - size_t free_slot = env->me_numdbs; - for (size_t i = CORE_DBS; i < env->me_numdbs; ++i) { - retry: - if ((env->me_db_flags[i] & DB_VALID) == 0) { - free_slot = i; - continue; - } +MDBX_EXCLUDE_FOR_GPROF +__cold static __attribute__((__destructor__)) void +mdbx_global_destructor(void) { + mdbx_fini(); +#ifdef ENABLE_GPROF + if (!&__gmon_start__) + _mcleanup(); +#endif /* ENABLE_GPROF */ +} - const uint32_t snap_seq = - atomic_load32(&env->me_dbi_seqs[i], mo_AcquireRelease); - const uint16_t snap_flags = env->me_db_flags[i]; - const MDBX_val snap_name = env->me_dbxs[i].md_name; - if (user_flags != MDBX_ACCEDE && - (((user_flags ^ snap_flags) & DB_PERSISTENT_FLAGS) || - (keycmp && keycmp != env->me_dbxs[i].md_cmp) || - (datacmp && datacmp != env->me_dbxs[i].md_dcmp))) - continue; - const uint32_t main_seq = - atomic_load32(&env->me_dbi_seqs[MAIN_DBI], mo_AcquireRelease); - MDBX_cmp_func *const snap_cmp = env->me_dbxs[MAIN_DBI].md_cmp; - if (unlikely(!(snap_flags & DB_VALID) || !snap_name.iov_base || - !snap_name.iov_len || !snap_cmp)) - continue; +#endif /* ! Windows */ - const bool name_match = snap_cmp(&snap_name, name) == 0; - osal_flush_incoherent_cpu_writeback(); - if (unlikely(snap_seq != - atomic_load32(&env->me_dbi_seqs[i], mo_AcquireRelease) || - main_seq != atomic_load32(&env->me_dbi_seqs[MAIN_DBI], - mo_AcquireRelease) || - snap_flags != env->me_db_flags[i] || - snap_name.iov_base != env->me_dbxs[i].md_name.iov_base || - snap_name.iov_len != env->me_dbxs[i].md_name.iov_len)) - goto retry; - if (name_match) { - rc = dbi_check(txn, i); - if (rc == MDBX_BAD_DBI && - txn->mt_dbi_state[i] == (DBI_OLDEN | DBI_LINDO)) { - /* хендл использовался, стал невалидным, - * но теперь явно пере-открывается в этой транзакци */ - eASSERT(env, !txn->mt_cursors[i]); - txn->mt_dbi_state[i] = DBI_LINDO; - rc = dbi_check(txn, i); - } - if (likely(rc == MDBX_SUCCESS)) { - rc = dbi_bind(txn, i, user_flags, keycmp, datacmp); - if (likely(rc == MDBX_SUCCESS)) - *dbi = (MDBX_dbi)i; - } - return rc; - } - } +/******************************************************************************/ - /* Fail, if no free slot and max hit */ - if (unlikely(free_slot >= env->me_maxdbs)) - return MDBX_DBS_FULL; -#endif /* MDBX_ENABLE_DBI_LOCKFREE */ +struct libmdbx_globals globals; - rc = osal_fastmutex_acquire(&txn->mt_env->me_dbi_lock); - if (likely(rc == MDBX_SUCCESS)) { - rc = dbi_open_locked(txn, user_flags, dbi, keycmp, datacmp, *name); - ENSURE(txn->mt_env, - osal_fastmutex_release(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS); - } - return rc; +__cold static void mdbx_init(void) { + globals.runtime_flags = ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT; + globals.loglevel = MDBX_LOG_FATAL; + ENSURE(nullptr, osal_fastmutex_init(&globals.debug_lock) == 0); + osal_ctor(); + assert(globals.sys_pagesize > 0 && + (globals.sys_pagesize & (globals.sys_pagesize - 1)) == 0); + rthc_ctor(); +#if MDBX_DEBUG + ENSURE(nullptr, troika_verify_fsm()); + ENSURE(nullptr, pv2pages_verify()); +#endif /* MDBX_DEBUG*/ } -static int dbi_open_cstr(MDBX_txn *txn, const char *name_cstr, - MDBX_db_flags_t flags, MDBX_dbi *dbi, - MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { - MDBX_val thunk, *name; - if (name_cstr == MDBX_CHK_MAIN || name_cstr == MDBX_CHK_GC || - name_cstr == MDBX_CHK_META) - name = (void *)name_cstr; - else { - thunk.iov_len = strlen(name_cstr); - thunk.iov_base = (void *)name_cstr; - name = &thunk; - } - return dbi_open(txn, name, flags, dbi, keycmp, datacmp); +MDBX_EXCLUDE_FOR_GPROF +__cold static void mdbx_fini(void) { + const uint32_t current_pid = osal_getpid(); + TRACE(">> pid %d", current_pid); + rthc_dtor(current_pid); + osal_dtor(); + TRACE("<< pid %d\n", current_pid); + ENSURE(nullptr, osal_fastmutex_destroy(&globals.debug_lock) == 0); } -int mdbx_dbi_open(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, - MDBX_dbi *dbi) { - return dbi_open_cstr(txn, name, flags, dbi, nullptr, nullptr); -} - -int mdbx_dbi_open2(MDBX_txn *txn, const MDBX_val *name, MDBX_db_flags_t flags, - MDBX_dbi *dbi) { - return dbi_open(txn, name, flags, dbi, nullptr, nullptr); -} - -int mdbx_dbi_open_ex(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, - MDBX_dbi *dbi, MDBX_cmp_func *keycmp, - MDBX_cmp_func *datacmp) { - return dbi_open_cstr(txn, name, flags, dbi, keycmp, datacmp); -} - -int mdbx_dbi_open_ex2(MDBX_txn *txn, const MDBX_val *name, - MDBX_db_flags_t flags, MDBX_dbi *dbi, - MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { - return dbi_open(txn, name, flags, dbi, keycmp, datacmp); -} - -__cold int mdbx_dbi_rename(MDBX_txn *txn, MDBX_dbi dbi, const char *name_cstr) { - MDBX_val thunk, *name; - if (name_cstr == MDBX_CHK_MAIN || name_cstr == MDBX_CHK_GC || - name_cstr == MDBX_CHK_META) - name = (void *)name_cstr; - else { - thunk.iov_len = strlen(name_cstr); - thunk.iov_base = (void *)name_cstr; - name = &thunk; - } - return mdbx_dbi_rename2(txn, dbi, name); -} - -struct dbi_rename_result { - struct mdbx_defer_free_item *defer; - int err; -}; - -__cold static struct dbi_rename_result -dbi_rename_locked(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val new_name) { - struct dbi_rename_result pair; - pair.defer = nullptr; - pair.err = dbi_check(txn, dbi); - if (unlikely(pair.err != MDBX_SUCCESS)) - return pair; - - MDBX_env *const env = txn->mt_env; - MDBX_val old_name = env->me_dbxs[dbi].md_name; - if (env->me_dbxs[MAIN_DBI].md_cmp(&new_name, &old_name) == 0 && - MDBX_DEBUG == 0) - return pair; - - MDBX_cursor_couple cx; - pair.err = cursor_init(&cx.outer, txn, MAIN_DBI); - if (unlikely(pair.err != MDBX_SUCCESS)) - return pair; - pair.err = cursor_set(&cx.outer, &new_name, nullptr, MDBX_SET).err; - if (unlikely(pair.err != MDBX_NOTFOUND)) { - pair.err = (pair.err == MDBX_SUCCESS) ? MDBX_KEYEXIST : pair.err; - return pair; - } - - pair.defer = osal_malloc(dbi_namelen(new_name)); - if (unlikely(!pair.defer)) { - pair.err = MDBX_ENOMEM; - return pair; - } - new_name.iov_base = memcpy(pair.defer, new_name.iov_base, new_name.iov_len); - - cx.outer.mc_next = txn->mt_cursors[MAIN_DBI]; - txn->mt_cursors[MAIN_DBI] = &cx.outer; - - MDBX_val data = {&txn->mt_dbs[dbi], sizeof(MDBX_db)}; - pair.err = cursor_put_checklen(&cx.outer, &new_name, &data, - F_SUBDATA | MDBX_NOOVERWRITE); - if (likely(pair.err == MDBX_SUCCESS)) { - pair.err = cursor_set(&cx.outer, &old_name, nullptr, MDBX_SET).err; - if (likely(pair.err == MDBX_SUCCESS)) - pair.err = cursor_del(&cx.outer, F_SUBDATA); - if (likely(pair.err == MDBX_SUCCESS)) { - pair.defer = env->me_dbxs[dbi].md_name.iov_base; - env->me_dbxs[dbi].md_name = new_name; - } else - txn->mt_flags |= MDBX_TXN_ERROR; - } - - txn->mt_cursors[MAIN_DBI] = cx.outer.mc_next; - return pair; -} - -__cold int mdbx_dbi_rename2(MDBX_txn *txn, MDBX_dbi dbi, - const MDBX_val *new_name) { - int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(new_name == MDBX_CHK_MAIN || - new_name->iov_base == MDBX_CHK_MAIN || new_name == MDBX_CHK_GC || - new_name->iov_base == MDBX_CHK_GC || new_name == MDBX_CHK_META || - new_name->iov_base == MDBX_CHK_META)) - return MDBX_EINVAL; - - if (unlikely(dbi < CORE_DBS)) - return MDBX_EINVAL; - rc = dbi_check(txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - rc = osal_fastmutex_acquire(&txn->mt_env->me_dbi_lock); - if (likely(rc == MDBX_SUCCESS)) { - struct dbi_rename_result pair = dbi_rename_locked(txn, dbi, *new_name); - if (pair.defer) - pair.defer->next = nullptr; - env_defer_free_and_release(txn->mt_env, pair.defer); - rc = pair.err; - } - return rc; -} - -__cold int mdbx_dbi_stat(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, - size_t bytes) { - int rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(!dest)) - return MDBX_EINVAL; - - rc = dbi_check(txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid); - if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid) - return MDBX_EINVAL; - - if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) - return MDBX_BAD_TXN; - - if (unlikely(txn->mt_dbi_state[dbi] & DBI_STALE)) { - rc = fetch_sdb((MDBX_txn *)txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } - - dest->ms_psize = txn->mt_env->me_psize; - stat_get(&txn->mt_dbs[dbi], dest, bytes); - return MDBX_SUCCESS; -} +/******************************************************************************/ -static struct mdbx_defer_free_item *dbi_close_locked(MDBX_env *env, - MDBX_dbi dbi) { - eASSERT(env, dbi >= CORE_DBS); - if (unlikely(dbi >= env->me_numdbs)) - return nullptr; - const uint32_t seq = dbi_seq_next(env, dbi); - struct mdbx_defer_free_item *defer_item = env->me_dbxs[dbi].md_name.iov_base; - if (likely(defer_item)) { - env->me_db_flags[dbi] = 0; - env->me_dbxs[dbi].md_name.iov_len = 0; - env->me_dbxs[dbi].md_name.iov_base = nullptr; - atomic_store32(&env->me_dbi_seqs[dbi], seq, mo_AcquireRelease); - osal_flush_incoherent_cpu_writeback(); - defer_item->next = nullptr; +__dll_export +#ifdef __attribute_used__ + __attribute_used__ +#elif defined(__GNUC__) || __has_attribute(__used__) + __attribute__((__used__)) +#endif +#ifdef __attribute_externally_visible__ + __attribute_externally_visible__ +#elif (defined(__GNUC__) && !defined(__clang__)) || \ + __has_attribute(__externally_visible__) + __attribute__((__externally_visible__)) +#endif + const struct MDBX_build_info mdbx_build = { +#ifdef MDBX_BUILD_TIMESTAMP + MDBX_BUILD_TIMESTAMP +#else + "\"" __DATE__ " " __TIME__ "\"" +#endif /* MDBX_BUILD_TIMESTAMP */ - if (env->me_numdbs == dbi + 1) { - size_t i = env->me_numdbs; - do { - --i; - eASSERT(env, i >= CORE_DBS); - eASSERT(env, !env->me_db_flags[i] && !env->me_dbxs[i].md_name.iov_len && - !env->me_dbxs[i].md_name.iov_base); - } while (i > CORE_DBS && !env->me_dbxs[i - 1].md_name.iov_base); - env->me_numdbs = (unsigned)i; - } - } + , +#ifdef MDBX_BUILD_TARGET + MDBX_BUILD_TARGET +#else + #if defined(__ANDROID_API__) + "Android" MDBX_STRINGIFY(__ANDROID_API__) + #elif defined(__linux__) || defined(__gnu_linux__) + "Linux" + #elif defined(EMSCRIPTEN) || defined(__EMSCRIPTEN__) + "webassembly" + #elif defined(__CYGWIN__) + "CYGWIN" + #elif defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) \ + || defined(__WINDOWS__) + "Windows" + #elif defined(__APPLE__) + #if (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE) \ + || (defined(TARGET_IPHONE_SIMULATOR) && TARGET_IPHONE_SIMULATOR) + "iOS" + #else + "MacOS" + #endif + #elif defined(__FreeBSD__) + "FreeBSD" + #elif defined(__DragonFly__) + "DragonFlyBSD" + #elif defined(__NetBSD__) + "NetBSD" + #elif defined(__OpenBSD__) + "OpenBSD" + #elif defined(__bsdi__) + "UnixBSDI" + #elif defined(__MACH__) + "MACH" + #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) + "HPUX" + #elif defined(_AIX) + "AIX" + #elif defined(__sun) && defined(__SVR4) + "Solaris" + #elif defined(__BSD__) || defined(BSD) + "UnixBSD" + #elif defined(__unix__) || defined(UNIX) || defined(__unix) \ + || defined(__UNIX) || defined(__UNIX__) + "UNIX" + #elif defined(_POSIX_VERSION) + "POSIX" MDBX_STRINGIFY(_POSIX_VERSION) + #else + "UnknownOS" + #endif /* Target OS */ - return defer_item; -} + "-" -int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { - int rc = check_env(env, true); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + #if defined(__amd64__) + "AMD64" + #elif defined(__ia32__) + "IA32" + #elif defined(__e2k__) || defined(__elbrus__) + "Elbrus" + #elif defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA) + "Alpha" + #elif defined(__aarch64__) || defined(_M_ARM64) + "ARM64" + #elif defined(__arm__) || defined(__thumb__) || defined(__TARGET_ARCH_ARM) \ + || defined(__TARGET_ARCH_THUMB) || defined(_ARM) || defined(_M_ARM) \ + || defined(_M_ARMT) || defined(__arm) + "ARM" + #elif defined(__mips64) || defined(__mips64__) || (defined(__mips) && (__mips >= 64)) + "MIPS64" + #elif defined(__mips__) || defined(__mips) || defined(_R4000) || defined(__MIPS__) + "MIPS" + #elif defined(__hppa64__) || defined(__HPPA64__) || defined(__hppa64) + "PARISC64" + #elif defined(__hppa__) || defined(__HPPA__) || defined(__hppa) + "PARISC" + #elif defined(__ia64__) || defined(__ia64) || defined(_IA64) \ + || defined(__IA64__) || defined(_M_IA64) || defined(__itanium__) + "Itanium" + #elif defined(__powerpc64__) || defined(__ppc64__) || defined(__ppc64) \ + || defined(__powerpc64) || defined(_ARCH_PPC64) + "PowerPC64" + #elif defined(__powerpc__) || defined(__ppc__) || defined(__powerpc) \ + || defined(__ppc) || defined(_ARCH_PPC) || defined(__PPC__) || defined(__POWERPC__) + "PowerPC" + #elif defined(__sparc64__) || defined(__sparc64) + "SPARC64" + #elif defined(__sparc__) || defined(__sparc) + "SPARC" + #elif defined(__s390__) || defined(__s390) || defined(__zarch__) || defined(__zarch) + "S390" + #else + "UnknownARCH" + #endif +#endif /* MDBX_BUILD_TARGET */ - if (unlikely(dbi < CORE_DBS)) - return (dbi == MAIN_DBI) ? MDBX_SUCCESS : MDBX_BAD_DBI; - - if (unlikely(dbi >= env->me_maxdbs)) - return MDBX_BAD_DBI; - - if (unlikely(dbi < CORE_DBS || dbi >= env->me_maxdbs)) - return MDBX_BAD_DBI; +#ifdef MDBX_BUILD_TYPE +# if defined(_MSC_VER) +# pragma message("Configuration-depended MDBX_BUILD_TYPE: " MDBX_BUILD_TYPE) +# endif + "-" MDBX_BUILD_TYPE +#endif /* MDBX_BUILD_TYPE */ + , + "MDBX_DEBUG=" MDBX_STRINGIFY(MDBX_DEBUG) +#ifdef ENABLE_GPROF + " ENABLE_GPROF" +#endif /* ENABLE_GPROF */ + " MDBX_WORDBITS=" MDBX_STRINGIFY(MDBX_WORDBITS) + " BYTE_ORDER=" +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + "LITTLE_ENDIAN" +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + "BIG_ENDIAN" +#else + #error "FIXME: Unsupported byte order" +#endif /* __BYTE_ORDER__ */ + " MDBX_ENABLE_BIGFOOT=" MDBX_STRINGIFY(MDBX_ENABLE_BIGFOOT) + " MDBX_ENV_CHECKPID=" MDBX_ENV_CHECKPID_CONFIG + " MDBX_TXN_CHECKOWNER=" MDBX_TXN_CHECKOWNER_CONFIG + " MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG + " MDBX_64BIT_CAS=" MDBX_64BIT_CAS_CONFIG + " MDBX_TRUST_RTC=" MDBX_TRUST_RTC_CONFIG + " MDBX_AVOID_MSYNC=" MDBX_STRINGIFY(MDBX_AVOID_MSYNC) + " MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND) + " MDBX_ENABLE_MADVISE=" MDBX_STRINGIFY(MDBX_ENABLE_MADVISE) + " MDBX_ENABLE_MINCORE=" MDBX_STRINGIFY(MDBX_ENABLE_MINCORE) + " MDBX_ENABLE_PGOP_STAT=" MDBX_STRINGIFY(MDBX_ENABLE_PGOP_STAT) + " MDBX_ENABLE_PROFGC=" MDBX_STRINGIFY(MDBX_ENABLE_PROFGC) +#if MDBX_DISABLE_VALIDATION + " MDBX_DISABLE_VALIDATION=YES" +#endif /* MDBX_DISABLE_VALIDATION */ +#ifdef __SANITIZE_ADDRESS__ + " SANITIZE_ADDRESS=YES" +#endif /* __SANITIZE_ADDRESS__ */ +#ifdef ENABLE_MEMCHECK + " ENABLE_MEMCHECK=YES" +#endif /* ENABLE_MEMCHECK */ +#if MDBX_FORCE_ASSERTIONS + " MDBX_FORCE_ASSERTIONS=YES" +#endif /* MDBX_FORCE_ASSERTIONS */ +#ifdef _GNU_SOURCE + " _GNU_SOURCE=YES" +#else + " _GNU_SOURCE=NO" +#endif /* _GNU_SOURCE */ +#ifdef __APPLE__ + " MDBX_OSX_SPEED_INSTEADOF_DURABILITY=" MDBX_STRINGIFY(MDBX_OSX_SPEED_INSTEADOF_DURABILITY) +#endif /* MacOS */ +#if defined(_WIN32) || defined(_WIN64) + " MDBX_WITHOUT_MSVC_CRT=" MDBX_STRINGIFY(MDBX_WITHOUT_MSVC_CRT) + " MDBX_BUILD_SHARED_LIBRARY=" MDBX_STRINGIFY(MDBX_BUILD_SHARED_LIBRARY) +#if !MDBX_BUILD_SHARED_LIBRARY + " MDBX_MANUAL_MODULE_HANDLER=" MDBX_STRINGIFY(MDBX_MANUAL_MODULE_HANDLER) +#endif + " WINVER=" MDBX_STRINGIFY(WINVER) +#else /* Windows */ + " MDBX_LOCKING=" MDBX_LOCKING_CONFIG + " MDBX_USE_OFDLOCKS=" MDBX_USE_OFDLOCKS_CONFIG +#endif /* !Windows */ + " MDBX_CACHELINE_SIZE=" MDBX_STRINGIFY(MDBX_CACHELINE_SIZE) + " MDBX_CPU_WRITEBACK_INCOHERENT=" MDBX_STRINGIFY(MDBX_CPU_WRITEBACK_INCOHERENT) + " MDBX_MMAP_INCOHERENT_CPU_CACHE=" MDBX_STRINGIFY(MDBX_MMAP_INCOHERENT_CPU_CACHE) + " MDBX_MMAP_INCOHERENT_FILE_WRITE=" MDBX_STRINGIFY(MDBX_MMAP_INCOHERENT_FILE_WRITE) + " MDBX_UNALIGNED_OK=" MDBX_STRINGIFY(MDBX_UNALIGNED_OK) + " MDBX_PNL_ASCENDING=" MDBX_STRINGIFY(MDBX_PNL_ASCENDING) + , +#ifdef MDBX_BUILD_COMPILER + MDBX_BUILD_COMPILER +#else + #ifdef __INTEL_COMPILER + "Intel C/C++ " MDBX_STRINGIFY(__INTEL_COMPILER) + #elif defined(__apple_build_version__) + "Apple clang " MDBX_STRINGIFY(__apple_build_version__) + #elif defined(__ibmxl__) + "IBM clang C " MDBX_STRINGIFY(__ibmxl_version__) "." MDBX_STRINGIFY(__ibmxl_release__) + "." MDBX_STRINGIFY(__ibmxl_modification__) "." MDBX_STRINGIFY(__ibmxl_ptf_fix_level__) + #elif defined(__clang__) + "clang " MDBX_STRINGIFY(__clang_version__) + #elif defined(__MINGW64__) + "MINGW-64 " MDBX_STRINGIFY(__MINGW64_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW64_MINOR_VERSION) + #elif defined(__MINGW32__) + "MINGW-32 " MDBX_STRINGIFY(__MINGW32_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW32_MINOR_VERSION) + #elif defined(__MINGW__) + "MINGW " MDBX_STRINGIFY(__MINGW_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW_MINOR_VERSION) + #elif defined(__IBMC__) + "IBM C " MDBX_STRINGIFY(__IBMC__) + #elif defined(__GNUC__) + "GNU C/C++ " + #ifdef __VERSION__ + __VERSION__ + #else + MDBX_STRINGIFY(__GNUC__) "." MDBX_STRINGIFY(__GNUC_MINOR__) "." MDBX_STRINGIFY(__GNUC_PATCHLEVEL__) + #endif + #elif defined(_MSC_VER) + "MSVC " MDBX_STRINGIFY(_MSC_FULL_VER) "-" MDBX_STRINGIFY(_MSC_BUILD) + #else + "Unknown compiler" + #endif +#endif /* MDBX_BUILD_COMPILER */ + , +#ifdef MDBX_BUILD_FLAGS_CONFIG + MDBX_BUILD_FLAGS_CONFIG +#endif /* MDBX_BUILD_FLAGS_CONFIG */ +#ifdef MDBX_BUILD_FLAGS + MDBX_BUILD_FLAGS +#endif /* MDBX_BUILD_FLAGS */ +#if !(defined(MDBX_BUILD_FLAGS_CONFIG) || defined(MDBX_BUILD_FLAGS)) + "undefined (please use correct build script)" +#ifdef _MSC_VER +#pragma message("warning: Build flags undefined. Please use correct build script") +#else +#warning "Build flags undefined. Please use correct build script" +#endif // _MSC_VER +#endif +}; - rc = osal_fastmutex_acquire(&env->me_dbi_lock); - if (likely(rc == MDBX_SUCCESS)) - rc = env_defer_free_and_release(env, dbi_close_locked(env, dbi)); - return rc; +#ifdef __SANITIZE_ADDRESS__ +#if !defined(_MSC_VER) || __has_attribute(weak) +LIBMDBX_API __attribute__((__weak__)) +#endif +const char *__asan_default_options(void) { + return "symbolize=1:allow_addr2line=1:" +#if MDBX_DEBUG + "debug=1:" + "verbosity=2:" +#endif /* MDBX_DEBUG */ + "log_threads=1:" + "report_globals=1:" + "replace_str=1:replace_intrin=1:" + "malloc_context_size=9:" +#if !defined(__APPLE__) + "detect_leaks=1:" +#endif + "check_printf=1:" + "detect_deadlocks=1:" +#ifndef LTO_ENABLED + "check_initialization_order=1:" +#endif + "detect_stack_use_after_return=1:" + "intercept_tls_get_addr=1:" + "decorate_proc_maps=1:" + "abort_on_error=1"; } +#endif /* __SANITIZE_ADDRESS__ */ -int mdbx_dbi_flags_ex(const MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags, - unsigned *state) { - int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 - if (unlikely(!flags || !state)) - return MDBX_EINVAL; +#if !(defined(_WIN32) || defined(_WIN64)) +/*----------------------------------------------------------------------------* + * POSIX/non-Windows LCK-implementation */ - rc = dbi_check(txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - *flags = txn->mt_dbs[dbi].md_flags & DB_PERSISTENT_FLAGS; - *state = - txn->mt_dbi_state[dbi] & (DBI_FRESH | DBI_CREAT | DBI_DIRTY | DBI_STALE); +#if MDBX_LOCKING == MDBX_LOCKING_SYSV +#include +#endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ - return MDBX_SUCCESS; +/* Описание реализации блокировок для POSIX & Linux: + * + * lck-файл отображается в память, в нём организуется таблица читателей и + * размещаются совместно используемые posix-мьютексы (futex). Посредством + * этих мьютексов (см struct lck_t) реализуются: + * - Блокировка таблицы читателей для регистрации, + * т.е. функции lck_rdt_lock() и lck_rdt_unlock(). + * - Блокировка БД для пишущих транзакций, + * т.е. функции lck_txn_lock() и lck_txn_unlock(). + * + * Остальной функционал реализуется отдельно посредством файловых блокировок: + * - Первоначальный захват БД в режиме exclusive/shared и последующий перевод + * в операционный режим, функции lck_seize() и lck_downgrade(). + * - Проверка присутствие процессов-читателей, + * т.е. функции lck_rpid_set(), lck_rpid_clear() и lck_rpid_check(). + * + * Для блокировки файлов используется fcntl(F_SETLK), так как: + * - lockf() оперирует только эксклюзивной блокировкой и требует + * открытия файла в RW-режиме. + * - flock() не гарантирует атомарности при смене блокировок + * и оперирует только всем файлом целиком. + * - Для контроля процессов-читателей используются однобайтовые + * range-блокировки lck-файла посредством fcntl(F_SETLK). При этом + * в качестве позиции используется pid процесса-читателя. + * - Для первоначального захвата и shared/exclusive выполняется блокировка + * основного файла БД и при успехе lck-файла. + * + * ---------------------------------------------------------------------------- + * УДЕРЖИВАЕМЫЕ БЛОКИРОВКИ В ЗАВИСИМОСТИ ОТ РЕЖИМА И СОСТОЯНИЯ + * + * Эксклюзивный режим без lck-файла: + * = заблокирован весь dxb-файл посредством F_RDLCK или F_WRLCK, + * в зависимости от MDBX_RDONLY. + * + * Не-операционный режим на время пере-инициализации и разрушении lck-файла: + * = F_WRLCK блокировка первого байта lck-файла, другие процессы ждут её + * снятия при получении F_RDLCK через F_SETLKW. + * - блокировки dxb-файла могут меняться до снятие эксклюзивной блокировки + * lck-файла: + * + для НЕ-эксклюзивного режима блокировка pid-байта в dxb-файле + * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. + * + для ЭКСКЛЮЗИВНОГО режима блокировка всего dxb-файла + * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. + * + * ОПЕРАЦИОННЫЙ режим с lck-файлом: + * = F_RDLCK блокировка первого байта lck-файла, другие процессы не могут + * получить F_WRLCK и таким образом видят что БД используется. + * + F_WRLCK блокировка pid-байта в clk-файле после первой транзакции чтения. + * + для НЕ-эксклюзивного режима блокировка pid-байта в dxb-файле + * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. + * + для ЭКСКЛЮЗИВНОГО режима блокировка pid-байта всего dxb-файла + * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. + */ + +#if MDBX_USE_OFDLOCKS +static int op_setlk, op_setlkw, op_getlk; +__cold static void choice_fcntl(void) { + assert(!op_setlk && !op_setlkw && !op_getlk); + if ((globals.runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 +#if defined(__linux__) || defined(__gnu_linux__) + && globals.linux_kernel_version > + 0x030f0000 /* OFD locks are available since 3.15, but engages here + only for 3.16 and later kernels (i.e. LTS) because + of reliability reasons */ +#endif /* linux */ + ) { + op_setlk = MDBX_F_OFD_SETLK; + op_setlkw = MDBX_F_OFD_SETLKW; + op_getlk = MDBX_F_OFD_GETLK; + return; + } + op_setlk = MDBX_F_SETLK; + op_setlkw = MDBX_F_SETLKW; + op_getlk = MDBX_F_GETLK; } +#else +#define op_setlk MDBX_F_SETLK +#define op_setlkw MDBX_F_SETLKW +#define op_getlk MDBX_F_GETLK +#endif /* MDBX_USE_OFDLOCKS */ -static int drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { - int rc = page_search(mc, NULL, MDBX_PS_FIRST); - if (likely(rc == MDBX_SUCCESS)) { - MDBX_txn *txn = mc->mc_txn; +static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck, + const off_t offset, off_t len) { + STATIC_ASSERT(sizeof(off_t) >= sizeof(void *) && + sizeof(off_t) >= sizeof(size_t)); +#ifdef __ANDROID_API__ + STATIC_ASSERT_MSG((sizeof(off_t) * 8 == MDBX_WORDBITS), + "The bitness of system `off_t` type is mismatch. Please " + "fix build and/or NDK configuration."); +#endif /* Android */ + assert(offset >= 0 && len > 0); + assert((uint64_t)offset < (uint64_t)INT64_MAX && + (uint64_t)len < (uint64_t)INT64_MAX && + (uint64_t)(offset + len) > (uint64_t)offset); - /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. - * This also avoids any P_LEAF2 pages, which have no nodes. - * Also if the DB doesn't have sub-DBs and has no large/overflow - * pages, omit scanning leaves. */ - if (!(may_have_subDBs | mc->mc_db->md_overflow_pages)) - cursor_pop(mc); + assert((uint64_t)offset < (uint64_t)OFF_T_MAX && + (uint64_t)len <= (uint64_t)OFF_T_MAX && + (uint64_t)(offset + len) <= (uint64_t)OFF_T_MAX); - rc = pnl_need(&txn->tw.retired_pages, - (size_t)mc->mc_db->md_branch_pages + - (size_t)mc->mc_db->md_leaf_pages + - (size_t)mc->mc_db->md_overflow_pages); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; + assert((uint64_t)((off_t)((uint64_t)offset + (uint64_t)len)) == + ((uint64_t)offset + (uint64_t)len)); - MDBX_cursor mx; - cursor_copy(mc, &mx); - while (mc->mc_snum > 0) { - MDBX_page *const mp = mc->mc_pg[mc->mc_top]; - const size_t nkeys = page_numkeys(mp); - if (IS_LEAF(mp)) { - cASSERT(mc, mc->mc_snum == mc->mc_db->md_depth); - for (size_t i = 0; i < nkeys; i++) { - MDBX_node *node = page_node(mp, i); - if (node_flags(node) & F_BIGDATA) { - rc = page_retire_ex(mc, node_largedata_pgno(node), nullptr, 0); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - if (!(may_have_subDBs | mc->mc_db->md_overflow_pages)) - goto pop; - } else if (node_flags(node) & F_SUBDATA) { - if (unlikely((node_flags(node) & F_DUPDATA) == 0)) { - rc = /* disallowing implicit subDB deletion */ MDBX_INCOMPATIBLE; - goto bailout; - } - rc = cursor_xinit1(mc, node, mp); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - rc = drop_tree(&mc->mc_xcursor->mx_cursor, false); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } - } - } else { - cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth); - mc->mc_checking |= CC_RETIRING; - const unsigned pagetype = (IS_FROZEN(txn, mp) ? P_FROZEN : 0) + - ((mc->mc_snum + 1 == mc->mc_db->md_depth) - ? (mc->mc_checking & (P_LEAF | P_LEAF2)) - : P_BRANCH); - for (size_t i = 0; i < nkeys; i++) { - MDBX_node *node = page_node(mp, i); - tASSERT(txn, (node_flags(node) & - (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); - const pgno_t pgno = node_pgno(node); - rc = page_retire_ex(mc, pgno, nullptr, pagetype); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } - mc->mc_checking -= CC_RETIRING; - } - if (!mc->mc_top) - break; - cASSERT(mc, nkeys > 0); - mc->mc_ki[mc->mc_top] = (indx_t)nkeys; - rc = cursor_sibling(mc, SIBLING_RIGHT); - if (unlikely(rc != MDBX_SUCCESS)) { - if (unlikely(rc != MDBX_NOTFOUND)) - goto bailout; - /* no more siblings, go back to beginning - * of previous level. */ - pop: - cursor_pop(mc); - mc->mc_ki[0] = 0; - for (size_t i = 1; i < mc->mc_snum; i++) { - mc->mc_ki[i] = 0; - mc->mc_pg[i] = mx.mc_pg[i]; - } + jitter4testing(true); + for (;;) { + MDBX_STRUCT_FLOCK lock_op; + STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(lock_op.l_start) && + sizeof(off_t) <= sizeof(lock_op.l_len) && + OFF_T_MAX == (off_t)OFF_T_MAX, + "Support for large/64-bit-sized files is misconfigured " + "for the target system and/or toolchain. " + "Please fix it or at least disable it completely."); + memset(&lock_op, 0, sizeof(lock_op)); + lock_op.l_type = lck; + lock_op.l_whence = SEEK_SET; + lock_op.l_start = offset; + lock_op.l_len = len; + int rc = MDBX_FCNTL(fd, cmd, &lock_op); + jitter4testing(true); + if (rc != -1) { + if (cmd == op_getlk) { + /* Checks reader by pid. Returns: + * MDBX_RESULT_TRUE - if pid is live (reader holds a lock). + * MDBX_RESULT_FALSE - if pid is dead (a lock could be placed). */ + return (lock_op.l_type == F_UNLCK) ? MDBX_RESULT_FALSE + : MDBX_RESULT_TRUE; } + return MDBX_SUCCESS; + } + rc = errno; +#if MDBX_USE_OFDLOCKS + if (rc == EINVAL && (cmd == MDBX_F_OFD_SETLK || cmd == MDBX_F_OFD_SETLKW || + cmd == MDBX_F_OFD_GETLK)) { + /* fallback to non-OFD locks */ + if (cmd == MDBX_F_OFD_SETLK) + cmd = MDBX_F_SETLK; + else if (cmd == MDBX_F_OFD_SETLKW) + cmd = MDBX_F_SETLKW; + else + cmd = MDBX_F_GETLK; + op_setlk = MDBX_F_SETLK; + op_setlkw = MDBX_F_SETLKW; + op_getlk = MDBX_F_GETLK; + continue; + } +#endif /* MDBX_USE_OFDLOCKS */ + if (rc != EINTR || cmd == op_setlkw) { + assert(MDBX_IS_ERROR(rc)); + return rc; } - rc = page_retire(mc, mc->mc_pg[0]); - bailout: - if (unlikely(rc != MDBX_SUCCESS)) - txn->mt_flags |= MDBX_TXN_ERROR; - } else if (rc == MDBX_NOTFOUND) { - rc = MDBX_SUCCESS; } - mc->mc_flags &= ~C_INITIALIZED; - return rc; } -__cold int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { - int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - MDBX_cursor *mc; - rc = mdbx_cursor_open(txn, dbi, &mc); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - rc = drop_tree(mc, - dbi == MAIN_DBI || (mc->mc_db->md_flags & MDBX_DUPSORT) != 0); - /* Invalidate the dropped DB's cursors */ - for (MDBX_cursor *m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) - m2->mc_flags &= ~(C_INITIALIZED | C_EOF); - if (unlikely(rc)) - goto bailout; +MDBX_INTERNAL int osal_lockfile(mdbx_filehandle_t fd, bool wait) { +#if MDBX_USE_OFDLOCKS + if (unlikely(op_setlk == 0)) + choice_fcntl(); +#endif /* MDBX_USE_OFDLOCKS */ + return lck_op(fd, wait ? op_setlkw : op_setlk, F_WRLCK, 0, OFF_T_MAX); +} - /* Can't delete the main DB */ - if (del && dbi >= CORE_DBS) { - rc = delete(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); - if (likely(rc == MDBX_SUCCESS)) { - tASSERT(txn, txn->mt_dbi_state[MAIN_DBI] & DBI_DIRTY); - tASSERT(txn, txn->mt_flags & MDBX_TXN_DIRTY); - txn->mt_dbi_state[dbi] = DBI_LINDO | DBI_OLDEN; - MDBX_env *const env = txn->mt_env; - rc = osal_fastmutex_acquire(&env->me_dbi_lock); - if (likely(rc == MDBX_SUCCESS)) { - rc = env_defer_free_and_release(env, dbi_close_locked(env, dbi)); - goto bailout; - } - } - txn->mt_flags |= MDBX_TXN_ERROR; - } else { - /* reset the DB record, mark it dirty */ - txn->mt_dbi_state[dbi] |= DBI_DIRTY; - txn->mt_dbs[dbi].md_depth = 0; - txn->mt_dbs[dbi].md_branch_pages = 0; - txn->mt_dbs[dbi].md_leaf_pages = 0; - txn->mt_dbs[dbi].md_overflow_pages = 0; - txn->mt_dbs[dbi].md_entries = 0; - txn->mt_dbs[dbi].md_root = P_INVALID; - txn->mt_dbs[dbi].md_seq = 0; - txn->mt_flags |= MDBX_TXN_DIRTY; - } +MDBX_INTERNAL int lck_rpid_set(MDBX_env *env) { + assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE); + assert(env->pid > 0); + if (unlikely(osal_getpid() != env->pid)) + return MDBX_PANIC; + return lck_op(env->lck_mmap.fd, op_setlk, F_WRLCK, env->pid, 1); +} -bailout: - mdbx_cursor_close(mc); - return rc; +MDBX_INTERNAL int lck_rpid_clear(MDBX_env *env) { + assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE); + assert(env->pid > 0); + return lck_op(env->lck_mmap.fd, op_setlk, F_UNLCK, env->pid, 1); } -__cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, - void *ctx) { - int rc = check_env(env, true); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +MDBX_INTERNAL int lck_rpid_check(MDBX_env *env, uint32_t pid) { + assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE); + assert(pid > 0); + return lck_op(env->lck_mmap.fd, op_getlk, F_WRLCK, pid, 1); +} - if (unlikely(!func)) - return MDBX_EINVAL; +/*---------------------------------------------------------------------------*/ - rc = MDBX_RESULT_TRUE; - int serial = 0; - MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (likely(lck)) { - const size_t snap_nreaders = - atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - for (size_t i = 0; i < snap_nreaders; i++) { - const MDBX_reader *r = lck->mti_readers + i; - retry_reader:; - const uint32_t pid = atomic_load32(&r->mr_pid, mo_AcquireRelease); - if (!pid) - continue; - txnid_t txnid = safe64_read(&r->mr_txnid); - const uint64_t tid = atomic_load64(&r->mr_tid, mo_Relaxed); - const pgno_t pages_used = - atomic_load32(&r->mr_snapshot_pages_used, mo_Relaxed); - const uint64_t reader_pages_retired = - atomic_load64(&r->mr_snapshot_pages_retired, mo_Relaxed); - if (unlikely( - txnid != safe64_read(&r->mr_txnid) || - pid != atomic_load32(&r->mr_pid, mo_AcquireRelease) || - tid != atomic_load64(&r->mr_tid, mo_Relaxed) || - pages_used != - atomic_load32(&r->mr_snapshot_pages_used, mo_Relaxed) || - reader_pages_retired != - atomic_load64(&r->mr_snapshot_pages_retired, mo_Relaxed))) - goto retry_reader; +#if MDBX_LOCKING > MDBX_LOCKING_SYSV +MDBX_INTERNAL int lck_ipclock_stubinit(osal_ipclock_t *ipc) { +#if MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + return sem_init(ipc, false, 1) ? errno : 0; +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 + return pthread_mutex_init(ipc, nullptr); +#else +#error "FIXME" +#endif +} - eASSERT(env, txnid > 0); - if (txnid >= SAFE64_INVALID_THRESHOLD) - txnid = 0; +MDBX_INTERNAL int lck_ipclock_destroy(osal_ipclock_t *ipc) { +#if MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + return sem_destroy(ipc) ? errno : 0; +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 + return pthread_mutex_destroy(ipc); +#else +#error "FIXME" +#endif +} +#endif /* MDBX_LOCKING > MDBX_LOCKING_SYSV */ - size_t bytes_used = 0; - size_t bytes_retained = 0; - uint64_t lag = 0; - if (txnid) { - meta_troika_t troika = meta_tap(env); - retry_header:; - const meta_ptr_t head = meta_recent(env, &troika); - const uint64_t head_pages_retired = - unaligned_peek_u64_volatile(4, head.ptr_v->mm_pages_retired); - if (unlikely(meta_should_retry(env, &troika) || - head_pages_retired != - unaligned_peek_u64_volatile( - 4, head.ptr_v->mm_pages_retired))) - goto retry_header; +static int check_fstat(MDBX_env *env) { + struct stat st; - lag = (head.txnid - txnid) / xMDBX_TXNID_STEP; - bytes_used = pgno2bytes(env, pages_used); - bytes_retained = (head_pages_retired > reader_pages_retired) - ? pgno2bytes(env, (pgno_t)(head_pages_retired - - reader_pages_retired)) - : 0; - } - rc = func(ctx, ++serial, (unsigned)i, pid, (mdbx_tid_t)((intptr_t)tid), - txnid, lag, bytes_used, bytes_retained); - if (unlikely(rc != MDBX_SUCCESS)) - break; - } + int rc = MDBX_SUCCESS; + if (fstat(env->lazy_fd, &st)) { + rc = errno; + ERROR("fstat(%s), err %d", "DXB", rc); + return rc; } - return rc; -} + if (!S_ISREG(st.st_mode) || st.st_nlink < 1) { +#ifdef EBADFD + rc = EBADFD; +#else + rc = EPERM; +#endif + ERROR("%s %s, err %d", "DXB", + (st.st_nlink < 1) ? "file was removed" : "not a regular file", rc); + return rc; + } -/* Insert pid into list if not already present. - * return -1 if already present. */ -__cold static bool pid_insert(uint32_t *ids, uint32_t pid) { - /* binary search of pid in list */ - size_t base = 0; - size_t cursor = 1; - int val = 0; - size_t n = ids[0]; + if (st.st_size < (off_t)(MDBX_MIN_PAGESIZE * NUM_METAS)) { + VERBOSE("dxb-file is too short (%u), exclusive-lock needed", + (unsigned)st.st_size); + rc = MDBX_RESULT_TRUE; + } - while (n > 0) { - size_t pivot = n >> 1; - cursor = base + pivot + 1; - val = pid - ids[cursor]; + //---------------------------------------------------------------------------- - if (val < 0) { - n = pivot; - } else if (val > 0) { - base = cursor; - n -= pivot + 1; - } else { - /* found, so it's a duplicate */ - return false; - } + if (fstat(env->lck_mmap.fd, &st)) { + rc = errno; + ERROR("fstat(%s), err %d", "LCK", rc); + return rc; } - if (val > 0) - ++cursor; + if (!S_ISREG(st.st_mode) || st.st_nlink < 1) { +#ifdef EBADFD + rc = EBADFD; +#else + rc = EPERM; +#endif + ERROR("%s %s, err %d", "LCK", + (st.st_nlink < 1) ? "file was removed" : "not a regular file", rc); + return rc; + } - ids[0]++; - for (n = ids[0]; n > cursor; n--) - ids[n] = ids[n - 1]; - ids[n] = pid; - return true; -} + /* Checking file size for detect the situation when we got the shared lock + * immediately after lck_destroy(). */ + if (st.st_size < (off_t)(sizeof(lck_t) + sizeof(reader_slot_t))) { + VERBOSE("lck-file is too short (%u), exclusive-lock needed", + (unsigned)st.st_size); + rc = MDBX_RESULT_TRUE; + } -__cold int mdbx_reader_check(MDBX_env *env, int *dead) { - if (dead) - *dead = 0; - return cleanup_dead_readers(env, false, dead); + return rc; } -/* Return: - * MDBX_RESULT_TRUE - done and mutex recovered - * MDBX_SUCCESS - done - * Otherwise errcode. */ -__cold MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, - int rdt_locked, int *dead) { - int rc = check_env(env, true); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +__cold MDBX_INTERNAL int lck_seize(MDBX_env *env) { + assert(env->lazy_fd != INVALID_HANDLE_VALUE); + if (unlikely(osal_getpid() != env->pid)) + return MDBX_PANIC; - eASSERT(env, rdt_locked >= 0); - MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (unlikely(lck == NULL)) { - /* exclusive mode */ - if (dead) - *dead = 0; - return MDBX_SUCCESS; + int rc = MDBX_SUCCESS; +#if defined(__linux__) || defined(__gnu_linux__) + if (unlikely(globals.running_on_WSL1)) { + rc = ENOLCK /* No record locks available */; + ERROR("%s, err %u", + "WSL1 (Windows Subsystem for Linux) is mad and trouble-full, " + "injecting failure to avoid data loss", + rc); + return rc; } +#endif /* Linux */ - const size_t snap_nreaders = - atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - uint32_t pidsbuf_onstask[142]; - uint32_t *const pids = - (snap_nreaders < ARRAY_LENGTH(pidsbuf_onstask)) - ? pidsbuf_onstask - : osal_malloc((snap_nreaders + 1) * sizeof(uint32_t)); - if (unlikely(!pids)) - return MDBX_ENOMEM; - - pids[0] = 0; - int count = 0; - for (size_t i = 0; i < snap_nreaders; i++) { - const uint32_t pid = - atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); - if (pid == 0) - continue /* skip empty */; - if (pid == env->me_pid) - continue /* skip self */; - if (!pid_insert(pids, pid)) - continue /* such pid already processed */; - - int err = osal_rpid_check(env, pid); - if (err == MDBX_RESULT_TRUE) - continue /* reader is live */; +#if MDBX_USE_OFDLOCKS + if (unlikely(op_setlk == 0)) + choice_fcntl(); +#endif /* MDBX_USE_OFDLOCKS */ - if (err != MDBX_SUCCESS) { - rc = err; - break /* osal_rpid_check() failed */; + if (env->lck_mmap.fd == INVALID_HANDLE_VALUE) { + /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ + rc = lck_op(env->lazy_fd, op_setlk, + (env->flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX); + if (rc != MDBX_SUCCESS) { + ERROR("%s, err %u", "without-lck", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); + return rc; } + return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */; + } +#if defined(_POSIX_PRIORITY_SCHEDULING) && _POSIX_PRIORITY_SCHEDULING > 0 + sched_yield(); +#endif - /* stale reader found */ - if (!rdt_locked) { - err = osal_rdt_lock(env); - if (MDBX_IS_ERROR(err)) { - rc = err; - break; - } +retry: + if (rc == MDBX_RESULT_TRUE) { + rc = lck_op(env->lck_mmap.fd, op_setlk, F_UNLCK, 0, 1); + if (rc != MDBX_SUCCESS) { + ERROR("%s, err %u", "unlock-before-retry", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); + return rc; + } + } - rdt_locked = -1; - if (err == MDBX_RESULT_TRUE) { - /* mutex recovered, the mdbx_ipclock_failed() checked all readers */ - rc = MDBX_RESULT_TRUE; - break; - } + /* Firstly try to get exclusive locking. */ + rc = lck_op(env->lck_mmap.fd, op_setlk, F_WRLCK, 0, 1); + if (rc == MDBX_SUCCESS) { + rc = check_fstat(env); + if (MDBX_IS_ERROR(rc)) + return rc; - /* a other process may have clean and reused slot, recheck */ - if (lck->mti_readers[i].mr_pid.weak != pid) - continue; + continue_dxb_exclusive: + rc = lck_op(env->lazy_fd, op_setlk, + (env->flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX); + if (rc == MDBX_SUCCESS) + return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */; - err = osal_rpid_check(env, pid); - if (MDBX_IS_ERROR(err)) { - rc = err; - break; - } + int err = check_fstat(env); + if (MDBX_IS_ERROR(err)) + return err; - if (err != MDBX_SUCCESS) - continue /* the race with other process, slot reused */; + /* the cause may be a collision with POSIX's file-lock recovery. */ + if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || + rc == EDEADLK)) { + ERROR("%s, err %u", "dxb-exclusive", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); + return rc; } - /* clean it */ - for (size_t j = i; j < snap_nreaders; j++) { - if (lck->mti_readers[j].mr_pid.weak == pid) { - DEBUG("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN, (size_t)pid, - lck->mti_readers[j].mr_txnid.weak); - atomic_store32(&lck->mti_readers[j].mr_pid, 0, mo_Relaxed); - atomic_store32(&lck->mti_readers_refresh_flag, true, mo_AcquireRelease); - count++; - } - } + /* Fallback to lck-shared */ + } else if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || + rc == EWOULDBLOCK || rc == EDEADLK)) { + ERROR("%s, err %u", "try-exclusive", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); + return rc; } - if (likely(!MDBX_IS_ERROR(rc))) - atomic_store64(&lck->mti_reader_check_timestamp, osal_monotime(), - mo_Relaxed); - - if (rdt_locked < 0) - osal_rdt_unlock(env); - - if (pids != pidsbuf_onstask) - osal_free(pids); + /* Here could be one of two: + * - lck_destroy() from the another process was hold the lock + * during a destruction. + * - either lck_seize() from the another process was got the exclusive + * lock and doing initialization. + * For distinguish these cases will use size of the lck-file later. */ - if (dead) - *dead = count; - return rc; -} + /* Wait for lck-shared now. */ + /* Here may be await during transient processes, for instance until another + * competing process doesn't call lck_downgrade(). */ + rc = lck_op(env->lck_mmap.fd, op_setlkw, F_RDLCK, 0, 1); + if (rc != MDBX_SUCCESS) { + ERROR("%s, err %u", "try-shared", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); + return rc; + } -__cold static int setup_debug(MDBX_log_level_t level, MDBX_debug_flags_t flags, - union logger_union logger, char *buffer, - size_t buffer_size) { - ENSURE(nullptr, osal_fastmutex_acquire(&debug_lock) == 0); + rc = check_fstat(env); + if (rc == MDBX_RESULT_TRUE) + goto retry; + if (rc != MDBX_SUCCESS) { + ERROR("%s, err %u", "lck_fstat", rc); + return rc; + } - const int rc = mdbx_static.flags | (mdbx_static.loglevel << 16); - if (level != MDBX_LOG_DONTCHANGE) - mdbx_static.loglevel = (uint8_t)level; + /* got shared, retry exclusive */ + rc = lck_op(env->lck_mmap.fd, op_setlk, F_WRLCK, 0, 1); + if (rc == MDBX_SUCCESS) + goto continue_dxb_exclusive; - if (flags != MDBX_DBG_DONTCHANGE) { - flags &= -#if MDBX_DEBUG - MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | MDBX_DBG_JITTER | -#endif - MDBX_DBG_DUMP | MDBX_DBG_LEGACY_MULTIOPEN | MDBX_DBG_LEGACY_OVERLAP | - MDBX_DBG_DONT_UPGRADE; - mdbx_static.flags = (uint8_t)flags; + if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || + rc == EDEADLK)) { + ERROR("%s, err %u", "try-exclusive", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); + return rc; } - assert(MDBX_LOGGER_DONTCHANGE == ((MDBX_debug_func *)(intptr_t)-1)); - if (logger.ptr != (void *)((intptr_t)-1)) { - mdbx_static.logger.ptr = logger.ptr; - mdbx_static.logger_buffer = buffer; - mdbx_static.logger_buffer_size = buffer_size; + /* Lock against another process operating in without-lck or exclusive mode. */ + rc = lck_op(env->lazy_fd, op_setlk, + (env->flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, env->pid, 1); + if (rc != MDBX_SUCCESS) { + ERROR("%s, err %u", "lock-against-without-lck", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); + return rc; } - ENSURE(nullptr, osal_fastmutex_release(&debug_lock) == 0); - return rc; + /* Done: return with shared locking. */ + return MDBX_RESULT_FALSE; } -__cold int mdbx_setup_debug_nofmt(MDBX_log_level_t level, - MDBX_debug_flags_t flags, - MDBX_debug_func_nofmt *logger, char *buffer, - size_t buffer_size) { - union logger_union thunk; - thunk.nofmt = - (logger && buffer && buffer_size) ? logger : MDBX_LOGGER_NOFMT_DONTCHANGE; - return setup_debug(level, flags, thunk, buffer, buffer_size); -} +MDBX_INTERNAL int lck_downgrade(MDBX_env *env) { + assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE); + if (unlikely(osal_getpid() != env->pid)) + return MDBX_PANIC; -__cold int mdbx_setup_debug(MDBX_log_level_t level, MDBX_debug_flags_t flags, - MDBX_debug_func *logger) { - union logger_union thunk; - thunk.fmt = logger; - return setup_debug(level, flags, thunk, nullptr, 0); + int rc = MDBX_SUCCESS; + if ((env->flags & MDBX_EXCLUSIVE) == 0) { + rc = lck_op(env->lazy_fd, op_setlk, F_UNLCK, 0, env->pid); + if (rc == MDBX_SUCCESS) + rc = lck_op(env->lazy_fd, op_setlk, F_UNLCK, env->pid + 1, + OFF_T_MAX - env->pid - 1); + } + if (rc == MDBX_SUCCESS) + rc = lck_op(env->lck_mmap.fd, op_setlk, F_RDLCK, 0, 1); + if (unlikely(rc != 0)) { + ERROR("%s, err %u", "lck", rc); + assert(MDBX_IS_ERROR(rc)); + } + return rc; } -__cold static txnid_t kick_longlived_readers(MDBX_env *env, - const txnid_t laggard) { - DEBUG("DB size maxed out by reading #%" PRIaTXN, laggard); - osal_memory_fence(mo_AcquireRelease, false); - MDBX_hsr_func *const callback = env->me_hsr_callback; - txnid_t oldest = 0; - bool notify_eof_of_loop = false; - int retry = 0; - do { - const txnid_t steady = - env->me_txn->tw.troika.txnid[env->me_txn->tw.troika.prefer_steady]; - env->me_lck->mti_readers_refresh_flag.weak = /* force refresh */ true; - oldest = find_oldest_reader(env, steady); - eASSERT(env, oldest < env->me_txn0->mt_txnid); - eASSERT(env, oldest >= laggard); - eASSERT(env, oldest >= env->me_lck->mti_oldest_reader.weak); - - MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (oldest == steady || oldest > laggard || /* without-LCK mode */ !lck) - break; +MDBX_INTERNAL int lck_upgrade(MDBX_env *env, bool dont_wait) { + assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE); + if (unlikely(osal_getpid() != env->pid)) + return MDBX_PANIC; - if (MDBX_IS_ERROR(cleanup_dead_readers(env, false, NULL))) - break; + const int cmd = dont_wait ? op_setlk : op_setlkw; + int rc = lck_op(env->lck_mmap.fd, cmd, F_WRLCK, 0, 1); + if (rc == MDBX_SUCCESS && (env->flags & MDBX_EXCLUSIVE) == 0) { + rc = (env->pid > 1) ? lck_op(env->lazy_fd, cmd, F_WRLCK, 0, env->pid - 1) + : MDBX_SUCCESS; + if (rc == MDBX_SUCCESS) { + rc = lck_op(env->lazy_fd, cmd, F_WRLCK, env->pid + 1, + OFF_T_MAX - env->pid - 1); + if (rc != MDBX_SUCCESS && env->pid > 1 && + lck_op(env->lazy_fd, op_setlk, F_UNLCK, 0, env->pid - 1)) + rc = MDBX_PANIC; + } + if (rc != MDBX_SUCCESS && lck_op(env->lck_mmap.fd, op_setlk, F_RDLCK, 0, 1)) + rc = MDBX_PANIC; + } + if (unlikely(rc != 0)) { + ERROR("%s, err %u", "lck", rc); + assert(MDBX_IS_ERROR(rc)); + } + return rc; +} - if (!callback) - break; +__cold MDBX_INTERNAL int lck_destroy(MDBX_env *env, + MDBX_env *inprocess_neighbor, + const uint32_t current_pid) { + eASSERT(env, osal_getpid() == current_pid); + int rc = MDBX_SUCCESS; + struct stat lck_info; + lck_t *lck = env->lck; + if (lck && lck == env->lck_mmap.lck && !inprocess_neighbor && + /* try get exclusive access */ + lck_op(env->lck_mmap.fd, op_setlk, F_WRLCK, 0, OFF_T_MAX) == 0 && + /* if LCK was not removed */ + fstat(env->lck_mmap.fd, &lck_info) == 0 && lck_info.st_nlink > 0 && + lck_op(env->lazy_fd, op_setlk, + (env->flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, + OFF_T_MAX) == 0) { - MDBX_reader *stucked = nullptr; - uint64_t hold_retired = 0; - for (size_t i = 0; i < lck->mti_numreaders.weak; ++i) { - const uint64_t snap_retired = atomic_load64( - &lck->mti_readers[i].mr_snapshot_pages_retired, mo_Relaxed); - const txnid_t rtxn = safe64_read(&lck->mti_readers[i].mr_txnid); - if (rtxn == laggard && - atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { - hold_retired = snap_retired; - stucked = &lck->mti_readers[i]; - } + VERBOSE("%p got exclusive, drown ipc-locks", (void *)env); + eASSERT(env, current_pid == env->pid); +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + if (env->me_sysv_ipc.semid != -1) + rc = semctl(env->me_sysv_ipc.semid, 2, IPC_RMID) ? errno : 0; +#else + rc = lck_ipclock_destroy(&lck->rdt_lock); + if (rc == 0) + rc = lck_ipclock_destroy(&lck->wrt_lock); +#endif /* MDBX_LOCKING */ + + eASSERT(env, rc == 0); + if (rc == 0) { + const bool synced = lck->unsynced_pages.weak == 0; + osal_munmap(&env->lck_mmap); + if (synced && env->lck_mmap.fd != INVALID_HANDLE_VALUE) + rc = ftruncate(env->lck_mmap.fd, 0) ? errno : 0; } - if (!stucked) - break; + jitter4testing(false); + } - uint32_t pid = atomic_load32(&stucked->mr_pid, mo_AcquireRelease); - uint64_t tid = atomic_load64(&stucked->mr_tid, mo_AcquireRelease); - if (safe64_read(&stucked->mr_txnid) != laggard || !pid || - stucked->mr_snapshot_pages_retired.weak != hold_retired) - continue; + if (current_pid != env->pid) { + eASSERT(env, !inprocess_neighbor); + NOTICE("drown env %p after-fork pid %d -> %d", + __Wpedantic_format_voidptr(env), env->pid, current_pid); + inprocess_neighbor = nullptr; + } - const meta_ptr_t head = meta_recent(env, &env->me_txn->tw.troika); - const txnid_t gap = (head.txnid - laggard) / xMDBX_TXNID_STEP; - const uint64_t head_retired = - unaligned_peek_u64(4, head.ptr_c->mm_pages_retired); - const size_t space = - (head_retired > hold_retired) - ? pgno2bytes(env, (pgno_t)(head_retired - hold_retired)) - : 0; - int rc = - callback(env, env->me_txn, pid, (mdbx_tid_t)((intptr_t)tid), laggard, - (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, space, retry); - if (rc < 0) - /* hsr returned error and/or agree MDBX_MAP_FULL error */ - break; + /* 1) POSIX's fcntl() locks (i.e. when op_setlk == F_SETLK) should be restored + * after file was closed. + * + * 2) File locks would be released (by kernel) while the file-descriptors will + * be closed. But to avoid false-positive EACCESS and EDEADLK from the kernel, + * locks should be released here explicitly with properly order. */ - if (rc > 0) { - if (rc == 1) { - /* hsr reported transaction (will be) aborted asynchronous */ - safe64_reset_compare(&stucked->mr_txnid, laggard); - } else { - /* hsr reported reader process was killed and slot should be cleared */ - safe64_reset(&stucked->mr_txnid, true); - atomic_store64(&stucked->mr_tid, 0, mo_Relaxed); - atomic_store32(&stucked->mr_pid, 0, mo_AcquireRelease); - } - } else if (!notify_eof_of_loop) { -#if MDBX_ENABLE_PROFGC - env->me_lck->mti_pgop_stat.gc_prof.kicks += 1; -#endif /* MDBX_ENABLE_PROFGC */ - notify_eof_of_loop = true; + /* close dxb and restore lock */ + if (env->dsync_fd != INVALID_HANDLE_VALUE) { + if (unlikely(close(env->dsync_fd) != 0) && rc == MDBX_SUCCESS) + rc = errno; + env->dsync_fd = INVALID_HANDLE_VALUE; + } + if (env->lazy_fd != INVALID_HANDLE_VALUE) { + if (unlikely(close(env->lazy_fd) != 0) && rc == MDBX_SUCCESS) + rc = errno; + env->lazy_fd = INVALID_HANDLE_VALUE; + if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) { + /* restore file-lock */ + rc = lck_op(inprocess_neighbor->lazy_fd, F_SETLKW, + (inprocess_neighbor->flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, + (inprocess_neighbor->flags & MDBX_EXCLUSIVE) + ? 0 + : inprocess_neighbor->pid, + (inprocess_neighbor->flags & MDBX_EXCLUSIVE) ? OFF_T_MAX : 1); } + } - } while (++retry < INT_MAX); - - if (notify_eof_of_loop) { - /* notify end of hsr-loop */ - const txnid_t turn = oldest - laggard; - if (turn) - NOTICE("hsr-kick: done turn %" PRIaTXN " -> %" PRIaTXN " +%" PRIaTXN, - laggard, oldest, turn); - callback(env, env->me_txn, 0, 0, laggard, - (turn < UINT_MAX) ? (unsigned)turn : UINT_MAX, 0, -retry); + /* close clk and restore locks */ + if (env->lck_mmap.fd != INVALID_HANDLE_VALUE) { + if (unlikely(close(env->lck_mmap.fd) != 0) && rc == MDBX_SUCCESS) + rc = errno; + env->lck_mmap.fd = INVALID_HANDLE_VALUE; + if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) { + /* restore file-locks */ + rc = lck_op(inprocess_neighbor->lck_mmap.fd, F_SETLKW, F_RDLCK, 0, 1); + if (rc == MDBX_SUCCESS && inprocess_neighbor->registered_reader_pid) + rc = lck_rpid_set(inprocess_neighbor); + } } - return oldest; + + if (inprocess_neighbor && rc != MDBX_SUCCESS) + inprocess_neighbor->flags |= ENV_FATAL_ERROR; + return rc; } -__cold int mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr) { - int rc = check_env(env, false); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +/*---------------------------------------------------------------------------*/ - env->me_hsr_callback = hsr; - return MDBX_SUCCESS; -} +__cold MDBX_INTERNAL int lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor, + int global_uniqueness_flag) { +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + int semid = -1; + /* don't initialize semaphores twice */ + (void)inprocess_neighbor; + if (global_uniqueness_flag == MDBX_RESULT_TRUE) { + struct stat st; + if (fstat(env->lazy_fd, &st)) + return errno; + sysv_retry_create: + semid = semget(env->me_sysv_ipc.key, 2, + IPC_CREAT | IPC_EXCL | + (st.st_mode & (S_IRWXU | S_IRWXG | S_IRWXO))); + if (unlikely(semid == -1)) { + int err = errno; + if (err != EEXIST) + return err; -__cold MDBX_hsr_func *mdbx_env_get_hsr(const MDBX_env *env) { - return likely(env && env->me_signature.weak == MDBX_ME_SIGNATURE) - ? env->me_hsr_callback - : NULL; -} + /* remove and re-create semaphore set */ + semid = semget(env->me_sysv_ipc.key, 2, 0); + if (semid == -1) { + err = errno; + if (err != ENOENT) + return err; + goto sysv_retry_create; + } + if (semctl(semid, 2, IPC_RMID)) { + err = errno; + if (err != EIDRM) + return err; + } + goto sysv_retry_create; + } -#ifdef __SANITIZE_THREAD__ -/* LY: avoid tsan-trap by me_txn, mm_last_pg and mt_next_pgno */ -__attribute__((__no_sanitize_thread__, __noinline__)) -#endif -int mdbx_txn_straggler(const MDBX_txn *txn, int *percent) -{ - int rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return (rc > 0) ? -rc : rc; + unsigned short val_array[2] = {1, 1}; + if (semctl(semid, 2, SETALL, val_array)) + return errno; + } else { + semid = semget(env->me_sysv_ipc.key, 2, 0); + if (semid == -1) + return errno; - MDBX_env *env = txn->mt_env; - if (unlikely((txn->mt_flags & MDBX_TXN_RDONLY) == 0)) { - if (percent) - *percent = - (int)((txn->mt_next_pgno * UINT64_C(100) + txn->mt_end_pgno / 2) / - txn->mt_end_pgno); - return 0; + /* check read & write access */ + struct semid_ds data[2]; + if (semctl(semid, 2, IPC_STAT, data) || semctl(semid, 2, IPC_SET, data)) + return errno; } - txnid_t lag; - meta_troika_t troika = meta_tap(env); - do { - const meta_ptr_t head = meta_recent(env, &troika); - if (percent) { - const pgno_t maxpg = head.ptr_v->mm_geo.now; - *percent = - (int)((head.ptr_v->mm_geo.next * UINT64_C(100) + maxpg / 2) / maxpg); - } - lag = (head.txnid - txn->mt_txnid) / xMDBX_TXNID_STEP; - } while (unlikely(meta_should_retry(env, &troika))); + env->me_sysv_ipc.semid = semid; + return MDBX_SUCCESS; - return (lag > INT_MAX) ? INT_MAX : (int)lag; -} +#elif MDBX_LOCKING == MDBX_LOCKING_FUTEX + (void)inprocess_neighbor; + if (global_uniqueness_flag != MDBX_RESULT_TRUE) + return MDBX_SUCCESS; +#error "FIXME: Not implemented" +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 -typedef struct mdbx_walk_ctx { - void *mw_user; - MDBX_pgvisitor_func *mw_visitor; - MDBX_txn *mw_txn; - MDBX_cursor *mw_cursor; - bool mw_dont_check_keys_ordering; -} mdbx_walk_ctx_t; + /* don't initialize semaphores twice */ + (void)inprocess_neighbor; + if (global_uniqueness_flag == MDBX_RESULT_TRUE) { + if (sem_init(&env->lck_mmap.lck->rdt_lock, true, 1)) + return errno; + if (sem_init(&env->lck_mmap.lck->wrt_lock, true, 1)) + return errno; + } + return MDBX_SUCCESS; -__cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_walk_sdb_t *sdb, - int deep); +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 + if (inprocess_neighbor) + return MDBX_SUCCESS /* don't need any initialization for mutexes + if LCK already opened/used inside current process */ + ; -static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { - if (mp) - switch (mp->mp_flags & ~P_SPILLED) { - case P_BRANCH: - return MDBX_page_branch; - case P_LEAF: - return MDBX_page_leaf; - case P_LEAF | P_LEAF2: - return MDBX_page_dupfixed_leaf; - case P_OVERFLOW: - return MDBX_page_large; - } - return MDBX_page_broken; -} + /* FIXME: Unfortunately, there is no other reliable way but to long testing + * on each platform. On the other hand, behavior like FreeBSD is incorrect + * and we can expect it to be rare. Moreover, even on FreeBSD without + * additional in-process initialization, the probability of an problem + * occurring is vanishingly small, and the symptom is a return of EINVAL + * while locking a mutex. In other words, in the worst case, the problem + * results in an EINVAL error at the start of the transaction, but NOT data + * loss, nor database corruption, nor other fatal troubles. Thus, the code + * below I am inclined to think the workaround for erroneous platforms (like + * FreeBSD), rather than a defect of libmdbx. */ +#if defined(__FreeBSD__) + /* seems that shared mutexes on FreeBSD required in-process initialization */ + (void)global_uniqueness_flag; +#else + /* shared mutexes on many other platforms (including Darwin and Linux's + * futexes) doesn't need any addition in-process initialization */ + if (global_uniqueness_flag != MDBX_RESULT_TRUE) + return MDBX_SUCCESS; +#endif -/* Depth-first tree traversal. */ -__cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, - MDBX_walk_sdb_t *sdb, int deep, - txnid_t parent_txnid) { - assert(pgno != P_INVALID); - MDBX_page *mp = nullptr; - int err = page_get(ctx->mw_cursor, pgno, &mp, parent_txnid); + pthread_mutexattr_t ma; + int rc = pthread_mutexattr_init(&ma); + if (rc) + return rc; - MDBX_page_type_t type = walk_page_type(mp); - const size_t nentries = mp ? page_numkeys(mp) : 0; - unsigned npages = 1; - size_t pagesize = pgno2bytes(ctx->mw_txn->mt_env, npages); - size_t header_size = - (mp && !IS_LEAF2(mp)) ? PAGEHDRSZ + mp->mp_lower : PAGEHDRSZ; - size_t payload_size = 0; - size_t unused_size = - (mp ? page_room(mp) : pagesize - header_size) - payload_size; - size_t align_bytes = 0; + rc = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED); + if (rc) + goto bailout; - for (size_t i = 0; err == MDBX_SUCCESS && i < nentries; ++i) { - if (type == MDBX_page_dupfixed_leaf) { - /* LEAF2 pages have no mp_ptrs[] or node headers */ - payload_size += mp->mp_leaf2_ksize; - continue; - } +#if MDBX_LOCKING == MDBX_LOCKING_POSIX2008 +#if defined(PTHREAD_MUTEX_ROBUST) || defined(pthread_mutexattr_setrobust) + rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); +#elif defined(PTHREAD_MUTEX_ROBUST_NP) || \ + defined(pthread_mutexattr_setrobust_np) + rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP); +#elif _POSIX_THREAD_PROCESS_SHARED < 200809L + rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP); +#else + rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); +#endif + if (rc) + goto bailout; +#endif /* MDBX_LOCKING == MDBX_LOCKING_POSIX2008 */ - const MDBX_node *node = page_node(mp, i); - header_size += NODESIZE; - const size_t node_key_size = node_ks(node); - payload_size += node_key_size; +#if defined(_POSIX_THREAD_PRIO_INHERIT) && _POSIX_THREAD_PRIO_INHERIT >= 0 && \ + !defined(MDBX_SAFE4QEMU) + rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_INHERIT); + if (rc == ENOTSUP) + rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_NONE); + if (rc && rc != ENOTSUP) + goto bailout; +#endif /* PTHREAD_PRIO_INHERIT */ - if (type == MDBX_page_branch) { - assert(i > 0 || node_ks(node) == 0); - align_bytes += node_key_size & 1; - continue; - } + rc = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK); + if (rc && rc != ENOTSUP) + goto bailout; - const size_t node_data_size = node_ds(node); - assert(type == MDBX_page_leaf); - switch (node_flags(node)) { - case 0 /* usual node */: - payload_size += node_data_size; - align_bytes += (node_key_size + node_data_size) & 1; - break; + rc = pthread_mutex_init(&env->lck_mmap.lck->rdt_lock, &ma); + if (rc) + goto bailout; + rc = pthread_mutex_init(&env->lck_mmap.lck->wrt_lock, &ma); - case F_BIGDATA /* long data on the large/overflow page */: { - const pgno_t large_pgno = node_largedata_pgno(node); - const size_t over_payload = node_data_size; - const size_t over_header = PAGEHDRSZ; - npages = 1; +bailout: + pthread_mutexattr_destroy(&ma); + return rc; +#else +#error "FIXME" +#endif /* MDBX_LOCKING > 0 */ +} - assert(err == MDBX_SUCCESS); - pgr_t lp = page_get_large(ctx->mw_cursor, large_pgno, mp->mp_txnid); - err = lp.err; - if (err == MDBX_SUCCESS) { - cASSERT(ctx->mw_cursor, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); - npages = lp.page->mp_pages; - } +__cold static int osal_ipclock_failed(MDBX_env *env, osal_ipclock_t *ipc, + const int err) { + int rc = err; +#if MDBX_LOCKING == MDBX_LOCKING_POSIX2008 || MDBX_LOCKING == MDBX_LOCKING_SYSV - pagesize = pgno2bytes(ctx->mw_txn->mt_env, npages); - const size_t over_unused = pagesize - over_payload - over_header; - const int rc = ctx->mw_visitor(large_pgno, npages, ctx->mw_user, deep, - sdb, pagesize, MDBX_page_large, err, 1, - over_payload, over_header, over_unused); - if (unlikely(rc != MDBX_SUCCESS)) - return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; - payload_size += sizeof(pgno_t); - align_bytes += node_key_size & 1; - } break; +#ifndef EOWNERDEAD +#define EOWNERDEAD MDBX_RESULT_TRUE +#endif /* EOWNERDEAD */ - case F_SUBDATA /* sub-db */: { - if (unlikely(node_data_size != sizeof(MDBX_db))) { - ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "invalid subDb node size", (unsigned)node_data_size); - assert(err == MDBX_CORRUPTED); - err = MDBX_CORRUPTED; + if (err == EOWNERDEAD) { + /* We own the mutex. Clean up after dead previous owner. */ + const bool rlocked = ipc == &env->lck->rdt_lock; + rc = MDBX_SUCCESS; + if (!rlocked) { + if (unlikely(env->txn)) { + /* env is hosed if the dead thread was ours */ + env->flags |= ENV_FATAL_ERROR; + env->txn = nullptr; + rc = MDBX_PANIC; } - header_size += node_data_size; - align_bytes += (node_key_size + node_data_size) & 1; - } break; + } + WARNING("%clock owner died, %s", (rlocked ? 'r' : 'w'), + (rc ? "this process' env is hosed" : "recovering")); - case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: - if (unlikely(node_data_size != sizeof(MDBX_db))) { - ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "invalid sub-tree node size", (unsigned)node_data_size); - assert(err == MDBX_CORRUPTED); - err = MDBX_CORRUPTED; - } - header_size += node_data_size; - align_bytes += (node_key_size + node_data_size) & 1; - break; + int check_rc = mvcc_cleanup_dead(env, rlocked, nullptr); + check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc; - case F_DUPDATA /* short sub-page */: { - if (unlikely(node_data_size <= PAGEHDRSZ || (node_data_size & 1))) { - ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "invalid sub-page node size", (unsigned)node_data_size); - assert(err == MDBX_CORRUPTED); - err = MDBX_CORRUPTED; - break; - } - - MDBX_page *sp = node_data(node); - const size_t nsubkeys = page_numkeys(sp); - size_t subheader_size = - IS_LEAF2(sp) ? PAGEHDRSZ : PAGEHDRSZ + sp->mp_lower; - size_t subunused_size = page_room(sp); - size_t subpayload_size = 0; - size_t subalign_bytes = 0; - MDBX_page_type_t subtype; - - switch (sp->mp_flags & /* ignore legacy P_DIRTY flag */ ~P_LEGACY_DIRTY) { - case P_LEAF | P_SUBP: - subtype = MDBX_subpage_leaf; - break; - case P_LEAF | P_LEAF2 | P_SUBP: - subtype = MDBX_subpage_dupfixed_leaf; - break; - default: - ERROR("%s/%d: %s 0x%x", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "invalid sub-page flags", sp->mp_flags); - assert(err == MDBX_CORRUPTED); - subtype = MDBX_subpage_broken; - err = MDBX_CORRUPTED; - } - - for (size_t j = 0; err == MDBX_SUCCESS && j < nsubkeys; ++j) { - if (subtype == MDBX_subpage_dupfixed_leaf) { - /* LEAF2 pages have no mp_ptrs[] or node headers */ - subpayload_size += sp->mp_leaf2_ksize; - } else { - assert(subtype == MDBX_subpage_leaf); - const MDBX_node *subnode = page_node(sp, j); - const size_t subnode_size = node_ks(subnode) + node_ds(subnode); - subheader_size += NODESIZE; - subpayload_size += subnode_size; - subalign_bytes += subnode_size & 1; - if (unlikely(node_flags(subnode) != 0)) { - ERROR("%s/%d: %s 0x%x", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "unexpected sub-node flags", node_flags(subnode)); - assert(err == MDBX_CORRUPTED); - err = MDBX_CORRUPTED; - } - } - } +#if MDBX_LOCKING == MDBX_LOCKING_SYSV + rc = (rc == MDBX_SUCCESS) ? check_rc : rc; +#else +#if defined(PTHREAD_MUTEX_ROBUST) || defined(pthread_mutex_consistent) + int mreco_rc = pthread_mutex_consistent(ipc); +#elif defined(PTHREAD_MUTEX_ROBUST_NP) || defined(pthread_mutex_consistent_np) + int mreco_rc = pthread_mutex_consistent_np(ipc); +#elif _POSIX_THREAD_PROCESS_SHARED < 200809L + int mreco_rc = pthread_mutex_consistent_np(ipc); +#else + int mreco_rc = pthread_mutex_consistent(ipc); +#endif + check_rc = (mreco_rc == 0) ? check_rc : mreco_rc; - const int rc = - ctx->mw_visitor(pgno, 0, ctx->mw_user, deep + 1, sdb, node_data_size, - subtype, err, nsubkeys, subpayload_size, - subheader_size, subunused_size + subalign_bytes); - if (unlikely(rc != MDBX_SUCCESS)) - return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; - header_size += subheader_size; - unused_size += subunused_size; - payload_size += subpayload_size; - align_bytes += subalign_bytes + (node_key_size & 1); - } break; + if (unlikely(mreco_rc)) + ERROR("lock recovery failed, %s", mdbx_strerror(mreco_rc)); - default: - ERROR("%s/%d: %s 0x%x", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "invalid node flags", node_flags(node)); - assert(err == MDBX_CORRUPTED); - err = MDBX_CORRUPTED; - } + rc = (rc == MDBX_SUCCESS) ? check_rc : rc; + if (MDBX_IS_ERROR(rc)) + pthread_mutex_unlock(ipc); +#endif /* MDBX_LOCKING == MDBX_LOCKING_POSIX2008 */ + return rc; } +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 + (void)ipc; +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + (void)ipc; +#elif MDBX_LOCKING == MDBX_LOCKING_FUTEX +#ifdef _MSC_VER +#pragma message("warning: TODO") +#else +#warning "TODO" +#endif + (void)ipc; +#else +#error "FIXME" +#endif /* MDBX_LOCKING */ - const int rc = ctx->mw_visitor( - pgno, 1, ctx->mw_user, deep, sdb, ctx->mw_txn->mt_env->me_psize, type, - err, nentries, payload_size, header_size, unused_size + align_bytes); - if (unlikely(rc != MDBX_SUCCESS)) - return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; - - for (size_t i = 0; err == MDBX_SUCCESS && i < nentries; ++i) { - if (type == MDBX_page_dupfixed_leaf) - continue; + ERROR("mutex (un)lock failed, %s", mdbx_strerror(err)); + if (rc != EDEADLK) + env->flags |= ENV_FATAL_ERROR; + return rc; +} - MDBX_node *node = page_node(mp, i); - if (type == MDBX_page_branch) { - assert(err == MDBX_SUCCESS); - err = walk_tree(ctx, node_pgno(node), sdb, deep + 1, mp->mp_txnid); - if (unlikely(err != MDBX_SUCCESS)) { - if (err == MDBX_RESULT_TRUE) - break; - return err; - } - continue; +#if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC) +MDBX_INTERNAL int osal_check_tid4bionic(void) { + /* avoid 32-bit Bionic bug/hang with 32-pit TID */ + if (sizeof(pthread_mutex_t) < sizeof(pid_t) + sizeof(unsigned)) { + pid_t tid = gettid(); + if (unlikely(tid > 0xffff)) { + FATAL("Raise the ENOSYS(%d) error to avoid hang due " + "the 32-bit Bionic/Android bug with tid/thread_id 0x%08x(%i) " + "that don’t fit in 16 bits, see " + "https://android.googlesource.com/platform/bionic/+/master/" + "docs/32-bit-abi.md#is-too-small-for-large-pids", + ENOSYS, tid, tid); + return ENOSYS; } + } + return 0; +} +#endif /* __ANDROID_API__ || ANDROID) || BIONIC */ - assert(type == MDBX_page_leaf); - switch (node_flags(node)) { - default: - continue; - - case F_SUBDATA /* sub-db */: - if (unlikely(node_ds(node) != sizeof(MDBX_db))) { - ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "invalid sub-tree node size", (unsigned)node_ds(node)); - assert(err == MDBX_CORRUPTED); - err = MDBX_CORRUPTED; - } else { - MDBX_db aligned_db; - memcpy(&aligned_db, node_data(node), sizeof(aligned_db)); - MDBX_walk_sdb_t sdb_info = { - {node_key(node), node_ks(node)}, nullptr, nullptr}; - sdb_info.internal = &aligned_db; - assert(err == MDBX_SUCCESS); - err = walk_sdb(ctx, &sdb_info, deep + 1); - } - break; - - case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: - if (unlikely(node_ds(node) != sizeof(MDBX_db))) { - ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "invalid dupsort sub-tree node size", (unsigned)node_ds(node)); - assert(err == MDBX_CORRUPTED); - err = MDBX_CORRUPTED; - } else if (unlikely(!ctx->mw_cursor->mc_xcursor)) { - ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, - "unexpected dupsort sub-tree node for non-dupsort subDB"); - assert(err == MDBX_CORRUPTED); - err = MDBX_CORRUPTED; - } else { - MDBX_db aligned_db; - memcpy(&aligned_db, node_data(node), sizeof(aligned_db)); - assert(ctx->mw_cursor->mc_xcursor == - &container_of(ctx->mw_cursor, MDBX_cursor_couple, outer)->inner); - assert(err == MDBX_SUCCESS); - err = cursor_xinit1(ctx->mw_cursor, node, mp); - if (likely(err == MDBX_SUCCESS)) { - ctx->mw_cursor = &ctx->mw_cursor->mc_xcursor->mx_cursor; - sdb->nested = &aligned_db; - err = walk_tree(ctx, aligned_db.md_root, sdb, deep + 1, mp->mp_txnid); - sdb->nested = nullptr; - MDBX_xcursor *inner_xcursor = - container_of(ctx->mw_cursor, MDBX_xcursor, mx_cursor); - MDBX_cursor_couple *couple = - container_of(inner_xcursor, MDBX_cursor_couple, inner); - ctx->mw_cursor = &couple->outer; - } - } - break; +static int osal_ipclock_lock(MDBX_env *env, osal_ipclock_t *ipc, + const bool dont_wait) { +#if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 + int rc = osal_check_tid4bionic(); + if (likely(rc == 0)) + rc = dont_wait ? pthread_mutex_trylock(ipc) : pthread_mutex_lock(ipc); + rc = (rc == EBUSY && dont_wait) ? MDBX_BUSY : rc; +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + int rc = MDBX_SUCCESS; + if (dont_wait) { + if (sem_trywait(ipc)) { + rc = errno; + if (rc == EAGAIN) + rc = MDBX_BUSY; } + } else if (sem_wait(ipc)) + rc = errno; +#elif MDBX_LOCKING == MDBX_LOCKING_SYSV + struct sembuf op = {.sem_num = (ipc != &env->lck->wrt_lock), + .sem_op = -1, + .sem_flg = dont_wait ? IPC_NOWAIT | SEM_UNDO : SEM_UNDO}; + int rc; + if (semop(env->me_sysv_ipc.semid, &op, 1)) { + rc = errno; + if (dont_wait && rc == EAGAIN) + rc = MDBX_BUSY; + } else { + rc = *ipc ? EOWNERDEAD : MDBX_SUCCESS; + *ipc = env->pid; } +#else +#error "FIXME" +#endif /* MDBX_LOCKING */ - return MDBX_SUCCESS; + if (unlikely(rc != MDBX_SUCCESS && rc != MDBX_BUSY)) + rc = osal_ipclock_failed(env, ipc, rc); + return rc; } -__cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_walk_sdb_t *sdb, - int deep) { - struct MDBX_db *const db = sdb->internal; - if (unlikely(db->md_root == P_INVALID)) - return MDBX_SUCCESS; /* empty db */ - - MDBX_cursor_couple couple; - MDBX_dbx dbx = {.md_klen_min = INT_MAX}; - uint8_t dbi_state = DBI_LINDO | DBI_VALID; - int rc = couple_init(&couple, ~0u, ctx->mw_txn, db, &dbx, &dbi_state); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - couple.outer.mc_checking |= ctx->mw_dont_check_keys_ordering - ? CC_SKIPORD | CC_PAGECHECK - : CC_PAGECHECK; - couple.inner.mx_cursor.mc_checking |= ctx->mw_dont_check_keys_ordering - ? CC_SKIPORD | CC_PAGECHECK - : CC_PAGECHECK; - couple.outer.mc_next = ctx->mw_cursor; - ctx->mw_cursor = &couple.outer; - rc = walk_tree(ctx, db->md_root, sdb, deep, - db->md_mod_txnid ? db->md_mod_txnid : ctx->mw_txn->mt_txnid); - ctx->mw_cursor = couple.outer.mc_next; +int osal_ipclock_unlock(MDBX_env *env, osal_ipclock_t *ipc) { + int err = MDBX_ENOSYS; +#if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ + MDBX_LOCKING == MDBX_LOCKING_POSIX2008 + err = pthread_mutex_unlock(ipc); +#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + err = sem_post(ipc) ? errno : MDBX_SUCCESS; +#elif MDBX_LOCKING == MDBX_LOCKING_SYSV + if (unlikely(*ipc != (pid_t)env->pid)) + err = EPERM; + else { + *ipc = 0; + struct sembuf op = {.sem_num = (ipc != &env->lck->wrt_lock), + .sem_op = 1, + .sem_flg = SEM_UNDO}; + err = semop(env->me_sysv_ipc.semid, &op, 1) ? errno : MDBX_SUCCESS; + } +#else +#error "FIXME" +#endif /* MDBX_LOCKING */ + int rc = err; + if (unlikely(rc != MDBX_SUCCESS)) { + const uint32_t current_pid = osal_getpid(); + if (current_pid == env->pid || LOG_ENABLED(MDBX_LOG_NOTICE)) + debug_log((current_pid == env->pid) + ? MDBX_LOG_FATAL + : (rc = MDBX_SUCCESS, MDBX_LOG_NOTICE), + "ipc-unlock()", __LINE__, "failed: env %p, lck-%s %p, err %d\n", + __Wpedantic_format_voidptr(env), + (env->lck == env->lck_mmap.lck) ? "mmap" : "stub", + __Wpedantic_format_voidptr(env->lck), err); + } return rc; } -__cold int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, - void *user, bool dont_check_keys_ordering) { - int rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - mdbx_walk_ctx_t ctx; - memset(&ctx, 0, sizeof(ctx)); - ctx.mw_txn = txn; - ctx.mw_user = user; - ctx.mw_visitor = visitor; - ctx.mw_dont_check_keys_ordering = dont_check_keys_ordering; - - MDBX_walk_sdb_t sdb = {{MDBX_CHK_GC, 0}, &txn->mt_dbs[FREE_DBI], nullptr}; - rc = walk_sdb(&ctx, &sdb, 0); - if (!MDBX_IS_ERROR(rc)) { - sdb.name.iov_base = MDBX_CHK_MAIN; - sdb.internal = &txn->mt_dbs[MAIN_DBI]; - rc = walk_sdb(&ctx, &sdb, 0); - } +MDBX_INTERNAL int lck_rdt_lock(MDBX_env *env) { + TRACE("%s", ">>"); + jitter4testing(true); + int rc = osal_ipclock_lock(env, &env->lck->rdt_lock, false); + TRACE("<< rc %d", rc); return rc; } -int mdbx_canary_put(MDBX_txn *txn, const MDBX_canary *canary) { - int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +MDBX_INTERNAL void lck_rdt_unlock(MDBX_env *env) { + TRACE("%s", ">>"); + int err = osal_ipclock_unlock(env, &env->lck->rdt_lock); + TRACE("<< err %d", err); + if (unlikely(err != MDBX_SUCCESS)) + mdbx_panic("%s() failed: err %d\n", __func__, err); + jitter4testing(true); +} - if (likely(canary)) { - if (txn->mt_canary.x == canary->x && txn->mt_canary.y == canary->y && - txn->mt_canary.z == canary->z) - return MDBX_SUCCESS; - txn->mt_canary.x = canary->x; - txn->mt_canary.y = canary->y; - txn->mt_canary.z = canary->z; +int lck_txn_lock(MDBX_env *env, bool dont_wait) { + TRACE("%swait %s", dont_wait ? "dont-" : "", ">>"); + jitter4testing(true); + const int err = osal_ipclock_lock(env, &env->lck->wrt_lock, dont_wait); + int rc = err; + if (likely(!MDBX_IS_ERROR(err))) { + eASSERT(env, !env->basal_txn->owner || + err == /* если другой поток в этом-же процессе завершился + не освободив блокировку */ + MDBX_RESULT_TRUE); + env->basal_txn->owner = osal_thread_self(); + rc = MDBX_SUCCESS; } - txn->mt_canary.v = txn->mt_txnid; - txn->mt_flags |= MDBX_TXN_DIRTY; + TRACE("<< err %d, rc %d", err, rc); + return rc; +} - return MDBX_SUCCESS; +void lck_txn_unlock(MDBX_env *env) { + TRACE("%s", ">>"); + eASSERT(env, env->basal_txn->owner == osal_thread_self()); + env->basal_txn->owner = 0; + int err = osal_ipclock_unlock(env, &env->lck->wrt_lock); + TRACE("<< err %d", err); + if (unlikely(err != MDBX_SUCCESS)) + mdbx_panic("%s() failed: err %d\n", __func__, err); + jitter4testing(true); } -int mdbx_canary_get(const MDBX_txn *txn, MDBX_canary *canary) { - int rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +#endif /* !Windows LCK-implementation */ +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 - if (unlikely(canary == NULL)) - return MDBX_EINVAL; +#if defined(_WIN32) || defined(_WIN64) - *canary = txn->mt_canary; - return MDBX_SUCCESS; -} +/* PREAMBLE FOR WINDOWS: + * + * We are not concerned for performance here. + * If you are running Windows a performance could NOT be the goal. + * Otherwise please use Linux. */ -int mdbx_cursor_on_first(const MDBX_cursor *mc) { - if (unlikely(mc == NULL)) - return MDBX_EINVAL; - if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL - : MDBX_EBADSIGN; +#define LCK_SHARED 0 +#define LCK_EXCLUSIVE LOCKFILE_EXCLUSIVE_LOCK +#define LCK_WAITFOR 0 +#define LCK_DONTWAIT LOCKFILE_FAIL_IMMEDIATELY - if (!(mc->mc_flags & C_INITIALIZED)) - return mc->mc_db->md_entries ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE; +static int flock_with_event(HANDLE fd, HANDLE event, unsigned flags, + size_t offset, size_t bytes) { + TRACE("lock>>: fd %p, event %p, flags 0x%x offset %zu, bytes %zu >>", fd, + event, flags, offset, bytes); + OVERLAPPED ov; + ov.Internal = 0; + ov.InternalHigh = 0; + ov.hEvent = event; + ov.Offset = (DWORD)offset; + ov.OffsetHigh = HIGH_DWORD(offset); + if (LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov)) { + TRACE("lock<<: fd %p, event %p, flags 0x%x offset %zu, bytes %zu << %s", fd, + event, flags, offset, bytes, "done"); + return MDBX_SUCCESS; + } - for (size_t i = 0; i < mc->mc_snum; ++i) { - if (mc->mc_ki[i]) - return MDBX_RESULT_FALSE; + DWORD rc = GetLastError(); + if (rc == ERROR_IO_PENDING) { + if (event) { + if (GetOverlappedResult(fd, &ov, &rc, true)) { + TRACE("lock<<: fd %p, event %p, flags 0x%x offset %zu, bytes %zu << %s", + fd, event, flags, offset, bytes, "overlapped-done"); + return MDBX_SUCCESS; + } + rc = GetLastError(); + } else + CancelIo(fd); } + TRACE("lock<<: fd %p, event %p, flags 0x%x offset %zu, bytes %zu << err %d", + fd, event, flags, offset, bytes, (int)rc); + return (int)rc; +} - return MDBX_RESULT_TRUE; +static inline int flock(HANDLE fd, unsigned flags, size_t offset, + size_t bytes) { + return flock_with_event(fd, 0, flags, offset, bytes); } -int mdbx_cursor_on_first_dup(const MDBX_cursor *mc) { - if (unlikely(mc == NULL)) - return MDBX_EINVAL; +static inline int flock_data(const MDBX_env *env, unsigned flags, size_t offset, + size_t bytes) { + const HANDLE fd4data = + env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd; + return flock_with_event(fd4data, env->dxb_lock_event, flags, offset, bytes); +} - if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL - : MDBX_EBADSIGN; +static int funlock(mdbx_filehandle_t fd, size_t offset, size_t bytes) { + TRACE("unlock: fd %p, offset %zu, bytes %zu", fd, offset, bytes); + return UnlockFile(fd, (DWORD)offset, HIGH_DWORD(offset), (DWORD)bytes, + HIGH_DWORD(bytes)) + ? MDBX_SUCCESS + : (int)GetLastError(); +} - if (!(mc->mc_flags & C_INITIALIZED)) - return MDBX_RESULT_TRUE; +/*----------------------------------------------------------------------------*/ +/* global `write` lock for write-txt processing, + * exclusive locking both meta-pages) */ - if (!mc->mc_xcursor) - return MDBX_RESULT_TRUE; +#ifdef _WIN64 +#define DXB_MAXLEN UINT64_C(0x7fffFFFFfff00000) +#else +#define DXB_MAXLEN UINT32_C(0x7ff00000) +#endif +#define DXB_BODY (env->ps * (size_t)NUM_METAS), DXB_MAXLEN +#define DXB_WHOLE 0, DXB_MAXLEN - mc = &mc->mc_xcursor->mx_cursor; - for (size_t i = 0; i < mc->mc_snum; ++i) { - if (mc->mc_ki[i]) - return MDBX_RESULT_FALSE; +int lck_txn_lock(MDBX_env *env, bool dontwait) { + if (dontwait) { + if (!TryEnterCriticalSection(&env->windowsbug_lock)) + return MDBX_BUSY; + } else { + __try { + EnterCriticalSection(&env->windowsbug_lock); + } + __except ((GetExceptionCode() == + 0xC0000194 /* STATUS_POSSIBLE_DEADLOCK / EXCEPTION_POSSIBLE_DEADLOCK */) + ? EXCEPTION_EXECUTE_HANDLER + : EXCEPTION_CONTINUE_SEARCH) { + return MDBX_EDEADLK; + } } - return MDBX_RESULT_TRUE; -} - -int mdbx_cursor_on_last(const MDBX_cursor *mc) { - if (unlikely(mc == NULL)) - return MDBX_EINVAL; + eASSERT(env, !env->basal_txn->owner); + if (env->flags & MDBX_EXCLUSIVE) + goto done; - if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL - : MDBX_EBADSIGN; + const HANDLE fd4data = + env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd; + int rc = flock_with_event(fd4data, env->dxb_lock_event, + dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT) + : (LCK_EXCLUSIVE | LCK_WAITFOR), + DXB_BODY); + if (rc == ERROR_LOCK_VIOLATION && dontwait) { + SleepEx(0, true); + rc = flock_with_event(fd4data, env->dxb_lock_event, + LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_BODY); + if (rc == ERROR_LOCK_VIOLATION) { + SleepEx(0, true); + rc = flock_with_event(fd4data, env->dxb_lock_event, + LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_BODY); + } + } + if (rc == MDBX_SUCCESS) { + done: + /* Zap: Failing to release lock 'env->windowsbug_lock' + * in function 'mdbx_txn_lock' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(26115); + env->basal_txn->owner = osal_thread_self(); + return MDBX_SUCCESS; + } - if (!(mc->mc_flags & C_INITIALIZED)) - return mc->mc_db->md_entries ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE; + LeaveCriticalSection(&env->windowsbug_lock); + return (!dontwait || rc != ERROR_LOCK_VIOLATION) ? rc : MDBX_BUSY; +} - for (size_t i = 0; i < mc->mc_snum; ++i) { - size_t nkeys = page_numkeys(mc->mc_pg[i]); - if (mc->mc_ki[i] < nkeys - 1) - return MDBX_RESULT_FALSE; +void lck_txn_unlock(MDBX_env *env) { + eASSERT(env, env->basal_txn->owner == osal_thread_self()); + if ((env->flags & MDBX_EXCLUSIVE) == 0) { + const HANDLE fd4data = + env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd; + int err = funlock(fd4data, DXB_BODY); + if (err != MDBX_SUCCESS) + mdbx_panic("%s failed: err %u", __func__, err); } - - return MDBX_RESULT_TRUE; + env->basal_txn->owner = 0; + LeaveCriticalSection(&env->windowsbug_lock); } -int mdbx_cursor_on_last_dup(const MDBX_cursor *mc) { - if (unlikely(mc == NULL)) - return MDBX_EINVAL; +/*----------------------------------------------------------------------------*/ +/* global `read` lock for readers registration, + * exclusive locking `rdt_length` (second) cacheline */ - if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL - : MDBX_EBADSIGN; +#define LCK_LO_OFFSET 0 +#define LCK_LO_LEN offsetof(lck_t, rdt_length) +#define LCK_UP_OFFSET LCK_LO_LEN +#define LCK_UP_LEN (sizeof(lck_t) - LCK_UP_OFFSET) +#define LCK_LOWER LCK_LO_OFFSET, LCK_LO_LEN +#define LCK_UPPER LCK_UP_OFFSET, LCK_UP_LEN - if (!(mc->mc_flags & C_INITIALIZED)) - return MDBX_RESULT_FALSE; +MDBX_INTERNAL int lck_rdt_lock(MDBX_env *env) { + imports.srwl_AcquireShared(&env->remap_guard); + if (env->lck_mmap.fd == INVALID_HANDLE_VALUE) + return MDBX_SUCCESS; /* readonly database in readonly filesystem */ - if (!mc->mc_xcursor) - return MDBX_RESULT_TRUE; + /* transition from S-? (used) to S-E (locked), + * e.g. exclusive lock upper-part */ + if (env->flags & MDBX_EXCLUSIVE) + return MDBX_SUCCESS; - mc = &mc->mc_xcursor->mx_cursor; - for (size_t i = 0; i < mc->mc_snum; ++i) { - size_t nkeys = page_numkeys(mc->mc_pg[i]); - if (mc->mc_ki[i] < nkeys - 1) - return MDBX_RESULT_FALSE; - } + int rc = flock(env->lck_mmap.fd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER); + if (rc == MDBX_SUCCESS) + return MDBX_SUCCESS; - return MDBX_RESULT_TRUE; + imports.srwl_ReleaseShared(&env->remap_guard); + return rc; } -int mdbx_cursor_eof(const MDBX_cursor *mc) { - if (unlikely(mc == NULL)) - return MDBX_EINVAL; - - if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL - : MDBX_EBADSIGN; - - return ((mc->mc_flags & (C_INITIALIZED | C_EOF)) == C_INITIALIZED && - mc->mc_snum && - mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top])) - ? MDBX_RESULT_FALSE - : MDBX_RESULT_TRUE; +MDBX_INTERNAL void lck_rdt_unlock(MDBX_env *env) { + if (env->lck_mmap.fd != INVALID_HANDLE_VALUE && + (env->flags & MDBX_EXCLUSIVE) == 0) { + /* transition from S-E (locked) to S-? (used), e.g. unlock upper-part */ + int err = funlock(env->lck_mmap.fd, LCK_UPPER); + if (err != MDBX_SUCCESS) + mdbx_panic("%s failed: err %u", __func__, err); + } + imports.srwl_ReleaseShared(&env->remap_guard); } -//------------------------------------------------------------------------------ - -struct diff_result { - ptrdiff_t diff; - size_t level; - ptrdiff_t root_nkeys; -}; - -/* calculates: r = x - y */ -__hot static int cursor_diff(const MDBX_cursor *const __restrict x, - const MDBX_cursor *const __restrict y, - struct diff_result *const __restrict r) { - r->diff = 0; - r->level = 0; - r->root_nkeys = 0; - - if (unlikely(x->mc_signature != MDBX_MC_LIVE)) - return (x->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL - : MDBX_EBADSIGN; +MDBX_INTERNAL int osal_lockfile(mdbx_filehandle_t fd, bool wait) { + return flock( + fd, wait ? LCK_EXCLUSIVE | LCK_WAITFOR : LCK_EXCLUSIVE | LCK_DONTWAIT, 0, + DXB_MAXLEN); +} - if (unlikely(y->mc_signature != MDBX_MC_LIVE)) - return (y->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL - : MDBX_EBADSIGN; +static int suspend_and_append(mdbx_handle_array_t **array, + const DWORD ThreadId) { + const unsigned limit = (*array)->limit; + if ((*array)->count == limit) { + mdbx_handle_array_t *const ptr = + osal_realloc((limit > ARRAY_LENGTH((*array)->handles)) + ? *array + : /* don't free initial array on the stack */ nullptr, + sizeof(mdbx_handle_array_t) + + sizeof(HANDLE) * (limit * (size_t)2 - + ARRAY_LENGTH((*array)->handles))); + if (!ptr) + return MDBX_ENOMEM; + if (limit == ARRAY_LENGTH((*array)->handles)) + *ptr = **array; + *array = ptr; + (*array)->limit = limit * 2; + } - int rc = check_txn(x->mc_txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + HANDLE hThread = OpenThread(THREAD_SUSPEND_RESUME | THREAD_QUERY_INFORMATION, + FALSE, ThreadId); + if (hThread == nullptr) + return (int)GetLastError(); - if (unlikely(x->mc_txn != y->mc_txn)) - return MDBX_BAD_TXN; + if (SuspendThread(hThread) == (DWORD)-1) { + int err = (int)GetLastError(); + DWORD ExitCode; + if (err == /* workaround for Win10 UCRT bug */ ERROR_ACCESS_DENIED || + !GetExitCodeThread(hThread, &ExitCode) || ExitCode != STILL_ACTIVE) + err = MDBX_SUCCESS; + CloseHandle(hThread); + return err; + } - if (unlikely(y->mc_dbi != x->mc_dbi)) - return MDBX_EINVAL; + (*array)->handles[(*array)->count++] = hThread; + return MDBX_SUCCESS; +} - if (unlikely(!(y->mc_flags & x->mc_flags & C_INITIALIZED))) - return MDBX_ENODATA; +MDBX_INTERNAL int +osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { + eASSERT(env, (env->flags & MDBX_NOSTICKYTHREADS) == 0); + const uintptr_t CurrentTid = GetCurrentThreadId(); + int rc; + if (env->lck_mmap.lck) { + /* Scan LCK for threads of the current process */ + const reader_slot_t *const begin = env->lck_mmap.lck->rdt; + const reader_slot_t *const end = + begin + + atomic_load32(&env->lck_mmap.lck->rdt_length, mo_AcquireRelease); + const uintptr_t WriteTxnOwner = env->basal_txn ? env->basal_txn->owner : 0; + for (const reader_slot_t *reader = begin; reader < end; ++reader) { + if (reader->pid.weak != env->pid || !reader->tid.weak) { + skip_lck: + continue; + } + if (reader->tid.weak == CurrentTid || reader->tid.weak == WriteTxnOwner) + goto skip_lck; - while (likely(r->level < y->mc_snum && r->level < x->mc_snum)) { - if (unlikely(y->mc_pg[r->level] != x->mc_pg[r->level])) { - ERROR("Mismatch cursors's pages at %zu level", r->level); - return MDBX_PROBLEM; + rc = suspend_and_append(array, (mdbx_tid_t)reader->tid.weak); + if (rc != MDBX_SUCCESS) { + bailout_lck: + (void)osal_resume_threads_after_remap(*array); + return rc; + } + } + if (WriteTxnOwner && WriteTxnOwner != CurrentTid) { + rc = suspend_and_append(array, (mdbx_tid_t)WriteTxnOwner); + if (rc != MDBX_SUCCESS) + goto bailout_lck; } + } else { + /* Without LCK (i.e. read-only mode). + * Walk through a snapshot of all running threads */ + eASSERT(env, env->flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)); + const HANDLE hSnapshot = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0); + if (hSnapshot == INVALID_HANDLE_VALUE) + return (int)GetLastError(); - intptr_t nkeys = page_numkeys(y->mc_pg[r->level]); - assert(nkeys > 0); - if (r->level == 0) - r->root_nkeys = nkeys; + THREADENTRY32 entry; + entry.dwSize = sizeof(THREADENTRY32); - const intptr_t limit_ki = nkeys - 1; - const intptr_t x_ki = x->mc_ki[r->level]; - const intptr_t y_ki = y->mc_ki[r->level]; - r->diff = ((x_ki < limit_ki) ? x_ki : limit_ki) - - ((y_ki < limit_ki) ? y_ki : limit_ki); - if (r->diff == 0) { - r->level += 1; - continue; + if (!Thread32First(hSnapshot, &entry)) { + rc = (int)GetLastError(); + bailout_toolhelp: + CloseHandle(hSnapshot); + (void)osal_resume_threads_after_remap(*array); + return rc; } - while (unlikely(r->diff == 1) && - likely(r->level + 1 < y->mc_snum && r->level + 1 < x->mc_snum)) { - r->level += 1; - /* DB'PAGEs: 0------------------>MAX - * - * CURSORs: y < x - * STACK[i ]: | - * STACK[+1]: ...y++N|0++x... - */ - nkeys = page_numkeys(y->mc_pg[r->level]); - r->diff = (nkeys - y->mc_ki[r->level]) + x->mc_ki[r->level]; - assert(r->diff > 0); - } + do { + if (entry.th32OwnerProcessID != env->pid || + entry.th32ThreadID == CurrentTid) + continue; - while (unlikely(r->diff == -1) && - likely(r->level + 1 < y->mc_snum && r->level + 1 < x->mc_snum)) { - r->level += 1; - /* DB'PAGEs: 0------------------>MAX - * - * CURSORs: x < y - * STACK[i ]: | - * STACK[+1]: ...x--N|0--y... - */ - nkeys = page_numkeys(x->mc_pg[r->level]); - r->diff = -(nkeys - x->mc_ki[r->level]) - y->mc_ki[r->level]; - assert(r->diff < 0); - } + rc = suspend_and_append(array, entry.th32ThreadID); + if (rc != MDBX_SUCCESS) + goto bailout_toolhelp; - return MDBX_SUCCESS; + } while (Thread32Next(hSnapshot, &entry)); + + rc = (int)GetLastError(); + if (rc != ERROR_NO_MORE_FILES) + goto bailout_toolhelp; + CloseHandle(hSnapshot); } - r->diff = CMP2INT(x->mc_flags & C_EOF, y->mc_flags & C_EOF); return MDBX_SUCCESS; } -__hot static ptrdiff_t estimate(const MDBX_db *db, - struct diff_result *const __restrict dr) { - /* root: branch-page => scale = leaf-factor * branch-factor^(N-1) - * level-1: branch-page(s) => scale = leaf-factor * branch-factor^2 - * level-2: branch-page(s) => scale = leaf-factor * branch-factor - * level-N: branch-page(s) => scale = leaf-factor - * leaf-level: leaf-page(s) => scale = 1 - */ - ptrdiff_t btree_power = (ptrdiff_t)db->md_depth - 2 - (ptrdiff_t)dr->level; - if (btree_power < 0) - return dr->diff; - - ptrdiff_t estimated = - (ptrdiff_t)db->md_entries * dr->diff / (ptrdiff_t)db->md_leaf_pages; - if (btree_power == 0) - return estimated; - - if (db->md_depth < 4) { - assert(dr->level == 0 && btree_power == 1); - return (ptrdiff_t)db->md_entries * dr->diff / (ptrdiff_t)dr->root_nkeys; - } - - /* average_branchpage_fillfactor = total(branch_entries) / branch_pages - total(branch_entries) = leaf_pages + branch_pages - 1 (root page) */ - const size_t log2_fixedpoint = sizeof(size_t) - 1; - const size_t half = UINT64_C(1) << (log2_fixedpoint - 1); - const size_t factor = - ((db->md_leaf_pages + db->md_branch_pages - 1) << log2_fixedpoint) / - db->md_branch_pages; - while (1) { - switch ((size_t)btree_power) { - default: { - const size_t square = (factor * factor + half) >> log2_fixedpoint; - const size_t quad = (square * square + half) >> log2_fixedpoint; - do { - estimated = estimated * quad + half; - estimated >>= log2_fixedpoint; - btree_power -= 4; - } while (btree_power >= 4); - continue; - } - case 3: - estimated = estimated * factor + half; - estimated >>= log2_fixedpoint; - __fallthrough /* fall through */; - case 2: - estimated = estimated * factor + half; - estimated >>= log2_fixedpoint; - __fallthrough /* fall through */; - case 1: - estimated = estimated * factor + half; - estimated >>= log2_fixedpoint; - __fallthrough /* fall through */; - case 0: - if (unlikely(estimated > (ptrdiff_t)db->md_entries)) - return (ptrdiff_t)db->md_entries; - if (unlikely(estimated < -(ptrdiff_t)db->md_entries)) - return -(ptrdiff_t)db->md_entries; - return estimated; +MDBX_INTERNAL int osal_resume_threads_after_remap(mdbx_handle_array_t *array) { + int rc = MDBX_SUCCESS; + for (unsigned i = 0; i < array->count; ++i) { + const HANDLE hThread = array->handles[i]; + if (ResumeThread(hThread) == (DWORD)-1) { + const int err = (int)GetLastError(); + DWORD ExitCode; + if (err != /* workaround for Win10 UCRT bug */ ERROR_ACCESS_DENIED && + GetExitCodeThread(hThread, &ExitCode) && ExitCode == STILL_ACTIVE) + rc = err; } + CloseHandle(hThread); } + return rc; } -int mdbx_estimate_distance(const MDBX_cursor *first, const MDBX_cursor *last, - ptrdiff_t *distance_items) { - if (unlikely(first == NULL || last == NULL || distance_items == NULL)) - return MDBX_EINVAL; +/*----------------------------------------------------------------------------*/ +/* global `initial` lock for lockfile initialization, + * exclusive/shared locking first cacheline */ - *distance_items = 0; - struct diff_result dr; - int rc = cursor_diff(last, first, &dr); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +/* Briefly description of locking schema/algorithm: + * - Windows does not support upgrading or downgrading for file locking. + * - Therefore upgrading/downgrading is emulated by shared and exclusive + * locking of upper and lower halves. + * - In other words, we have FSM with possible 9 states, + * i.e. free/shared/exclusive x free/shared/exclusive == 9. + * Only 6 states of FSM are used, which 2 of ones are transitive. + * + * States: + * LO HI + * ?-? = free, i.e. unlocked + * S-? = used, i.e. shared lock + * E-? = exclusive-read, i.e. operational exclusive + * ?-S + * ?-E = middle (transitive state) + * S-S + * S-E = locked (transitive state) + * E-S + * E-E = exclusive-write, i.e. exclusive due (re)initialization + * + * The lck_seize() moves the locking-FSM from the initial free/unlocked + * state to the "exclusive write" (and returns MDBX_RESULT_TRUE) if possible, + * or to the "used" (and returns MDBX_RESULT_FALSE). + * + * The lck_downgrade() moves the locking-FSM from "exclusive write" + * state to the "used" (i.e. shared) state. + * + * The lck_upgrade() moves the locking-FSM from "used" (i.e. shared) + * state to the "exclusive write" state. + */ - if (unlikely(dr.diff == 0) && - F_ISSET(first->mc_db->md_flags & last->mc_db->md_flags, - MDBX_DUPSORT | C_INITIALIZED)) { - first = &first->mc_xcursor->mx_cursor; - last = &last->mc_xcursor->mx_cursor; - rc = cursor_diff(first, last, &dr); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +static void lck_unlock(MDBX_env *env) { + int err; + + if (env->lck_mmap.fd != INVALID_HANDLE_VALUE) { + /* double `unlock` for robustly remove overlapped shared/exclusive locks */ + do + err = funlock(env->lck_mmap.fd, LCK_LOWER); + while (err == MDBX_SUCCESS); + assert(err == ERROR_NOT_LOCKED || + (globals.running_under_Wine && err == ERROR_LOCK_VIOLATION)); + SetLastError(ERROR_SUCCESS); + + do + err = funlock(env->lck_mmap.fd, LCK_UPPER); + while (err == MDBX_SUCCESS); + assert(err == ERROR_NOT_LOCKED || + (globals.running_under_Wine && err == ERROR_LOCK_VIOLATION)); + SetLastError(ERROR_SUCCESS); } - if (likely(dr.diff != 0)) - *distance_items = estimate(first->mc_db, &dr); + const HANDLE fd4data = + env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd; + if (fd4data != INVALID_HANDLE_VALUE) { + /* explicitly unlock to avoid latency for other processes (windows kernel + * releases such locks via deferred queues) */ + do + err = funlock(fd4data, DXB_BODY); + while (err == MDBX_SUCCESS); + assert(err == ERROR_NOT_LOCKED || + (globals.running_under_Wine && err == ERROR_LOCK_VIOLATION)); + SetLastError(ERROR_SUCCESS); - return MDBX_SUCCESS; + do + err = funlock(fd4data, DXB_WHOLE); + while (err == MDBX_SUCCESS); + assert(err == ERROR_NOT_LOCKED || + (globals.running_under_Wine && err == ERROR_LOCK_VIOLATION)); + SetLastError(ERROR_SUCCESS); + } } -int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, - MDBX_cursor_op move_op, ptrdiff_t *distance_items) { - if (unlikely(cursor == NULL || distance_items == NULL || - move_op == MDBX_GET_CURRENT || move_op == MDBX_GET_MULTIPLE)) - return MDBX_EINVAL; - - if (unlikely(cursor->mc_signature != MDBX_MC_LIVE)) - return (cursor->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL - : MDBX_EBADSIGN; +/* Seize state as 'exclusive-write' (E-E and returns MDBX_RESULT_TRUE) + * or as 'used' (S-? and returns MDBX_RESULT_FALSE). + * Otherwise returns an error. */ +static int internal_seize_lck(HANDLE lfd) { + assert(lfd != INVALID_HANDLE_VALUE); - int rc = check_txn(cursor->mc_txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) + /* 1) now on ?-? (free), get ?-E (middle) */ + jitter4testing(false); + int rc = flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER); + if (rc != MDBX_SUCCESS) { + /* 2) something went wrong, give up */; + ERROR("%s, err %u", "?-?(free) >> ?-E(middle)", rc); return rc; + } - if (!(cursor->mc_flags & C_INITIALIZED)) - return MDBX_ENODATA; + /* 3) now on ?-E (middle), try E-E (exclusive-write) */ + jitter4testing(false); + rc = flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER); + if (rc == MDBX_SUCCESS) + return MDBX_RESULT_TRUE /* 4) got E-E (exclusive-write), done */; - MDBX_cursor_couple next; - cursor_copy(cursor, &next.outer); - if (cursor->mc_db->md_flags & MDBX_DUPSORT) { - next.outer.mc_xcursor = &next.inner; - rc = cursor_xinit0(&next.outer); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - MDBX_xcursor *mx = &container_of(cursor, MDBX_cursor_couple, outer)->inner; - cursor_copy(&mx->mx_cursor, &next.inner.mx_cursor); + /* 5) still on ?-E (middle) */ + jitter4testing(false); + if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { + /* 6) something went wrong, give up */ + rc = funlock(lfd, LCK_UPPER); + if (rc != MDBX_SUCCESS) + mdbx_panic("%s(%s) failed: err %u", __func__, "?-E(middle) >> ?-?(free)", + rc); + return rc; } - MDBX_val stub = {0, 0}; - if (data == NULL) { - const unsigned mask = - 1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | 1 << MDBX_SET_KEY; - if (unlikely(mask & (1 << move_op))) - return MDBX_EINVAL; - data = &stub; - } + /* 7) still on ?-E (middle), try S-E (locked) */ + jitter4testing(false); + rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER); - if (key == NULL) { - const unsigned mask = 1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | - 1 << MDBX_SET_KEY | 1 << MDBX_SET | - 1 << MDBX_SET_RANGE; - if (unlikely(mask & (1 << move_op))) - return MDBX_EINVAL; - key = &stub; - } + jitter4testing(false); + if (rc != MDBX_SUCCESS) + ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc); - next.outer.mc_signature = MDBX_MC_LIVE; - rc = cursor_get(&next.outer, key, data, move_op); - if (unlikely(rc != MDBX_SUCCESS && - (rc != MDBX_NOTFOUND || !(next.outer.mc_flags & C_INITIALIZED)))) - return rc; + /* 8) now on S-E (locked) or still on ?-E (middle), + * transition to S-? (used) or ?-? (free) */ + int err = funlock(lfd, LCK_UPPER); + if (err != MDBX_SUCCESS) + mdbx_panic("%s(%s) failed: err %u", __func__, + "X-E(locked/middle) >> X-?(used/free)", err); - return mdbx_estimate_distance(cursor, &next.outer, distance_items); + /* 9) now on S-? (used, DONE) or ?-? (free, FAILURE) */ + return rc; } -int mdbx_estimate_range(const MDBX_txn *txn, MDBX_dbi dbi, - const MDBX_val *begin_key, const MDBX_val *begin_data, - const MDBX_val *end_key, const MDBX_val *end_data, - ptrdiff_t *size_items) { - int rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) +MDBX_INTERNAL int lck_seize(MDBX_env *env) { + const HANDLE fd4data = + env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd; + assert(fd4data != INVALID_HANDLE_VALUE); + if (env->flags & MDBX_EXCLUSIVE) + return MDBX_RESULT_TRUE /* nope since files were must be opened + non-shareable */ + ; + + if (env->lck_mmap.fd == INVALID_HANDLE_VALUE) { + /* LY: without-lck mode (e.g. on read-only filesystem) */ + jitter4testing(false); + int rc = flock_data(env, LCK_SHARED | LCK_DONTWAIT, DXB_WHOLE); + if (rc != MDBX_SUCCESS) + ERROR("%s, err %u", "without-lck", rc); return rc; + } - if (unlikely(!size_items)) - return MDBX_EINVAL; + int rc = internal_seize_lck(env->lck_mmap.fd); + jitter4testing(false); + if (rc == MDBX_RESULT_TRUE && (env->flags & MDBX_RDONLY) == 0) { + /* Check that another process don't operates in without-lck mode. + * Doing such check by exclusive locking the body-part of db. Should be + * noted: + * - we need an exclusive lock for do so; + * - we can't lock meta-pages, otherwise other process could get an error + * while opening db in valid (non-conflict) mode. */ + int err = flock_data(env, LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_WHOLE); + if (err != MDBX_SUCCESS) { + ERROR("%s, err %u", "lock-against-without-lck", err); + jitter4testing(false); + lck_unlock(env); + return err; + } + jitter4testing(false); + err = funlock(fd4data, DXB_WHOLE); + if (err != MDBX_SUCCESS) + mdbx_panic("%s(%s) failed: err %u", __func__, + "unlock-against-without-lck", err); + } - if (unlikely(begin_data && (begin_key == NULL || begin_key == MDBX_EPSILON))) - return MDBX_EINVAL; + return rc; +} - if (unlikely(end_data && (end_key == NULL || end_key == MDBX_EPSILON))) - return MDBX_EINVAL; +MDBX_INTERNAL int lck_downgrade(MDBX_env *env) { + const HANDLE fd4data = + env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd; + /* Transite from exclusive-write state (E-E) to used (S-?) */ + assert(fd4data != INVALID_HANDLE_VALUE); + assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE); - if (unlikely(begin_key == MDBX_EPSILON && end_key == MDBX_EPSILON)) - return MDBX_EINVAL; + if (env->flags & MDBX_EXCLUSIVE) + return MDBX_SUCCESS /* nope since files were must be opened non-shareable */ + ; + /* 1) now at E-E (exclusive-write), transition to ?_E (middle) */ + int rc = funlock(env->lck_mmap.fd, LCK_LOWER); + if (rc != MDBX_SUCCESS) + mdbx_panic("%s(%s) failed: err %u", __func__, + "E-E(exclusive-write) >> ?-E(middle)", rc); - MDBX_cursor_couple begin; - /* LY: first, initialize cursor to refresh a DB in case it have DB_STALE */ - rc = cursor_init(&begin.outer, txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) + /* 2) now at ?-E (middle), transition to S-E (locked) */ + rc = flock(env->lck_mmap.fd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER); + if (rc != MDBX_SUCCESS) { + /* 3) something went wrong, give up */; + ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc); return rc; - - if (unlikely(begin.outer.mc_db->md_entries == 0)) { - *size_items = 0; - return MDBX_SUCCESS; } - MDBX_val stub; - if (!begin_key) { - if (unlikely(!end_key)) { - /* LY: FIRST..LAST case */ - *size_items = (ptrdiff_t)begin.outer.mc_db->md_entries; - return MDBX_SUCCESS; - } - rc = cursor_first(&begin.outer, &stub, &stub); - if (unlikely(end_key == MDBX_EPSILON)) { - /* LY: FIRST..+epsilon case */ - return (rc == MDBX_SUCCESS) - ? mdbx_cursor_count(&begin.outer, (size_t *)size_items) - : rc; - } - } else { - if (unlikely(begin_key == MDBX_EPSILON)) { - if (end_key == NULL) { - /* LY: -epsilon..LAST case */ - rc = cursor_last(&begin.outer, &stub, &stub); - return (rc == MDBX_SUCCESS) - ? mdbx_cursor_count(&begin.outer, (size_t *)size_items) - : rc; - } - /* LY: -epsilon..value case */ - assert(end_key != MDBX_EPSILON); - begin_key = end_key; - } else if (unlikely(end_key == MDBX_EPSILON)) { - /* LY: value..+epsilon case */ - assert(begin_key != MDBX_EPSILON); - end_key = begin_key; - } - if (end_key && !begin_data && !end_data && - (begin_key == end_key || - begin.outer.mc_dbx->md_cmp(begin_key, end_key) == 0)) { - /* LY: single key case */ - rc = cursor_set(&begin.outer, (MDBX_val *)begin_key, NULL, MDBX_SET).err; - if (unlikely(rc != MDBX_SUCCESS)) { - *size_items = 0; - return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc; - } - *size_items = 1; - if (begin.outer.mc_xcursor != NULL) { - MDBX_node *node = page_node(begin.outer.mc_pg[begin.outer.mc_top], - begin.outer.mc_ki[begin.outer.mc_top]); - if (node_flags(node) & F_DUPDATA) { - /* LY: return the number of duplicates for given key */ - tASSERT(txn, begin.outer.mc_xcursor == &begin.inner && - (begin.inner.mx_cursor.mc_flags & C_INITIALIZED)); - *size_items = - (sizeof(*size_items) >= sizeof(begin.inner.mx_db.md_entries) || - begin.inner.mx_db.md_entries <= PTRDIFF_MAX) - ? (size_t)begin.inner.mx_db.md_entries - : PTRDIFF_MAX; - } - } - return MDBX_SUCCESS; - } else if (begin_data) { - stub = *begin_data; - rc = cursor_set(&begin.outer, (MDBX_val *)begin_key, &stub, - MDBX_GET_BOTH_RANGE) - .err; - } else { - stub = *begin_key; - rc = cursor_set(&begin.outer, &stub, nullptr, MDBX_SET_RANGE).err; - } - } + /* 4) got S-E (locked), continue transition to S-? (used) */ + rc = funlock(env->lck_mmap.fd, LCK_UPPER); + if (rc != MDBX_SUCCESS) + mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> S-?(used)", + rc); - if (unlikely(rc != MDBX_SUCCESS)) { - if (rc != MDBX_NOTFOUND || !(begin.outer.mc_flags & C_INITIALIZED)) - return rc; - } + return MDBX_SUCCESS /* 5) now at S-? (used), done */; +} - MDBX_cursor_couple end; - rc = cursor_init(&end.outer, txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) +MDBX_INTERNAL int lck_upgrade(MDBX_env *env, bool dont_wait) { + /* Transite from used state (S-?) to exclusive-write (E-E) */ + assert(env->lck_mmap.fd != INVALID_HANDLE_VALUE); + + if (env->flags & MDBX_EXCLUSIVE) + return MDBX_SUCCESS /* nope since files were must be opened non-shareable */ + ; + + /* 1) now on S-? (used), try S-E (locked) */ + jitter4testing(false); + int rc = flock(env->lck_mmap.fd, + dont_wait ? LCK_EXCLUSIVE | LCK_DONTWAIT : LCK_EXCLUSIVE, + LCK_UPPER); + if (rc != MDBX_SUCCESS) { + /* 2) something went wrong, give up */; + VERBOSE("%s, err %u", "S-?(used) >> S-E(locked)", rc); return rc; - if (!end_key) - rc = cursor_last(&end.outer, &stub, &stub); - else if (end_data) { - stub = *end_data; - rc = cursor_set(&end.outer, (MDBX_val *)end_key, &stub, MDBX_GET_BOTH_RANGE) - .err; - } else { - stub = *end_key; - rc = cursor_set(&end.outer, &stub, nullptr, MDBX_SET_RANGE).err; - } - if (unlikely(rc != MDBX_SUCCESS)) { - if (rc != MDBX_NOTFOUND || !(end.outer.mc_flags & C_INITIALIZED)) - return rc; } - rc = mdbx_estimate_distance(&begin.outer, &end.outer, size_items); - if (unlikely(rc != MDBX_SUCCESS)) + /* 3) now on S-E (locked), transition to ?-E (middle) */ + rc = funlock(env->lck_mmap.fd, LCK_LOWER); + if (rc != MDBX_SUCCESS) + mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> ?-E(middle)", + rc); + + /* 4) now on ?-E (middle), try E-E (exclusive-write) */ + jitter4testing(false); + rc = flock(env->lck_mmap.fd, + dont_wait ? LCK_EXCLUSIVE | LCK_DONTWAIT : LCK_EXCLUSIVE, + LCK_LOWER); + if (rc != MDBX_SUCCESS) { + /* 5) something went wrong, give up */; + VERBOSE("%s, err %u", "?-E(middle) >> E-E(exclusive-write)", rc); return rc; - assert(*size_items >= -(ptrdiff_t)begin.outer.mc_db->md_entries && - *size_items <= (ptrdiff_t)begin.outer.mc_db->md_entries); + } -#if 0 /* LY: Was decided to returns as-is (i.e. negative) the estimation \ - * results for an inverted ranges. */ + return MDBX_SUCCESS /* 6) now at E-E (exclusive-write), done */; +} - /* Commit 8ddfd1f34ad7cf7a3c4aa75d2e248ca7e639ed63 - Change-Id: If59eccf7311123ab6384c4b93f9b1fed5a0a10d1 */ +MDBX_INTERNAL int lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor, + int global_uniqueness_flag) { + (void)env; + (void)inprocess_neighbor; + (void)global_uniqueness_flag; + if (imports.SetFileIoOverlappedRange && !(env->flags & MDBX_RDONLY)) { + HANDLE token = INVALID_HANDLE_VALUE; + TOKEN_PRIVILEGES privileges; + privileges.PrivilegeCount = 1; + privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, + &token) || + !LookupPrivilegeValue(nullptr, SE_LOCK_MEMORY_NAME, + &privileges.Privileges[0].Luid) || + !AdjustTokenPrivileges(token, FALSE, &privileges, sizeof(privileges), + nullptr, nullptr) || + GetLastError() != ERROR_SUCCESS) + imports.SetFileIoOverlappedRange = nullptr; - if (*size_items < 0) { - /* LY: inverted range case */ - *size_items += (ptrdiff_t)begin.outer.mc_db->md_entries; - } else if (*size_items == 0 && begin_key && end_key) { - int cmp = begin.outer.mc_dbx->md_cmp(&origin_begin_key, &origin_end_key); - if (cmp == 0 && (begin.inner.mx_cursor.mc_flags & C_INITIALIZED) && - begin_data && end_data) - cmp = begin.outer.mc_dbx->md_dcmp(&origin_begin_data, &origin_end_data); - if (cmp > 0) { - /* LY: inverted range case with empty scope */ - *size_items = (ptrdiff_t)begin.outer.mc_db->md_entries; - } + if (token != INVALID_HANDLE_VALUE) + CloseHandle(token); } - assert(*size_items >= 0 && - *size_items <= (ptrdiff_t)begin.outer.mc_db->md_entries); -#endif + return MDBX_SUCCESS; +} +MDBX_INTERNAL int lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor, + const uint32_t current_pid) { + (void)current_pid; + /* LY: should unmap before releasing the locks to avoid race condition and + * STATUS_USER_MAPPED_FILE/ERROR_USER_MAPPED_FILE */ + if (env->dxb_mmap.base) + osal_munmap(&env->dxb_mmap); + if (env->lck_mmap.lck) { + const bool synced = env->lck_mmap.lck->unsynced_pages.weak == 0; + osal_munmap(&env->lck_mmap); + if (synced && !inprocess_neighbor && + env->lck_mmap.fd != INVALID_HANDLE_VALUE && + lck_upgrade(env, true) == MDBX_SUCCESS) + /* this will fail if LCK is used/mmapped by other process(es) */ + osal_ftruncate(env->lck_mmap.fd, 0); + } + lck_unlock(env); return MDBX_SUCCESS; } -//------------------------------------------------------------------------------ +/*----------------------------------------------------------------------------*/ +/* reader checking (by pid) */ -/* Позволяет обновить или удалить существующую запись с получением - * в old_data предыдущего значения данных. При этом если new_data равен - * нулю, то выполняется удаление, иначе обновление/вставка. - * - * Текущее значение может находиться в уже измененной (грязной) странице. - * В этом случае страница будет перезаписана при обновлении, а само старое - * значение утрачено. Поэтому исходно в old_data должен быть передан - * дополнительный буфер для копирования старого значения. - * Если переданный буфер слишком мал, то функция вернет -1, установив - * old_data->iov_len в соответствующее значение. - * - * Для не-уникальных ключей также возможен второй сценарий использования, - * когда посредством old_data из записей с одинаковым ключом для - * удаления/обновления выбирается конкретная. Для выбора этого сценария - * во flags следует одновременно указать MDBX_CURRENT и MDBX_NOOVERWRITE. - * Именно эта комбинация выбрана, так как она лишена смысла, и этим позволяет - * идентифицировать запрос такого сценария. +MDBX_INTERNAL int lck_rpid_set(MDBX_env *env) { + (void)env; + return MDBX_SUCCESS; +} + +MDBX_INTERNAL int lck_rpid_clear(MDBX_env *env) { + (void)env; + return MDBX_SUCCESS; +} + +/* Checks reader by pid. * - * Функция может быть замещена соответствующими операциями с курсорами - * после двух доработок (TODO): - * - внешняя аллокация курсоров, в том числе на стеке (без malloc). - * - получения dirty-статуса страницы по адресу (знать о MUTABLE/WRITEABLE). - */ + * Returns: + * MDBX_RESULT_TRUE, if pid is live (unable to acquire lock) + * MDBX_RESULT_FALSE, if pid is dead (lock acquired) + * or otherwise the errcode. */ +MDBX_INTERNAL int lck_rpid_check(MDBX_env *env, uint32_t pid) { + (void)env; + HANDLE hProcess = OpenProcess(SYNCHRONIZE, FALSE, pid); + int rc; + if (likely(hProcess)) { + rc = WaitForSingleObject(hProcess, 0); + if (unlikely(rc == (int)WAIT_FAILED)) + rc = (int)GetLastError(); + CloseHandle(hProcess); + } else { + rc = (int)GetLastError(); + } -int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, - MDBX_val *new_data, MDBX_val *old_data, - MDBX_put_flags_t flags, MDBX_preserve_func preserver, - void *preserver_context) { - int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) + switch (rc) { + case ERROR_INVALID_PARAMETER: + /* pid seems invalid */ + return MDBX_RESULT_FALSE; + case WAIT_OBJECT_0: + /* process just exited */ + return MDBX_RESULT_FALSE; + case ERROR_ACCESS_DENIED: + /* The ERROR_ACCESS_DENIED would be returned for CSRSS-processes, etc. + * assume pid exists */ + return MDBX_RESULT_TRUE; + case WAIT_TIMEOUT: + /* pid running */ + return MDBX_RESULT_TRUE; + default: + /* failure */ return rc; + } +} - if (unlikely(!key || !old_data || old_data == new_data)) - return MDBX_EINVAL; +#endif /* Windows */ +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 - if (unlikely(old_data->iov_base == NULL && old_data->iov_len)) - return MDBX_EINVAL; - if (unlikely(new_data == NULL && - (flags & (MDBX_CURRENT | MDBX_RESERVE)) != MDBX_CURRENT)) - return MDBX_EINVAL; +__cold static int lck_setup_locked(MDBX_env *env) { + int err = rthc_register(env); + if (unlikely(err != MDBX_SUCCESS)) + return err; - if (unlikely(dbi <= FREE_DBI)) - return MDBX_BAD_DBI; + int lck_seize_rc = lck_seize(env); + if (unlikely(MDBX_IS_ERROR(lck_seize_rc))) + return lck_seize_rc; - if (unlikely(flags & - ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS | - MDBX_RESERVE | MDBX_APPEND | MDBX_APPENDDUP | MDBX_CURRENT))) - return MDBX_EINVAL; + if (env->lck_mmap.fd == INVALID_HANDLE_VALUE) { + env->lck = lckless_stub(env); + env->max_readers = UINT_MAX; + DEBUG("lck-setup:%s%s%s", " lck-less", + (env->flags & MDBX_RDONLY) ? " readonly" : "", + (lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); + return lck_seize_rc; + } - MDBX_cursor_couple cx; - rc = cursor_init(&cx.outer, txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - cx.outer.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &cx.outer; + DEBUG("lck-setup:%s%s%s", " with-lck", + (env->flags & MDBX_RDONLY) ? " readonly" : "", + (lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); - MDBX_val present_key = *key; - if (F_ISSET(flags, MDBX_CURRENT | MDBX_NOOVERWRITE)) { - /* в old_data значение для выбора конкретного дубликата */ - if (unlikely(!(txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT))) { - rc = MDBX_EINVAL; - goto bailout; + MDBX_env *inprocess_neighbor = nullptr; + err = rthc_uniq_check(&env->lck_mmap, &inprocess_neighbor); + if (unlikely(MDBX_IS_ERROR(err))) + return err; + if (inprocess_neighbor) { + if ((globals.runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 || + (inprocess_neighbor->flags & MDBX_EXCLUSIVE) != 0) + return MDBX_BUSY; + if (lck_seize_rc == MDBX_RESULT_TRUE) { + err = lck_downgrade(env); + if (unlikely(err != MDBX_SUCCESS)) + return err; + lck_seize_rc = MDBX_RESULT_FALSE; } + } - /* убираем лишний бит, он был признаком запрошенного режима */ - flags -= MDBX_NOOVERWRITE; + uint64_t size = 0; + err = osal_filesize(env->lck_mmap.fd, &size); + if (unlikely(err != MDBX_SUCCESS)) + return err; - rc = cursor_set(&cx.outer, &present_key, old_data, MDBX_GET_BOTH).err; - if (rc != MDBX_SUCCESS) - goto bailout; + if (lck_seize_rc == MDBX_RESULT_TRUE) { + size = + ceil_powerof2(env->max_readers * sizeof(reader_slot_t) + sizeof(lck_t), + globals.sys_pagesize); + jitter4testing(false); } else { - /* в old_data буфер для сохранения предыдущего значения */ - if (unlikely(new_data && old_data->iov_base == new_data->iov_base)) - return MDBX_EINVAL; - MDBX_val present_data; - rc = cursor_set(&cx.outer, &present_key, &present_data, MDBX_SET_KEY).err; - if (unlikely(rc != MDBX_SUCCESS)) { - old_data->iov_base = NULL; - old_data->iov_len = 0; - if (rc != MDBX_NOTFOUND || (flags & MDBX_CURRENT)) - goto bailout; - } else if (flags & MDBX_NOOVERWRITE) { - rc = MDBX_KEYEXIST; - *old_data = present_data; - goto bailout; - } else { - MDBX_page *page = cx.outer.mc_pg[cx.outer.mc_top]; - if (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) { - if (flags & MDBX_CURRENT) { - /* disallow update/delete for multi-values */ - MDBX_node *node = page_node(page, cx.outer.mc_ki[cx.outer.mc_top]); - if (node_flags(node) & F_DUPDATA) { - tASSERT(txn, XCURSOR_INITED(&cx.outer) && - cx.outer.mc_xcursor->mx_db.md_entries > 1); - if (cx.outer.mc_xcursor->mx_db.md_entries > 1) { - rc = MDBX_EMULTIVAL; - goto bailout; - } - } - /* В оригинальной LMDB флажок MDBX_CURRENT здесь приведет - * к замене данных без учета MDBX_DUPSORT сортировки, - * но здесь это в любом случае допустимо, так как мы - * проверили что для ключа есть только одно значение. */ - } - } - - if (IS_MODIFIABLE(txn, page)) { - if (new_data && cmp_lenfast(&present_data, new_data) == 0) { - /* если данные совпадают, то ничего делать не надо */ - *old_data = *new_data; - goto bailout; - } - rc = preserver ? preserver(preserver_context, old_data, - present_data.iov_base, present_data.iov_len) - : MDBX_SUCCESS; - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } else { - *old_data = present_data; - } - flags |= MDBX_CURRENT; + if (env->flags & MDBX_EXCLUSIVE) + return MDBX_BUSY; + if (size > INT_MAX || (size & (globals.sys_pagesize - 1)) != 0 || + size < globals.sys_pagesize) { + ERROR("lck-file has invalid size %" PRIu64 " bytes", size); + return MDBX_PROBLEM; } } - if (likely(new_data)) - rc = cursor_put_checklen(&cx.outer, key, new_data, flags); - else - rc = cursor_del(&cx.outer, flags & MDBX_ALLDUPS); - -bailout: - txn->mt_cursors[dbi] = cx.outer.mc_next; - return rc; -} - -static int default_value_preserver(void *context, MDBX_val *target, - const void *src, size_t bytes) { - (void)context; - if (unlikely(target->iov_len < bytes)) { - target->iov_base = nullptr; - target->iov_len = bytes; - return MDBX_RESULT_TRUE; + const size_t maxreaders = + ((size_t)size - sizeof(lck_t)) / sizeof(reader_slot_t); + if (maxreaders < 4) { + ERROR("lck-size too small (up to %" PRIuPTR " readers)", maxreaders); + return MDBX_PROBLEM; } - memcpy(target->iov_base, src, target->iov_len = bytes); - return MDBX_SUCCESS; -} + env->max_readers = (maxreaders <= MDBX_READERS_LIMIT) + ? (unsigned)maxreaders + : (unsigned)MDBX_READERS_LIMIT; -int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, - MDBX_val *new_data, MDBX_val *old_data, - MDBX_put_flags_t flags) { - return mdbx_replace_ex(txn, dbi, key, new_data, old_data, flags, - default_value_preserver, nullptr); -} + err = osal_mmap((env->flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP, &env->lck_mmap, + (size_t)size, (size_t)size, + lck_seize_rc ? MMAP_OPTION_TRUNCATE | MMAP_OPTION_SEMAPHORE + : MMAP_OPTION_SEMAPHORE); + if (unlikely(err != MDBX_SUCCESS)) + return err; -/* Функция сообщает находится ли указанный адрес в "грязной" странице у - * заданной пишущей транзакции. В конечном счете это позволяет избавиться от - * лишнего копирования данных из НЕ-грязных страниц. - * - * "Грязные" страницы - это те, которые уже были изменены в ходе пишущей - * транзакции. Соответственно, какие-либо дальнейшие изменения могут привести - * к перезаписи таких страниц. Поэтому все функции, выполняющие изменения, в - * качестве аргументов НЕ должны получать указатели на данные в таких - * страницах. В свою очередь "НЕ грязные" страницы перед модификацией будут - * скопированы. - * - * Другими словами, данные из "грязных" страниц должны быть либо скопированы - * перед передачей в качестве аргументов для дальнейших модификаций, либо - * отвергнуты на стадии проверки корректности аргументов. - * - * Таким образом, функция позволяет как избавится от лишнего копирования, - * так и выполнить более полную проверку аргументов. - * - * ВАЖНО: Передаваемый указатель должен указывать на начало данных. Только - * так гарантируется что актуальный заголовок страницы будет физически - * расположен в той-же странице памяти, в том числе для многостраничных - * P_OVERFLOW страниц с длинными данными. */ -int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { - int rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +#if MDBX_ENABLE_MADVISE +#ifdef MADV_DODUMP + err = madvise(env->lck_mmap.lck, size, MADV_DODUMP) ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#endif /* MADV_DODUMP */ - const MDBX_env *env = txn->mt_env; - const ptrdiff_t offset = ptr_dist(ptr, env->me_map); - if (offset >= 0) { - const pgno_t pgno = bytes2pgno(env, offset); - if (likely(pgno < txn->mt_next_pgno)) { - const MDBX_page *page = pgno2page(env, pgno); - if (unlikely(page->mp_pgno != pgno || - (page->mp_flags & P_ILL_BITS) != 0)) { - /* The ptr pointed into middle of a large page, - * not to the beginning of a data. */ - return MDBX_EINVAL; - } - return ((txn->mt_flags & MDBX_TXN_RDONLY) || !IS_MODIFIABLE(txn, page)) - ? MDBX_RESULT_FALSE - : MDBX_RESULT_TRUE; +#ifdef MADV_WILLNEED + err = madvise(env->lck_mmap.lck, size, MADV_WILLNEED) ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#elif defined(POSIX_MADV_WILLNEED) + err = ignore_enosys( + posix_madvise(env->lck_mmap.lck, size, POSIX_MADV_WILLNEED)); + if (unlikely(MDBX_IS_ERROR(err))) + return err; +#endif /* MADV_WILLNEED */ +#endif /* MDBX_ENABLE_MADVISE */ + + lck_t *lck = env->lck_mmap.lck; + if (lck_seize_rc == MDBX_RESULT_TRUE) { + /* If we succeed got exclusive lock, then nobody is using the lock region + * and we should initialize it. */ + memset(lck, 0, (size_t)size); + jitter4testing(false); + lck->magic_and_version = MDBX_LOCK_MAGIC; + lck->os_and_format = MDBX_LOCK_FORMAT; +#if MDBX_ENABLE_PGOP_STAT + lck->pgops.wops.weak = 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + err = osal_msync(&env->lck_mmap, 0, (size_t)size, + MDBX_SYNC_DATA | MDBX_SYNC_SIZE); + if (unlikely(err != MDBX_SUCCESS)) { + ERROR("initial-%s for lck-file failed, err %d", "msync/fsync", err); + eASSERT(env, MDBX_IS_ERROR(err)); + return err; } - if ((size_t)offset < env->me_dxb_mmap.limit) { - /* Указатель адресует что-то в пределах mmap, но за границей - * распределенных страниц. Такое может случится если mdbx_is_dirty() - * вызывается после операции, в ходе которой грязная страница была - * возвращена в нераспределенное пространство. */ - return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EINVAL : MDBX_RESULT_TRUE; + } else { + if (lck->magic_and_version != MDBX_LOCK_MAGIC) { + const bool invalid = (lck->magic_and_version >> 8) != MDBX_MAGIC; + ERROR("lock region has %s", + invalid + ? "invalid magic" + : "incompatible version (only applications with nearly or the " + "same versions of libmdbx can share the same database)"); + return invalid ? MDBX_INVALID : MDBX_VERSION_MISMATCH; + } + if (lck->os_and_format != MDBX_LOCK_FORMAT) { + ERROR("lock region has os/format signature 0x%" PRIx32 + ", expected 0x%" PRIx32, + lck->os_and_format, MDBX_LOCK_FORMAT); + return MDBX_VERSION_MISMATCH; } } - /* Страница вне используемого mmap-диапазона, т.е. либо в функцию был - * передан некорректный адрес, либо адрес в теневой странице, которая была - * выделена посредством malloc(). - * - * Для режима MDBX_WRITE_MAP режима страница однозначно "не грязная", - * а для режимов без MDBX_WRITE_MAP однозначно "не чистая". */ - return (txn->mt_flags & (MDBX_WRITEMAP | MDBX_TXN_RDONLY)) ? MDBX_EINVAL - : MDBX_RESULT_TRUE; -} - -int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, - uint64_t increment) { - int rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - rc = dbi_check(txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(txn->mt_dbi_state[dbi] & DBI_STALE)) { - rc = fetch_sdb(txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + err = lck_init(env, inprocess_neighbor, lck_seize_rc); + if (unlikely(err != MDBX_SUCCESS)) { + eASSERT(env, MDBX_IS_ERROR(err)); + return err; } - MDBX_db *dbs = &txn->mt_dbs[dbi]; - if (likely(result)) - *result = dbs->md_seq; + env->lck = lck; + eASSERT(env, !MDBX_IS_ERROR(lck_seize_rc)); + return lck_seize_rc; +} - if (likely(increment > 0)) { - if (unlikely(dbi == FREE_DBI || (txn->mt_flags & MDBX_TXN_RDONLY) != 0)) - return MDBX_EACCESS; +__cold int lck_setup(MDBX_env *env, mdbx_mode_t mode) { + eASSERT(env, env->lazy_fd != INVALID_HANDLE_VALUE); + eASSERT(env, env->lck_mmap.fd == INVALID_HANDLE_VALUE); - uint64_t new = dbs->md_seq + increment; - if (unlikely(new < increment)) - return MDBX_RESULT_TRUE; + int err = osal_openfile(MDBX_OPEN_LCK, env, env->pathname.lck, + &env->lck_mmap.fd, mode); + if (err != MDBX_SUCCESS) { + switch (err) { + default: + return err; + case MDBX_ENOFILE: + case MDBX_EACCESS: + case MDBX_EPERM: + if (!F_ISSET(env->flags, MDBX_RDONLY | MDBX_EXCLUSIVE)) + return err; + break; + case MDBX_EROFS: + if ((env->flags & MDBX_RDONLY) == 0) + return err; + break; + } + + if (err != MDBX_ENOFILE) { + /* ENSURE the file system is read-only */ + err = osal_check_fs_rdonly(env->lazy_fd, env->pathname.lck, err); + if (err != MDBX_SUCCESS && + /* ignore ERROR_NOT_SUPPORTED for exclusive mode */ + !(err == MDBX_ENOSYS && (env->flags & MDBX_EXCLUSIVE))) + return err; + } - tASSERT(txn, new > dbs->md_seq); - dbs->md_seq = new; - txn->mt_flags |= MDBX_TXN_DIRTY; - txn->mt_dbi_state[dbi] |= DBI_DIRTY; + /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ + env->lck_mmap.fd = INVALID_HANDLE_VALUE; } - return MDBX_SUCCESS; + rthc_lock(); + err = lck_setup_locked(env); + rthc_unlock(); + return err; } -/*----------------------------------------------------------------------------*/ - -__cold intptr_t mdbx_limits_dbsize_min(intptr_t pagesize) { - if (pagesize < 1) - pagesize = (intptr_t)mdbx_default_pagesize(); - else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || - pagesize > (intptr_t)MAX_PAGESIZE || - !is_powerof2((size_t)pagesize))) - return -1; - - return MIN_PAGENO * pagesize; +void mincore_clean_cache(const MDBX_env *const env) { + memset(env->lck->mincore_cache.begin, -1, + sizeof(env->lck->mincore_cache.begin)); } +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 -__cold intptr_t mdbx_limits_dbsize_max(intptr_t pagesize) { - if (pagesize < 1) - pagesize = (intptr_t)mdbx_default_pagesize(); - else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || - pagesize > (intptr_t)MAX_PAGESIZE || - !is_powerof2((size_t)pagesize))) - return -1; - STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX); - const uint64_t limit = (1 + (uint64_t)MAX_PAGENO) * pagesize; - return (limit < MAX_MAPSIZE) ? (intptr_t)limit : (intptr_t)MAX_MAPSIZE; +__cold void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args) { + ENSURE(nullptr, osal_fastmutex_acquire(&globals.debug_lock) == 0); + if (globals.logger.ptr) { + if (globals.logger_buffer == nullptr) + globals.logger.fmt(level, function, line, fmt, args); + else { + const int len = vsnprintf(globals.logger_buffer, + globals.logger_buffer_size, fmt, args); + if (len > 0) + globals.logger.nofmt(level, function, line, globals.logger_buffer, len); + } + } else { +#if defined(_WIN32) || defined(_WIN64) + if (IsDebuggerPresent()) { + int prefix_len = 0; + char *prefix = nullptr; + if (function && line > 0) + prefix_len = osal_asprintf(&prefix, "%s:%d ", function, line); + else if (function) + prefix_len = osal_asprintf(&prefix, "%s: ", function); + else if (line > 0) + prefix_len = osal_asprintf(&prefix, "%d: ", line); + if (prefix_len > 0 && prefix) { + OutputDebugStringA(prefix); + osal_free(prefix); + } + char *msg = nullptr; + int msg_len = osal_vasprintf(&msg, fmt, args); + if (msg_len > 0 && msg) { + OutputDebugStringA(msg); + osal_free(msg); + } + } +#else + if (function && line > 0) + fprintf(stderr, "%s:%d ", function, line); + else if (function) + fprintf(stderr, "%s: ", function); + else if (line > 0) + fprintf(stderr, "%d: ", line); + vfprintf(stderr, fmt, args); + fflush(stderr); +#endif + } + ENSURE(nullptr, osal_fastmutex_release(&globals.debug_lock) == 0); } -__cold intptr_t mdbx_limits_txnsize_max(intptr_t pagesize) { - if (pagesize < 1) - pagesize = (intptr_t)mdbx_default_pagesize(); - else if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || - pagesize > (intptr_t)MAX_PAGESIZE || - !is_powerof2((size_t)pagesize))) - return -1; - - STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX); - const uint64_t pgl_limit = - pagesize * (uint64_t)(MDBX_PGL_LIMIT / MDBX_GOLD_RATIO_DBL); - const uint64_t map_limit = (uint64_t)(MAX_MAPSIZE / MDBX_GOLD_RATIO_DBL); - return (pgl_limit < map_limit) ? (intptr_t)pgl_limit : (intptr_t)map_limit; +__cold void debug_log(int level, const char *function, int line, + const char *fmt, ...) { + va_list args; + va_start(args, fmt); + debug_log_va(level, function, line, fmt, args); + va_end(args); } -/*** Key-making functions to avoid custom comparators *************************/ - -static __always_inline double key2double(const int64_t key) { - union { - uint64_t u; - double f; - } casting; +/* Dump a val in ascii or hexadecimal. */ +__cold const char *mdbx_dump_val(const MDBX_val *val, char *const buf, + const size_t bufsize) { + if (!val) + return ""; + if (!val->iov_len) + return ""; + if (!buf || bufsize < 4) + return nullptr; - casting.u = (key < 0) ? key + UINT64_C(0x8000000000000000) - : UINT64_C(0xffffFFFFffffFFFF) - key; - return casting.f; -} + if (!val->iov_base) { + int len = snprintf(buf, bufsize, "", val->iov_len); + assert(len > 0 && (size_t)len < bufsize); + (void)len; + return buf; + } -static __always_inline uint64_t double2key(const double *const ptr) { - STATIC_ASSERT(sizeof(double) == sizeof(int64_t)); - const int64_t i = *(const int64_t *)ptr; - const uint64_t u = (i < 0) ? UINT64_C(0xffffFFFFffffFFFF) - i - : i + UINT64_C(0x8000000000000000); - if (ASSERT_ENABLED()) { - const double f = key2double(u); - assert(memcmp(&f, ptr, 8) == 0); + bool is_ascii = true; + const uint8_t *const data = val->iov_base; + for (size_t i = 0; i < val->iov_len; i++) + if (data[i] < ' ' || data[i] > '~') { + is_ascii = false; + break; + } + + if (is_ascii) { + int len = + snprintf(buf, bufsize, "%.*s", + (val->iov_len > INT_MAX) ? INT_MAX : (int)val->iov_len, data); + assert(len > 0 && (size_t)len < bufsize); + (void)len; + } else { + char *const detent = buf + bufsize - 2; + char *ptr = buf; + *ptr++ = '<'; + for (size_t i = 0; i < val->iov_len && ptr < detent; i++) { + const char hex[16] = {'0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'}; + *ptr++ = hex[data[i] >> 4]; + *ptr++ = hex[data[i] & 15]; + } + if (ptr < detent) + *ptr++ = '>'; + *ptr = '\0'; } - return u; + return buf; } -static __always_inline float key2float(const int32_t key) { - union { - uint32_t u; - float f; - } casting; +/*------------------------------------------------------------------------------ + LY: debug stuff */ - casting.u = - (key < 0) ? key + UINT32_C(0x80000000) : UINT32_C(0xffffFFFF) - key; - return casting.f; +__cold const char *pagetype_caption(const uint8_t type, char buf4unknown[16]) { + switch (type) { + case P_BRANCH: + return "branch"; + case P_LEAF: + return "leaf"; + case P_LEAF | P_SUBP: + return "subleaf"; + case P_LEAF | P_DUPFIX: + return "dupfix-leaf"; + case P_LEAF | P_DUPFIX | P_SUBP: + return "dupfix-subleaf"; + case P_LEAF | P_DUPFIX | P_SUBP | P_LEGACY_DIRTY: + return "dupfix-subleaf.legacy-dirty"; + case P_LARGE: + return "large"; + default: + snprintf(buf4unknown, 16, "unknown_0x%x", type); + return buf4unknown; + } } -static __always_inline uint32_t float2key(const float *const ptr) { - STATIC_ASSERT(sizeof(float) == sizeof(int32_t)); - const int32_t i = *(const int32_t *)ptr; - const uint32_t u = - (i < 0) ? UINT32_C(0xffffFFFF) - i : i + UINT32_C(0x80000000); - if (ASSERT_ENABLED()) { - const float f = key2float(u); - assert(memcmp(&f, ptr, 4) == 0); - } - return u; +__cold static const char *leafnode_type(node_t *n) { + static const char *const tp[2][2] = {{"", ": DB"}, + {": sub-page", ": sub-DB"}}; + return (node_flags(n) & N_BIGDATA) + ? ": large page" + : tp[!!(node_flags(n) & N_DUPDATA)][!!(node_flags(n) & N_SUBDATA)]; } -uint64_t mdbx_key_from_double(const double ieee754_64bit) { - return double2key(&ieee754_64bit); +/* Display all the keys in the page. */ +__cold void page_list(page_t *mp) { + pgno_t pgno = mp->pgno; + const char *type; + node_t *node; + size_t i, nkeys, nsize, total = 0; + MDBX_val key; + DKBUF; + + switch (page_type(mp)) { + case P_BRANCH: + type = "Branch page"; + break; + case P_LEAF: + type = "Leaf page"; + break; + case P_LEAF | P_SUBP: + type = "Leaf sub-page"; + break; + case P_LEAF | P_DUPFIX: + type = "Leaf2 page"; + break; + case P_LEAF | P_DUPFIX | P_SUBP: + type = "Leaf2 sub-page"; + break; + case P_LARGE: + VERBOSE("Overflow page %" PRIaPGNO " pages %u\n", pgno, mp->pages); + return; + case P_META: + VERBOSE("Meta-page %" PRIaPGNO " txnid %" PRIu64 "\n", pgno, + unaligned_peek_u64(4, page_meta(mp)->txnid_a)); + return; + default: + VERBOSE("Bad page %" PRIaPGNO " flags 0x%X\n", pgno, mp->flags); + return; + } + + nkeys = page_numkeys(mp); + VERBOSE("%s %" PRIaPGNO " numkeys %zu\n", type, pgno, nkeys); + + for (i = 0; i < nkeys; i++) { + if (is_dupfix_leaf( + mp)) { /* DUPFIX pages have no entries[] or node headers */ + key = page_dupfix_key(mp, i, nsize = mp->dupfix_ksize); + total += nsize; + VERBOSE("key %zu: nsize %zu, %s\n", i, nsize, DKEY(&key)); + continue; + } + node = page_node(mp, i); + key.iov_len = node_ks(node); + key.iov_base = node->payload; + nsize = NODESIZE + key.iov_len; + if (is_branch(mp)) { + VERBOSE("key %zu: page %" PRIaPGNO ", %s\n", i, node_pgno(node), + DKEY(&key)); + total += nsize; + } else { + if (node_flags(node) & N_BIGDATA) + nsize += sizeof(pgno_t); + else + nsize += node_ds(node); + total += nsize; + nsize += sizeof(indx_t); + VERBOSE("key %zu: nsize %zu, %s%s\n", i, nsize, DKEY(&key), + leafnode_type(node)); + } + total = EVEN_CEIL(total); + } + VERBOSE("Total: header %u + contents %zu + unused %zu\n", + is_dupfix_leaf(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->lower, total, + page_room(mp)); } -uint64_t mdbx_key_from_ptrdouble(const double *const ieee754_64bit) { - return double2key(ieee754_64bit); +__cold static int setup_debug(MDBX_log_level_t level, MDBX_debug_flags_t flags, + union logger_union logger, char *buffer, + size_t buffer_size) { + ENSURE(nullptr, osal_fastmutex_acquire(&globals.debug_lock) == 0); + + const int rc = globals.runtime_flags | (globals.loglevel << 16); + if (level != MDBX_LOG_DONTCHANGE) + globals.loglevel = (uint8_t)level; + + if (flags != MDBX_DBG_DONTCHANGE) { + flags &= +#if MDBX_DEBUG + MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | MDBX_DBG_JITTER | +#endif + MDBX_DBG_DUMP | MDBX_DBG_LEGACY_MULTIOPEN | MDBX_DBG_LEGACY_OVERLAP | + MDBX_DBG_DONT_UPGRADE; + globals.runtime_flags = (uint8_t)flags; + } + + assert(MDBX_LOGGER_DONTCHANGE == ((MDBX_debug_func *)(intptr_t)-1)); + if (logger.ptr != (void *)((intptr_t)-1)) { + globals.logger.ptr = logger.ptr; + globals.logger_buffer = buffer; + globals.logger_buffer_size = buffer_size; + } + + ENSURE(nullptr, osal_fastmutex_release(&globals.debug_lock) == 0); + return rc; } -uint32_t mdbx_key_from_float(const float ieee754_32bit) { - return float2key(&ieee754_32bit); +__cold int mdbx_setup_debug_nofmt(MDBX_log_level_t level, + MDBX_debug_flags_t flags, + MDBX_debug_func_nofmt *logger, char *buffer, + size_t buffer_size) { + union logger_union thunk; + thunk.nofmt = + (logger && buffer && buffer_size) ? logger : MDBX_LOGGER_NOFMT_DONTCHANGE; + return setup_debug(level, flags, thunk, buffer, buffer_size); } -uint32_t mdbx_key_from_ptrfloat(const float *const ieee754_32bit) { - return float2key(ieee754_32bit); +__cold int mdbx_setup_debug(MDBX_log_level_t level, MDBX_debug_flags_t flags, + MDBX_debug_func *logger) { + union logger_union thunk; + thunk.fmt = logger; + return setup_debug(level, flags, thunk, nullptr, 0); } +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 -#define IEEE754_DOUBLE_MANTISSA_SIZE 52 -#define IEEE754_DOUBLE_EXPONENTA_BIAS 0x3FF -#define IEEE754_DOUBLE_EXPONENTA_MAX 0x7FF -#define IEEE754_DOUBLE_IMPLICIT_LEAD UINT64_C(0x0010000000000000) -#define IEEE754_DOUBLE_MANTISSA_MASK UINT64_C(0x000FFFFFFFFFFFFF) -#define IEEE754_DOUBLE_MANTISSA_AMAX UINT64_C(0x001FFFFFFFFFFFFF) -static __inline int clz64(uint64_t value) { -#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_clzl) - if (sizeof(value) == sizeof(int)) - return __builtin_clz(value); - if (sizeof(value) == sizeof(long)) - return __builtin_clzl(value); -#if (defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ == 8) || \ - __has_builtin(__builtin_clzll) - return __builtin_clzll(value); -#endif /* have(long long) && long long == uint64_t */ -#endif /* GNU C */ +typedef struct meta_snap { + uint64_t txnid; + size_t is_steady; +} meta_snap_t; -#if defined(_MSC_VER) - unsigned long index; -#if defined(_M_AMD64) || defined(_M_ARM64) || defined(_M_X64) - _BitScanReverse64(&index, value); - return 63 - index; +static inline txnid_t fetch_txnid(const volatile mdbx_atomic_uint32_t *ptr) { +#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \ + MDBX_UNALIGNED_OK >= 8 + return atomic_load64((const volatile mdbx_atomic_uint64_t *)ptr, + mo_AcquireRelease); #else - if (value > UINT32_MAX) { - _BitScanReverse(&index, (uint32_t)(value >> 32)); - return 31 - index; - } - _BitScanReverse(&index, (uint32_t)value); - return 63 - index; + const uint32_t l = atomic_load32( + &ptr[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], mo_AcquireRelease); + const uint32_t h = atomic_load32( + &ptr[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], mo_AcquireRelease); + return (uint64_t)h << 32 | l; #endif -#endif /* MSVC */ +} - value |= value >> 1; - value |= value >> 2; - value |= value >> 4; - value |= value >> 8; - value |= value >> 16; - value |= value >> 32; - static const uint8_t debruijn_clz64[64] = { - 63, 16, 62, 7, 15, 36, 61, 3, 6, 14, 22, 26, 35, 47, 60, 2, - 9, 5, 28, 11, 13, 21, 42, 19, 25, 31, 34, 40, 46, 52, 59, 1, - 17, 8, 37, 4, 23, 27, 48, 10, 29, 12, 43, 20, 32, 41, 53, 18, - 38, 24, 49, 30, 44, 33, 54, 39, 50, 45, 55, 51, 56, 57, 58, 0}; - return debruijn_clz64[value * UINT64_C(0x03F79D71B4CB0A89) >> 58]; +static inline meta_snap_t meta_snap(const volatile meta_t *meta) { + txnid_t txnid = fetch_txnid(meta->txnid_a); + jitter4testing(true); + size_t is_steady = meta_is_steady(meta) && txnid >= MIN_TXNID; + jitter4testing(true); + if (unlikely(txnid != fetch_txnid(meta->txnid_b))) + txnid = is_steady = 0; + meta_snap_t r = {txnid, is_steady}; + return r; } -static __inline uint64_t round_mantissa(const uint64_t u64, int shift) { - assert(shift < 0 && u64 > 0); - shift = -shift; - const unsigned half = 1 << (shift - 1); - const unsigned lsb = 1 & (unsigned)(u64 >> shift); - const unsigned tie2even = 1 ^ lsb; - return (u64 + half - tie2even) >> shift; +txnid_t meta_txnid(const volatile meta_t *meta) { + return meta_snap(meta).txnid; } -uint64_t mdbx_key_from_jsonInteger(const int64_t json_integer) { - const uint64_t bias = UINT64_C(0x8000000000000000); - if (json_integer > 0) { - const uint64_t u64 = json_integer; - int shift = clz64(u64) - (64 - IEEE754_DOUBLE_MANTISSA_SIZE - 1); - uint64_t mantissa = u64 << shift; - if (unlikely(shift < 0)) { - mantissa = round_mantissa(u64, shift); - if (mantissa > IEEE754_DOUBLE_MANTISSA_AMAX) - mantissa = round_mantissa(u64, --shift); - } +meta_ptr_t meta_ptr(const MDBX_env *env, unsigned n) { + eASSERT(env, n < NUM_METAS); + meta_ptr_t r; + meta_snap_t snap = meta_snap(r.ptr_v = METAPAGE(env, n)); + r.txnid = snap.txnid; + r.is_steady = snap.is_steady; + return r; +} - assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD && - mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX); - const uint64_t exponent = (uint64_t)IEEE754_DOUBLE_EXPONENTA_BIAS + - IEEE754_DOUBLE_MANTISSA_SIZE - shift; - assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX); - const uint64_t key = bias + (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) + - (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD); -#if !defined(_MSC_VER) || \ - defined( \ - _DEBUG) /* Workaround for MSVC error LNK2019: unresolved external \ - symbol __except1 referenced in function __ftol3_except */ - assert(key == mdbx_key_from_double((double)json_integer)); -#endif /* Workaround for MSVC */ - return key; - } +static uint8_t meta_cmp2pack(uint8_t c01, uint8_t c02, uint8_t c12, bool s0, + bool s1, bool s2) { + assert(c01 < 3 && c02 < 3 && c12 < 3); + /* assert(s0 < 2 && s1 < 2 && s2 < 2); */ + const uint8_t recent = meta_cmp2recent(c01, s0, s1) + ? (meta_cmp2recent(c02, s0, s2) ? 0 : 2) + : (meta_cmp2recent(c12, s1, s2) ? 1 : 2); + const uint8_t prefer_steady = meta_cmp2steady(c01, s0, s1) + ? (meta_cmp2steady(c02, s0, s2) ? 0 : 2) + : (meta_cmp2steady(c12, s1, s2) ? 1 : 2); - if (json_integer < 0) { - const uint64_t u64 = -json_integer; - int shift = clz64(u64) - (64 - IEEE754_DOUBLE_MANTISSA_SIZE - 1); - uint64_t mantissa = u64 << shift; - if (unlikely(shift < 0)) { - mantissa = round_mantissa(u64, shift); - if (mantissa > IEEE754_DOUBLE_MANTISSA_AMAX) - mantissa = round_mantissa(u64, --shift); - } + uint8_t tail; + if (recent == 0) + tail = meta_cmp2steady(c12, s1, s2) ? 2 : 1; + else if (recent == 1) + tail = meta_cmp2steady(c02, s0, s2) ? 2 : 0; + else + tail = meta_cmp2steady(c01, s0, s1) ? 1 : 0; - assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD && - mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX); - const uint64_t exponent = (uint64_t)IEEE754_DOUBLE_EXPONENTA_BIAS + - IEEE754_DOUBLE_MANTISSA_SIZE - shift; - assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX); - const uint64_t key = bias - 1 - (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) - - (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD); -#if !defined(_MSC_VER) || \ - defined( \ - _DEBUG) /* Workaround for MSVC error LNK2019: unresolved external \ - symbol __except1 referenced in function __ftol3_except */ - assert(key == mdbx_key_from_double((double)json_integer)); -#endif /* Workaround for MSVC */ - return key; - } + const bool valid = + c01 != 1 || s0 != s1 || c02 != 1 || s0 != s2 || c12 != 1 || s1 != s2; + const bool strict = (c01 != 1 || s0 != s1) && (c02 != 1 || s0 != s2) && + (c12 != 1 || s1 != s2); + return tail | recent << 2 | prefer_steady << 4 | strict << 6 | valid << 7; +} - return bias; +static inline void meta_troika_unpack(troika_t *troika, const uint8_t packed) { + troika->recent = (packed >> 2) & 3; + troika->prefer_steady = (packed >> 4) & 3; + troika->tail_and_flags = packed & 0xC3; +#if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */ + troika->unused_pad = 0; +#endif } -int64_t mdbx_jsonInteger_from_key(const MDBX_val v) { - assert(v.iov_len == 8); - const uint64_t key = unaligned_peek_u64(2, v.iov_base); - const uint64_t bias = UINT64_C(0x8000000000000000); - const uint64_t covalent = (key > bias) ? key - bias : bias - key - 1; - const int shift = IEEE754_DOUBLE_EXPONENTA_BIAS + 63 - - (IEEE754_DOUBLE_EXPONENTA_MAX & - (int)(covalent >> IEEE754_DOUBLE_MANTISSA_SIZE)); - if (unlikely(shift < 1)) - return (key < bias) ? INT64_MIN : INT64_MAX; - if (unlikely(shift > 63)) - return 0; +static const uint8_t troika_fsm_map[2 * 2 * 2 * 3 * 3 * 3] = { + 232, 201, 216, 216, 232, 233, 232, 232, 168, 201, 216, 152, 168, 233, 232, + 168, 233, 201, 216, 201, 233, 233, 232, 233, 168, 201, 152, 216, 232, 169, + 232, 168, 168, 193, 152, 152, 168, 169, 232, 168, 169, 193, 152, 194, 233, + 169, 232, 169, 232, 201, 216, 216, 232, 201, 232, 232, 168, 193, 216, 152, + 168, 193, 232, 168, 193, 193, 210, 194, 225, 193, 225, 193, 168, 137, 212, + 214, 232, 233, 168, 168, 168, 137, 212, 150, 168, 233, 168, 168, 169, 137, + 216, 201, 233, 233, 168, 169, 168, 137, 148, 214, 232, 169, 168, 168, 40, + 129, 148, 150, 168, 169, 168, 40, 169, 129, 152, 194, 233, 169, 168, 169, + 168, 137, 214, 214, 232, 201, 168, 168, 168, 129, 214, 150, 168, 193, 168, + 168, 129, 129, 210, 194, 225, 193, 161, 129, 212, 198, 212, 214, 228, 228, + 212, 212, 148, 201, 212, 150, 164, 233, 212, 148, 233, 201, 216, 201, 233, + 233, 216, 233, 148, 198, 148, 214, 228, 164, 212, 148, 148, 194, 148, 150, + 164, 169, 212, 148, 169, 194, 152, 194, 233, 169, 216, 169, 214, 198, 214, + 214, 228, 198, 212, 214, 150, 194, 214, 150, 164, 193, 212, 150, 194, 194, + 210, 194, 225, 193, 210, 194}; - const uint64_t unscaled = ((covalent & IEEE754_DOUBLE_MANTISSA_MASK) - << (63 - IEEE754_DOUBLE_MANTISSA_SIZE)) + - bias; - const int64_t absolute = unscaled >> shift; - const int64_t value = (key < bias) ? -absolute : absolute; - assert(key == mdbx_key_from_jsonInteger(value) || - (mdbx_key_from_jsonInteger(value - 1) < key && - key < mdbx_key_from_jsonInteger(value + 1))); - return value; +__cold bool troika_verify_fsm(void) { + bool ok = true; + for (size_t i = 0; i < 2 * 2 * 2 * 3 * 3 * 3; ++i) { + const bool s0 = (i >> 0) & 1; + const bool s1 = (i >> 1) & 1; + const bool s2 = (i >> 2) & 1; + const uint8_t c01 = (i / (8 * 1)) % 3; + const uint8_t c02 = (i / (8 * 3)) % 3; + const uint8_t c12 = (i / (8 * 9)) % 3; + + const uint8_t packed = meta_cmp2pack(c01, c02, c12, s0, s1, s2); + troika_t troika; + troika.fsm = (uint8_t)i; + meta_troika_unpack(&troika, packed); + + const uint8_t tail = TROIKA_TAIL(&troika); + const bool strict = TROIKA_STRICT_VALID(&troika); + const bool valid = TROIKA_VALID(&troika); + + const uint8_t recent_chk = meta_cmp2recent(c01, s0, s1) + ? (meta_cmp2recent(c02, s0, s2) ? 0 : 2) + : (meta_cmp2recent(c12, s1, s2) ? 1 : 2); + const uint8_t prefer_steady_chk = + meta_cmp2steady(c01, s0, s1) ? (meta_cmp2steady(c02, s0, s2) ? 0 : 2) + : (meta_cmp2steady(c12, s1, s2) ? 1 : 2); + + uint8_t tail_chk; + if (recent_chk == 0) + tail_chk = meta_cmp2steady(c12, s1, s2) ? 2 : 1; + else if (recent_chk == 1) + tail_chk = meta_cmp2steady(c02, s0, s2) ? 2 : 0; + else + tail_chk = meta_cmp2steady(c01, s0, s1) ? 1 : 0; + + const bool valid_chk = + c01 != 1 || s0 != s1 || c02 != 1 || s0 != s2 || c12 != 1 || s1 != s2; + const bool strict_chk = (c01 != 1 || s0 != s1) && (c02 != 1 || s0 != s2) && + (c12 != 1 || s1 != s2); + assert(troika.recent == recent_chk); + assert(troika.prefer_steady == prefer_steady_chk); + assert(tail == tail_chk); + assert(valid == valid_chk); + assert(strict == strict_chk); + assert(troika_fsm_map[troika.fsm] == packed); + if (troika.recent != recent_chk || + troika.prefer_steady != prefer_steady_chk || tail != tail_chk || + valid != valid_chk || strict != strict_chk || + troika_fsm_map[troika.fsm] != packed) { + ok = false; + } + } + return ok; } -double mdbx_double_from_key(const MDBX_val v) { - assert(v.iov_len == 8); - return key2double(unaligned_peek_u64(2, v.iov_base)); +__hot troika_t meta_tap(const MDBX_env *env) { + meta_snap_t snap; + troika_t troika; + snap = meta_snap(METAPAGE(env, 0)); + troika.txnid[0] = snap.txnid; + troika.fsm = (uint8_t)snap.is_steady << 0; + snap = meta_snap(METAPAGE(env, 1)); + troika.txnid[1] = snap.txnid; + troika.fsm += (uint8_t)snap.is_steady << 1; + troika.fsm += meta_cmp2int(troika.txnid[0], troika.txnid[1], 8); + snap = meta_snap(METAPAGE(env, 2)); + troika.txnid[2] = snap.txnid; + troika.fsm += (uint8_t)snap.is_steady << 2; + troika.fsm += meta_cmp2int(troika.txnid[0], troika.txnid[2], 8 * 3); + troika.fsm += meta_cmp2int(troika.txnid[1], troika.txnid[2], 8 * 3 * 3); + + meta_troika_unpack(&troika, troika_fsm_map[troika.fsm]); + return troika; } -float mdbx_float_from_key(const MDBX_val v) { - assert(v.iov_len == 4); - return key2float(unaligned_peek_u32(2, v.iov_base)); +txnid_t recent_committed_txnid(const MDBX_env *env) { + const txnid_t m0 = meta_txnid(METAPAGE(env, 0)); + const txnid_t m1 = meta_txnid(METAPAGE(env, 1)); + const txnid_t m2 = meta_txnid(METAPAGE(env, 2)); + return (m0 > m1) ? ((m0 > m2) ? m0 : m2) : ((m1 > m2) ? m1 : m2); } -int32_t mdbx_int32_from_key(const MDBX_val v) { - assert(v.iov_len == 4); - return (int32_t)(unaligned_peek_u32(2, v.iov_base) - UINT32_C(0x80000000)); +static inline bool meta_eq(const troika_t *troika, size_t a, size_t b) { + assert(a < NUM_METAS && b < NUM_METAS); + return troika->txnid[a] == troika->txnid[b] && + (((troika->fsm >> a) ^ (troika->fsm >> b)) & 1) == 0 && + troika->txnid[a]; } -int64_t mdbx_int64_from_key(const MDBX_val v) { - assert(v.iov_len == 8); - return (int64_t)(unaligned_peek_u64(2, v.iov_base) - - UINT64_C(0x8000000000000000)); +unsigned meta_eq_mask(const troika_t *troika) { + return meta_eq(troika, 0, 1) | meta_eq(troika, 1, 2) << 1 | + meta_eq(troika, 2, 0) << 2; } -__cold MDBX_cmp_func *mdbx_get_keycmp(MDBX_db_flags_t flags) { - return get_default_keycmp(flags); +__hot bool meta_should_retry(const MDBX_env *env, troika_t *troika) { + const troika_t prev = *troika; + *troika = meta_tap(env); + return prev.fsm != troika->fsm || prev.txnid[0] != troika->txnid[0] || + prev.txnid[1] != troika->txnid[1] || prev.txnid[2] != troika->txnid[2]; } -__cold MDBX_cmp_func *mdbx_get_datacmp(MDBX_db_flags_t flags) { - return get_default_datacmp(flags); +const char *durable_caption(const meta_t *const meta) { + if (meta_is_steady(meta)) + return (meta_sign_get(meta) == meta_sign_calculate(meta)) ? "Steady" + : "Tainted"; + return "Weak"; } -__cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, - uint64_t value) { - int err = check_env(env, false); - if (unlikely(err != MDBX_SUCCESS)) - return err; +__cold void meta_troika_dump(const MDBX_env *env, const troika_t *troika) { + const meta_ptr_t recent = meta_recent(env, troika); + const meta_ptr_t prefer_steady = meta_prefer_steady(env, troika); + const meta_ptr_t tail = meta_tail(env, troika); + NOTICE("troika: %" PRIaTXN ".%c:%" PRIaTXN ".%c:%" PRIaTXN ".%c, fsm=0x%02x, " + "head=%d-%" PRIaTXN ".%c, " + "base=%d-%" PRIaTXN ".%c, " + "tail=%d-%" PRIaTXN ".%c, " + "valid %c, strict %c", + troika->txnid[0], (troika->fsm & 1) ? 's' : 'w', troika->txnid[1], + (troika->fsm & 2) ? 's' : 'w', troika->txnid[2], + (troika->fsm & 4) ? 's' : 'w', troika->fsm, troika->recent, + recent.txnid, recent.is_steady ? 's' : 'w', troika->prefer_steady, + prefer_steady.txnid, prefer_steady.is_steady ? 's' : 'w', + troika->tail_and_flags % NUM_METAS, tail.txnid, + tail.is_steady ? 's' : 'w', TROIKA_VALID(troika) ? 'Y' : 'N', + TROIKA_STRICT_VALID(troika) ? 'Y' : 'N'); +} - const bool lock_needed = ((env->me_flags & MDBX_ENV_ACTIVE) && env->me_txn0 && - !env_txn0_owned(env)); - bool should_unlock = false; - switch (option) { - case MDBX_opt_sync_bytes: - if (value == /* default */ UINT64_MAX) - value = MAX_WRITE; - if (unlikely(env->me_flags & MDBX_RDONLY)) - return MDBX_EACCESS; - if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE))) - return MDBX_EPERM; - if (unlikely(value > SIZE_MAX - 65536)) - return MDBX_EINVAL; - value = bytes2pgno(env, (size_t)value + env->me_psize - 1); - if ((uint32_t)value != atomic_load32(&env->me_lck->mti_autosync_threshold, - mo_AcquireRelease) && - atomic_store32(&env->me_lck->mti_autosync_threshold, (uint32_t)value, - mo_Relaxed) - /* Дергаем sync(force=off) только если задано новое не-нулевое значение - * и мы вне транзакции */ - && lock_needed) { - err = env_sync(env, false, false); - if (err == /* нечего сбрасывать на диск */ MDBX_RESULT_TRUE) - err = MDBX_SUCCESS; - } - break; +/*----------------------------------------------------------------------------*/ - case MDBX_opt_sync_period: - if (value == /* default */ UINT64_MAX) - value = 2780315 /* 42.42424 секунды */; - if (unlikely(env->me_flags & MDBX_RDONLY)) - return MDBX_EACCESS; - if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE))) - return MDBX_EPERM; - if (unlikely(value > UINT32_MAX)) - return MDBX_EINVAL; - value = osal_16dot16_to_monotime((uint32_t)value); - if (value != atomic_load64(&env->me_lck->mti_autosync_period, - mo_AcquireRelease) && - atomic_store64(&env->me_lck->mti_autosync_period, value, mo_Relaxed) - /* Дергаем sync(force=off) только если задано новое не-нулевое значение - * и мы вне транзакции */ - && lock_needed) { - err = env_sync(env, false, false); - if (err == /* нечего сбрасывать на диск */ MDBX_RESULT_TRUE) - err = MDBX_SUCCESS; - } - break; +static int meta_unsteady(MDBX_env *env, const txnid_t inclusive_upto, + const pgno_t pgno) { + meta_t *const meta = METAPAGE(env, pgno); + const txnid_t txnid = constmeta_txnid(meta); + if (!meta_is_steady(meta) || txnid > inclusive_upto) + return MDBX_RESULT_FALSE; - case MDBX_opt_max_db: - if (value == /* default */ UINT64_MAX) - value = 42; - if (unlikely(value > MDBX_MAX_DBI)) - return MDBX_EINVAL; - if (unlikely(env->me_map)) - return MDBX_EPERM; - env->me_maxdbs = (unsigned)value + CORE_DBS; - break; + WARNING("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, txnid, pgno); + const uint64_t wipe = DATASIGN_NONE; + const void *ptr = &wipe; + size_t bytes = sizeof(meta->sign), + offset = ptr_dist(&meta->sign, env->dxb_mmap.base); + if (env->flags & MDBX_WRITEMAP) { + unaligned_poke_u64(4, meta->sign, wipe); + osal_flush_incoherent_cpu_writeback(); + if (!MDBX_AVOID_MSYNC) + return MDBX_RESULT_TRUE; + ptr = data_page(meta); + offset = ptr_dist(ptr, env->dxb_mmap.base); + bytes = env->ps; + } - case MDBX_opt_max_readers: - if (value == /* default */ UINT64_MAX) - value = MDBX_READERS_LIMIT; - if (unlikely(value < 1 || value > MDBX_READERS_LIMIT)) - return MDBX_EINVAL; - if (unlikely(env->me_map)) - return MDBX_EPERM; - env->me_maxreaders = (unsigned)value; - break; +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.wops.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + int err = osal_pwrite(env->fd4meta, ptr, bytes, offset); + return likely(err == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : err; +} - case MDBX_opt_dp_reserve_limit: - if (value == /* default */ UINT64_MAX) - value = INT_MAX; - if (unlikely(value > INT_MAX)) - return MDBX_EINVAL; - if (env->me_options.dp_reserve_limit != (unsigned)value) { - if (lock_needed) { - err = osal_txn_lock(env, false); - if (unlikely(err != MDBX_SUCCESS)) - return err; - should_unlock = true; - } - env->me_options.dp_reserve_limit = (unsigned)value; - while (env->me_dp_reserve_len > env->me_options.dp_reserve_limit) { - eASSERT(env, env->me_dp_reserve != NULL); - MDBX_page *dp = env->me_dp_reserve; - MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); - VALGRIND_MAKE_MEM_DEFINED(&mp_next(dp), sizeof(MDBX_page *)); - env->me_dp_reserve = mp_next(dp); - void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t)); - osal_free(ptr); - env->me_dp_reserve_len -= 1; - } - } - break; +__cold int meta_wipe_steady(MDBX_env *env, txnid_t inclusive_upto) { + int err = meta_unsteady(env, inclusive_upto, 0); + if (likely(!MDBX_IS_ERROR(err))) + err = meta_unsteady(env, inclusive_upto, 1); + if (likely(!MDBX_IS_ERROR(err))) + err = meta_unsteady(env, inclusive_upto, 2); - case MDBX_opt_rp_augment_limit: - if (value == /* default */ UINT64_MAX) { - env->me_options.flags.non_auto.rp_augment_limit = 0; - env->me_options.rp_augment_limit = default_rp_augment_limit(env); - } else if (unlikely(value > MDBX_PGL_LIMIT)) - return MDBX_EINVAL; - else { - env->me_options.flags.non_auto.rp_augment_limit = 1; - env->me_options.rp_augment_limit = (unsigned)value; + if (err == MDBX_RESULT_TRUE) { + err = MDBX_SUCCESS; + if (!MDBX_AVOID_MSYNC && (env->flags & MDBX_WRITEMAP)) { + err = osal_msync(&env->dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.msync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + } else if (env->fd4meta == env->lazy_fd) { + err = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ } - break; + } - case MDBX_opt_gc_time_limit: - if (value == /* default */ UINT64_MAX) - value = 0; - if (unlikely(value > UINT32_MAX)) - return MDBX_EINVAL; - if (unlikely(env->me_flags & MDBX_RDONLY)) - return MDBX_EACCESS; - value = osal_16dot16_to_monotime((uint32_t)value); - if (value != env->me_options.gc_time_limit) { - if (env->me_txn && lock_needed) - return MDBX_EPERM; - env->me_options.gc_time_limit = value; - if (!env->me_options.flags.non_auto.rp_augment_limit) - env->me_options.rp_augment_limit = default_rp_augment_limit(env); - } - break; + osal_flush_incoherent_mmap(env->dxb_mmap.base, pgno2bytes(env, NUM_METAS), + globals.sys_pagesize); - case MDBX_opt_txn_dp_limit: - case MDBX_opt_txn_dp_initial: - if (value == /* default */ UINT64_MAX) - value = MDBX_PGL_LIMIT; - if (unlikely(value > MDBX_PGL_LIMIT || value < CURSOR_STACK * 4)) - return MDBX_EINVAL; - if (unlikely(env->me_flags & MDBX_RDONLY)) - return MDBX_EACCESS; - if (lock_needed) { - err = osal_txn_lock(env, false); - if (unlikely(err != MDBX_SUCCESS)) - return err; - should_unlock = true; - } - if (env->me_txn) - err = MDBX_EPERM /* unable change during transaction */; - else { - const pgno_t value32 = (pgno_t)value; - if (option == MDBX_opt_txn_dp_initial && - env->me_options.dp_initial != value32) { - env->me_options.dp_initial = value32; - if (env->me_options.dp_limit < value32) { - env->me_options.dp_limit = value32; - env->me_options.flags.non_auto.dp_limit = 1; - } - } - if (option == MDBX_opt_txn_dp_limit && - env->me_options.dp_limit != value32) { - env->me_options.dp_limit = value32; - env->me_options.flags.non_auto.dp_limit = 1; - if (env->me_options.dp_initial > value32) - env->me_options.dp_initial = value32; + /* force oldest refresh */ + atomic_store32(&env->lck->rdt_refresh_flag, true, mo_Relaxed); + + env->basal_txn->tw.troika = meta_tap(env); + for (MDBX_txn *scan = env->basal_txn->nested; scan; scan = scan->nested) + scan->tw.troika = env->basal_txn->tw.troika; + return err; +} + +int meta_sync(const MDBX_env *env, const meta_ptr_t head) { + eASSERT(env, atomic_load32(&env->lck->meta_sync_txnid, mo_Relaxed) != + (uint32_t)head.txnid); + /* Функция может вызываться (в том числе) при (env->flags & + * MDBX_NOMETASYNC) == 0 и env->fd4meta == env->dsync_fd, например если + * предыдущая транзакция была выполненна с флагом MDBX_NOMETASYNC. */ + + int rc = MDBX_RESULT_TRUE; + if (env->flags & MDBX_WRITEMAP) { + if (!MDBX_AVOID_MSYNC) { + rc = osal_msync(&env->dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.msync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + } else { +#if MDBX_ENABLE_PGOP_ST + env->lck->pgops.wops.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + const page_t *page = data_page(head.ptr_c); + rc = osal_pwrite(env->fd4meta, page, env->ps, + ptr_dist(page, env->dxb_mmap.base)); + + if (likely(rc == MDBX_SUCCESS) && env->fd4meta == env->lazy_fd) { + rc = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ } } - break; - - case MDBX_opt_spill_max_denominator: - if (value == /* default */ UINT64_MAX) - value = 8; - if (unlikely(value > 255)) - return MDBX_EINVAL; - env->me_options.spill_max_denominator = (uint8_t)value; - break; - case MDBX_opt_spill_min_denominator: - if (value == /* default */ UINT64_MAX) - value = 8; - if (unlikely(value > 255)) - return MDBX_EINVAL; - env->me_options.spill_min_denominator = (uint8_t)value; - break; - case MDBX_opt_spill_parent4child_denominator: - if (value == /* default */ UINT64_MAX) - value = 0; - if (unlikely(value > 255)) - return MDBX_EINVAL; - env->me_options.spill_parent4child_denominator = (uint8_t)value; - break; + } else { + rc = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + } - case MDBX_opt_loose_limit: - if (value == /* default */ UINT64_MAX) - value = 64; - if (unlikely(value > 255)) - return MDBX_EINVAL; - env->me_options.dp_loose_limit = (uint8_t)value; - break; + if (likely(rc == MDBX_SUCCESS)) + env->lck->meta_sync_txnid.weak = (uint32_t)head.txnid; + return rc; +} - case MDBX_opt_merge_threshold_16dot16_percent: - if (value == /* default */ UINT64_MAX) - value = 65536 / 4 /* 25% */; - if (unlikely(value < 8192 || value > 32768)) - return MDBX_EINVAL; - env->me_options.merge_threshold_16dot16_percent = (unsigned)value; - recalculate_merge_threshold(env); - break; +__cold static page_t *meta_model(const MDBX_env *env, page_t *model, + size_t num) { + ENSURE(env, is_powerof2(env->ps)); + ENSURE(env, env->ps >= MDBX_MIN_PAGESIZE); + ENSURE(env, env->ps <= MDBX_MAX_PAGESIZE); + ENSURE(env, env->geo_in_bytes.lower >= MIN_MAPSIZE); + ENSURE(env, env->geo_in_bytes.upper <= MAX_MAPSIZE); + ENSURE(env, env->geo_in_bytes.now >= env->geo_in_bytes.lower); + ENSURE(env, env->geo_in_bytes.now <= env->geo_in_bytes.upper); + + memset(model, 0, env->ps); + model->pgno = (pgno_t)num; + model->flags = P_META; + meta_t *const model_meta = page_meta(model); + unaligned_poke_u64(4, model_meta->magic_and_version, MDBX_DATA_MAGIC); + + model_meta->geometry.lower = bytes2pgno(env, env->geo_in_bytes.lower); + model_meta->geometry.upper = bytes2pgno(env, env->geo_in_bytes.upper); + model_meta->geometry.grow_pv = + pages2pv(bytes2pgno(env, env->geo_in_bytes.grow)); + model_meta->geometry.shrink_pv = + pages2pv(bytes2pgno(env, env->geo_in_bytes.shrink)); + model_meta->geometry.now = bytes2pgno(env, env->geo_in_bytes.now); + model_meta->geometry.first_unallocated = NUM_METAS; + + ENSURE(env, model_meta->geometry.lower >= MIN_PAGENO); + ENSURE(env, model_meta->geometry.upper <= MAX_PAGENO + 1); + ENSURE(env, model_meta->geometry.now >= model_meta->geometry.lower); + ENSURE(env, model_meta->geometry.now <= model_meta->geometry.upper); + ENSURE(env, model_meta->geometry.first_unallocated >= MIN_PAGENO); + ENSURE(env, + model_meta->geometry.first_unallocated <= model_meta->geometry.now); + ENSURE(env, model_meta->geometry.grow_pv == + pages2pv(pv2pages(model_meta->geometry.grow_pv))); + ENSURE(env, model_meta->geometry.shrink_pv == + pages2pv(pv2pages(model_meta->geometry.shrink_pv))); + + model_meta->pagesize = env->ps; + model_meta->trees.gc.flags = MDBX_INTEGERKEY; + model_meta->trees.gc.root = P_INVALID; + model_meta->trees.main.root = P_INVALID; + meta_set_txnid(env, model_meta, MIN_TXNID + num); + unaligned_poke_u64(4, model_meta->sign, meta_sign_calculate(model_meta)); + eASSERT(env, coherency_check_meta(env, model_meta, true)); + return ptr_disp(model, env->ps); +} - case MDBX_opt_writethrough_threshold: -#if defined(_WIN32) || defined(_WIN64) - /* позволяем "установить" значение по-умолчанию и совпадающее - * с поведением соответствующим текущей установке MDBX_NOMETASYNC */ - if (value == /* default */ UINT64_MAX && - value != ((env->me_flags & MDBX_NOMETASYNC) ? 0 : UINT_MAX)) - err = MDBX_EINVAL; -#else - if (value == /* default */ UINT64_MAX) - value = MDBX_WRITETHROUGH_THRESHOLD_DEFAULT; - if (value != (unsigned)value) - err = MDBX_EINVAL; - else - env->me_options.writethrough_threshold = (unsigned)value; -#endif - break; +__cold meta_t *meta_init_triplet(const MDBX_env *env, void *buffer) { + page_t *page0 = (page_t *)buffer; + page_t *page1 = meta_model(env, page0, 0); + page_t *page2 = meta_model(env, page1, 1); + meta_model(env, page2, 2); + return page_meta(page2); +} - case MDBX_opt_prefault_write_enable: - if (value == /* default */ UINT64_MAX) { - env->me_options.prefault_write = default_prefault_write(env); - env->me_options.flags.non_auto.prefault_write = false; - } else if (value > 1) - err = MDBX_EINVAL; - else { - env->me_options.prefault_write = value != 0; - env->me_options.flags.non_auto.prefault_write = true; +__cold int __must_check_result meta_override(MDBX_env *env, size_t target, + txnid_t txnid, + const meta_t *shape) { + int rc = env_page_auxbuffer(env); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + page_t *const page = env->page_auxbuf; + meta_model(env, page, target); + meta_t *const model = page_meta(page); + meta_set_txnid(env, model, txnid); + if (txnid) + eASSERT(env, coherency_check_meta(env, model, true)); + if (shape) { + if (txnid && unlikely(!coherency_check_meta(env, shape, false))) { + ERROR("bailout overriding meta-%zu since model failed " + "FreeDB/MainDB %s-check for txnid #%" PRIaTXN, + target, "pre", constmeta_txnid(shape)); + return MDBX_PROBLEM; } - break; + if (globals.runtime_flags & MDBX_DBG_DONT_UPGRADE) + memcpy(&model->magic_and_version, &shape->magic_and_version, + sizeof(model->magic_and_version)); + model->reserve16 = shape->reserve16; + model->validator_id = shape->validator_id; + model->extra_pagehdr = shape->extra_pagehdr; + memcpy(&model->geometry, &shape->geometry, sizeof(model->geometry)); + memcpy(&model->trees, &shape->trees, sizeof(model->trees)); + memcpy(&model->canary, &shape->canary, sizeof(model->canary)); + memcpy(&model->pages_retired, &shape->pages_retired, + sizeof(model->pages_retired)); + if (txnid) { + if ((!model->trees.gc.mod_txnid && model->trees.gc.root != P_INVALID) || + (!model->trees.main.mod_txnid && model->trees.main.root != P_INVALID)) + memcpy(&model->magic_and_version, &shape->magic_and_version, + sizeof(model->magic_and_version)); + if (unlikely(!coherency_check_meta(env, model, false))) { + ERROR("bailout overriding meta-%zu since model failed " + "FreeDB/MainDB %s-check for txnid #%" PRIaTXN, + target, "post", txnid); + return MDBX_PROBLEM; + } + } + } + meta_sign_as_steady(model); + rc = meta_validate(env, model, page, (pgno_t)target, nullptr); + if (unlikely(MDBX_IS_ERROR(rc))) + return MDBX_PROBLEM; - default: - return MDBX_EINVAL; + if (shape && memcmp(model, shape, sizeof(meta_t)) == 0) { + NOTICE("skip overriding meta-%zu since no changes " + "for txnid #%" PRIaTXN, + target, txnid); + return MDBX_SUCCESS; } - if (should_unlock) - osal_txn_unlock(env); - return err; + if (env->flags & MDBX_WRITEMAP) { +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.msync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_msync(&env->dxb_mmap, 0, + pgno_align2os_bytes(env, model->geometry.first_unallocated), + MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + /* meta_override() called only while current process have exclusive + * lock of a DB file. So meta-page could be updated directly without + * clearing consistency flag by mdbx_meta_update_begin() */ + memcpy(pgno2page(env, target), page, env->ps); + osal_flush_incoherent_cpu_writeback(); +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.msync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_msync(&env->dxb_mmap, 0, pgno_align2os_bytes(env, target + 1), + MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } else { +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.wops.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_pwrite(env->fd4meta, page, env->ps, pgno2bytes(env, target)); + if (rc == MDBX_SUCCESS && env->fd4meta == env->lazy_fd) { +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_fsync(env->lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } + osal_flush_incoherent_mmap(env->dxb_mmap.base, pgno2bytes(env, NUM_METAS), + globals.sys_pagesize); + } + eASSERT(env, (!env->txn && !env->basal_txn) || + (env->stuck_meta == (int)target && + (env->flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) == + MDBX_EXCLUSIVE)); + return rc; } -__cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option, - uint64_t *pvalue) { - int err = check_env(env, false); - if (unlikely(err != MDBX_SUCCESS)) - return err; - if (unlikely(!pvalue)) - return MDBX_EINVAL; - - switch (option) { - case MDBX_opt_sync_bytes: - if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE))) - return MDBX_EPERM; - *pvalue = pgno2bytes( - env, atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed)); - break; +__cold int meta_validate(MDBX_env *env, meta_t *const meta, + const page_t *const page, const unsigned meta_number, + unsigned *guess_pagesize) { + const uint64_t magic_and_version = + unaligned_peek_u64(4, &meta->magic_and_version); + if (unlikely(magic_and_version != MDBX_DATA_MAGIC && + magic_and_version != MDBX_DATA_MAGIC_LEGACY_COMPAT && + magic_and_version != MDBX_DATA_MAGIC_LEGACY_DEVEL)) { + ERROR("meta[%u] has invalid magic/version %" PRIx64, meta_number, + magic_and_version); + return ((magic_and_version >> 8) != MDBX_MAGIC) ? MDBX_INVALID + : MDBX_VERSION_MISMATCH; + } - case MDBX_opt_sync_period: - if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE))) - return MDBX_EPERM; - *pvalue = osal_monotime_to_16dot16( - atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed)); - break; + if (unlikely(page->pgno != meta_number)) { + ERROR("meta[%u] has invalid pageno %" PRIaPGNO, meta_number, page->pgno); + return MDBX_INVALID; + } - case MDBX_opt_max_db: - *pvalue = env->me_maxdbs - CORE_DBS; - break; + if (unlikely(page->flags != P_META)) { + ERROR("page #%u not a meta-page", meta_number); + return MDBX_INVALID; + } - case MDBX_opt_max_readers: - *pvalue = env->me_maxreaders; - break; + if (unlikely(!is_powerof2(meta->pagesize) || + meta->pagesize < MDBX_MIN_PAGESIZE || + meta->pagesize > MDBX_MAX_PAGESIZE)) { + WARNING("meta[%u] has invalid pagesize (%u), skip it", meta_number, + meta->pagesize); + return is_powerof2(meta->pagesize) ? MDBX_VERSION_MISMATCH : MDBX_INVALID; + } - case MDBX_opt_dp_reserve_limit: - *pvalue = env->me_options.dp_reserve_limit; - break; + if (guess_pagesize && *guess_pagesize != meta->pagesize) { + *guess_pagesize = meta->pagesize; + VERBOSE("meta[%u] took pagesize %u", meta_number, meta->pagesize); + } - case MDBX_opt_rp_augment_limit: - *pvalue = env->me_options.rp_augment_limit; - break; + const txnid_t txnid = unaligned_peek_u64(4, &meta->txnid_a); + if (unlikely(txnid != unaligned_peek_u64(4, &meta->txnid_b))) { + WARNING("meta[%u] not completely updated, skip it", meta_number); + return MDBX_RESULT_TRUE; + } - case MDBX_opt_gc_time_limit: - *pvalue = osal_monotime_to_16dot16(env->me_options.gc_time_limit); - break; + /* LY: check signature as a checksum */ + const uint64_t sign = meta_sign_get(meta); + const uint64_t sign_stready = meta_sign_calculate(meta); + if (SIGN_IS_STEADY(sign) && unlikely(sign != sign_stready)) { + WARNING("meta[%u] has invalid steady-checksum (0x%" PRIx64 " != 0x%" PRIx64 + "), skip it", + meta_number, sign, sign_stready); + return MDBX_RESULT_TRUE; + } - case MDBX_opt_txn_dp_limit: - *pvalue = env->me_options.dp_limit; - break; - case MDBX_opt_txn_dp_initial: - *pvalue = env->me_options.dp_initial; - break; - - case MDBX_opt_spill_max_denominator: - *pvalue = env->me_options.spill_max_denominator; - break; - case MDBX_opt_spill_min_denominator: - *pvalue = env->me_options.spill_min_denominator; - break; - case MDBX_opt_spill_parent4child_denominator: - *pvalue = env->me_options.spill_parent4child_denominator; - break; - - case MDBX_opt_loose_limit: - *pvalue = env->me_options.dp_loose_limit; - break; - - case MDBX_opt_merge_threshold_16dot16_percent: - *pvalue = env->me_options.merge_threshold_16dot16_percent; - break; + if (unlikely(meta->trees.gc.flags != MDBX_INTEGERKEY)) { + WARNING("meta[%u] has invalid %s flags 0x%u, skip it", meta_number, + "GC/FreeDB", meta->trees.gc.flags); + return MDBX_INCOMPATIBLE; + } - case MDBX_opt_writethrough_threshold: -#if defined(_WIN32) || defined(_WIN64) - *pvalue = (env->me_flags & MDBX_NOMETASYNC) ? 0 : INT_MAX; -#else - *pvalue = env->me_options.writethrough_threshold; -#endif - break; + if (unlikely(!check_sdb_flags(meta->trees.main.flags))) { + WARNING("meta[%u] has invalid %s flags 0x%u, skip it", meta_number, + "MainDB", meta->trees.main.flags); + return MDBX_INCOMPATIBLE; + } - case MDBX_opt_prefault_write_enable: - *pvalue = env->me_options.prefault_write; - break; + DEBUG("checking meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO + ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO + " +%u -%u, txn_id %" PRIaTXN ", %s", + page->pgno, meta->trees.main.root, meta->trees.gc.root, + meta->geometry.lower, meta->geometry.first_unallocated, + meta->geometry.now, meta->geometry.upper, + pv2pages(meta->geometry.grow_pv), pv2pages(meta->geometry.shrink_pv), + txnid, durable_caption(meta)); - default: - return MDBX_EINVAL; + if (unlikely(txnid < MIN_TXNID || txnid > MAX_TXNID)) { + WARNING("meta[%u] has invalid txnid %" PRIaTXN ", skip it", meta_number, + txnid); + return MDBX_RESULT_TRUE; } - return MDBX_SUCCESS; -} - -static size_t estimate_rss(size_t database_bytes) { - return database_bytes + database_bytes / 64 + - (512 + MDBX_WORDBITS * 16) * MEGABYTE; -} + if (unlikely(meta->geometry.lower < MIN_PAGENO || + meta->geometry.lower > MAX_PAGENO + 1)) { + WARNING("meta[%u] has invalid min-pages (%" PRIaPGNO "), skip it", + meta_number, meta->geometry.lower); + return MDBX_INVALID; + } -__cold int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn, - MDBX_warmup_flags_t flags, - unsigned timeout_seconds_16dot16) { - if (unlikely(env == NULL && txn == NULL)) - return MDBX_EINVAL; - if (unlikely(flags > - (MDBX_warmup_force | MDBX_warmup_oomsafe | MDBX_warmup_lock | - MDBX_warmup_touchlimit | MDBX_warmup_release))) - return MDBX_EINVAL; + if (unlikely(meta->geometry.upper < MIN_PAGENO || + meta->geometry.upper > MAX_PAGENO + 1 || + meta->geometry.upper < meta->geometry.lower)) { + WARNING("meta[%u] has invalid max-pages (%" PRIaPGNO "), skip it", + meta_number, meta->geometry.upper); + return MDBX_INVALID; + } - if (txn) { - int err = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR); - if (unlikely(err != MDBX_SUCCESS)) - return err; + if (unlikely(meta->geometry.first_unallocated < MIN_PAGENO || + meta->geometry.first_unallocated - 1 > MAX_PAGENO)) { + WARNING("meta[%u] has invalid next-pageno (%" PRIaPGNO "), skip it", + meta_number, meta->geometry.first_unallocated); + return MDBX_CORRUPTED; } - if (env) { - int err = check_env(env, false); + + const uint64_t used_bytes = + meta->geometry.first_unallocated * (uint64_t)meta->pagesize; + if (unlikely(used_bytes > env->dxb_mmap.filesize)) { + /* Here could be a race with DB-shrinking performed by other process */ + int err = osal_filesize(env->lazy_fd, &env->dxb_mmap.filesize); if (unlikely(err != MDBX_SUCCESS)) return err; - if (txn && unlikely(txn->mt_env != env)) - return MDBX_EINVAL; - } else { - env = txn->mt_env; + if (unlikely(used_bytes > env->dxb_mmap.filesize)) { + WARNING("meta[%u] used-bytes (%" PRIu64 ") beyond filesize (%" PRIu64 + "), skip it", + meta_number, used_bytes, env->dxb_mmap.filesize); + return MDBX_CORRUPTED; + } } - - const uint64_t timeout_monotime = - (timeout_seconds_16dot16 && (flags & MDBX_warmup_force)) - ? osal_monotime() + osal_16dot16_to_monotime(timeout_seconds_16dot16) - : 0; - - if (flags & MDBX_warmup_release) - munlock_all(env); - - pgno_t used_pgno; - if (txn) { - used_pgno = txn->mt_geo.next; - } else { - const meta_troika_t troika = meta_tap(env); - used_pgno = meta_recent(env, &troika).ptr_v->mm_geo.next; + if (unlikely(meta->geometry.first_unallocated - 1 > MAX_PAGENO || + used_bytes > MAX_MAPSIZE)) { + WARNING("meta[%u] has too large used-space (%" PRIu64 "), skip it", + meta_number, used_bytes); + return MDBX_TOO_LARGE; } - const size_t used_range = pgno_align2os_bytes(env, used_pgno); - const pgno_t mlock_pgno = bytes2pgno(env, used_range); - int rc = MDBX_SUCCESS; - if (flags & MDBX_warmup_touchlimit) { - const size_t estimated_rss = estimate_rss(used_range); -#if defined(_WIN32) || defined(_WIN64) - SIZE_T current_ws_lower, current_ws_upper; - if (GetProcessWorkingSetSize(GetCurrentProcess(), ¤t_ws_lower, - ¤t_ws_upper) && - current_ws_lower < estimated_rss) { - const SIZE_T ws_lower = estimated_rss; - const SIZE_T ws_upper = - (MDBX_WORDBITS == 32 && ws_lower > MEGABYTE * 2048) - ? ws_lower - : ws_lower + MDBX_WORDBITS * MEGABYTE * 32; - if (!SetProcessWorkingSetSize(GetCurrentProcess(), ws_lower, ws_upper)) { - rc = (int)GetLastError(); - WARNING("SetProcessWorkingSetSize(%zu, %zu) error %d", ws_lower, - ws_upper, rc); + pgno_t geo_lower = meta->geometry.lower; + uint64_t mapsize_min = geo_lower * (uint64_t)meta->pagesize; + STATIC_ASSERT(MAX_MAPSIZE < PTRDIFF_MAX - MDBX_MAX_PAGESIZE); + STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE); + STATIC_ASSERT((uint64_t)(MAX_PAGENO + 1) * MDBX_MIN_PAGESIZE % (4ul << 20) == + 0); + if (unlikely(mapsize_min < MIN_MAPSIZE || mapsize_min > MAX_MAPSIZE)) { + if (MAX_MAPSIZE != MAX_MAPSIZE64 && mapsize_min > MAX_MAPSIZE && + mapsize_min <= MAX_MAPSIZE64) { + eASSERT(env, meta->geometry.first_unallocated - 1 <= MAX_PAGENO && + used_bytes <= MAX_MAPSIZE); + WARNING("meta[%u] has too large min-mapsize (%" PRIu64 "), " + "but size of used space still acceptable (%" PRIu64 ")", + meta_number, mapsize_min, used_bytes); + geo_lower = (pgno_t)((mapsize_min = MAX_MAPSIZE) / meta->pagesize); + if (geo_lower > MAX_PAGENO + 1) { + geo_lower = MAX_PAGENO + 1; + mapsize_min = geo_lower * (uint64_t)meta->pagesize; } + WARNING("meta[%u] consider get-%s pageno is %" PRIaPGNO + " instead of wrong %" PRIaPGNO + ", will be corrected on next commit(s)", + meta_number, "lower", geo_lower, meta->geometry.lower); + meta->geometry.lower = geo_lower; + } else { + WARNING("meta[%u] has invalid min-mapsize (%" PRIu64 "), skip it", + meta_number, mapsize_min); + return MDBX_VERSION_MISMATCH; } -#endif /* Windows */ -#ifdef RLIMIT_RSS - struct rlimit rss; - if (getrlimit(RLIMIT_RSS, &rss) == 0 && rss.rlim_cur < estimated_rss) { - rss.rlim_cur = estimated_rss; - if (rss.rlim_max < estimated_rss) - rss.rlim_max = estimated_rss; - if (setrlimit(RLIMIT_RSS, &rss)) { - rc = errno; - WARNING("setrlimit(%s, {%zu, %zu}) error %d", "RLIMIT_RSS", - (size_t)rss.rlim_cur, (size_t)rss.rlim_max, rc); - } + } + + pgno_t geo_upper = meta->geometry.upper; + uint64_t mapsize_max = geo_upper * (uint64_t)meta->pagesize; + STATIC_ASSERT(MIN_MAPSIZE < MAX_MAPSIZE); + if (unlikely(mapsize_max > MAX_MAPSIZE || + (MAX_PAGENO + 1) < + ceil_powerof2((size_t)mapsize_max, globals.sys_pagesize) / + (size_t)meta->pagesize)) { + if (mapsize_max > MAX_MAPSIZE64) { + WARNING("meta[%u] has invalid max-mapsize (%" PRIu64 "), skip it", + meta_number, mapsize_max); + return MDBX_VERSION_MISMATCH; } -#endif /* RLIMIT_RSS */ -#ifdef RLIMIT_MEMLOCK - if (flags & MDBX_warmup_lock) { - struct rlimit memlock; - if (getrlimit(RLIMIT_MEMLOCK, &memlock) == 0 && - memlock.rlim_cur < estimated_rss) { - memlock.rlim_cur = estimated_rss; - if (memlock.rlim_max < estimated_rss) - memlock.rlim_max = estimated_rss; - if (setrlimit(RLIMIT_MEMLOCK, &memlock)) { - rc = errno; - WARNING("setrlimit(%s, {%zu, %zu}) error %d", "RLIMIT_MEMLOCK", - (size_t)memlock.rlim_cur, (size_t)memlock.rlim_max, rc); - } - } + /* allow to open large DB from a 32-bit environment */ + eASSERT(env, meta->geometry.first_unallocated - 1 <= MAX_PAGENO && + used_bytes <= MAX_MAPSIZE); + WARNING("meta[%u] has too large max-mapsize (%" PRIu64 "), " + "but size of used space still acceptable (%" PRIu64 ")", + meta_number, mapsize_max, used_bytes); + geo_upper = (pgno_t)((mapsize_max = MAX_MAPSIZE) / meta->pagesize); + if (geo_upper > MAX_PAGENO + 1) { + geo_upper = MAX_PAGENO + 1; + mapsize_max = geo_upper * (uint64_t)meta->pagesize; } -#endif /* RLIMIT_MEMLOCK */ - (void)estimated_rss; + WARNING("meta[%u] consider get-%s pageno is %" PRIaPGNO + " instead of wrong %" PRIaPGNO + ", will be corrected on next commit(s)", + meta_number, "upper", geo_upper, meta->geometry.upper); + meta->geometry.upper = geo_upper; } -#if defined(MLOCK_ONFAULT) && \ - ((defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 27)) || \ - (defined(__ANDROID_API__) && __ANDROID_API__ >= 30)) && \ - (defined(__linux__) || defined(__gnu_linux__)) - if ((flags & MDBX_warmup_lock) != 0 && linux_kernel_version >= 0x04040000 && - atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease) < mlock_pgno) { - if (mlock2(env->me_map, used_range, MLOCK_ONFAULT)) { - rc = errno; - WARNING("mlock2(%zu, %s) error %d", used_range, "MLOCK_ONFAULT", rc); - } else { - update_mlcnt(env, mlock_pgno, true); - rc = MDBX_SUCCESS; + /* LY: check and silently put geometry.now into [geo.lower...geo.upper]. + * + * Copy-with-compaction by old version of libmdbx could produce DB-file + * less than meta.geo.lower bound, in case actual filling is low or no data + * at all. This is not a problem as there is no damage or loss of data. + * Therefore it is better not to consider such situation as an error, but + * silently correct it. */ + pgno_t geo_now = meta->geometry.now; + if (geo_now < geo_lower) + geo_now = geo_lower; + if (geo_now > geo_upper && meta->geometry.first_unallocated <= geo_upper) + geo_now = geo_upper; + + if (unlikely(meta->geometry.first_unallocated > geo_now)) { + WARNING("meta[%u] next-pageno (%" PRIaPGNO + ") is beyond end-pgno (%" PRIaPGNO "), skip it", + meta_number, meta->geometry.first_unallocated, geo_now); + return MDBX_CORRUPTED; + } + if (meta->geometry.now != geo_now) { + WARNING("meta[%u] consider geo-%s pageno is %" PRIaPGNO + " instead of wrong %" PRIaPGNO + ", will be corrected on next commit(s)", + meta_number, "now", geo_now, meta->geometry.now); + meta->geometry.now = geo_now; + } + + /* GC */ + if (meta->trees.gc.root == P_INVALID) { + if (unlikely(meta->trees.gc.branch_pages || meta->trees.gc.height || + meta->trees.gc.items || meta->trees.gc.leaf_pages || + meta->trees.gc.large_pages)) { + WARNING("meta[%u] has false-empty %s, skip it", meta_number, "GC"); + return MDBX_CORRUPTED; } - if (rc != EINVAL) - flags -= MDBX_warmup_lock; + } else if (unlikely(meta->trees.gc.root >= + meta->geometry.first_unallocated)) { + WARNING("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", meta_number, + "GC", meta->trees.gc.root); + return MDBX_CORRUPTED; } -#endif /* MLOCK_ONFAULT */ - int err = MDBX_ENOSYS; -#if MDBX_ENABLE_MADVISE - err = set_readahead(env, used_pgno, true, true); -#else -#if defined(_WIN32) || defined(_WIN64) - if (mdbx_PrefetchVirtualMemory) { - WIN32_MEMORY_RANGE_ENTRY hint; - hint.VirtualAddress = env->me_map; - hint.NumberOfBytes = used_range; - if (mdbx_PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0)) - err = MDBX_SUCCESS; - else { - err = (int)GetLastError(); - ERROR("%s(%zu) error %d", "PrefetchVirtualMemory", used_range, err); + /* MainDB */ + if (meta->trees.main.root == P_INVALID) { + if (unlikely(meta->trees.main.branch_pages || meta->trees.main.height || + meta->trees.main.items || meta->trees.main.leaf_pages || + meta->trees.main.large_pages)) { + WARNING("meta[%u] has false-empty %s", meta_number, "MainDB"); + return MDBX_CORRUPTED; } + } else if (unlikely(meta->trees.main.root >= + meta->geometry.first_unallocated)) { + WARNING("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", meta_number, + "MainDB", meta->trees.main.root); + return MDBX_CORRUPTED; } -#endif /* Windows */ -#if defined(POSIX_MADV_WILLNEED) - err = posix_madvise(env->me_map, used_range, POSIX_MADV_WILLNEED) - ? ignore_enosys(errno) - : MDBX_SUCCESS; -#elif defined(MADV_WILLNEED) - err = madvise(env->me_map, used_range, MADV_WILLNEED) ? ignore_enosys(errno) - : MDBX_SUCCESS; -#endif + if (unlikely(meta->trees.gc.mod_txnid > txnid)) { + WARNING("meta[%u] has wrong mod_txnid %" PRIaTXN " for %s, skip it", + meta_number, meta->trees.gc.mod_txnid, "GC"); + return MDBX_CORRUPTED; + } -#if defined(F_RDADVISE) - if (err) { - fcntl(env->me_lazy_fd, F_RDAHEAD, true); - struct radvisory hint; - hint.ra_offset = 0; - hint.ra_count = unlikely(used_range > INT_MAX && - sizeof(used_range) > sizeof(hint.ra_count)) - ? INT_MAX - : (int)used_range; - err = fcntl(env->me_lazy_fd, F_RDADVISE, &hint) ? ignore_enosys(errno) - : MDBX_SUCCESS; - if (err == ENOTTY) - err = MDBX_SUCCESS /* Ignore ENOTTY for DB on the ram-disk */; + if (unlikely(meta->trees.main.mod_txnid > txnid)) { + WARNING("meta[%u] has wrong mod_txnid %" PRIaTXN " for %s, skip it", + meta_number, meta->trees.main.mod_txnid, "MainDB"); + return MDBX_CORRUPTED; } -#endif /* F_RDADVISE */ -#endif /* MDBX_ENABLE_MADVISE */ - if (err != MDBX_SUCCESS && rc == MDBX_SUCCESS) - rc = err; - if ((flags & MDBX_warmup_force) != 0 && - (rc == MDBX_SUCCESS || rc == MDBX_ENOSYS)) { - const volatile uint8_t *ptr = env->me_map; - size_t offset = 0, unused = 42; -#if !(defined(_WIN32) || defined(_WIN64)) - if (flags & MDBX_warmup_oomsafe) { - const int null_fd = open("/dev/null", O_WRONLY); - if (unlikely(null_fd < 0)) - rc = errno; - else { - struct iovec iov[MDBX_AUXILARY_IOV_MAX]; - for (;;) { - unsigned i; - for (i = 0; i < MDBX_AUXILARY_IOV_MAX && offset < used_range; ++i) { - iov[i].iov_base = (void *)(ptr + offset); - iov[i].iov_len = 1; - offset += env->me_os_psize; - } - if (unlikely(writev(null_fd, iov, i) < 0)) { - rc = errno; - if (rc == EFAULT) - rc = ENOMEM; - break; - } - if (offset >= used_range) { - rc = MDBX_SUCCESS; - break; - } - if (timeout_seconds_16dot16 && osal_monotime() > timeout_monotime) { - rc = MDBX_RESULT_TRUE; - break; - } - } - close(null_fd); - } - } else -#endif /* Windows */ - for (;;) { - unused += ptr[offset]; - offset += env->me_os_psize; - if (offset >= used_range) { - rc = MDBX_SUCCESS; - break; - } - if (timeout_seconds_16dot16 && osal_monotime() > timeout_monotime) { - rc = MDBX_RESULT_TRUE; - break; - } - } - (void)unused; - } - - if ((flags & MDBX_warmup_lock) != 0 && - (rc == MDBX_SUCCESS || rc == MDBX_ENOSYS) && - atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease) < mlock_pgno) { -#if defined(_WIN32) || defined(_WIN64) - if (VirtualLock(env->me_map, used_range)) { - update_mlcnt(env, mlock_pgno, true); - rc = MDBX_SUCCESS; - } else { - rc = (int)GetLastError(); - WARNING("%s(%zu) error %d", "VirtualLock", used_range, rc); - } -#elif defined(_POSIX_MEMLOCK_RANGE) - if (mlock(env->me_map, used_range) == 0) { - update_mlcnt(env, mlock_pgno, true); - rc = MDBX_SUCCESS; - } else { - rc = errno; - WARNING("%s(%zu) error %d", "mlock", used_range, rc); - } -#else - rc = MDBX_ENOSYS; -#endif - } - - return rc; + return MDBX_SUCCESS; } -#if !defined(_WIN32) && !defined(_WIN64) -__cold static void rthc_afterfork(void) { - NOTICE("drown %d rthc entries", rthc_count); - for (size_t i = 0; i < rthc_count; ++i) { - MDBX_env *const env = rthc_table[i].env; - NOTICE("drown env %p", __Wpedantic_format_voidptr(env)); - if (env->me_lck_mmap.lck) - osal_munmap(&env->me_lck_mmap); - if (env->me_map) { - osal_munmap(&env->me_dxb_mmap); -#ifdef ENABLE_MEMCHECK - VALGRIND_DISCARD(env->me_valgrind_handle); - env->me_valgrind_handle = -1; -#endif /* ENABLE_MEMCHECK */ - } - env->me_lck = lckless_stub(env); - rthc_drown(env); - } - if (rthc_table != rthc_table_static) - osal_free(rthc_table); - rthc_count = 0; - rthc_table = rthc_table_static; - rthc_limit = RTHC_INITIAL_LIMIT; - rthc_pending.weak = 0; +__cold int meta_validate_copy(MDBX_env *env, const meta_t *meta, meta_t *dest) { + *dest = *meta; + return meta_validate(env, dest, data_page(meta), + bytes2pgno(env, ptr_dist(meta, env->dxb_mmap.base)), + nullptr); } -#endif /* ! Windows */ - -__cold void global_ctor(void) { - ENSURE(nullptr, osal_fastmutex_init(&debug_lock) == 0); - osal_ctor(); - rthc_limit = RTHC_INITIAL_LIMIT; - rthc_table = rthc_table_static; -#if defined(_WIN32) || defined(_WIN64) - InitializeCriticalSection(&rthc_critical_section); -#else - ENSURE(nullptr, pthread_atfork(nullptr, nullptr, rthc_afterfork) == 0); - ENSURE(nullptr, pthread_key_create(&rthc_key, thread_dtor) == 0); - TRACE("pid %d, &mdbx_rthc_key = %p, value 0x%x", osal_getpid(), - __Wpedantic_format_voidptr(&rthc_key), (unsigned)rthc_key); -#endif - /* checking time conversion, this also avoids racing on 32-bit architectures - * during storing calculated 64-bit ratio(s) into memory. */ - uint32_t proba = UINT32_MAX; - while (true) { - unsigned time_conversion_checkup = - osal_monotime_to_16dot16(osal_16dot16_to_monotime(proba)); - unsigned one_more = (proba < UINT32_MAX) ? proba + 1 : proba; - unsigned one_less = (proba > 0) ? proba - 1 : proba; - ENSURE(nullptr, time_conversion_checkup >= one_less && - time_conversion_checkup <= one_more); - if (proba == 0) - break; - proba >>= 1; - } - - bootid = osal_bootid(); - -#if MDBX_DEBUG - for (size_t i = 0; i < 2 * 2 * 2 * 3 * 3 * 3; ++i) { - const bool s0 = (i >> 0) & 1; - const bool s1 = (i >> 1) & 1; - const bool s2 = (i >> 2) & 1; - const uint8_t c01 = (i / (8 * 1)) % 3; - const uint8_t c02 = (i / (8 * 3)) % 3; - const uint8_t c12 = (i / (8 * 9)) % 3; - - const uint8_t packed = meta_cmp2pack(c01, c02, c12, s0, s1, s2); - meta_troika_t troika; - troika.fsm = (uint8_t)i; - meta_troika_unpack(&troika, packed); - - const uint8_t tail = TROIKA_TAIL(&troika); - const bool strict = TROIKA_STRICT_VALID(&troika); - const bool valid = TROIKA_VALID(&troika); - - const uint8_t recent_chk = meta_cmp2recent(c01, s0, s1) - ? (meta_cmp2recent(c02, s0, s2) ? 0 : 2) - : (meta_cmp2recent(c12, s1, s2) ? 1 : 2); - const uint8_t prefer_steady_chk = - meta_cmp2steady(c01, s0, s1) ? (meta_cmp2steady(c02, s0, s2) ? 0 : 2) - : (meta_cmp2steady(c12, s1, s2) ? 1 : 2); - - uint8_t tail_chk; - if (recent_chk == 0) - tail_chk = meta_cmp2steady(c12, s1, s2) ? 2 : 1; - else if (recent_chk == 1) - tail_chk = meta_cmp2steady(c02, s0, s2) ? 2 : 0; - else - tail_chk = meta_cmp2steady(c01, s0, s1) ? 1 : 0; +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 - const bool valid_chk = - c01 != 1 || s0 != s1 || c02 != 1 || s0 != s2 || c12 != 1 || s1 != s2; - const bool strict_chk = (c01 != 1 || s0 != s1) && (c02 != 1 || s0 != s2) && - (c12 != 1 || s1 != s2); - assert(troika.recent == recent_chk); - assert(troika.prefer_steady == prefer_steady_chk); - assert(tail == tail_chk); - assert(valid == valid_chk); - assert(strict == strict_chk); - // printf(" %d, ", packed); - assert(troika_fsm_map[troika.fsm] == packed); - } -#endif /* MDBX_DEBUG*/ -#if 0 /* debug */ - for (size_t i = 0; i < 65536; ++i) { - size_t pages = pv2pages(i); - size_t x = pages2pv(pages); - size_t xp = pv2pages(x); - if (!(x == i || (x % 2 == 0 && x < 65536)) || pages != xp) - printf("%u => %zu => %u => %zu\n", i, pages, x, xp); - assert(pages == xp); - } - fflush(stdout); -#endif /* #if 0 */ -} +__cold int mdbx_is_readahead_reasonable(size_t volume, intptr_t redundancy) { + if (volume <= 1024 * 1024 * 4ul) + return MDBX_RESULT_TRUE; -/*------------------------------------------------------------------------------ - * Legacy API */ + intptr_t pagesize, total_ram_pages; + int err = mdbx_get_sysraminfo(&pagesize, &total_ram_pages, nullptr); + if (unlikely(err != MDBX_SUCCESS)) + return err; -#ifndef LIBMDBX_NO_EXPORTS_LEGACY_API + const int log2page = log2n_powerof2(pagesize); + const intptr_t volume_pages = (volume + pagesize - 1) >> log2page; + const intptr_t redundancy_pages = + (redundancy < 0) ? -(intptr_t)((-redundancy + pagesize - 1) >> log2page) + : (intptr_t)(redundancy + pagesize - 1) >> log2page; + if (volume_pages >= total_ram_pages || + volume_pages + redundancy_pages >= total_ram_pages) + return MDBX_RESULT_FALSE; -LIBMDBX_API int mdbx_txn_begin(MDBX_env *env, MDBX_txn *parent, - MDBX_txn_flags_t flags, MDBX_txn **ret) { - return __inline_mdbx_txn_begin(env, parent, flags, ret); -} + intptr_t avail_ram_pages; + err = mdbx_get_sysraminfo(nullptr, nullptr, &avail_ram_pages); + if (unlikely(err != MDBX_SUCCESS)) + return err; -LIBMDBX_API int mdbx_txn_commit(MDBX_txn *txn) { - return __inline_mdbx_txn_commit(txn); + return (volume_pages + redundancy_pages >= avail_ram_pages) + ? MDBX_RESULT_FALSE + : MDBX_RESULT_TRUE; } -LIBMDBX_API __cold int mdbx_env_stat(const MDBX_env *env, MDBX_stat *stat, - size_t bytes) { - return __inline_mdbx_env_stat(env, stat, bytes); -} +int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, + uint64_t increment) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; -LIBMDBX_API __cold int mdbx_env_info(const MDBX_env *env, MDBX_envinfo *info, - size_t bytes) { - return __inline_mdbx_env_info(env, info, bytes); -} + rc = dbi_check(txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; -LIBMDBX_API int mdbx_dbi_flags(const MDBX_txn *txn, MDBX_dbi dbi, - unsigned *flags) { - return __inline_mdbx_dbi_flags(txn, dbi, flags); -} + if (unlikely(txn->dbi_state[dbi] & DBI_STALE)) { + rc = sdb_fetch(txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } -LIBMDBX_API __cold int mdbx_env_sync(MDBX_env *env) { - return __inline_mdbx_env_sync(env); -} + tree_t *dbs = &txn->dbs[dbi]; + if (likely(result)) + *result = dbs->sequence; -LIBMDBX_API __cold int mdbx_env_sync_poll(MDBX_env *env) { - return __inline_mdbx_env_sync_poll(env); -} + if (likely(increment > 0)) { + if (unlikely(dbi == FREE_DBI || (txn->flags & MDBX_TXN_RDONLY) != 0)) + return MDBX_EACCESS; -LIBMDBX_API __cold int mdbx_env_close(MDBX_env *env) { - return __inline_mdbx_env_close(env); -} + uint64_t new = dbs->sequence + increment; + if (unlikely(new < increment)) + return MDBX_RESULT_TRUE; -LIBMDBX_API __cold int mdbx_env_set_mapsize(MDBX_env *env, size_t size) { - return __inline_mdbx_env_set_mapsize(env, size); -} + tASSERT(txn, new > dbs->sequence); + dbs->sequence = new; + txn->flags |= MDBX_TXN_DIRTY; + txn->dbi_state[dbi] |= DBI_DIRTY; + } -LIBMDBX_API __cold int mdbx_env_set_maxdbs(MDBX_env *env, MDBX_dbi dbs) { - return __inline_mdbx_env_set_maxdbs(env, dbs); + return MDBX_SUCCESS; } -LIBMDBX_API __cold int mdbx_env_get_maxdbs(const MDBX_env *env, MDBX_dbi *dbs) { - return __inline_mdbx_env_get_maxdbs(env, dbs); +int mdbx_cmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, + const MDBX_val *b) { + eASSERT(nullptr, txn->signature == txn_signature); + tASSERT(txn, (dbi_state(txn, dbi) & DBI_VALID) && !dbi_changed(txn, dbi)); + tASSERT(txn, + dbi < txn->env->n_dbi && (txn->env->dbs_flags[dbi] & DB_VALID) != 0); + return txn->env->kvs[dbi].clc.k.cmp(a, b); } -LIBMDBX_API __cold int mdbx_env_set_maxreaders(MDBX_env *env, - unsigned readers) { - return __inline_mdbx_env_set_maxreaders(env, readers); +int mdbx_dcmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, + const MDBX_val *b) { + eASSERT(nullptr, txn->signature == txn_signature); + tASSERT(txn, (dbi_state(txn, dbi) & DBI_VALID) && !dbi_changed(txn, dbi)); + tASSERT(txn, dbi < txn->env->n_dbi && (txn->env->dbs_flags[dbi] & DB_VALID)); + return txn->env->kvs[dbi].clc.v.cmp(a, b); } -LIBMDBX_API __cold int mdbx_env_get_maxreaders(const MDBX_env *env, - unsigned *readers) { - return __inline_mdbx_env_get_maxreaders(env, readers); +__cold MDBX_cmp_func *mdbx_get_keycmp(MDBX_db_flags_t flags) { + return builtin_keycmp(flags); } -LIBMDBX_API __cold int mdbx_env_set_syncbytes(MDBX_env *env, size_t threshold) { - return __inline_mdbx_env_set_syncbytes(env, threshold); +__cold MDBX_cmp_func *mdbx_get_datacmp(MDBX_db_flags_t flags) { + return builtin_datacmp(flags); } -LIBMDBX_API __cold int mdbx_env_get_syncbytes(const MDBX_env *env, - size_t *threshold) { - return __inline_mdbx_env_get_syncbytes(env, threshold); -} +/*----------------------------------------------------------------------------*/ -LIBMDBX_API __cold int mdbx_env_set_syncperiod(MDBX_env *env, - unsigned seconds_16dot16) { - return __inline_mdbx_env_set_syncperiod(env, seconds_16dot16); -} +__cold const char *mdbx_liberr2str(int errnum) { + /* Table of descriptions for MDBX errors */ + static const char *const tbl[] = { + "MDBX_KEYEXIST: Key/data pair already exists", + "MDBX_NOTFOUND: No matching key/data pair found", + "MDBX_PAGE_NOTFOUND: Requested page not found", + "MDBX_CORRUPTED: Database is corrupted", + "MDBX_PANIC: Environment had fatal error", + "MDBX_VERSION_MISMATCH: DB version mismatch libmdbx", + "MDBX_INVALID: File is not an MDBX file", + "MDBX_MAP_FULL: Environment mapsize limit reached", + "MDBX_DBS_FULL: Too many DBI-handles (maxdbs reached)", + "MDBX_READERS_FULL: Too many readers (maxreaders reached)", + nullptr /* MDBX_TLS_FULL (-30789): unused in MDBX */, + "MDBX_TXN_FULL: Transaction has too many dirty pages," + " i.e transaction is too big", + "MDBX_CURSOR_FULL: Cursor stack limit reachedn - this usually indicates" + " corruption, i.e branch-pages loop", + "MDBX_PAGE_FULL: Internal error - Page has no more space", + "MDBX_UNABLE_EXTEND_MAPSIZE: Database engine was unable to extend" + " mapping, e.g. since address space is unavailable or busy," + " or Operation system not supported such operations", + "MDBX_INCOMPATIBLE: Environment or database is not compatible" + " with the requested operation or the specified flags", + "MDBX_BAD_RSLOT: Invalid reuse of reader locktable slot," + " e.g. read-transaction already run for current thread", + "MDBX_BAD_TXN: Transaction is not valid for requested operation," + " e.g. had errored and be must aborted, has a child, or is invalid", + "MDBX_BAD_VALSIZE: Invalid size or alignment of key or data" + " for target database, either invalid subDB name", + "MDBX_BAD_DBI: The specified DBI-handle is invalid" + " or changed by another thread/transaction", + "MDBX_PROBLEM: Unexpected internal error, transaction should be aborted", + "MDBX_BUSY: Another write transaction is running," + " or environment is already used while opening with MDBX_EXCLUSIVE flag", + }; -LIBMDBX_API __cold int mdbx_env_get_syncperiod(const MDBX_env *env, - unsigned *seconds_16dot16) { - return __inline_mdbx_env_get_syncperiod(env, seconds_16dot16); + if (errnum >= MDBX_KEYEXIST && errnum <= MDBX_BUSY) { + int i = errnum - MDBX_KEYEXIST; + return tbl[i]; + } + + switch (errnum) { + case MDBX_SUCCESS: + return "MDBX_SUCCESS: Successful"; + case MDBX_EMULTIVAL: + return "MDBX_EMULTIVAL: The specified key has" + " more than one associated value"; + case MDBX_EBADSIGN: + return "MDBX_EBADSIGN: Wrong signature of a runtime object(s)," + " e.g. memory corruption or double-free"; + case MDBX_WANNA_RECOVERY: + return "MDBX_WANNA_RECOVERY: Database should be recovered," + " but this could NOT be done automatically for now" + " since it opened in read-only mode"; + case MDBX_EKEYMISMATCH: + return "MDBX_EKEYMISMATCH: The given key value is mismatched to the" + " current cursor position"; + case MDBX_TOO_LARGE: + return "MDBX_TOO_LARGE: Database is too large for current system," + " e.g. could NOT be mapped into RAM"; + case MDBX_THREAD_MISMATCH: + return "MDBX_THREAD_MISMATCH: A thread has attempted to use a not" + " owned object, e.g. a transaction that started by another thread"; + case MDBX_TXN_OVERLAPPING: + return "MDBX_TXN_OVERLAPPING: Overlapping read and write transactions for" + " the current thread"; + case MDBX_DUPLICATED_CLK: + return "MDBX_DUPLICATED_CLK: Alternative/Duplicate LCK-file is exists," + " please keep one and remove unused other"; + case MDBX_DANGLING_DBI: + return "MDBX_DANGLING_DBI: Some cursors and/or other resources should be" + " closed before subDb or corresponding DBI-handle could be (re)used"; + default: + return nullptr; + } } -LIBMDBX_API __cold MDBX_NOTHROW_CONST_FUNCTION intptr_t -mdbx_limits_pgsize_min(void) { - return __inline_mdbx_limits_pgsize_min(); +__cold const char *mdbx_strerror_r(int errnum, char *buf, size_t buflen) { + const char *msg = mdbx_liberr2str(errnum); + if (!msg && buflen > 0 && buflen < INT_MAX) { +#if defined(_WIN32) || defined(_WIN64) + DWORD size = FormatMessageA( + FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, nullptr, + errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen, + nullptr); + while (size && buf[size - 1] <= ' ') + --size; + buf[size] = 0; + return size ? buf : "FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM) failed"; +#elif defined(_GNU_SOURCE) && defined(__GLIBC__) + /* GNU-specific */ + if (errnum > 0) + msg = strerror_r(errnum, buf, buflen); +#elif (_POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600) + /* XSI-compliant */ + if (errnum > 0 && strerror_r(errnum, buf, buflen) == 0) + msg = buf; +#else + if (errnum > 0) { + msg = strerror(errnum); + if (msg) { + strncpy(buf, msg, buflen); + msg = buf; + } + } +#endif + if (!msg) { + (void)snprintf(buf, buflen, "error %d", errnum); + msg = buf; + } + buf[buflen - 1] = '\0'; + } + return msg; } -LIBMDBX_API __cold MDBX_NOTHROW_CONST_FUNCTION intptr_t -mdbx_limits_pgsize_max(void) { - return __inline_mdbx_limits_pgsize_max(); +__cold const char *mdbx_strerror(int errnum) { +#if defined(_WIN32) || defined(_WIN64) + static char buf[1024]; + return mdbx_strerror_r(errnum, buf, sizeof(buf)); +#else + const char *msg = mdbx_liberr2str(errnum); + if (!msg) { + if (errnum > 0) + msg = strerror(errnum); + if (!msg) { + static char buf[32]; + (void)snprintf(buf, sizeof(buf) - 1, "error %d", errnum); + msg = buf; + } + } + return msg; +#endif } -LIBMDBX_API MDBX_NOTHROW_CONST_FUNCTION uint64_t -mdbx_key_from_int64(const int64_t i64) { - return __inline_mdbx_key_from_int64(i64); +#if defined(_WIN32) || defined(_WIN64) /* Bit of madness for Windows */ +const char *mdbx_strerror_r_ANSI2OEM(int errnum, char *buf, size_t buflen) { + const char *msg = mdbx_liberr2str(errnum); + if (!msg && buflen > 0 && buflen < INT_MAX) { + DWORD size = FormatMessageA( + FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, nullptr, + errnum, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), buf, (DWORD)buflen, + nullptr); + while (size && buf[size - 1] <= ' ') + --size; + buf[size] = 0; + if (!size) + msg = "FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM) failed"; + else if (!CharToOemBuffA(buf, buf, size)) + msg = "CharToOemBuffA() failed"; + else + msg = buf; + } + return msg; } -LIBMDBX_API MDBX_NOTHROW_CONST_FUNCTION uint32_t -mdbx_key_from_int32(const int32_t i32) { - return __inline_mdbx_key_from_int32(i32); +const char *mdbx_strerror_ANSI2OEM(int errnum) { + static char buf[1024]; + return mdbx_strerror_r_ANSI2OEM(errnum, buf, sizeof(buf)); } +#endif /* Bit of madness for Windows */ +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 -#endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ -/*------------------------------------------------------------------------------ - * Locking API */ +bsr_t mvcc_bind_slot(MDBX_env *env, const uintptr_t tid) { + eASSERT(env, env->lck_mmap.lck); + eASSERT(env, env->lck->magic_and_version == MDBX_LOCK_MAGIC); + eASSERT(env, env->lck->os_and_format == MDBX_LOCK_FORMAT); -int mdbx_txn_lock(MDBX_env *env, bool dont_wait) { - int rc = check_env(env, true); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + bsr_t result = {lck_rdt_lock(env), nullptr}; + if (unlikely(MDBX_IS_ERROR(result.err))) + return result; + if (unlikely(env->flags & ENV_FATAL_ERROR)) { + lck_rdt_unlock(env); + result.err = MDBX_PANIC; + return result; + } + if (unlikely(!env->dxb_mmap.base)) { + lck_rdt_unlock(env); + result.err = MDBX_EPERM; + return result; + } - if (unlikely(env->me_flags & MDBX_RDONLY)) - return MDBX_EACCESS; - if (unlikely(env->me_txn0->mt_owner || - (env->me_txn0->mt_flags & MDBX_TXN_FINISHED) == 0)) - return MDBX_BUSY; + if (unlikely(env->registered_reader_pid != env->pid)) { + result.err = lck_rpid_set(env); + if (unlikely(result.err != MDBX_SUCCESS)) { + lck_rdt_unlock(env); + return result; + } + env->registered_reader_pid = env->pid; + } - return osal_txn_lock(env, dont_wait); -} + result.err = MDBX_SUCCESS; + size_t slot, nreaders; + while (1) { + nreaders = env->lck->rdt_length.weak; + for (slot = 0; slot < nreaders; slot++) + if (!atomic_load32(&env->lck->rdt[slot].pid, mo_AcquireRelease)) + break; -int mdbx_txn_unlock(MDBX_env *env) { - int rc = check_env(env, true); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + if (likely(slot < env->max_readers)) + break; - if (unlikely(env->me_flags & MDBX_RDONLY)) - return MDBX_EACCESS; - if (unlikely(env->me_txn0->mt_owner != osal_thread_self())) - return MDBX_THREAD_MISMATCH; - if (unlikely((env->me_txn0->mt_flags & MDBX_TXN_FINISHED) == 0)) - return MDBX_BUSY; + result.err = mvcc_cleanup_dead(env, true, nullptr); + if (result.err != MDBX_RESULT_TRUE) { + lck_rdt_unlock(env); + result.err = + (result.err == MDBX_SUCCESS) ? MDBX_READERS_FULL : result.err; + return result; + } + } - osal_txn_unlock(env); - return MDBX_SUCCESS; -} + result.rslot = &env->lck->rdt[slot]; + /* Claim the reader slot, carefully since other code + * uses the reader table un-mutexed: First reset the + * slot, next publish it in lck->rdt_length. After + * that, it is safe for mdbx_env_close() to touch it. + * When it will be closed, we can finally claim it. */ + atomic_store32(&result.rslot->pid, 0, mo_AcquireRelease); + safe64_reset(&result.rslot->txnid, true); + if (slot == nreaders) + env->lck->rdt_length.weak = (uint32_t)++nreaders; + result.rslot->tid.weak = (env->flags & MDBX_NOSTICKYTHREADS) ? 0 : tid; + atomic_store32(&result.rslot->pid, env->pid, mo_AcquireRelease); + lck_rdt_unlock(env); -/******************************************************************************* - * Checking API */ + if (likely(env->flags & ENV_TXKEY)) { + eASSERT(env, env->registered_reader_pid == env->pid); + thread_rthc_set(env->me_txkey, result.rslot); + } + return result; +} -typedef struct MDBX_chk_internal { - MDBX_chk_context_t *usr; - const struct MDBX_chk_callbacks *cb; - uint64_t monotime_timeout; +__hot txnid_t mvcc_shapshot_oldest(MDBX_env *const env, const txnid_t steady) { + const uint32_t nothing_changed = MDBX_STRING_TETRAD("None"); + eASSERT(env, steady <= env->basal_txn->txnid); - size_t *problem_counter; - uint8_t flags; - bool got_break; - bool write_locked; - uint8_t scope_depth; + lck_t *const lck = env->lck_mmap.lck; + if (unlikely(lck == nullptr /* exclusive without-lck mode */)) { + eASSERT(env, env->lck == lckless_stub(env)); + env->lck->rdt_refresh_flag.weak = nothing_changed; + return env->lck->cached_oldest.weak = steady; + } - MDBX_chk_subdb_t subdb_gc, subdb_main; - int16_t *pagemap; - MDBX_chk_subdb_t *last_lookup; - const void *last_nested; - MDBX_chk_scope_t scope_stack[12]; - MDBX_chk_subdb_t *subdb[MDBX_MAX_DBI + CORE_DBS]; + const txnid_t prev_oldest = + atomic_load64(&lck->cached_oldest, mo_AcquireRelease); + eASSERT(env, steady >= prev_oldest); - MDBX_envinfo envinfo; - meta_troika_t troika; - MDBX_val v2a_buf; -} MDBX_chk_internal_t; + txnid_t new_oldest = prev_oldest; + while (nothing_changed != + atomic_load32(&lck->rdt_refresh_flag, mo_AcquireRelease)) { + lck->rdt_refresh_flag.weak = nothing_changed; + jitter4testing(false); + const size_t snap_nreaders = + atomic_load32(&lck->rdt_length, mo_AcquireRelease); + new_oldest = steady; -__cold static int chk_check_break(MDBX_chk_scope_t *const scope) { - MDBX_chk_internal_t *const chk = scope->internal; - return (chk->got_break || (chk->cb->check_break && - (chk->got_break = chk->cb->check_break(chk->usr)))) - ? MDBX_RESULT_TRUE - : MDBX_RESULT_FALSE; -} + for (size_t i = 0; i < snap_nreaders; ++i) { + const uint32_t pid = atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease); + if (!pid) + continue; + jitter4testing(true); -__cold static void chk_line_end(MDBX_chk_line_t *line) { - if (likely(line)) { - MDBX_chk_internal_t *chk = line->ctx->internal; - assert(line->begin <= line->end && line->begin <= line->out && - line->out <= line->end); - if (likely(chk->cb->print_done)) - chk->cb->print_done(line); - } -} + const txnid_t rtxn = safe64_read(&lck->rdt[i].txnid); + if (unlikely(rtxn < prev_oldest)) { + if (unlikely(nothing_changed == atomic_load32(&lck->rdt_refresh_flag, + mo_AcquireRelease)) && + safe64_reset_compare(&lck->rdt[i].txnid, rtxn)) { + NOTICE("kick stuck reader[%zu of %zu].pid_%u %" PRIaTXN + " < prev-oldest %" PRIaTXN ", steady-txn %" PRIaTXN, + i, snap_nreaders, pid, rtxn, prev_oldest, steady); + } + continue; + } -__cold __must_check_result static MDBX_chk_line_t * -chk_line_begin(MDBX_chk_scope_t *const scope, enum MDBX_chk_severity severity) { - MDBX_chk_internal_t *const chk = scope->internal; - if (severity < MDBX_chk_warning) - mdbx_env_chk_encount_problem(chk->usr); - MDBX_chk_line_t *line = nullptr; - if (likely(chk->cb->print_begin)) { - line = chk->cb->print_begin(chk->usr, severity); - if (likely(line)) { - assert(line->ctx == nullptr || (line->ctx == chk->usr && line->empty)); - assert(line->begin <= line->end && line->begin <= line->out && - line->out <= line->end); - line->ctx = chk->usr; + if (rtxn < new_oldest) { + new_oldest = rtxn; + if (!MDBX_DEBUG && !MDBX_FORCE_ASSERTIONS && new_oldest == prev_oldest) + break; + } } } - return line; -} -__cold static MDBX_chk_line_t *chk_line_feed(MDBX_chk_line_t *line) { - if (likely(line)) { - MDBX_chk_internal_t *chk = line->ctx->internal; - enum MDBX_chk_severity severity = line->severity; - chk_line_end(line); - line = chk_line_begin(chk->usr->scope, severity); + if (new_oldest != prev_oldest) { + VERBOSE("update oldest %" PRIaTXN " -> %" PRIaTXN, prev_oldest, new_oldest); + eASSERT(env, new_oldest >= lck->cached_oldest.weak); + atomic_store64(&lck->cached_oldest, new_oldest, mo_Relaxed); } - return line; + return new_oldest; } -__cold static MDBX_chk_line_t *chk_flush(MDBX_chk_line_t *line) { - if (likely(line)) { - MDBX_chk_internal_t *chk = line->ctx->internal; - assert(line->begin <= line->end && line->begin <= line->out && - line->out <= line->end); - if (likely(chk->cb->print_flush)) { - chk->cb->print_flush(line); - assert(line->begin <= line->end && line->begin <= line->out && - line->out <= line->end); - line->out = line->begin; +pgno_t mvcc_snapshot_largest(const MDBX_env *env, pgno_t last_used_page) { + lck_t *const lck = env->lck_mmap.lck; + if (likely(lck != nullptr /* check for exclusive without-lck mode */)) { + retry:; + const size_t snap_nreaders = + atomic_load32(&lck->rdt_length, mo_AcquireRelease); + for (size_t i = 0; i < snap_nreaders; ++i) { + if (atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease)) { + /* jitter4testing(true); */ + const pgno_t snap_pages = + atomic_load32(&lck->rdt[i].snapshot_pages_used, mo_Relaxed); + const txnid_t snap_txnid = safe64_read(&lck->rdt[i].txnid); + if (unlikely(snap_pages != + atomic_load32(&lck->rdt[i].snapshot_pages_used, + mo_AcquireRelease) || + snap_txnid != safe64_read(&lck->rdt[i].txnid))) + goto retry; + if (last_used_page < snap_pages && snap_txnid <= env->basal_txn->txnid) + last_used_page = snap_pages; + } } } - return line; + + return last_used_page; } -__cold static size_t chk_print_wanna(MDBX_chk_line_t *line, size_t need) { - if (likely(line && need)) { - size_t have = line->end - line->out; - assert(line->begin <= line->end && line->begin <= line->out && - line->out <= line->end); - if (need > have) { - line = chk_flush(line); - have = line->end - line->out; +/* Find largest mvcc-snapshot still referenced by this process. */ +pgno_t mvcc_largest_this(MDBX_env *env, pgno_t largest) { + lck_t *const lck = env->lck_mmap.lck; + if (likely(lck != nullptr /* exclusive mode */)) { + const size_t snap_nreaders = + atomic_load32(&lck->rdt_length, mo_AcquireRelease); + for (size_t i = 0; i < snap_nreaders; ++i) { + retry: + if (atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease) == env->pid) { + /* jitter4testing(true); */ + const pgno_t snap_pages = + atomic_load32(&lck->rdt[i].snapshot_pages_used, mo_Relaxed); + const txnid_t snap_txnid = safe64_read(&lck->rdt[i].txnid); + if (unlikely(snap_pages != + atomic_load32(&lck->rdt[i].snapshot_pages_used, + mo_AcquireRelease) || + snap_txnid != safe64_read(&lck->rdt[i].txnid))) + goto retry; + if (largest < snap_pages && + atomic_load64(&lck->cached_oldest, mo_AcquireRelease) <= + /* ignore pending updates */ snap_txnid && + snap_txnid <= MAX_TXNID) + largest = snap_pages; + } } - return (need < have) ? need : have; } - return 0; + return largest; } -__cold static MDBX_chk_line_t *chk_puts(MDBX_chk_line_t *line, - const char *str) { - if (likely(line && str && *str)) { - MDBX_chk_internal_t *chk = line->ctx->internal; - size_t left = strlen(str); - assert(line->begin <= line->end && line->begin <= line->out && - line->out <= line->end); - if (chk->cb->print_chars) { - chk->cb->print_chars(line, str, left); - assert(line->begin <= line->end && line->begin <= line->out && - line->out <= line->end); - } else - do { - size_t chunk = chk_print_wanna(line, left); - assert(chunk <= left); - if (unlikely(!chunk)) - break; - memcpy(line->out, str, chunk); - line->out += chunk; - assert(line->begin <= line->end && line->begin <= line->out && - line->out <= line->end); - str += chunk; - left -= chunk; - } while (left); - line->empty = false; - } - return line; -} +static bool pid_insert(uint32_t *list, uint32_t pid) { + /* binary search of pid in list */ + size_t base = 0; + size_t cursor = 1; + int32_t val = 0; + size_t n = /* length */ list[0]; -__cold static MDBX_chk_line_t *chk_print_va(MDBX_chk_line_t *line, - const char *fmt, va_list args) { - if (likely(line)) { - MDBX_chk_internal_t *chk = line->ctx->internal; - assert(line->begin <= line->end && line->begin <= line->out && - line->out <= line->end); - if (chk->cb->print_format) { - chk->cb->print_format(line, fmt, args); - assert(line->begin <= line->end && line->begin <= line->out && - line->out <= line->end); + while (n > 0) { + size_t pivot = n >> 1; + cursor = base + pivot + 1; + val = pid - list[cursor]; + + if (val < 0) { + n = pivot; + } else if (val > 0) { + base = cursor; + n -= pivot + 1; } else { - va_list ones; - va_copy(ones, args); - const int needed = vsnprintf(nullptr, 0, fmt, ones); - va_end(ones); - if (likely(needed > 0)) { - const size_t have = chk_print_wanna(line, needed); - if (likely(have > 0)) { - int written = vsnprintf(line->out, have, fmt, args); - if (likely(written > 0)) - line->out += written; - assert(line->begin <= line->end && line->begin <= line->out && - line->out <= line->end); - } - } + /* found, so it's a duplicate */ + return false; } - line->empty = false; } - return line; -} -__cold static MDBX_chk_line_t *MDBX_PRINTF_ARGS(2, 3) - chk_print(MDBX_chk_line_t *line, const char *fmt, ...) { - if (likely(line)) { - // MDBX_chk_internal_t *chk = line->ctx->internal; - va_list args; - va_start(args, fmt); - line = chk_print_va(line, fmt, args); - va_end(args); - line->empty = false; - } - return line; -} + if (val > 0) + ++cursor; -__cold static MDBX_chk_line_t *chk_print_size(MDBX_chk_line_t *line, - const char *prefix, - const uint64_t value, - const char *suffix) { - static const char sf[] = - "KMGTPEZY"; /* LY: Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta! */ - if (likely(line)) { - MDBX_chk_internal_t *chk = line->ctx->internal; - prefix = prefix ? prefix : ""; - suffix = suffix ? suffix : ""; - if (chk->cb->print_size) - chk->cb->print_size(line, prefix, value, suffix); - else - for (unsigned i = 0;; ++i) { - const unsigned scale = 10 + i * 10; - const uint64_t rounded = value + (UINT64_C(5) << (scale - 10)); - const uint64_t integer = rounded >> scale; - const uint64_t fractional = - (rounded - (integer << scale)) * 100u >> scale; - if ((rounded >> scale) <= 1000) - return chk_print(line, "%s%" PRIu64 " (%u.%02u %ciB)%s", prefix, - value, (unsigned)integer, (unsigned)fractional, - sf[i], suffix); - } - line->empty = false; - } - return line; + list[0]++; + for (n = list[0]; n > cursor; n--) + list[n] = list[n - 1]; + list[n] = pid; + return true; } -__cold static int chk_error_rc(MDBX_chk_scope_t *const scope, int err, - const char *subj) { - MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_error); - if (line) - chk_line_end(chk_flush(chk_print(line, "%s() failed, error %s (%d)", subj, - mdbx_strerror(err), err))); - else - debug_log(MDBX_LOG_ERROR, "mdbx_env_chk", 0, "%s() failed, error %s (%d)", - subj, mdbx_strerror(err), err); - return err; -} +__cold MDBX_INTERNAL int mvcc_cleanup_dead(MDBX_env *env, int rdt_locked, + int *dead) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; -__cold static void MDBX_PRINTF_ARGS(5, 6) - chk_object_issue(MDBX_chk_scope_t *const scope, const char *object, - uint64_t entry_number, const char *caption, - const char *extra_fmt, ...) { - MDBX_chk_internal_t *const chk = scope->internal; - MDBX_chk_issue_t *issue = chk->usr->scope->issues; - while (issue) { - if (issue->caption == caption) { - issue->count += 1; - break; - } else - issue = issue->next; - } - const bool fresh = issue == nullptr; - if (fresh) { - issue = osal_malloc(sizeof(*issue)); - if (likely(issue)) { - issue->caption = caption; - issue->count = 1; - issue->next = chk->usr->scope->issues; - chk->usr->scope->issues = issue; - } else - chk_error_rc(scope, ENOMEM, "adding issue"); + eASSERT(env, rdt_locked >= 0); + lck_t *const lck = env->lck_mmap.lck; + if (unlikely(lck == nullptr)) { + /* exclusive mode */ + if (dead) + *dead = 0; + return MDBX_SUCCESS; } - va_list args; - va_start(args, extra_fmt); - if (chk->cb->issue) { - mdbx_env_chk_encount_problem(chk->usr); - chk->cb->issue(chk->usr, object, entry_number, caption, extra_fmt, args); - } else { - MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_error); - if (entry_number != UINT64_MAX) - chk_print(line, "%s #%" PRIu64 ": %s", object, entry_number, caption); - else - chk_print(line, "%s: %s", object, caption); - if (extra_fmt) - chk_puts(chk_print_va(chk_puts(line, " ("), extra_fmt, args), ")"); - chk_line_end(fresh ? chk_flush(line) : line); - } - va_end(args); -} + const size_t snap_nreaders = + atomic_load32(&lck->rdt_length, mo_AcquireRelease); + uint32_t pidsbuf_onstask[142]; + uint32_t *const pids = + (snap_nreaders < ARRAY_LENGTH(pidsbuf_onstask)) + ? pidsbuf_onstask + : osal_malloc((snap_nreaders + 1) * sizeof(uint32_t)); + if (unlikely(!pids)) + return MDBX_ENOMEM; -__cold static void MDBX_PRINTF_ARGS(2, 3) - chk_scope_issue(MDBX_chk_scope_t *const scope, const char *fmt, ...) { - MDBX_chk_internal_t *const chk = scope->internal; - va_list args; - va_start(args, fmt); - if (likely(chk->cb->issue)) { - mdbx_env_chk_encount_problem(chk->usr); - chk->cb->issue(chk->usr, nullptr, 0, nullptr, fmt, args); - } else - chk_line_end( - chk_print_va(chk_line_begin(scope, MDBX_chk_error), fmt, args)); - va_end(args); -} + pids[0] = 0; + int count = 0; + for (size_t i = 0; i < snap_nreaders; i++) { + const uint32_t pid = atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease); + if (pid == 0) + continue /* skip empty */; + if (pid == env->pid) + continue /* skip self */; + if (!pid_insert(pids, pid)) + continue /* such pid already processed */; -__cold static int chk_scope_end(MDBX_chk_internal_t *chk, int err) { - assert(chk->scope_depth > 0); - MDBX_chk_scope_t *const inner = chk->scope_stack + chk->scope_depth; - MDBX_chk_scope_t *const outer = chk->scope_depth ? inner - 1 : nullptr; - if (!outer || outer->stage != inner->stage) { - if (err == MDBX_SUCCESS && *chk->problem_counter) - err = MDBX_PROBLEM; - else if (*chk->problem_counter == 0 && MDBX_IS_ERROR(err)) - *chk->problem_counter = 1; - if (chk->problem_counter != &chk->usr->result.total_problems) { - chk->usr->result.total_problems += *chk->problem_counter; - chk->problem_counter = &chk->usr->result.total_problems; + int err = lck_rpid_check(env, pid); + if (err == MDBX_RESULT_TRUE) + continue /* reader is live */; + + if (err != MDBX_SUCCESS) { + rc = err; + break /* lck_rpid_check() failed */; } - if (chk->cb->stage_end) - err = chk->cb->stage_end(chk->usr, inner->stage, err); - } - if (chk->cb->scope_conclude) - err = chk->cb->scope_conclude(chk->usr, outer, inner, err); - chk->usr->scope = outer; - chk->usr->scope_nesting = chk->scope_depth -= 1; - if (outer) - outer->subtotal_issues += inner->subtotal_issues; - if (chk->cb->scope_pop) - chk->cb->scope_pop(chk->usr, outer, inner); - while (inner->issues) { - MDBX_chk_issue_t *next = inner->issues->next; - osal_free(inner->issues); - inner->issues = next; - } - memset(inner, -1, sizeof(*inner)); - return err; -} + /* stale reader found */ + if (!rdt_locked) { + err = lck_rdt_lock(env); + if (MDBX_IS_ERROR(err)) { + rc = err; + break; + } -__cold static int chk_scope_begin_args(MDBX_chk_internal_t *chk, - int verbosity_adjustment, - enum MDBX_chk_stage stage, - const void *object, size_t *problems, - const char *fmt, va_list args) { - if (unlikely(chk->scope_depth + 1u >= ARRAY_LENGTH(chk->scope_stack))) - return MDBX_BACKLOG_DEPLETED; + rdt_locked = -1; + if (err == MDBX_RESULT_TRUE) { + /* mutex recovered, the mdbx_ipclock_failed() checked all readers */ + rc = MDBX_RESULT_TRUE; + break; + } - MDBX_chk_scope_t *const outer = chk->scope_stack + chk->scope_depth; - const int verbosity = - outer->verbosity + - (verbosity_adjustment - 1) * (1 << MDBX_chk_severity_prio_shift); - MDBX_chk_scope_t *const inner = outer + 1; - memset(inner, 0, sizeof(*inner)); - inner->internal = outer->internal; - inner->stage = stage ? stage : (stage = outer->stage); - inner->object = object; - inner->verbosity = (verbosity < MDBX_chk_warning) - ? MDBX_chk_warning - : (enum MDBX_chk_severity)verbosity; - if (problems) - chk->problem_counter = problems; - else if (!chk->problem_counter || outer->stage != stage) - chk->problem_counter = &chk->usr->result.total_problems; + /* a other process may have clean and reused slot, recheck */ + if (lck->rdt[i].pid.weak != pid) + continue; - if (chk->cb->scope_push) { - const int err = chk->cb->scope_push(chk->usr, outer, inner, fmt, args); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } - chk->usr->scope = inner; - chk->usr->scope_nesting = chk->scope_depth += 1; + err = lck_rpid_check(env, pid); + if (MDBX_IS_ERROR(err)) { + rc = err; + break; + } - if (stage != outer->stage && chk->cb->stage_begin) { - int err = chk->cb->stage_begin(chk->usr, stage); - if (unlikely(err != MDBX_SUCCESS)) { - err = chk_scope_end(chk, err); - assert(err != MDBX_SUCCESS); - return err ? err : MDBX_RESULT_TRUE; + if (err != MDBX_SUCCESS) + continue /* the race with other process, slot reused */; + } + + /* clean it */ + for (size_t ii = i; ii < snap_nreaders; ii++) { + if (lck->rdt[ii].pid.weak == pid) { + DEBUG("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN, (size_t)pid, + lck->rdt[ii].txnid.weak); + atomic_store32(&lck->rdt[ii].pid, 0, mo_Relaxed); + atomic_store32(&lck->rdt_refresh_flag, true, mo_AcquireRelease); + count++; + } } } - return MDBX_SUCCESS; -} -__cold static int MDBX_PRINTF_ARGS(6, 7) - chk_scope_begin(MDBX_chk_internal_t *chk, int verbosity_adjustment, - enum MDBX_chk_stage stage, const void *object, - size_t *problems, const char *fmt, ...) { - va_list args; - va_start(args, fmt); - int rc = chk_scope_begin_args(chk, verbosity_adjustment, stage, object, - problems, fmt, args); - va_end(args); + if (likely(!MDBX_IS_ERROR(rc))) + atomic_store64(&lck->readers_check_timestamp, osal_monotime(), mo_Relaxed); + + if (rdt_locked < 0) + lck_rdt_unlock(env); + + if (pids != pidsbuf_onstask) + osal_free(pids); + + if (dead) + *dead = count; return rc; } -__cold static int chk_scope_restore(MDBX_chk_scope_t *const target, int err) { - MDBX_chk_internal_t *const chk = target->internal; - assert(target <= chk->usr->scope); - while (chk->usr->scope > target) - err = chk_scope_end(chk, err); - return err; -} +__cold txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler) { + DEBUG("DB size maxed out by reading #%" PRIaTXN, straggler); + osal_memory_fence(mo_AcquireRelease, false); + MDBX_hsr_func *const callback = env->hsr_callback; + txnid_t oldest = 0; + bool notify_eof_of_loop = false; + int retry = 0; + do { + const txnid_t steady = + env->txn->tw.troika.txnid[env->txn->tw.troika.prefer_steady]; + env->lck->rdt_refresh_flag.weak = /* force refresh */ true; + oldest = mvcc_shapshot_oldest(env, steady); + eASSERT(env, oldest < env->basal_txn->txnid); + eASSERT(env, oldest >= straggler); + eASSERT(env, oldest >= env->lck->cached_oldest.weak); + + lck_t *const lck = env->lck_mmap.lck; + if (oldest == steady || oldest > straggler || /* without-LCK mode */ !lck) + break; -__cold void chk_scope_pop(MDBX_chk_scope_t *const inner) { - if (inner && inner > inner->internal->scope_stack) - chk_scope_restore(inner - 1, MDBX_SUCCESS); -} + if (MDBX_IS_ERROR(mvcc_cleanup_dead(env, false, nullptr))) + break; -__cold static MDBX_chk_scope_t *MDBX_PRINTF_ARGS(3, 4) - chk_scope_push(MDBX_chk_scope_t *const scope, int verbosity_adjustment, - const char *fmt, ...) { - chk_scope_restore(scope, MDBX_SUCCESS); - va_list args; - va_start(args, fmt); - int err = chk_scope_begin_args(scope->internal, verbosity_adjustment, - scope->stage, nullptr, nullptr, fmt, args); - va_end(args); - return err ? nullptr : scope + 1; -} + if (!callback) + break; -__cold static const char *chk_v2a(MDBX_chk_internal_t *chk, - const MDBX_val *val) { - if (val == MDBX_CHK_MAIN) - return "@MAIN"; - if (val == MDBX_CHK_GC) - return "@GC"; - if (val == MDBX_CHK_META) - return "@META"; + reader_slot_t *stucked = nullptr; + uint64_t hold_retired = 0; + for (size_t i = 0; i < lck->rdt_length.weak; ++i) { + const uint64_t snap_retired = + atomic_load64(&lck->rdt[i].snapshot_pages_retired, mo_Relaxed); + const txnid_t rtxn = safe64_read(&lck->rdt[i].txnid); + if (rtxn == straggler && + atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease)) { + hold_retired = snap_retired; + stucked = &lck->rdt[i]; + } + } - const unsigned char *const data = val->iov_base; - const size_t len = val->iov_len; - if (data == MDBX_CHK_MAIN) - return "@MAIN"; - if (data == MDBX_CHK_GC) - return "@GC"; - if (data == MDBX_CHK_META) - return "@META"; + if (!stucked) + break; - if (!len) - return ""; - if (!data) - return ""; - if (len > 65536) { - const size_t enough = 42; - if (chk->v2a_buf.iov_len < enough) { - void *ptr = osal_realloc(chk->v2a_buf.iov_base, enough); - if (unlikely(!ptr)) - return ""; - chk->v2a_buf.iov_base = ptr; - chk->v2a_buf.iov_len = enough; + uint32_t pid = atomic_load32(&stucked->pid, mo_AcquireRelease); + uint64_t tid = atomic_load64(&stucked->tid, mo_AcquireRelease); + if (safe64_read(&stucked->txnid) != straggler || !pid || + stucked->snapshot_pages_retired.weak != hold_retired) + continue; + + const meta_ptr_t head = meta_recent(env, &env->txn->tw.troika); + const txnid_t gap = (head.txnid - straggler) / xMDBX_TXNID_STEP; + const uint64_t head_retired = + unaligned_peek_u64(4, head.ptr_c->pages_retired); + const size_t space = + (head_retired > hold_retired) + ? pgno2bytes(env, (pgno_t)(head_retired - hold_retired)) + : 0; + int rc = + callback(env, env->txn, pid, (mdbx_tid_t)((intptr_t)tid), straggler, + (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, space, retry); + if (rc < 0) + /* hsr returned error and/or agree MDBX_MAP_FULL error */ + break; + + if (rc > 0) { + if (rc == 1) { + /* hsr reported transaction (will be) aborted asynchronous */ + safe64_reset_compare(&stucked->txnid, straggler); + } else { + /* hsr reported reader process was killed and slot should be cleared */ + safe64_reset(&stucked->txnid, true); + atomic_store64(&stucked->tid, 0, mo_Relaxed); + atomic_store32(&stucked->pid, 0, mo_AcquireRelease); + } + } else if (!notify_eof_of_loop) { +#if MDBX_ENABLE_PROFGC + env->lck->pgops.gc_prof.kicks += 1; +#endif /* MDBX_ENABLE_PROFGC */ + notify_eof_of_loop = true; } - snprintf(chk->v2a_buf.iov_base, chk->v2a_buf.iov_len, - "", len); - return chk->v2a_buf.iov_base; - } - bool printable = true; - bool quoting = false; - size_t xchars = 0; - for (size_t i = 0; i < len && printable; ++i) { - quoting = quoting || !(data[i] == '_' || isalnum(data[i])); - printable = - isprint(data[i]) || (data[i] < ' ' && ++xchars < 4 && len > xchars * 4); - } + } while (++retry < INT_MAX); - size_t need = len + 1; - if (quoting || !printable) - need += len + /* quotes */ 2 + 2 * /* max xchars */ 4; - if (need > chk->v2a_buf.iov_len) { - void *ptr = osal_realloc(chk->v2a_buf.iov_base, need); - if (unlikely(!ptr)) - return ""; - chk->v2a_buf.iov_base = ptr; - chk->v2a_buf.iov_len = need; + if (notify_eof_of_loop) { + /* notify end of hsr-loop */ + const txnid_t turn = oldest - straggler; + if (turn) + NOTICE("hsr-kick: done turn %" PRIaTXN " -> %" PRIaTXN " +%" PRIaTXN, + straggler, oldest, turn); + callback(env, env->txn, 0, 0, straggler, + (turn < UINT_MAX) ? (unsigned)turn : UINT_MAX, 0, -retry); } + return oldest; +} - static const char hex[] = "0123456789abcdef"; - char *w = chk->v2a_buf.iov_base; - if (!quoting) { - memcpy(w, data, len); - w += len; - } else if (printable) { - *w++ = '\''; - for (size_t i = 0; i < len; ++i) { - if (data[i] < ' ') { - assert((char *)chk->v2a_buf.iov_base + chk->v2a_buf.iov_len > w + 4); - w[0] = '\\'; - w[1] = 'x'; - w[2] = hex[data[i] >> 4]; - w[3] = hex[data[i] & 15]; - w += 4; - } else if (strchr("\"'`\\", data[i])) { - assert((char *)chk->v2a_buf.iov_base + chk->v2a_buf.iov_len > w + 2); - w[0] = '\\'; - w[1] = data[i]; - w += 2; - } else { - assert((char *)chk->v2a_buf.iov_base + chk->v2a_buf.iov_len > w + 1); - *w++ = data[i]; - } - } - *w++ = '\''; - } else { - *w++ = '\\'; - *w++ = 'x'; - for (size_t i = 0; i < len; ++i) { - assert((char *)chk->v2a_buf.iov_base + chk->v2a_buf.iov_len > w + 2); - w[0] = hex[data[i] >> 4]; - w[1] = hex[data[i] & 15]; - w += 2; - } +/*----------------------------------------------------------------------------*/ + +__cold int mdbx_thread_register(const MDBX_env *env) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!env->lck_mmap.lck)) + return (env->flags & MDBX_EXCLUSIVE) ? MDBX_EINVAL : MDBX_EPERM; + + if (unlikely((env->flags & ENV_TXKEY) == 0)) { + eASSERT(env, env->flags & MDBX_NOSTICKYTHREADS); + return MDBX_EINVAL /* MDBX_NOSTICKYTHREADS mode */; } - assert((char *)chk->v2a_buf.iov_base + chk->v2a_buf.iov_len > w); - *w = 0; - return chk->v2a_buf.iov_base; -} -__cold static void chk_dispose(MDBX_chk_internal_t *chk) { - assert(chk->subdb[FREE_DBI] == &chk->subdb_gc); - assert(chk->subdb[MAIN_DBI] == &chk->subdb_main); - for (size_t i = 0; i < ARRAY_LENGTH(chk->subdb); ++i) { - MDBX_chk_subdb_t *const sdb = chk->subdb[i]; - if (sdb) { - chk->subdb[i] = nullptr; - if (chk->cb->subdb_dispose && sdb->cookie) { - chk->cb->subdb_dispose(chk->usr, sdb); - sdb->cookie = nullptr; - } - if (sdb != &chk->subdb_gc && sdb != &chk->subdb_main) { - osal_free(sdb); - } - } + eASSERT(env, (env->flags & (MDBX_NOSTICKYTHREADS | ENV_TXKEY)) == ENV_TXKEY); + reader_slot_t *r = thread_rthc_get(env->me_txkey); + if (unlikely(r != nullptr)) { + eASSERT(env, r->pid.weak == env->pid); + eASSERT(env, r->tid.weak == osal_thread_self()); + if (unlikely(r->pid.weak != env->pid)) + return MDBX_BAD_RSLOT; + return MDBX_RESULT_TRUE /* already registered */; } - osal_free(chk->v2a_buf.iov_base); - osal_free(chk->pagemap); - chk->usr->internal = nullptr; - chk->usr->scope = nullptr; - chk->pagemap = nullptr; - memset(chk, 0xDD, sizeof(*chk)); - osal_free(chk); -} -static size_t div_8s(size_t numerator, size_t divider) { - assert(numerator <= (SIZE_MAX >> 8)); - return (numerator << 8) / divider; + const uintptr_t tid = osal_thread_self(); + if (env->txn && unlikely(env->basal_txn->owner == tid)) + return MDBX_TXN_OVERLAPPING; + return mvcc_bind_slot((MDBX_env *)env, tid).err; } -static size_t mul_8s(size_t quotient, size_t multiplier) { - size_t hi = multiplier * (quotient >> 8); - size_t lo = multiplier * (quotient & 255) + 128; - return hi + (lo >> 8); -} +__cold int mdbx_thread_unregister(const MDBX_env *env) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; -static void histogram_reduce(struct MDBX_chk_histogram *p) { - const size_t size = ARRAY_LENGTH(p->ranges), last = size - 1; - // ищем пару для слияния с минимальной ошибкой - size_t min_err = SIZE_MAX, min_i = last - 1; - for (size_t i = 0; i < last; ++i) { - const size_t b1 = p->ranges[i].begin, e1 = p->ranges[i].end, - s1 = p->ranges[i].amount; - const size_t b2 = p->ranges[i + 1].begin, e2 = p->ranges[i + 1].end, - s2 = p->ranges[i + 1].amount; - const size_t l1 = e1 - b1, l2 = e2 - b2, lx = e2 - b1, sx = s1 + s2; - assert(s1 > 0 && b1 > 0 && b1 < e1); - assert(s2 > 0 && b2 > 0 && b2 < e2); - assert(e1 <= b2); - // за ошибку принимаем площадь изменений на гистограмме при слиянии - const size_t h1 = div_8s(s1, l1), h2 = div_8s(s2, l2), hx = div_8s(sx, lx); - const size_t d1 = mul_8s((h1 > hx) ? h1 - hx : hx - h1, l1); - const size_t d2 = mul_8s((h2 > hx) ? h2 - hx : hx - h2, l2); - const size_t dx = mul_8s(hx, b2 - e1); - const size_t err = d1 + d2 + dx; - if (min_err >= err) { - min_i = i; - min_err = err; - } + if (unlikely(!env->lck_mmap.lck)) + return MDBX_RESULT_TRUE; + + if (unlikely((env->flags & ENV_TXKEY) == 0)) { + eASSERT(env, env->flags & MDBX_NOSTICKYTHREADS); + return MDBX_RESULT_TRUE /* MDBX_NOSTICKYTHREADS mode */; } - // объединяем - p->ranges[min_i].end = p->ranges[min_i + 1].end; - p->ranges[min_i].amount += p->ranges[min_i + 1].amount; - p->ranges[min_i].count += p->ranges[min_i + 1].count; - if (min_i < last) - // перемещаем хвост - memmove(p->ranges + min_i, p->ranges + min_i + 1, - (last - min_i) * sizeof(p->ranges[0])); - // обнуляем последний элемент и продолжаем - p->ranges[last].count = 0; -} -static void histogram_acc(const size_t n, struct MDBX_chk_histogram *p) { - STATIC_ASSERT(ARRAY_LENGTH(p->ranges) > 2); - p->amount += n; - p->count += 1; - if (likely(n < 2)) { - p->ones += n; - p->pad += 1; - } else - for (;;) { - const size_t size = ARRAY_LENGTH(p->ranges), last = size - 1; - size_t i = 0; - while (i < size && p->ranges[i].count && n >= p->ranges[i].begin) { - if (n < p->ranges[i].end) { - // значение попадает в существующий интервал - p->ranges[i].amount += n; - p->ranges[i].count += 1; - return; - } - ++i; - } - if (p->ranges[last].count == 0) { - // использованы еще не все слоты, добавляем интервал - assert(i < size); - if (p->ranges[i].count) { - assert(i < last); - // раздвигаем -#ifdef __COVERITY__ - if (i < last) /* avoid Coverity false-positive issue */ -#endif /* __COVERITY__ */ - memmove(p->ranges + i + 1, p->ranges + i, - (last - i) * sizeof(p->ranges[0])); - } - p->ranges[i].begin = n; - p->ranges[i].end = n + 1; - p->ranges[i].amount = n; - p->ranges[i].count = 1; - return; - } - histogram_reduce(p); - } -} + eASSERT(env, (env->flags & (MDBX_NOSTICKYTHREADS | ENV_TXKEY)) == ENV_TXKEY); + reader_slot_t *r = thread_rthc_get(env->me_txkey); + if (unlikely(r == nullptr)) + return MDBX_RESULT_TRUE /* not registered */; -__cold static MDBX_chk_line_t * -histogram_dist(MDBX_chk_line_t *line, - const struct MDBX_chk_histogram *histogram, const char *prefix, - const char *first, bool amount) { - line = chk_print(line, "%s:", prefix); - const char *comma = ""; - const size_t first_val = amount ? histogram->ones : histogram->pad; - if (first_val) { - chk_print(line, " %s=%" PRIuSIZE, first, first_val); - comma = ","; - } - for (size_t n = 0; n < ARRAY_LENGTH(histogram->ranges); ++n) - if (histogram->ranges[n].count) { - chk_print(line, "%s %" PRIuSIZE, comma, histogram->ranges[n].begin); - if (histogram->ranges[n].begin != histogram->ranges[n].end - 1) - chk_print(line, "-%" PRIuSIZE, histogram->ranges[n].end - 1); - line = chk_print(line, "=%" PRIuSIZE, - amount ? histogram->ranges[n].amount - : histogram->ranges[n].count); - comma = ","; - } - return line; + eASSERT(env, r->pid.weak == env->pid); + eASSERT(env, r->tid.weak == osal_thread_self()); + if (unlikely(r->pid.weak != env->pid || r->tid.weak != osal_thread_self())) + return MDBX_BAD_RSLOT; + + eASSERT(env, r->txnid.weak >= SAFE64_INVALID_THRESHOLD); + if (unlikely(r->txnid.weak < SAFE64_INVALID_THRESHOLD)) + return MDBX_BUSY /* transaction is still active */; + + atomic_store32(&r->pid, 0, mo_Relaxed); + atomic_store32(&env->lck->rdt_refresh_flag, true, mo_AcquireRelease); + thread_rthc_set(env->me_txkey, nullptr); + return MDBX_SUCCESS; } +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \note Please refer to the COPYRIGHT file for explanations license change, +/// credits and acknowledgments. +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 -__cold static MDBX_chk_line_t * -histogram_print(MDBX_chk_scope_t *scope, MDBX_chk_line_t *line, - const struct MDBX_chk_histogram *histogram, const char *prefix, - const char *first, bool amount) { - if (histogram->count) { - line = chk_print(line, "%s %" PRIuSIZE, prefix, - amount ? histogram->amount : histogram->count); - if (scope->verbosity > MDBX_chk_info) - line = chk_puts( - histogram_dist(line, histogram, " (distribution", first, amount), - ")"); + +__hot int __must_check_result node_add_dupfix(MDBX_cursor *mc, size_t indx, + const MDBX_val *key) { + page_t *mp = mc->pg[mc->top]; + MDBX_ANALYSIS_ASSUME(key != nullptr); + DKBUF_DEBUG; + DEBUG("add to leaf2-%spage %" PRIaPGNO " index %zi, " + " key size %" PRIuPTR " [%s]", + is_subpage(mp) ? "sub-" : "", mp->pgno, indx, key ? key->iov_len : 0, + DKEY_DEBUG(key)); + + cASSERT(mc, key); + cASSERT(mc, page_type_compat(mp) == (P_LEAF | P_DUPFIX)); + const size_t ksize = mc->tree->dupfix_size; + cASSERT(mc, ksize == key->iov_len); + const size_t nkeys = page_numkeys(mp); + cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->upper) & 1) == 0); + + /* Just using these for counting */ + const intptr_t lower = mp->lower + sizeof(indx_t); + const intptr_t upper = mp->upper - (ksize - sizeof(indx_t)); + if (unlikely(lower > upper)) { + mc->txn->flags |= MDBX_TXN_ERROR; + return MDBX_PAGE_FULL; } - return line; + mp->lower = (indx_t)lower; + mp->upper = (indx_t)upper; + + void *const ptr = page_dupfix_ptr(mp, indx, ksize); + cASSERT(mc, nkeys >= indx); + const size_t diff = nkeys - indx; + if (likely(diff > 0)) + /* Move higher keys up one slot. */ + memmove(ptr_disp(ptr, ksize), ptr, diff * ksize); + /* insert new key */ + memcpy(ptr, key->iov_base, ksize); + + cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->upper) & 1) == 0); + return MDBX_SUCCESS; } -//----------------------------------------------------------------------------- +int __must_check_result node_add_branch(MDBX_cursor *mc, size_t indx, + const MDBX_val *key, pgno_t pgno) { + page_t *mp = mc->pg[mc->top]; + DKBUF_DEBUG; + DEBUG("add to branch-%spage %" PRIaPGNO " index %zi, node-pgno %" PRIaPGNO + " key size %" PRIuPTR " [%s]", + is_subpage(mp) ? "sub-" : "", mp->pgno, indx, pgno, + key ? key->iov_len : 0, DKEY_DEBUG(key)); -__cold static int chk_get_sdb(MDBX_chk_scope_t *const scope, - const MDBX_walk_sdb_t *in, - MDBX_chk_subdb_t **out) { - MDBX_chk_internal_t *const chk = scope->internal; - if (chk->last_lookup && - chk->last_lookup->name.iov_base == in->name.iov_base) { - *out = chk->last_lookup; - return MDBX_SUCCESS; + cASSERT(mc, page_type(mp) == P_BRANCH); + STATIC_ASSERT(NODESIZE % 2 == 0); + + /* Move higher pointers up one slot. */ + const size_t nkeys = page_numkeys(mp); + cASSERT(mc, nkeys >= indx); + for (size_t i = nkeys; i > indx; --i) + mp->entries[i] = mp->entries[i - 1]; + + /* Adjust free space offsets. */ + const size_t branch_bytes = branch_size(mc->txn->env, key); + const intptr_t lower = mp->lower + sizeof(indx_t); + const intptr_t upper = mp->upper - (branch_bytes - sizeof(indx_t)); + if (unlikely(lower > upper)) { + mc->txn->flags |= MDBX_TXN_ERROR; + return MDBX_PAGE_FULL; } + mp->lower = (indx_t)lower; + mp->entries[indx] = mp->upper = (indx_t)upper; - for (size_t i = 0; i < ARRAY_LENGTH(chk->subdb); ++i) { - MDBX_chk_subdb_t *sdb = chk->subdb[i]; - if (!sdb) { - sdb = osal_calloc(1, sizeof(MDBX_chk_subdb_t)); - if (unlikely(!sdb)) { - *out = nullptr; - return chk_error_rc(scope, MDBX_ENOMEM, "alloc_subDB"); - } - chk->subdb[i] = sdb; - sdb->flags = in->internal->md_flags; - sdb->id = -1; - sdb->name = in->name; + /* Write the node data. */ + node_t *node = page_node(mp, indx); + node_set_pgno(node, pgno); + node_set_flags(node, 0); + UNALIGNED_POKE_8(node, node_t, extra, 0); + node_set_ks(node, 0); + if (likely(key != nullptr)) { + node_set_ks(node, key->iov_len); + memcpy(node_key(node), key->iov_base, key->iov_len); + } + return MDBX_SUCCESS; +} + +__hot int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx, + const MDBX_val *key, MDBX_val *data, + unsigned flags) { + MDBX_ANALYSIS_ASSUME(key != nullptr); + MDBX_ANALYSIS_ASSUME(data != nullptr); + page_t *mp = mc->pg[mc->top]; + DKBUF_DEBUG; + DEBUG("add to leaf-%spage %" PRIaPGNO " index %zi, data size %" PRIuPTR + " key size %" PRIuPTR " [%s]", + is_subpage(mp) ? "sub-" : "", mp->pgno, indx, data ? data->iov_len : 0, + key ? key->iov_len : 0, DKEY_DEBUG(key)); + cASSERT(mc, key != nullptr && data != nullptr); + cASSERT(mc, page_type_compat(mp) == P_LEAF); + page_t *largepage = nullptr; + + size_t node_bytes; + if (unlikely(flags & N_BIGDATA)) { + /* Data already on large/overflow page. */ + STATIC_ASSERT(sizeof(pgno_t) % 2 == 0); + node_bytes = + node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t); + cASSERT(mc, page_room(mp) >= node_bytes); + } else if (unlikely(node_size(key, data) > mc->txn->env->leaf_nodemax)) { + /* Put data on large/overflow page. */ + if (unlikely(mc->tree->flags & MDBX_DUPSORT)) { + ERROR("Unexpected target %s flags 0x%x for large data-item", "dupsort-db", + mc->tree->flags); + return MDBX_PROBLEM; } - if (sdb->name.iov_base == in->name.iov_base) { - if (sdb->id < 0) { - sdb->id = (int)i; - sdb->cookie = - chk->cb->subdb_filter - ? chk->cb->subdb_filter(chk->usr, &sdb->name, sdb->flags) - : (void *)(intptr_t)-1; - } - *out = (chk->last_lookup = sdb); + if (unlikely(flags & (N_DUPDATA | N_SUBDATA))) { + ERROR("Unexpected target %s flags 0x%x for large data-item", "node", + flags); + return MDBX_PROBLEM; + } + cASSERT(mc, page_room(mp) >= leaf_size(mc->txn->env, key, data)); + const pgno_t ovpages = largechunk_npages(mc->txn->env, data->iov_len); + const pgr_t npr = page_new_large(mc, ovpages); + if (unlikely(npr.err != MDBX_SUCCESS)) + return npr.err; + largepage = npr.page; + DEBUG("allocated %u large/overflow page(s) %" PRIaPGNO "for %" PRIuPTR + " data bytes", + largepage->pages, largepage->pgno, data->iov_len); + flags |= N_BIGDATA; + node_bytes = + node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t); + cASSERT(mc, node_bytes == leaf_size(mc->txn->env, key, data)); + } else { + cASSERT(mc, page_room(mp) >= leaf_size(mc->txn->env, key, data)); + node_bytes = node_size(key, data) + sizeof(indx_t); + cASSERT(mc, node_bytes == leaf_size(mc->txn->env, key, data)); + } + + /* Move higher pointers up one slot. */ + const size_t nkeys = page_numkeys(mp); + cASSERT(mc, nkeys >= indx); + for (size_t i = nkeys; i > indx; --i) + mp->entries[i] = mp->entries[i - 1]; + + /* Adjust free space offsets. */ + const intptr_t lower = mp->lower + sizeof(indx_t); + const intptr_t upper = mp->upper - (node_bytes - sizeof(indx_t)); + if (unlikely(lower > upper)) { + mc->txn->flags |= MDBX_TXN_ERROR; + return MDBX_PAGE_FULL; + } + mp->lower = (indx_t)lower; + mp->entries[indx] = mp->upper = (indx_t)upper; + + /* Write the node data. */ + node_t *node = page_node(mp, indx); + node_set_ks(node, key->iov_len); + node_set_flags(node, (uint8_t)flags); + UNALIGNED_POKE_8(node, node_t, extra, 0); + node_set_ds(node, data->iov_len); + memcpy(node_key(node), key->iov_base, key->iov_len); + + void *nodedata = node_data(node); + if (likely(largepage == nullptr)) { + if (unlikely(flags & N_BIGDATA)) { + memcpy(nodedata, data->iov_base, sizeof(pgno_t)); return MDBX_SUCCESS; } + } else { + poke_pgno(nodedata, largepage->pgno); + nodedata = page_data(largepage); } - chk_scope_issue(scope, "too many subDBs > %u", - (unsigned)ARRAY_LENGTH(chk->subdb) - CORE_DBS - /* meta */ 1); - *out = nullptr; - return MDBX_PROBLEM; + if (unlikely(flags & MDBX_RESERVE)) + data->iov_base = nodedata; + else if (likely(data->iov_len /* to avoid UBSAN traps */)) + memcpy(nodedata, data->iov_base, data->iov_len); + return MDBX_SUCCESS; } -//------------------------------------------------------------------------------ +__hot void node_del(MDBX_cursor *mc, size_t ksize) { + page_t *mp = mc->pg[mc->top]; + const size_t hole = mc->ki[mc->top]; + const size_t nkeys = page_numkeys(mp); -__cold static void chk_verbose_meta(MDBX_chk_scope_t *const scope, - const unsigned num) { - MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_verbose); - MDBX_chk_internal_t *const chk = scope->internal; - if (line) { - MDBX_env *const env = chk->usr->env; - const bool have_bootid = (chk->envinfo.mi_bootid.current.x | - chk->envinfo.mi_bootid.current.y) != 0; - const bool bootid_match = - have_bootid && memcmp(&chk->envinfo.mi_bootid.meta[num], - &chk->envinfo.mi_bootid.current, - sizeof(chk->envinfo.mi_bootid.current)) == 0; + DEBUG("delete node %zu on %s page %" PRIaPGNO, hole, + is_leaf(mp) ? "leaf" : "branch", mp->pgno); + cASSERT(mc, hole < nkeys); - const char *status = "stay"; - if (num == chk->troika.recent) - status = "head"; - else if (num == TROIKA_TAIL(&chk->troika)) - status = "tail"; - line = chk_print(line, "meta-%u: %s, ", num, status); + if (is_dupfix_leaf(mp)) { + cASSERT(mc, ksize >= sizeof(indx_t)); + size_t diff = nkeys - 1 - hole; + void *const base = page_dupfix_ptr(mp, hole, ksize); + if (diff) + memmove(base, ptr_disp(base, ksize), diff * ksize); + cASSERT(mc, mp->lower >= sizeof(indx_t)); + mp->lower -= sizeof(indx_t); + cASSERT(mc, (size_t)UINT16_MAX - mp->upper >= ksize - sizeof(indx_t)); + mp->upper += (indx_t)(ksize - sizeof(indx_t)); + cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->upper) & 1) == 0); + return; + } - switch (chk->envinfo.mi_meta_sign[num]) { - case MDBX_DATASIGN_NONE: - line = chk_puts(line, "no-sync/legacy"); - break; - case MDBX_DATASIGN_WEAK: - line = chk_print(line, "weak-%s", - have_bootid - ? (bootid_match ? "intact (same boot-id)" : "dead") - : "unknown (no boot-id)"); - break; - default: - line = chk_puts(line, "steady"); - break; - } - const txnid_t meta_txnid = chk->envinfo.mi_meta_txnid[num]; - line = chk_print(line, " txn#%" PRIaTXN ", ", meta_txnid); - if (chk->envinfo.mi_bootid.meta[num].x | chk->envinfo.mi_bootid.meta[num].y) - line = chk_print(line, "boot-id %" PRIx64 "-%" PRIx64 " (%s)", - chk->envinfo.mi_bootid.meta[num].x, - chk->envinfo.mi_bootid.meta[num].y, - bootid_match ? "live" : "not match"); - else - line = chk_puts(line, "no boot-id"); + node_t *node = page_node(mp, hole); + cASSERT(mc, !is_branch(mp) || hole || node_ks(node) == 0); + size_t hole_size = NODESIZE + node_ks(node); + if (is_leaf(mp)) + hole_size += + (node_flags(node) & N_BIGDATA) ? sizeof(pgno_t) : node_ds(node); + hole_size = EVEN_CEIL(hole_size); - if (env->me_stuck_meta >= 0) { - if (num == (unsigned)env->me_stuck_meta) - line = chk_print(line, ", %s", "forced for checking"); - } else if (meta_txnid > chk->envinfo.mi_recent_txnid && - (env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) == - MDBX_EXCLUSIVE) - line = chk_print(line, - ", rolled-back %" PRIu64 " commit(s) (%" PRIu64 - " >>> %" PRIu64 ")", - meta_txnid - chk->envinfo.mi_recent_txnid, meta_txnid, - chk->envinfo.mi_recent_txnid); - chk_line_end(line); + const indx_t hole_offset = mp->entries[hole]; + size_t r, w; + for (r = w = 0; r < nkeys; r++) + if (r != hole) + mp->entries[w++] = (mp->entries[r] < hole_offset) + ? mp->entries[r] + (indx_t)hole_size + : mp->entries[r]; + + void *const base = ptr_disp(mp, mp->upper + PAGEHDRSZ); + memmove(ptr_disp(base, hole_size), base, hole_offset - mp->upper); + + cASSERT(mc, mp->lower >= sizeof(indx_t)); + mp->lower -= sizeof(indx_t); + cASSERT(mc, (size_t)UINT16_MAX - mp->upper >= hole_size); + mp->upper += (indx_t)hole_size; + + if (AUDIT_ENABLED()) { + const uint8_t checking = mc->checking; + mc->checking |= z_updating; + const int page_check_err = page_check(mc, mp); + mc->checking = checking; + cASSERT(mc, page_check_err == MDBX_SUCCESS); } } -__cold static int -chk_pgvisitor(const size_t pgno, const unsigned npages, void *const ctx, - const int deep, const MDBX_walk_sdb_t *sdb_info, - const size_t page_size, const MDBX_page_type_t pagetype, - const MDBX_error_t page_err, const size_t nentries, - const size_t payload_bytes, const size_t header_bytes, - const size_t unused_bytes) { - MDBX_chk_scope_t *const scope = ctx; - MDBX_chk_internal_t *const chk = scope->internal; - MDBX_chk_context_t *const usr = chk->usr; - MDBX_env *const env = usr->env; +__noinline int node_read_bigdata(MDBX_cursor *mc, const node_t *node, + MDBX_val *data, const page_t *mp) { + cASSERT(mc, node_flags(node) == N_BIGDATA && data->iov_len == node_ds(node)); - MDBX_chk_subdb_t *sdb; - int err = chk_get_sdb(scope, sdb_info, &sdb); - if (unlikely(err)) - return err; + pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->txnid); + if (unlikely((lp.err != MDBX_SUCCESS))) { + DEBUG("read large/overflow page %" PRIaPGNO " failed", + node_largedata_pgno(node)); + return lp.err; + } - if (deep > 42) { - chk_scope_issue(scope, "too deeply %u", deep); - return MDBX_CORRUPTED /* avoid infinite loop/recursion */; + cASSERT(mc, page_type(lp.page) == P_LARGE); + data->iov_base = page_data(lp.page); + if (!MDBX_DISABLE_VALIDATION) { + const MDBX_env *env = mc->txn->env; + const size_t dsize = data->iov_len; + const unsigned npages = largechunk_npages(env, dsize); + if (unlikely(lp.page->pages < npages)) + return bad_page(lp.page, + "too less n-pages %u for bigdata-node (%zu bytes)", + lp.page->pages, dsize); } - histogram_acc(deep, &sdb->histogram.deep); - usr->result.processed_pages += npages; - const size_t page_bytes = payload_bytes + header_bytes + unused_bytes; + return MDBX_SUCCESS; +} - int height = deep + 1; - if (sdb->id >= CORE_DBS) - height -= usr->txn->mt_dbs[MAIN_DBI].md_depth; - const struct MDBX_db *nested = sdb_info->nested; - if (nested) { - if (sdb->flags & MDBX_DUPSORT) - height -= sdb_info->internal->md_depth; - else { - chk_object_issue(scope, "nested tree", pgno, "unexpected", - "subDb %s flags 0x%x, deep %i", chk_v2a(chk, &sdb->name), - sdb->flags, deep); - nested = nullptr; +node_t *node_shrink(page_t *mp, size_t indx, node_t *node) { + assert(node == page_node(mp, indx)); + page_t *sp = (page_t *)node_data(node); + assert(is_subpage(sp) && page_numkeys(sp) > 0); + const size_t delta = + EVEN_FLOOR(page_room(sp) /* avoid the node uneven-sized */); + if (unlikely(delta) == 0) + return node; + + /* Prepare to shift upward, set len = length(subpage part to shift) */ + size_t nsize = node_ds(node) - delta, len = nsize; + assert(nsize % 1 == 0); + if (!is_dupfix_leaf(sp)) { + len = PAGEHDRSZ; + page_t *xp = ptr_disp(sp, delta); /* destination subpage */ + for (intptr_t i = page_numkeys(sp); --i >= 0;) { + assert(sp->entries[i] >= delta); + xp->entries[i] = (indx_t)(sp->entries[i] - delta); } - } else - chk->last_nested = nullptr; + } + assert(sp->upper >= sp->lower + delta); + sp->upper -= (indx_t)delta; + sp->pgno = mp->pgno; + node_set_ds(node, nsize); - const char *pagetype_caption; - bool branch = false; - switch (pagetype) { - default: - chk_object_issue(scope, "page", pgno, "unknown page-type", - "type %u, deep %i", (unsigned)pagetype, deep); - pagetype_caption = "unknown"; - sdb->pages.other += npages; - break; - case MDBX_page_broken: - assert(page_err != MDBX_SUCCESS); - pagetype_caption = "broken"; - sdb->pages.other += npages; - break; - case MDBX_subpage_broken: - assert(page_err != MDBX_SUCCESS); - pagetype_caption = "broken-subpage"; - sdb->pages.other += npages; - break; - case MDBX_page_large: - pagetype_caption = "large"; - histogram_acc(npages, &sdb->histogram.large_pages); - if (sdb->flags & MDBX_DUPSORT) - chk_object_issue(scope, "page", pgno, "unexpected", - "type %u, subDb %s flags 0x%x, deep %i", - (unsigned)pagetype, chk_v2a(chk, &sdb->name), sdb->flags, - deep); - break; - case MDBX_page_branch: - branch = true; - if (!nested) { - pagetype_caption = "branch"; - sdb->pages.branch += 1; - } else { - pagetype_caption = "nested-branch"; - sdb->pages.nested_branch += 1; + /* Shift upward */ + void *const base = ptr_disp(mp, mp->upper + PAGEHDRSZ); + memmove(ptr_disp(base, delta), base, ptr_dist(sp, base) + len); + + const size_t pivot = mp->entries[indx]; + for (intptr_t i = page_numkeys(mp); --i >= 0;) { + if (mp->entries[i] <= pivot) { + assert((size_t)UINT16_MAX - mp->entries[i] >= delta); + mp->entries[i] += (indx_t)delta; } - break; - case MDBX_page_dupfixed_leaf: - if (!nested) - chk_object_issue(scope, "page", pgno, "unexpected", - "type %u, subDb %s flags 0x%x, deep %i", - (unsigned)pagetype, chk_v2a(chk, &sdb->name), sdb->flags, - deep); - /* fall through */ - __fallthrough; - case MDBX_page_leaf: - if (!nested) { - pagetype_caption = "leaf"; - sdb->pages.leaf += 1; - if (height != sdb_info->internal->md_depth) - chk_object_issue(scope, "page", pgno, "wrong tree height", - "actual %i != %i subDb %s", height, - sdb_info->internal->md_depth, - chk_v2a(chk, &sdb->name)); - } else { - pagetype_caption = - (pagetype == MDBX_page_leaf) ? "nested-leaf" : "nested-leaf-dupfixed"; - sdb->pages.nested_leaf += 1; - if (chk->last_nested != nested) { - histogram_acc(height, &sdb->histogram.nested_tree); - chk->last_nested = nested; - } - if (height != nested->md_depth) - chk_object_issue(scope, "page", pgno, "wrong nested-tree height", - "actual %i != %i dupsort-node %s", height, - nested->md_depth, chk_v2a(chk, &sdb->name)); - } - break; - case MDBX_subpage_dupfixed_leaf: - case MDBX_subpage_leaf: - pagetype_caption = (pagetype == MDBX_subpage_leaf) ? "subleaf-dupsort" - : "subleaf-dupfixed"; - sdb->pages.nested_subleaf += 1; - if ((sdb->flags & MDBX_DUPSORT) == 0 || nested) - chk_object_issue(scope, "page", pgno, "unexpected", - "type %u, subDb %s flags 0x%x, deep %i", - (unsigned)pagetype, chk_v2a(chk, &sdb->name), sdb->flags, - deep); - break; } + assert((size_t)UINT16_MAX - mp->upper >= delta); + mp->upper += (indx_t)delta; - if (npages) { - if (sdb->cookie) { - MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_extra); - if (npages == 1) - chk_print(line, "%s-page %" PRIuSIZE, pagetype_caption, pgno); - else - chk_print(line, "%s-span %" PRIuSIZE "[%u]", pagetype_caption, pgno, - npages); - chk_line_end( - chk_print(line, - " of %s: header %" PRIiPTR ", %s %" PRIiPTR - ", payload %" PRIiPTR ", unused %" PRIiPTR ", deep %i", - chk_v2a(chk, &sdb->name), header_bytes, - (pagetype == MDBX_page_branch) ? "keys" : "entries", - nentries, payload_bytes, unused_bytes, deep)); - } + return ptr_disp(node, delta); +} - bool already_used = false; - for (unsigned n = 0; n < npages; ++n) { - const size_t spanpgno = pgno + n; - if (spanpgno >= usr->result.alloc_pages) { - chk_object_issue(scope, "page", spanpgno, "wrong page-no", - "%s-page: %" PRIuSIZE " > %" PRIuSIZE ", deep %i", - pagetype_caption, spanpgno, usr->result.alloc_pages, - deep); - sdb->pages.all += 1; - } else if (chk->pagemap[spanpgno]) { - const MDBX_chk_subdb_t *const rival = - chk->subdb[chk->pagemap[spanpgno] - 1]; - chk_object_issue(scope, "page", spanpgno, - (branch && rival == sdb) ? "loop" : "already used", - "%s-page: by %s, deep %i", pagetype_caption, - chk_v2a(chk, &rival->name), deep); - already_used = true; - } else { - chk->pagemap[spanpgno] = (int16_t)sdb->id + 1; - sdb->pages.all += 1; +__hot struct node_search_result node_search(MDBX_cursor *mc, + const MDBX_val *key) { + page_t *mp = mc->pg[mc->top]; + const intptr_t nkeys = page_numkeys(mp); + DKBUF_DEBUG; + + DEBUG("searching %zu keys in %s %spage %" PRIaPGNO, nkeys, + is_leaf(mp) ? "leaf" : "branch", is_subpage(mp) ? "sub-" : "", + mp->pgno); + + struct node_search_result ret; + ret.exact = false; + STATIC_ASSERT(P_BRANCH == 1); + intptr_t low = mp->flags & P_BRANCH; + intptr_t high = nkeys - 1; + if (unlikely(high < low)) { + mc->ki[mc->top] = 0; + ret.node = nullptr; + return ret; + } + + intptr_t i; + MDBX_cmp_func *cmp = mc->clc->k.cmp; + MDBX_val nodekey; + if (unlikely(is_dupfix_leaf(mp))) { + cASSERT(mc, mp->dupfix_ksize == mc->tree->dupfix_size); + nodekey.iov_len = mp->dupfix_ksize; + do { + i = (low + high) >> 1; + nodekey.iov_base = page_dupfix_ptr(mp, i, nodekey.iov_len); + cASSERT(mc, ptr_disp(mp, mc->txn->env->ps) >= + ptr_disp(nodekey.iov_base, nodekey.iov_len)); + int cr = cmp(key, &nodekey); + DEBUG("found leaf index %zu [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); + if (cr > 0) + low = ++i; + else if (cr < 0) + high = i - 1; + else { + ret.exact = true; + break; } - } + } while (likely(low <= high)); - if (already_used) - return branch ? MDBX_RESULT_TRUE /* avoid infinite loop/recursion */ - : MDBX_SUCCESS; + /* store the key index */ + mc->ki[mc->top] = (indx_t)i; + ret.node = + (i < nkeys) + ? /* fake for DUPFIX */ (node_t *)(intptr_t)-1 + : /* There is no entry larger or equal to the key. */ nullptr; + return ret; } - if (MDBX_IS_ERROR(page_err)) { - chk_object_issue(scope, "page", pgno, "invalid/corrupted", "%s-page", - pagetype_caption); - } else { - if (unused_bytes > page_size) - chk_object_issue(scope, "page", pgno, "illegal unused-bytes", - "%s-page: %u < %" PRIuSIZE " < %u", pagetype_caption, 0, - unused_bytes, env->me_psize); + if (MDBX_UNALIGNED_OK < 4 && is_branch(mp) && cmp == cmp_int_align2) + /* Branch pages have no data, so if using integer keys, + * alignment is guaranteed. Use faster cmp_int_align4(). */ + cmp = cmp_int_align4; - if (header_bytes < (int)sizeof(long) || - (size_t)header_bytes >= env->me_psize - sizeof(long)) { - chk_object_issue(scope, "page", pgno, "illegal header-length", - "%s-page: %" PRIuSIZE " < %" PRIuSIZE " < %" PRIuSIZE, - pagetype_caption, sizeof(long), header_bytes, - env->me_psize - sizeof(long)); - } - if (nentries < 1 || (pagetype == MDBX_page_branch && nentries < 2)) { - chk_object_issue(scope, "page", pgno, nentries ? "half-empty" : "empty", - "%s-page: payload %" PRIuSIZE " bytes, %" PRIuSIZE - " entries, deep %i", - pagetype_caption, payload_bytes, nentries, deep); - sdb->pages.empty += 1; + node_t *node; + do { + i = (low + high) >> 1; + node = page_node(mp, i); + nodekey.iov_len = node_ks(node); + nodekey.iov_base = node_key(node); + cASSERT(mc, ptr_disp(mp, mc->txn->env->ps) >= + ptr_disp(nodekey.iov_base, nodekey.iov_len)); + int cr = cmp(key, &nodekey); + if (is_leaf(mp)) + DEBUG("found leaf index %zu [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); + else + DEBUG("found branch index %zu [%s -> %" PRIaPGNO "], rc = %i", i, + DKEY_DEBUG(&nodekey), node_pgno(node), cr); + if (cr > 0) + low = ++i; + else if (cr < 0) + high = i - 1; + else { + ret.exact = true; + break; } + } while (likely(low <= high)); - if (npages) { - if (page_bytes != page_size) { - chk_object_issue(scope, "page", pgno, "misused", - "%s-page: %" PRIuPTR " != %" PRIuPTR " (%" PRIuPTR - "h + %" PRIuPTR "p + %" PRIuPTR "u), deep %i", - pagetype_caption, page_size, page_bytes, header_bytes, - payload_bytes, unused_bytes, deep); - if (page_size > page_bytes) - sdb->lost_bytes += page_size - page_bytes; - } else { - sdb->payload_bytes += payload_bytes + header_bytes; - usr->result.total_payload_bytes += payload_bytes + header_bytes; - } - } - } - return chk_check_break(scope); + /* store the key index */ + mc->ki[mc->top] = (indx_t)i; + ret.node = (i < nkeys) + ? page_node(mp, i) + : /* There is no entry larger or equal to the key. */ nullptr; + return ret; } +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 +/// +/// https://en.wikipedia.org/wiki/Operating_system_abstraction_layer -__cold static int chk_tree(MDBX_chk_scope_t *const scope) { - MDBX_chk_internal_t *const chk = scope->internal; - MDBX_chk_context_t *const usr = chk->usr; - MDBX_env *const env = usr->env; - MDBX_txn *const txn = usr->txn; #if defined(_WIN32) || defined(_WIN64) - SetLastError(ERROR_SUCCESS); -#else - errno = 0; -#endif /* Windows */ - chk->pagemap = osal_calloc(usr->result.alloc_pages, sizeof(*chk->pagemap)); - if (!chk->pagemap) { - int err = osal_get_errno(); - return chk_error_rc(scope, err ? err : MDBX_ENOMEM, "calloc"); - } - if (scope->verbosity > MDBX_chk_info) - chk_scope_push(scope, 0, "Walking pages..."); - /* always skip key ordering checking - * to avoid MDBX_CORRUPTED in case custom comparators were used */ - usr->result.processed_pages = NUM_METAS; - int err = mdbx_env_pgwalk(txn, chk_pgvisitor, scope, true); - if (MDBX_IS_ERROR(err) && err != MDBX_EINTR) - chk_error_rc(scope, err, "mdbx_env_pgwalk"); +#include +#include - for (size_t n = NUM_METAS; n < usr->result.alloc_pages; ++n) - if (!chk->pagemap[n]) - usr->result.unused_pages += 1; +#if !MDBX_WITHOUT_MSVC_CRT && defined(_DEBUG) +#include +#endif - MDBX_chk_subdb_t total; - memset(&total, 0, sizeof(total)); - total.pages.all = NUM_METAS; - for (size_t i = 0; i < ARRAY_LENGTH(chk->subdb) && chk->subdb[i]; ++i) { - MDBX_chk_subdb_t *const sdb = chk->subdb[i]; - total.payload_bytes += sdb->payload_bytes; - total.lost_bytes += sdb->lost_bytes; - total.pages.all += sdb->pages.all; - total.pages.empty += sdb->pages.empty; - total.pages.other += sdb->pages.other; - total.pages.branch += sdb->pages.branch; - total.pages.leaf += sdb->pages.leaf; - total.pages.nested_branch += sdb->pages.nested_branch; - total.pages.nested_leaf += sdb->pages.nested_leaf; - total.pages.nested_subleaf += sdb->pages.nested_subleaf; +static int waitstatus2errcode(DWORD result) { + switch (result) { + case WAIT_OBJECT_0: + return MDBX_SUCCESS; + case WAIT_FAILED: + return (int)GetLastError(); + case WAIT_ABANDONED: + return ERROR_ABANDONED_WAIT_0; + case WAIT_IO_COMPLETION: + return ERROR_USER_APC; + case WAIT_TIMEOUT: + return ERROR_TIMEOUT; + default: + return ERROR_UNHANDLED_ERROR; } - assert(total.pages.all == usr->result.processed_pages); +} - const size_t total_page_bytes = pgno2bytes(env, total.pages.all); - if (usr->scope->subtotal_issues || usr->scope->verbosity >= MDBX_chk_verbose) - chk_line_end(chk_print(chk_line_begin(usr->scope, MDBX_chk_resolution), - "walked %zu pages, left/unused %zu" - ", %" PRIuSIZE " problem(s)", - usr->result.processed_pages, - usr->result.unused_pages, - usr->scope->subtotal_issues)); +/* Map a result from an NTAPI call to WIN32 error code. */ +static int ntstatus2errcode(NTSTATUS status) { + DWORD dummy; + OVERLAPPED ov; + memset(&ov, 0, sizeof(ov)); + ov.Internal = status; + /* Zap: '_Param_(1)' could be '0' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6387); + return GetOverlappedResult(nullptr, &ov, &dummy, FALSE) ? MDBX_SUCCESS + : (int)GetLastError(); +} - err = chk_scope_restore(scope, err); - if (scope->verbosity > MDBX_chk_info) { - for (size_t i = 0; i < ARRAY_LENGTH(chk->subdb) && chk->subdb[i]; ++i) { - MDBX_chk_subdb_t *const sdb = chk->subdb[i]; - MDBX_chk_scope_t *inner = - chk_scope_push(scope, 0, "tree %s:", chk_v2a(chk, &sdb->name)); - if (sdb->pages.all == 0) - chk_line_end( - chk_print(chk_line_begin(inner, MDBX_chk_resolution), "empty")); - else { - MDBX_chk_line_t *line = chk_line_begin(inner, MDBX_chk_info); - if (line) { - line = chk_print(line, "page usage: subtotal %" PRIuSIZE, - sdb->pages.all); - const size_t branch_pages = - sdb->pages.branch + sdb->pages.nested_branch; - const size_t leaf_pages = sdb->pages.leaf + sdb->pages.nested_leaf + - sdb->pages.nested_subleaf; - if (sdb->pages.other) - line = chk_print(line, ", other %" PRIuSIZE, sdb->pages.other); - if (sdb->pages.other == 0 || - (branch_pages | leaf_pages | sdb->histogram.large_pages.count) != - 0) { - line = chk_print(line, ", branch %" PRIuSIZE ", leaf %" PRIuSIZE, - branch_pages, leaf_pages); - if (sdb->histogram.large_pages.count || - (sdb->flags & MDBX_DUPSORT) == 0) { - line = chk_print(line, ", large %" PRIuSIZE, - sdb->histogram.large_pages.count); - if (sdb->histogram.large_pages.amount | - sdb->histogram.large_pages.count) - line = histogram_print(inner, line, &sdb->histogram.large_pages, - " amount", "single", true); - } - } - line = histogram_dist(chk_line_feed(line), &sdb->histogram.deep, - "tree deep density", "1", false); - if (sdb != &chk->subdb_gc && sdb->histogram.nested_tree.count) { - line = chk_print(chk_line_feed(line), "nested tree(s) %" PRIuSIZE, - sdb->histogram.nested_tree.count); - line = histogram_dist(line, &sdb->histogram.nested_tree, " density", - "1", false); - line = chk_print(chk_line_feed(line), - "nested tree(s) pages %" PRIuSIZE - ": branch %" PRIuSIZE ", leaf %" PRIuSIZE - ", subleaf %" PRIuSIZE, - sdb->pages.nested_branch + sdb->pages.nested_leaf, - sdb->pages.nested_branch, sdb->pages.nested_leaf, - sdb->pages.nested_subleaf); - } +/* We use native NT APIs to setup the memory map, so that we can + * let the DB file grow incrementally instead of always preallocating + * the full size. These APIs are defined in and + * but those headers are meant for driver-level development and + * conflict with the regular user-level headers, so we explicitly + * declare them here. Using these APIs also means we must link to + * ntdll.dll, which is not linked by default in user code. */ - const size_t bytes = pgno2bytes(env, sdb->pages.all); - line = chk_print( - chk_line_feed(line), - "page filling: subtotal %" PRIuSIZE - " bytes (%.1f%%), payload %" PRIuSIZE - " (%.1f%%), unused %" PRIuSIZE " (%.1f%%)", - bytes, bytes * 100.0 / total_page_bytes, sdb->payload_bytes, - sdb->payload_bytes * 100.0 / bytes, bytes - sdb->payload_bytes, - (bytes - sdb->payload_bytes) * 100.0 / bytes); - if (sdb->pages.empty) - line = chk_print(line, ", %" PRIuSIZE " empty pages", - sdb->pages.empty); - if (sdb->lost_bytes) - line = - chk_print(line, ", %" PRIuSIZE " bytes lost", sdb->lost_bytes); - chk_line_end(line); - } - } - chk_scope_restore(scope, 0); - } - } +extern NTSTATUS NTAPI NtCreateSection( + OUT PHANDLE SectionHandle, IN ACCESS_MASK DesiredAccess, + IN OPTIONAL POBJECT_ATTRIBUTES ObjectAttributes, + IN OPTIONAL PLARGE_INTEGER MaximumSize, IN ULONG SectionPageProtection, + IN ULONG AllocationAttributes, IN OPTIONAL HANDLE FileHandle); - MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_resolution); - line = chk_print(line, - "summary: total %" PRIuSIZE " bytes, payload %" PRIuSIZE - " (%.1f%%), unused %" PRIuSIZE " (%.1f%%)," - " average fill %.1f%%", - total_page_bytes, usr->result.total_payload_bytes, - usr->result.total_payload_bytes * 100.0 / total_page_bytes, - total_page_bytes - usr->result.total_payload_bytes, - (total_page_bytes - usr->result.total_payload_bytes) * - 100.0 / total_page_bytes, - usr->result.total_payload_bytes * 100.0 / total_page_bytes); - if (total.pages.empty) - line = chk_print(line, ", %" PRIuSIZE " empty pages", total.pages.empty); - if (total.lost_bytes) - line = chk_print(line, ", %" PRIuSIZE " bytes lost", total.lost_bytes); - chk_line_end(line); - return err; -} +typedef struct _SECTION_BASIC_INFORMATION { + ULONG Unknown; + ULONG SectionAttributes; + LARGE_INTEGER SectionSize; +} SECTION_BASIC_INFORMATION, *PSECTION_BASIC_INFORMATION; -typedef int(chk_kv_visitor)(MDBX_chk_scope_t *const scope, - MDBX_chk_subdb_t *sdb, const size_t record_number, - const MDBX_val *key, const MDBX_val *data); +extern NTSTATUS NTAPI NtMapViewOfSection( + IN HANDLE SectionHandle, IN HANDLE ProcessHandle, IN OUT PVOID *BaseAddress, + IN ULONG_PTR ZeroBits, IN SIZE_T CommitSize, + IN OUT OPTIONAL PLARGE_INTEGER SectionOffset, IN OUT PSIZE_T ViewSize, + IN SECTION_INHERIT InheritDisposition, IN ULONG AllocationType, + IN ULONG Win32Protect); -__cold static int chk_handle_kv(MDBX_chk_scope_t *const scope, - MDBX_chk_subdb_t *sdb, - const size_t record_number, const MDBX_val *key, - const MDBX_val *data) { - MDBX_chk_internal_t *const chk = scope->internal; - int err = MDBX_SUCCESS; - assert(sdb->cookie); - if (chk->cb->subdb_handle_kv) - err = chk->cb->subdb_handle_kv(chk->usr, sdb, record_number, key, data); - return err ? err : chk_check_break(scope); -} +extern NTSTATUS NTAPI NtUnmapViewOfSection(IN HANDLE ProcessHandle, + IN OPTIONAL PVOID BaseAddress); -__cold static int chk_db(MDBX_chk_scope_t *const scope, MDBX_dbi dbi, - MDBX_chk_subdb_t *sdb, chk_kv_visitor *handler) { - MDBX_chk_internal_t *const chk = scope->internal; - MDBX_chk_context_t *const usr = chk->usr; - MDBX_env *const env = usr->env; - MDBX_txn *const txn = usr->txn; - MDBX_cursor *cursor = nullptr; - size_t record_count = 0, dups = 0, sub_databases = 0; - int err; +/* Zap: Inconsistent annotation for 'NtClose'... */ +MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(28251) +extern NTSTATUS NTAPI NtClose(HANDLE Handle); - if ((MDBX_TXN_FINISHED | MDBX_TXN_ERROR) & txn->mt_flags) { - chk_line_end( - chk_flush(chk_print(chk_line_begin(scope, MDBX_chk_error), - "abort processing %s due to a previous error", - chk_v2a(chk, &sdb->name)))); - err = MDBX_BAD_TXN; - goto bailout; - } +extern NTSTATUS NTAPI NtAllocateVirtualMemory( + IN HANDLE ProcessHandle, IN OUT PVOID *BaseAddress, IN ULONG_PTR ZeroBits, + IN OUT PSIZE_T RegionSize, IN ULONG AllocationType, IN ULONG Protect); - if (0 > (int)dbi) { - err = dbi_open( - txn, &sdb->name, MDBX_DB_ACCEDE, &dbi, - (chk->flags & MDBX_CHK_IGNORE_ORDER) ? cmp_equal_or_greater : nullptr, - (chk->flags & MDBX_CHK_IGNORE_ORDER) ? cmp_equal_or_greater : nullptr); - if (unlikely(err)) { - tASSERT(txn, dbi >= txn->mt_env->me_numdbs || - (txn->mt_env->me_db_flags[dbi] & DB_VALID) == 0); - chk_error_rc(scope, err, "mdbx_dbi_open"); - goto bailout; - } - tASSERT(txn, dbi < txn->mt_env->me_numdbs && - (txn->mt_env->me_db_flags[dbi] & DB_VALID) != 0); - } +extern NTSTATUS NTAPI NtFreeVirtualMemory(IN HANDLE ProcessHandle, + IN PVOID *BaseAddress, + IN OUT PSIZE_T RegionSize, + IN ULONG FreeType); - const MDBX_db *const db = txn->mt_dbs + dbi; - if (handler) { - const char *key_mode = nullptr; - switch (sdb->flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY)) { - case 0: - key_mode = "usual"; - break; - case MDBX_REVERSEKEY: - key_mode = "reserve"; - break; - case MDBX_INTEGERKEY: - key_mode = "ordinal"; - break; - case MDBX_REVERSEKEY | MDBX_INTEGERKEY: - key_mode = "msgpack"; - break; - default: - key_mode = "inconsistent"; - chk_scope_issue(scope, "wrong key-mode (0x%x)", - sdb->flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY)); - } +#ifndef WOF_CURRENT_VERSION +typedef struct _WOF_EXTERNAL_INFO { + DWORD Version; + DWORD Provider; +} WOF_EXTERNAL_INFO, *PWOF_EXTERNAL_INFO; +#endif /* WOF_CURRENT_VERSION */ - const char *value_mode = nullptr; - switch (sdb->flags & (MDBX_DUPSORT | MDBX_REVERSEDUP | MDBX_DUPFIXED | - MDBX_INTEGERDUP)) { - case 0: - value_mode = "single"; - break; - case MDBX_DUPSORT: - value_mode = "multi"; - break; - case MDBX_DUPSORT | MDBX_REVERSEDUP: - value_mode = "multi-reverse"; - break; - case MDBX_DUPSORT | MDBX_DUPFIXED: - value_mode = "multi-samelength"; - break; - case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP: - value_mode = "multi-reverse-samelength"; - break; - case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP: - value_mode = "multi-ordinal"; - break; - case MDBX_DUPSORT | MDBX_INTEGERDUP | MDBX_REVERSEDUP: - value_mode = "multi-msgpack"; - break; - case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP: - value_mode = "reserved"; - break; - default: - value_mode = "inconsistent"; - chk_scope_issue(scope, "wrong value-mode (0x%x)", - sdb->flags & (MDBX_DUPSORT | MDBX_REVERSEDUP | - MDBX_DUPFIXED | MDBX_INTEGERDUP)); - } +#ifndef WIM_PROVIDER_CURRENT_VERSION +#define WIM_PROVIDER_HASH_SIZE 20 - MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_info); - line = chk_print(line, "key-value kind: %s-key => %s-value", key_mode, - value_mode); - line = chk_print(line, ", flags:"); - if (!sdb->flags) - line = chk_print(line, " none"); - else { - const uint8_t f[] = {MDBX_DUPSORT, - MDBX_INTEGERKEY, - MDBX_REVERSEKEY, - MDBX_DUPFIXED, - MDBX_REVERSEDUP, - MDBX_INTEGERDUP, - 0}; - const char *const t[] = {"dupsort", "integerkey", "reversekey", - "dupfixed", "reversedup", "integerdup"}; - for (size_t i = 0; f[i]; i++) - if (sdb->flags & f[i]) - line = chk_print(line, " %s", t[i]); - } - chk_line_end(chk_print(line, " (0x%02X)", sdb->flags)); +typedef struct _WIM_PROVIDER_EXTERNAL_INFO { + DWORD Version; + DWORD Flags; + LARGE_INTEGER DataSourceId; + BYTE ResourceHash[WIM_PROVIDER_HASH_SIZE]; +} WIM_PROVIDER_EXTERNAL_INFO, *PWIM_PROVIDER_EXTERNAL_INFO; +#endif /* WIM_PROVIDER_CURRENT_VERSION */ - line = chk_print(chk_line_begin(scope, MDBX_chk_verbose), - "entries %" PRIu64 ", sequence %" PRIu64, db->md_entries, - db->md_seq); - if (db->md_mod_txnid) - line = chk_print(line, ", last modification txn#%" PRIaTXN, - db->md_mod_txnid); - if (db->md_root != P_INVALID) - line = chk_print(line, ", root #%" PRIaPGNO, db->md_root); - chk_line_end(line); - chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_verbose), - "b-tree depth %u, pages: branch %" PRIaPGNO - ", leaf %" PRIaPGNO ", large %" PRIaPGNO, - db->md_depth, db->md_branch_pages, db->md_leaf_pages, - db->md_overflow_pages)); +#ifndef FILE_PROVIDER_CURRENT_VERSION +typedef struct _FILE_PROVIDER_EXTERNAL_INFO_V1 { + ULONG Version; + ULONG Algorithm; + ULONG Flags; +} FILE_PROVIDER_EXTERNAL_INFO_V1, *PFILE_PROVIDER_EXTERNAL_INFO_V1; +#endif /* FILE_PROVIDER_CURRENT_VERSION */ - if ((chk->flags & MDBX_CHK_SKIP_BTREE_TRAVERSAL) == 0) { - const size_t branch_pages = sdb->pages.branch + sdb->pages.nested_branch; - const size_t leaf_pages = sdb->pages.leaf + sdb->pages.nested_leaf; - const size_t subtotal_pages = - db->md_branch_pages + db->md_leaf_pages + db->md_overflow_pages; - if (subtotal_pages != sdb->pages.all) - chk_scope_issue( - scope, "%s pages mismatch (%" PRIuSIZE " != walked %" PRIuSIZE ")", - "subtotal", subtotal_pages, sdb->pages.all); - if (db->md_branch_pages != branch_pages) - chk_scope_issue( - scope, "%s pages mismatch (%" PRIaPGNO " != walked %" PRIuSIZE ")", - "branch", db->md_branch_pages, branch_pages); - if (db->md_leaf_pages != leaf_pages) - chk_scope_issue( - scope, "%s pages mismatch (%" PRIaPGNO " != walked %" PRIuSIZE ")", - "all-leaf", db->md_leaf_pages, leaf_pages); - if (db->md_overflow_pages != sdb->histogram.large_pages.amount) - chk_scope_issue( - scope, "%s pages mismatch (%" PRIaPGNO " != walked %" PRIuSIZE ")", - "large/overlow", db->md_overflow_pages, - sdb->histogram.large_pages.amount); - } - } +#ifndef STATUS_OBJECT_NOT_EXTERNALLY_BACKED +#define STATUS_OBJECT_NOT_EXTERNALLY_BACKED ((NTSTATUS)0xC000046DL) +#endif +#ifndef STATUS_INVALID_DEVICE_REQUEST +#define STATUS_INVALID_DEVICE_REQUEST ((NTSTATUS)0xC0000010L) +#endif +#ifndef STATUS_NOT_SUPPORTED +#define STATUS_NOT_SUPPORTED ((NTSTATUS)0xC00000BBL) +#endif - err = mdbx_cursor_open(txn, dbi, &cursor); - if (unlikely(err)) { - chk_error_rc(scope, err, "mdbx_cursor_open"); - goto bailout; - } - if (chk->flags & MDBX_CHK_IGNORE_ORDER) { - cursor->mc_checking |= CC_SKIPORD | CC_PAGECHECK; - if (cursor->mc_xcursor) - cursor->mc_xcursor->mx_cursor.mc_checking |= CC_SKIPORD | CC_PAGECHECK; - } +#ifndef FILE_DEVICE_FILE_SYSTEM +#define FILE_DEVICE_FILE_SYSTEM 0x00000009 +#endif - const size_t maxkeysize = mdbx_env_get_maxkeysize_ex(env, sdb->flags); - MDBX_val prev_key = {nullptr, 0}, prev_data = {nullptr, 0}; - MDBX_val key, data; - err = mdbx_cursor_get(cursor, &key, &data, MDBX_FIRST); - while (err == MDBX_SUCCESS) { - err = chk_check_break(scope); - if (unlikely(err)) - goto bailout; +#ifndef FSCTL_GET_EXTERNAL_BACKING +#define FSCTL_GET_EXTERNAL_BACKING \ + CTL_CODE(FILE_DEVICE_FILE_SYSTEM, 196, METHOD_BUFFERED, FILE_ANY_ACCESS) +#endif - bool bad_key = false; - if (key.iov_len > maxkeysize) { - chk_object_issue(scope, "entry", record_count, - "key length exceeds max-key-size", - "%" PRIuPTR " > %" PRIuPTR, key.iov_len, maxkeysize); - bad_key = true; - } else if ((sdb->flags & MDBX_INTEGERKEY) && key.iov_len != 8 && - key.iov_len != 4) { - chk_object_issue(scope, "entry", record_count, "wrong key length", - "%" PRIuPTR " != 4or8", key.iov_len); - bad_key = true; - } +#ifndef ERROR_NOT_CAPABLE +#define ERROR_NOT_CAPABLE 775L +#endif - bool bad_data = false; - if ((sdb->flags & MDBX_INTEGERDUP) && data.iov_len != 8 && - data.iov_len != 4) { - chk_object_issue(scope, "entry", record_count, "wrong data length", - "%" PRIuPTR " != 4or8", data.iov_len); - bad_data = true; - } +#endif /* _WIN32 || _WIN64 */ - if (prev_key.iov_base) { - if (prev_data.iov_base && !bad_data && (sdb->flags & MDBX_DUPFIXED) && - prev_data.iov_len != data.iov_len) { - chk_object_issue(scope, "entry", record_count, "different data length", - "%" PRIuPTR " != %" PRIuPTR, prev_data.iov_len, - data.iov_len); - bad_data = true; - } +/*----------------------------------------------------------------------------*/ - if (!bad_key) { - int cmp = mdbx_cmp(txn, dbi, &key, &prev_key); - if (cmp == 0) { - ++dups; - if ((sdb->flags & MDBX_DUPSORT) == 0) { - chk_object_issue(scope, "entry", record_count, "duplicated entries", - nullptr); - if (prev_data.iov_base && data.iov_len == prev_data.iov_len && - memcmp(data.iov_base, prev_data.iov_base, data.iov_len) == 0) - chk_object_issue(scope, "entry", record_count, - "complete duplicate", nullptr); - } else if (!bad_data && prev_data.iov_base) { - cmp = mdbx_dcmp(txn, dbi, &data, &prev_data); - if (cmp == 0) - chk_object_issue(scope, "entry", record_count, - "complete duplicate", nullptr); - else if (cmp < 0 && !(chk->flags & MDBX_CHK_IGNORE_ORDER)) - chk_object_issue(scope, "entry", record_count, - "wrong order of multi-values", nullptr); - } - } else if (cmp < 0 && !(chk->flags & MDBX_CHK_IGNORE_ORDER)) - chk_object_issue(scope, "entry", record_count, - "wrong order of entries", nullptr); - } - } +#if defined(__ANDROID_API__) +__extern_C void __assert2(const char *file, int line, const char *function, + const char *msg) __noreturn; +#define __assert_fail(assertion, file, line, function) \ + __assert2(file, line, function, assertion) - if (!bad_key) { - if (!prev_key.iov_base && (sdb->flags & MDBX_INTEGERKEY)) - chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_info), - "fixed key-size %" PRIuSIZE, key.iov_len)); - prev_key = key; - } - if (!bad_data) { - if (!prev_data.iov_base && - (sdb->flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED))) - chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_info), - "fixed data-size %" PRIuSIZE, data.iov_len)); - prev_data = data; - } +#elif defined(__UCLIBC__) +__extern_C void __assert(const char *, const char *, unsigned, const char *) +#ifdef __THROW + __THROW +#else + __nothrow +#endif /* __THROW */ + MDBX_NORETURN; +#define __assert_fail(assertion, file, line, function) \ + __assert(assertion, file, line, function) - record_count++; - histogram_acc(key.iov_len, &sdb->histogram.key_len); - histogram_acc(data.iov_len, &sdb->histogram.val_len); +#elif _POSIX_C_SOURCE > 200212 && \ + /* workaround for avoid musl libc wrong prototype */ ( \ + defined(__GLIBC__) || defined(__GNU_LIBRARY__)) +/* Prototype should match libc runtime. ISO POSIX (2003) & LSB 1.x-3.x */ +__extern_C void __assert_fail(const char *assertion, const char *file, + unsigned line, const char *function) +#ifdef __THROW + __THROW +#else + __nothrow +#endif /* __THROW */ + MDBX_NORETURN; - const MDBX_node *const node = - page_node(cursor->mc_pg[cursor->mc_top], cursor->mc_ki[cursor->mc_top]); - if (node_flags(node) == F_SUBDATA) { - if (dbi != MAIN_DBI || (sdb->flags & (MDBX_DUPSORT | MDBX_DUPFIXED | - MDBX_REVERSEDUP | MDBX_INTEGERDUP))) - chk_object_issue(scope, "entry", record_count, - "unexpected sub-database", "node-flags 0x%x", - node_flags(node)); - else if (data.iov_len != sizeof(MDBX_db)) - chk_object_issue(scope, "entry", record_count, - "wrong sub-database node size", - "node-size %" PRIuSIZE " != %" PRIuSIZE, data.iov_len, - sizeof(MDBX_db)); - else if (scope->stage == MDBX_chk_traversal_maindb) - /* подсчитываем subDB при первом проходе */ - sub_databases += 1; - else { - /* обработка subDB при втором проходе */ - MDBX_db aligned_db; - memcpy(&aligned_db, data.iov_base, sizeof(aligned_db)); - MDBX_walk_sdb_t sdb_info = {key, nullptr, nullptr}; - sdb_info.internal = &aligned_db; - MDBX_chk_subdb_t *subdb; - err = chk_get_sdb(scope, &sdb_info, &subdb); - if (unlikely(err)) - goto bailout; - if (subdb->cookie) { - err = chk_scope_begin(chk, 0, MDBX_chk_traversal_subdbs, subdb, - &usr->result.problems_kv, - "Processing subDB %s...", - chk_v2a(chk, &subdb->name)); - if (likely(!err)) { - err = chk_db(usr->scope, (MDBX_dbi)-1, subdb, chk_handle_kv); - if (err != MDBX_EINTR && err != MDBX_RESULT_TRUE) - usr->result.subdb_processed += 1; - } - err = chk_scope_restore(scope, err); - if (unlikely(err)) - goto bailout; - } else - chk_line_end(chk_flush( - chk_print(chk_line_begin(scope, MDBX_chk_processing), - "Skip processing %s...", chk_v2a(chk, &subdb->name)))); - } - } else if (handler) { - err = handler(scope, sdb, record_count, &key, &data); - if (unlikely(err)) - goto bailout; - } +#elif defined(__APPLE__) || defined(__MACH__) +__extern_C void __assert_rtn(const char *function, const char *file, int line, + const char *assertion) /* __nothrow */ +#ifdef __dead2 + __dead2 +#else + MDBX_NORETURN +#endif /* __dead2 */ +#ifdef __disable_tail_calls + __disable_tail_calls +#endif /* __disable_tail_calls */ + ; - err = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT); - } +#define __assert_fail(assertion, file, line, function) \ + __assert_rtn(function, file, line, assertion) +#elif defined(__sun) || defined(__SVR4) || defined(__svr4__) +__extern_C void __assert_c99(const char *assection, const char *file, int line, + const char *function) MDBX_NORETURN; +#define __assert_fail(assertion, file, line, function) \ + __assert_c99(assertion, file, line, function) +#elif defined(__OpenBSD__) +__extern_C __dead void __assert2(const char *file, int line, + const char *function, + const char *assertion) /* __nothrow */; +#define __assert_fail(assertion, file, line, function) \ + __assert2(file, line, function, assertion) +#elif defined(__NetBSD__) +__extern_C __dead void __assert13(const char *file, int line, + const char *function, + const char *assertion) /* __nothrow */; +#define __assert_fail(assertion, file, line, function) \ + __assert13(file, line, function, assertion) +#elif defined(__FreeBSD__) || defined(__BSD__) || defined(__bsdi__) || \ + defined(__DragonFly__) +__extern_C void __assert(const char *function, const char *file, int line, + const char *assertion) /* __nothrow */ +#ifdef __dead2 + __dead2 +#else + MDBX_NORETURN +#endif /* __dead2 */ +#ifdef __disable_tail_calls + __disable_tail_calls +#endif /* __disable_tail_calls */ + ; +#define __assert_fail(assertion, file, line, function) \ + __assert(function, file, line, assertion) - err = (err != MDBX_NOTFOUND) ? chk_error_rc(scope, err, "mdbx_cursor_get") - : MDBX_SUCCESS; - if (err == MDBX_SUCCESS && record_count != db->md_entries) - chk_scope_issue(scope, - "different number of entries %" PRIuSIZE " != %" PRIu64, - record_count, db->md_entries); -bailout: - if (cursor) { - if (handler) { - if (sdb->histogram.key_len.count) { - MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_info); - line = histogram_dist(line, &sdb->histogram.key_len, - "key length density", "0/1", false); - chk_line_feed(line); - line = histogram_dist(line, &sdb->histogram.val_len, - "value length density", "0/1", false); - chk_line_end(line); - } - if (scope->stage == MDBX_chk_traversal_maindb) - usr->result.subdb_total = sub_databases; - if (chk->cb->subdb_conclude) - err = chk->cb->subdb_conclude(usr, sdb, cursor, err); - MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_resolution); - line = chk_print(line, "summary: %" PRIuSIZE " records,", record_count); - if (dups || (sdb->flags & (MDBX_DUPSORT | MDBX_DUPFIXED | - MDBX_REVERSEDUP | MDBX_INTEGERDUP))) - line = chk_print(line, " %" PRIuSIZE " dups,", dups); - if (sub_databases || dbi == MAIN_DBI) - line = chk_print(line, " %" PRIuSIZE " sub-databases,", sub_databases); - line = chk_print(line, - " %" PRIuSIZE " key's bytes," - " %" PRIuSIZE " data's bytes," - " %" PRIuSIZE " problem(s)", - sdb->histogram.key_len.amount, - sdb->histogram.val_len.amount, scope->subtotal_issues); - chk_line_end(chk_flush(line)); - } +#endif /* __assert_fail */ - mdbx_cursor_close(cursor); - if (!txn->mt_cursors[dbi] && (txn->mt_dbi_state[dbi] & DBI_FRESH)) - mdbx_dbi_close(env, dbi); - } - return err; +__cold void mdbx_assert_fail(const MDBX_env *env, const char *msg, + const char *func, unsigned line) { +#if MDBX_DEBUG + if (env && env->assert_func) + env->assert_func(env, msg, func, line); +#else + (void)env; + assert_fail(msg, func, line); } -__cold static int chk_handle_gc(MDBX_chk_scope_t *const scope, - MDBX_chk_subdb_t *sdb, - const size_t record_number, const MDBX_val *key, - const MDBX_val *data) { - MDBX_chk_internal_t *const chk = scope->internal; - MDBX_chk_context_t *const usr = chk->usr; - assert(sdb == &chk->subdb_gc); - (void)sdb; - const char *bad = ""; - pgno_t *iptr = data->iov_base; +MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, + unsigned line) { +#endif /* MDBX_DEBUG */ - if (key->iov_len != sizeof(txnid_t)) - chk_object_issue(scope, "entry", record_number, "wrong txn-id size", - "key-size %" PRIuSIZE, key->iov_len); + if (globals.logger.ptr) + debug_log(MDBX_LOG_FATAL, func, line, "assert: %s\n", msg); else { - txnid_t txnid; - memcpy(&txnid, key->iov_base, sizeof(txnid)); - if (txnid < 1 || txnid > usr->txn->mt_txnid) - chk_object_issue(scope, "entry", record_number, "wrong txn-id", - "%" PRIaTXN, txnid); - else { - if (data->iov_len < sizeof(pgno_t) || data->iov_len % sizeof(pgno_t)) - chk_object_issue(scope, "entry", txnid, "wrong idl size", "%" PRIuPTR, - data->iov_len); - size_t number = (data->iov_len >= sizeof(pgno_t)) ? *iptr++ : 0; - if (number > MDBX_PGL_LIMIT) - chk_object_issue(scope, "entry", txnid, "wrong idl length", "%" PRIuPTR, - number); - else if ((number + 1) * sizeof(pgno_t) > data->iov_len) { - chk_object_issue(scope, "entry", txnid, "trimmed idl", - "%" PRIuSIZE " > %" PRIuSIZE " (corruption)", - (number + 1) * sizeof(pgno_t), data->iov_len); - number = data->iov_len / sizeof(pgno_t) - 1; - } else if (data->iov_len - (number + 1) * sizeof(pgno_t) >= - /* LY: allow gap up to one page. it is ok - * and better than shink-and-retry inside update_gc() */ - usr->env->me_psize) - chk_object_issue(scope, "entry", txnid, "extra idl space", - "%" PRIuSIZE " < %" PRIuSIZE " (minor, not a trouble)", - (number + 1) * sizeof(pgno_t), data->iov_len); - - usr->result.gc_pages += number; - if (chk->envinfo.mi_latter_reader_txnid > txnid) - usr->result.reclaimable_pages += number; +#if defined(_WIN32) || defined(_WIN64) + char *message = nullptr; + const int num = osal_asprintf(&message, "\r\nMDBX-ASSERTION: %s, %s:%u", + msg, func ? func : "unknown", line); + if (num < 1 || !message) + message = ""; + OutputDebugStringA(message); +#else + __assert_fail(msg, "mdbx", line, func); +#endif + } - size_t prev = MDBX_PNL_ASCENDING ? NUM_METAS - 1 : usr->txn->mt_next_pgno; - size_t span = 1; - for (size_t i = 0; i < number; ++i) { - const size_t pgno = iptr[i]; - if (pgno < NUM_METAS) - chk_object_issue(scope, "entry", txnid, "wrong idl entry", - "pgno %" PRIuSIZE " < meta-pages %u", pgno, - NUM_METAS); - else if (pgno >= usr->result.backed_pages) - chk_object_issue(scope, "entry", txnid, "wrong idl entry", - "pgno %" PRIuSIZE " > backed-pages %" PRIuSIZE, pgno, - usr->result.backed_pages); - else if (pgno >= usr->result.alloc_pages) - chk_object_issue(scope, "entry", txnid, "wrong idl entry", - "pgno %" PRIuSIZE " > alloc-pages %" PRIuSIZE, pgno, - usr->result.alloc_pages - 1); - else { - if (MDBX_PNL_DISORDERED(prev, pgno)) { - bad = " [bad sequence]"; - chk_object_issue( - scope, "entry", txnid, "bad sequence", - "%" PRIuSIZE " %c [%" PRIuSIZE "].%" PRIuSIZE, prev, - (prev == pgno) ? '=' : (MDBX_PNL_ASCENDING ? '>' : '<'), i, - pgno); - } - if (chk->pagemap) { - const intptr_t id = chk->pagemap[pgno]; - if (id == 0) - chk->pagemap[pgno] = -1 /* mark the pgno listed in GC */; - else if (id > 0) { - assert(id - 1 <= (intptr_t)ARRAY_LENGTH(chk->subdb)); - chk_object_issue(scope, "page", pgno, "already used", "by %s", - chk_v2a(chk, &chk->subdb[id - 1]->name)); - } else - chk_object_issue(scope, "page", pgno, "already listed in GC", - nullptr); - } - } - prev = pgno; - while (i + span < number && - iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span) - : pgno_sub(pgno, span))) - ++span; - } - if (sdb->cookie) { - chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_details), - "transaction %" PRIaTXN ", %" PRIuSIZE - " pages, maxspan %" PRIuSIZE "%s", - txnid, number, span, bad)); - for (size_t i = 0; i < number; i += span) { - const size_t pgno = iptr[i]; - for (span = 1; - i + span < number && - iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span) - : pgno_sub(pgno, span)); - ++span) - ; - histogram_acc(span, &sdb->histogram.nested_tree); - MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_extra); - if (line) { - if (span > 1) - line = - chk_print(line, "%9" PRIuSIZE "[%" PRIuSIZE "]", pgno, span); - else - line = chk_print(line, "%9" PRIuSIZE, pgno); - chk_line_end(line); - int err = chk_check_break(scope); - if (err) - return err; - } - } - } - } + while (1) { +#if defined(_WIN32) || defined(_WIN64) +#if !MDBX_WITHOUT_MSVC_CRT && defined(_DEBUG) + _CrtDbgReport(_CRT_ASSERT, func ? func : "unknown", line, "libmdbx", + "assertion failed: %s", msg); +#else + if (IsDebuggerPresent()) + DebugBreak(); +#endif + FatalExit(STATUS_ASSERTION_FAILURE); +#else + abort(); +#endif } - return chk_check_break(scope); } -__cold static int env_chk(MDBX_chk_scope_t *const scope) { - MDBX_chk_internal_t *const chk = scope->internal; - MDBX_chk_context_t *const usr = chk->usr; - MDBX_env *const env = usr->env; - MDBX_txn *const txn = usr->txn; - int err = - env_info(env, txn, &chk->envinfo, sizeof(chk->envinfo), &chk->troika); - if (unlikely(err)) - return chk_error_rc(scope, err, "env_info"); +__cold void mdbx_panic(const char *fmt, ...) { + va_list ap; + va_start(ap, fmt); - MDBX_chk_line_t *line = - chk_puts(chk_line_begin(scope, MDBX_chk_info), "current boot-id "); - if (chk->envinfo.mi_bootid.current.x | chk->envinfo.mi_bootid.current.y) - line = chk_print(line, "%016" PRIx64 "-%016" PRIx64, - chk->envinfo.mi_bootid.current.x, - chk->envinfo.mi_bootid.current.y); - else - line = chk_puts(line, "unavailable"); - chk_line_end(line); + char *message = nullptr; + const int num = osal_vasprintf(&message, fmt, ap); + va_end(ap); + const char *const const_message = + unlikely(num < 1 || !message) + ? "" + : message; - err = osal_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); - if (unlikely(err)) - return chk_error_rc(scope, err, "osal_filesize"); + if (globals.logger.ptr) + debug_log(MDBX_LOG_FATAL, "panic", 0, "%s", const_message); - //-------------------------------------------------------------------------- + while (1) { +#if defined(_WIN32) || defined(_WIN64) +#if !MDBX_WITHOUT_MSVC_CRT && defined(_DEBUG) + _CrtDbgReport(_CRT_ASSERT, "mdbx.c", 0, "libmdbx", "panic: %s", + const_message); +#else + OutputDebugStringA("\r\nMDBX-PANIC: "); + OutputDebugStringA(const_message); + if (IsDebuggerPresent()) + DebugBreak(); +#endif + FatalExit(ERROR_UNHANDLED_ERROR); +#else + __assert_fail(const_message, "mdbx", 0, "panic"); + abort(); +#endif + } +} - err = chk_scope_begin(chk, 1, MDBX_chk_meta, nullptr, - &usr->result.problems_meta, "Peek the meta-pages..."); - if (likely(!err)) { - MDBX_chk_scope_t *const inner = usr->scope; - const uint64_t dxbfile_pages = - env->me_dxb_mmap.filesize >> env->me_psize2log; - usr->result.alloc_pages = txn->mt_next_pgno; - usr->result.backed_pages = bytes2pgno(env, env->me_dxb_mmap.current); - if (unlikely(usr->result.backed_pages > dxbfile_pages)) - chk_scope_issue(inner, "backed-pages %zu > file-pages %" PRIu64, - usr->result.backed_pages, dxbfile_pages); - if (unlikely(dxbfile_pages < NUM_METAS)) - chk_scope_issue(inner, "file-pages %" PRIu64 " < %u", dxbfile_pages, - NUM_METAS); - if (unlikely(usr->result.backed_pages < NUM_METAS)) - chk_scope_issue(inner, "backed-pages %zu < %u", usr->result.backed_pages, - NUM_METAS); - if (unlikely(usr->result.backed_pages < NUM_METAS)) { - chk_scope_issue(inner, "backed-pages %zu < num-metas %u", - usr->result.backed_pages, NUM_METAS); - return MDBX_CORRUPTED; - } - if (unlikely(dxbfile_pages < NUM_METAS)) { - chk_scope_issue(inner, "backed-pages %zu < num-metas %u", - usr->result.backed_pages, NUM_METAS); - return MDBX_CORRUPTED; - } - if (unlikely(usr->result.backed_pages > (size_t)MAX_PAGENO + 1)) { - chk_scope_issue(inner, "backed-pages %zu > max-pages %zu", - usr->result.backed_pages, (size_t)MAX_PAGENO + 1); - usr->result.backed_pages = MAX_PAGENO + 1; - } +/*----------------------------------------------------------------------------*/ - if ((env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) { - if (unlikely(usr->result.backed_pages > dxbfile_pages)) { - chk_scope_issue(inner, "backed-pages %zu > file-pages %" PRIu64, - usr->result.backed_pages, dxbfile_pages); - usr->result.backed_pages = (size_t)dxbfile_pages; - } - if (unlikely(usr->result.alloc_pages > usr->result.backed_pages)) { - chk_scope_issue(scope, "alloc-pages %zu > backed-pages %zu", - usr->result.alloc_pages, usr->result.backed_pages); - usr->result.alloc_pages = usr->result.backed_pages; - } - } else { - /* DB may be shrunk by writer down to the allocated (but unused) pages. */ - if (unlikely(usr->result.alloc_pages > usr->result.backed_pages)) { - chk_scope_issue(inner, "alloc-pages %zu > backed-pages %zu", - usr->result.alloc_pages, usr->result.backed_pages); - usr->result.alloc_pages = usr->result.backed_pages; - } - if (unlikely(usr->result.alloc_pages > dxbfile_pages)) { - chk_scope_issue(inner, "alloc-pages %zu > file-pages %" PRIu64, - usr->result.alloc_pages, dxbfile_pages); - usr->result.alloc_pages = (size_t)dxbfile_pages; - } - if (unlikely(usr->result.backed_pages > dxbfile_pages)) - usr->result.backed_pages = (size_t)dxbfile_pages; - } +#ifndef osal_vasprintf +MDBX_INTERNAL int osal_vasprintf(char **strp, const char *fmt, va_list ap) { + va_list ones; + va_copy(ones, ap); + const int needed = vsnprintf(nullptr, 0, fmt, ones); + va_end(ones); - line = chk_line_feed(chk_print( - chk_line_begin(inner, MDBX_chk_info), - "pagesize %u (%u system), max keysize %u..%u" - ", max readers %u", - env->me_psize, env->me_os_psize, - mdbx_env_get_maxkeysize_ex(env, MDBX_DUPSORT), - mdbx_env_get_maxkeysize_ex(env, MDBX_DB_DEFAULTS), env->me_maxreaders)); - line = chk_line_feed( - chk_print_size(line, "mapsize ", env->me_dxb_mmap.current, nullptr)); - if (txn->mt_geo.lower == txn->mt_geo.upper) - line = chk_print_size( - line, "fixed datafile: ", chk->envinfo.mi_geo.current, nullptr); - else { - line = chk_print_size( - line, "dynamic datafile: ", chk->envinfo.mi_geo.lower, nullptr); - line = chk_print_size(line, " .. ", chk->envinfo.mi_geo.upper, ", "); - line = chk_print_size(line, "+", chk->envinfo.mi_geo.grow, ", "); + if (unlikely(needed < 0 || needed >= INT_MAX)) { + *strp = nullptr; + return needed; + } - line = chk_line_feed( - chk_print_size(line, "-", chk->envinfo.mi_geo.shrink, nullptr)); - line = chk_print_size( - line, "current datafile: ", chk->envinfo.mi_geo.current, nullptr); - } - tASSERT(txn, txn->mt_geo.now == chk->envinfo.mi_geo.current / - chk->envinfo.mi_dxb_pagesize); - chk_line_end(chk_print(line, ", %u pages", txn->mt_geo.now)); -#if defined(_WIN32) || defined(_WIN64) || MDBX_DEBUG - if (txn->mt_geo.shrink_pv && txn->mt_geo.now != txn->mt_geo.upper && - scope->verbosity >= MDBX_chk_verbose) { - line = chk_line_begin(inner, MDBX_chk_notice); - chk_line_feed(chk_print( - line, " > WARNING: Due Windows system limitations a file couldn't")); - chk_line_feed(chk_print( - line, " > be truncated while the database is opened. So, the size")); - chk_line_feed(chk_print( - line, " > database file of may by large than the database itself,")); - chk_line_end(chk_print( - line, " > until it will be closed or reopened in read-write mode.")); - } -#endif /* Windows || Debug */ - chk_verbose_meta(inner, 0); - chk_verbose_meta(inner, 1); - chk_verbose_meta(inner, 2); - - if (env->me_stuck_meta >= 0) { - chk_line_end(chk_print(chk_line_begin(inner, MDBX_chk_processing), - "skip checking meta-pages since the %u" - " is selected for verification", - env->me_stuck_meta)); - line = chk_line_feed( - chk_print(chk_line_begin(inner, MDBX_chk_resolution), - "transactions: recent %" PRIu64 ", " - "selected for verification %" PRIu64 ", lag %" PRIi64, - chk->envinfo.mi_recent_txnid, - chk->envinfo.mi_meta_txnid[env->me_stuck_meta], - chk->envinfo.mi_recent_txnid - - chk->envinfo.mi_meta_txnid[env->me_stuck_meta])); - chk_line_end(line); - } else { - chk_line_end(chk_puts(chk_line_begin(inner, MDBX_chk_verbose), - "performs check for meta-pages clashes")); - const unsigned meta_clash_mask = meta_eq_mask(&chk->troika); - if (meta_clash_mask & 1) - chk_scope_issue(inner, "meta-%d and meta-%d are clashed", 0, 1); - if (meta_clash_mask & 2) - chk_scope_issue(inner, "meta-%d and meta-%d are clashed", 1, 2); - if (meta_clash_mask & 4) - chk_scope_issue(inner, "meta-%d and meta-%d are clashed", 2, 0); - - const unsigned prefer_steady_metanum = chk->troika.prefer_steady; - const uint64_t prefer_steady_txnid = - chk->troika.txnid[prefer_steady_metanum]; - const unsigned recent_metanum = chk->troika.recent; - const uint64_t recent_txnid = chk->troika.txnid[recent_metanum]; - if (env->me_flags & MDBX_EXCLUSIVE) { - chk_line_end( - chk_puts(chk_line_begin(inner, MDBX_chk_verbose), - "performs full check recent-txn-id with meta-pages")); - eASSERT(env, recent_txnid == chk->envinfo.mi_recent_txnid); - if (prefer_steady_txnid != recent_txnid) { - if ((chk->flags & MDBX_CHK_READWRITE) != 0 && - (env->me_flags & MDBX_RDONLY) == 0 && - recent_txnid > prefer_steady_txnid && - (chk->envinfo.mi_bootid.current.x | - chk->envinfo.mi_bootid.current.y) != 0 && - chk->envinfo.mi_bootid.current.x == - chk->envinfo.mi_bootid.meta[recent_metanum].x && - chk->envinfo.mi_bootid.current.y == - chk->envinfo.mi_bootid.meta[recent_metanum].y) { - chk_line_end( - chk_print(chk_line_begin(inner, MDBX_chk_verbose), - "recent meta-%u is weak, but boot-id match current" - " (will synced upon successful check)", - recent_metanum)); - } else - chk_scope_issue( - inner, - "steady meta-%d txn-id mismatch recent-txn-id (%" PRIi64 - " != %" PRIi64 ")", - prefer_steady_metanum, prefer_steady_txnid, recent_txnid); - } - } else if (chk->write_locked) { - chk_line_end( - chk_puts(chk_line_begin(inner, MDBX_chk_verbose), - "performs lite check recent-txn-id with meta-pages (not a " - "monopolistic mode)")); - if (recent_txnid != chk->envinfo.mi_recent_txnid) { - chk_scope_issue(inner, - "weak meta-%d txn-id mismatch recent-txn-id (%" PRIi64 - " != %" PRIi64 ")", - recent_metanum, recent_txnid, - chk->envinfo.mi_recent_txnid); - } - } else { - chk_line_end(chk_puts( - chk_line_begin(inner, MDBX_chk_verbose), - "skip check recent-txn-id with meta-pages (monopolistic or " - "read-write mode only)")); - } - - chk_line_end(chk_print( - chk_line_begin(inner, MDBX_chk_resolution), - "transactions: recent %" PRIu64 ", latter reader %" PRIu64 - ", lag %" PRIi64, - chk->envinfo.mi_recent_txnid, chk->envinfo.mi_latter_reader_txnid, - chk->envinfo.mi_recent_txnid - chk->envinfo.mi_latter_reader_txnid)); - } + *strp = osal_malloc(needed + (size_t)1); + if (unlikely(*strp == nullptr)) { +#if defined(_WIN32) || defined(_WIN64) + SetLastError(MDBX_ENOMEM); +#else + errno = MDBX_ENOMEM; +#endif + return -1; } - err = chk_scope_restore(scope, err); - - //-------------------------------------------------------------------------- - if (chk->flags & MDBX_CHK_SKIP_BTREE_TRAVERSAL) - chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_processing), - "Skipping %s traversal...", "b-tree")); - else { - err = chk_scope_begin( - chk, -1, MDBX_chk_traversal_tree, nullptr, &usr->result.tree_problems, - "Traversal %s by txn#%" PRIaTXN "...", "b-tree", txn->mt_txnid); - if (likely(!err)) - err = chk_tree(usr->scope); - if (usr->result.tree_problems && usr->result.gc_tree_problems == 0) - usr->result.gc_tree_problems = usr->result.tree_problems; - if (usr->result.tree_problems && usr->result.kv_tree_problems == 0) - usr->result.kv_tree_problems = usr->result.tree_problems; - chk_scope_restore(scope, err); + const int actual = vsnprintf(*strp, needed + (size_t)1, fmt, ap); + assert(actual == needed); + if (unlikely(actual < 0)) { + osal_free(*strp); + *strp = nullptr; } + return actual; +} +#endif /* osal_vasprintf */ - if (usr->result.gc_tree_problems > 0) - chk_line_end(chk_print( - chk_line_begin(scope, MDBX_chk_processing), - "Skip processing %s since %s is corrupted (%" PRIuSIZE " problem(s))", - chk_v2a(chk, MDBX_CHK_GC), "b-tree", - usr->result.problems_gc = usr->result.gc_tree_problems)); - else { - err = chk_scope_begin(chk, -1, MDBX_chk_traversal_freedb, &chk->subdb_gc, - &usr->result.problems_gc, - "Traversal %s by txn#%" PRIaTXN "...", "GC/freeDB", - txn->mt_txnid); - if (likely(!err)) - err = chk_db(usr->scope, FREE_DBI, &chk->subdb_gc, chk_handle_gc); - line = chk_line_begin(scope, MDBX_chk_info); - if (line) { - histogram_print(scope, line, &chk->subdb_gc.histogram.nested_tree, - "span(s)", "single", false); - chk_line_end(line); - } - if (usr->result.problems_gc == 0 && - (chk->flags & MDBX_CHK_SKIP_BTREE_TRAVERSAL) == 0) { - const size_t used_pages = usr->result.alloc_pages - usr->result.gc_pages; - if (usr->result.processed_pages != used_pages) - chk_scope_issue(usr->scope, - "used pages mismatch (%" PRIuSIZE - "(walked) != %" PRIuSIZE "(allocated - GC))", - usr->result.processed_pages, used_pages); - if (usr->result.unused_pages != usr->result.gc_pages) - chk_scope_issue(usr->scope, - "GC pages mismatch (%" PRIuSIZE - "(expected) != %" PRIuSIZE "(GC))", - usr->result.unused_pages, usr->result.gc_pages); - } - } - chk_scope_restore(scope, err); +#ifndef osal_asprintf +MDBX_INTERNAL int osal_asprintf(char **strp, const char *fmt, ...) { + va_list ap; + va_start(ap, fmt); + const int rc = osal_vasprintf(strp, fmt, ap); + va_end(ap); + return rc; +} +#endif /* osal_asprintf */ - //-------------------------------------------------------------------------- +#ifndef osal_memalign_alloc +MDBX_INTERNAL int osal_memalign_alloc(size_t alignment, size_t bytes, + void **result) { + assert(is_powerof2(alignment) && alignment >= sizeof(void *)); +#if defined(_WIN32) || defined(_WIN64) + (void)alignment; + *result = + VirtualAlloc(nullptr, bytes, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); + return *result ? MDBX_SUCCESS : MDBX_ENOMEM /* ERROR_OUTOFMEMORY */; +#elif defined(_ISOC11_SOURCE) + *result = aligned_alloc(alignment, ceil_powerof2(bytes, alignment)); + return *result ? MDBX_SUCCESS : errno; +#elif _POSIX_VERSION >= 200112L && \ + (!defined(__ANDROID_API__) || __ANDROID_API__ >= 17) + *result = nullptr; + return posix_memalign(result, alignment, bytes); +#elif __GLIBC_PREREQ(2, 16) || __STDC_VERSION__ >= 201112L + *result = memalign(alignment, bytes); + return *result ? MDBX_SUCCESS : errno; +#else +#error FIXME +#endif +} +#endif /* osal_memalign_alloc */ - err = chk_scope_begin(chk, 1, MDBX_chk_space, nullptr, nullptr, - "Page allocation:"); - const double percent_boundary_reciprocal = 100.0 / txn->mt_geo.upper; - const double percent_backed_reciprocal = 100.0 / usr->result.backed_pages; - const size_t detained = usr->result.gc_pages - usr->result.reclaimable_pages; - const size_t available2boundary = txn->mt_geo.upper - - usr->result.alloc_pages + - usr->result.reclaimable_pages; - const size_t available2backed = usr->result.backed_pages - - usr->result.alloc_pages + - usr->result.reclaimable_pages; - const size_t remained2boundary = txn->mt_geo.upper - usr->result.alloc_pages; - const size_t remained2backed = - usr->result.backed_pages - usr->result.alloc_pages; +#ifndef osal_memalign_free +MDBX_INTERNAL void osal_memalign_free(void *ptr) { +#if defined(_WIN32) || defined(_WIN64) + VirtualFree(ptr, 0, MEM_RELEASE); +#else + osal_free(ptr); +#endif +} +#endif /* osal_memalign_free */ - const size_t used = (chk->flags & MDBX_CHK_SKIP_BTREE_TRAVERSAL) - ? usr->result.alloc_pages - usr->result.gc_pages - : usr->result.processed_pages; +#ifndef osal_strdup +char *osal_strdup(const char *str) { + if (!str) + return nullptr; + size_t bytes = strlen(str) + 1; + char *dup = osal_malloc(bytes); + if (dup) + memcpy(dup, str, bytes); + return dup; +} +#endif /* osal_strdup */ - line = chk_line_begin(usr->scope, MDBX_chk_info); - line = chk_print(line, - "backed by file: %" PRIuSIZE " pages (%.1f%%)" - ", %" PRIuSIZE " left to boundary (%.1f%%)", - usr->result.backed_pages, - usr->result.backed_pages * percent_boundary_reciprocal, - txn->mt_geo.upper - usr->result.backed_pages, - (txn->mt_geo.upper - usr->result.backed_pages) * - percent_boundary_reciprocal); - line = chk_line_feed(line); +/*----------------------------------------------------------------------------*/ - line = chk_print( - line, "%s: %" PRIuSIZE " page(s), %.1f%% of backed, %.1f%% of boundary", - "used", used, used * percent_backed_reciprocal, - used * percent_boundary_reciprocal); - line = chk_line_feed(line); +MDBX_INTERNAL int osal_condpair_init(osal_condpair_t *condpair) { + int rc; + memset(condpair, 0, sizeof(osal_condpair_t)); +#if defined(_WIN32) || defined(_WIN64) + if (!(condpair->mutex = CreateMutexW(nullptr, FALSE, nullptr))) { + rc = (int)GetLastError(); + goto bailout_mutex; + } + if (!(condpair->event[0] = CreateEventW(nullptr, FALSE, FALSE, nullptr))) { + rc = (int)GetLastError(); + goto bailout_event; + } + if ((condpair->event[1] = CreateEventW(nullptr, FALSE, FALSE, nullptr))) + return MDBX_SUCCESS; - line = chk_print( - line, - "%s: %" PRIuSIZE " page(s) (%.1f%%) of backed, %" PRIuSIZE - " to boundary (%.1f%% of boundary)", - "remained", remained2backed, remained2backed * percent_backed_reciprocal, - remained2boundary, remained2boundary * percent_boundary_reciprocal); - line = chk_line_feed(line); + rc = (int)GetLastError(); + (void)CloseHandle(condpair->event[0]); +bailout_event: + (void)CloseHandle(condpair->mutex); +#else + rc = pthread_mutex_init(&condpair->mutex, nullptr); + if (unlikely(rc != 0)) + goto bailout_mutex; + rc = pthread_cond_init(&condpair->cond[0], nullptr); + if (unlikely(rc != 0)) + goto bailout_cond; + rc = pthread_cond_init(&condpair->cond[1], nullptr); + if (likely(rc == 0)) + return MDBX_SUCCESS; - line = chk_print( - line, - "reclaimable: %" PRIuSIZE " (%.1f%% of backed, %.1f%% of boundary)" - ", GC %" PRIuSIZE " (%.1f%% of backed, %.1f%% of boundary)", - usr->result.reclaimable_pages, - usr->result.reclaimable_pages * percent_backed_reciprocal, - usr->result.reclaimable_pages * percent_boundary_reciprocal, - usr->result.gc_pages, usr->result.gc_pages * percent_backed_reciprocal, - usr->result.gc_pages * percent_boundary_reciprocal); - line = chk_line_feed(line); + (void)pthread_cond_destroy(&condpair->cond[0]); +bailout_cond: + (void)pthread_mutex_destroy(&condpair->mutex); +#endif +bailout_mutex: + memset(condpair, 0, sizeof(osal_condpair_t)); + return rc; +} - line = chk_print( - line, - "detained by reader(s): %" PRIuSIZE - " (%.1f%% of backed, %.1f%% of boundary)" - ", %u reader(s), lag %" PRIi64, - detained, detained * percent_backed_reciprocal, - detained * percent_boundary_reciprocal, chk->envinfo.mi_numreaders, - chk->envinfo.mi_recent_txnid - chk->envinfo.mi_latter_reader_txnid); - line = chk_line_feed(line); +MDBX_INTERNAL int osal_condpair_destroy(osal_condpair_t *condpair) { +#if defined(_WIN32) || defined(_WIN64) + int rc = CloseHandle(condpair->mutex) ? MDBX_SUCCESS : (int)GetLastError(); + rc = CloseHandle(condpair->event[0]) ? rc : (int)GetLastError(); + rc = CloseHandle(condpair->event[1]) ? rc : (int)GetLastError(); +#else + int err, rc = pthread_mutex_destroy(&condpair->mutex); + rc = (err = pthread_cond_destroy(&condpair->cond[0])) ? err : rc; + rc = (err = pthread_cond_destroy(&condpair->cond[1])) ? err : rc; +#endif + memset(condpair, 0, sizeof(osal_condpair_t)); + return rc; +} - line = chk_print( - line, "%s: %" PRIuSIZE " page(s), %.1f%% of backed, %.1f%% of boundary", - "allocated", usr->result.alloc_pages, - usr->result.alloc_pages * percent_backed_reciprocal, - usr->result.alloc_pages * percent_boundary_reciprocal); - line = chk_line_feed(line); +MDBX_INTERNAL int osal_condpair_lock(osal_condpair_t *condpair) { +#if defined(_WIN32) || defined(_WIN64) + DWORD code = WaitForSingleObject(condpair->mutex, INFINITE); + return waitstatus2errcode(code); +#else + return osal_pthread_mutex_lock(&condpair->mutex); +#endif +} - line = chk_print(line, - "%s: %" PRIuSIZE " page(s) (%.1f%%) of backed, %" PRIuSIZE - " to boundary (%.1f%% of boundary)", - "available", available2backed, - available2backed * percent_backed_reciprocal, - available2boundary, - available2boundary * percent_boundary_reciprocal); - chk_line_end(line); +MDBX_INTERNAL int osal_condpair_unlock(osal_condpair_t *condpair) { +#if defined(_WIN32) || defined(_WIN64) + return ReleaseMutex(condpair->mutex) ? MDBX_SUCCESS : (int)GetLastError(); +#else + return pthread_mutex_unlock(&condpair->mutex); +#endif +} - line = chk_line_begin(usr->scope, MDBX_chk_resolution); - line = chk_print(line, "%s %" PRIaPGNO " pages", - (txn->mt_geo.upper == txn->mt_geo.now) ? "total" : "upto", - txn->mt_geo.upper); - line = chk_print(line, ", backed %" PRIuSIZE " (%.1f%%)", - usr->result.backed_pages, - usr->result.backed_pages * percent_boundary_reciprocal); - line = chk_print(line, ", allocated %" PRIuSIZE " (%.1f%%)", - usr->result.alloc_pages, - usr->result.alloc_pages * percent_boundary_reciprocal); - line = - chk_print(line, ", available %" PRIuSIZE " (%.1f%%)", available2boundary, - available2boundary * percent_boundary_reciprocal); - chk_line_end(line); - chk_scope_restore(scope, err); +MDBX_INTERNAL int osal_condpair_signal(osal_condpair_t *condpair, bool part) { +#if defined(_WIN32) || defined(_WIN64) + return SetEvent(condpair->event[part]) ? MDBX_SUCCESS : (int)GetLastError(); +#else + return pthread_cond_signal(&condpair->cond[part]); +#endif +} - //-------------------------------------------------------------------------- +MDBX_INTERNAL int osal_condpair_wait(osal_condpair_t *condpair, bool part) { +#if defined(_WIN32) || defined(_WIN64) + DWORD code = SignalObjectAndWait(condpair->mutex, condpair->event[part], + INFINITE, FALSE); + if (code == WAIT_OBJECT_0) { + code = WaitForSingleObject(condpair->mutex, INFINITE); + if (code == WAIT_OBJECT_0) + return MDBX_SUCCESS; + } + return waitstatus2errcode(code); +#else + return pthread_cond_wait(&condpair->cond[part], &condpair->mutex); +#endif +} - if (chk->flags & MDBX_CHK_SKIP_KV_TRAVERSAL) - chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_processing), - "Skipping %s traversal...", "key-value")); - else if ((usr->result.problems_kv = usr->result.kv_tree_problems) > 0) - chk_line_end(chk_print( - chk_line_begin(scope, MDBX_chk_processing), - "Skip processing %s since %s is corrupted (%" PRIuSIZE " problem(s))", - chk_v2a(chk, MDBX_CHK_MAIN), "key-value", - usr->result.problems_kv = usr->result.kv_tree_problems)); - else { - err = - chk_scope_begin(chk, 0, MDBX_chk_traversal_maindb, &chk->subdb_main, - &usr->result.problems_kv, "Processing %s...", "MainDB"); - if (likely(!err)) - err = chk_db(usr->scope, MAIN_DBI, &chk->subdb_main, chk_handle_kv); - chk_scope_restore(scope, err); +/*----------------------------------------------------------------------------*/ - if (usr->result.problems_kv && usr->result.subdb_total) - chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_processing), - "Skip processing %s", "sub-database(s)")); - else if (usr->result.problems_kv == 0 && usr->result.subdb_total == 0) - chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_info), "No %s", - "sub-database(s)")); - else if (usr->result.problems_kv == 0 && usr->result.subdb_total) { - err = chk_scope_begin(chk, 1, MDBX_chk_traversal_subdbs, nullptr, - &usr->result.problems_kv, - "Traversal %s by txn#%" PRIaTXN "...", - "sub-database(s)", txn->mt_txnid); - if (!err) - err = chk_db(usr->scope, MAIN_DBI, &chk->subdb_main, nullptr); - if (usr->scope->subtotal_issues) - chk_line_end( - chk_print(chk_line_begin(usr->scope, MDBX_chk_resolution), - "processed %" PRIuSIZE " of %" PRIuSIZE " subDb(s)" - ", %" PRIuSIZE " problems(s)", - usr->result.subdb_processed, usr->result.subdb_total, - usr->scope->subtotal_issues)); - } - chk_scope_restore(scope, err); +MDBX_INTERNAL int osal_fastmutex_init(osal_fastmutex_t *fastmutex) { +#if defined(_WIN32) || defined(_WIN64) + InitializeCriticalSection(fastmutex); + return MDBX_SUCCESS; +#elif MDBX_DEBUG + pthread_mutexattr_t ma; + int rc = pthread_mutexattr_init(&ma); + if (likely(!rc)) { + rc = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK); + if (likely(!rc) || rc == ENOTSUP) + rc = pthread_mutex_init(fastmutex, &ma); + pthread_mutexattr_destroy(&ma); } + return rc; +#else + return pthread_mutex_init(fastmutex, nullptr); +#endif +} - return chk_scope_end(chk, chk_scope_begin(chk, 0, MDBX_chk_conclude, nullptr, - nullptr, nullptr)); +MDBX_INTERNAL int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex) { +#if defined(_WIN32) || defined(_WIN64) + DeleteCriticalSection(fastmutex); + return MDBX_SUCCESS; +#else + return pthread_mutex_destroy(fastmutex); +#endif } -__cold int mdbx_env_chk_encount_problem(MDBX_chk_context_t *ctx) { - if (likely(ctx && ctx->internal && ctx->internal->usr == ctx && - ctx->internal->problem_counter && ctx->scope)) { - *ctx->internal->problem_counter += 1; - ctx->scope->subtotal_issues += 1; - return MDBX_SUCCESS; +MDBX_INTERNAL int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex) { +#if defined(_WIN32) || defined(_WIN64) + __try { + EnterCriticalSection(fastmutex); + } __except ( + (GetExceptionCode() == + 0xC0000194 /* STATUS_POSSIBLE_DEADLOCK / EXCEPTION_POSSIBLE_DEADLOCK */) + ? EXCEPTION_EXECUTE_HANDLER + : EXCEPTION_CONTINUE_SEARCH) { + return MDBX_EDEADLK; } - return MDBX_EINVAL; + return MDBX_SUCCESS; +#else + return osal_pthread_mutex_lock(fastmutex); +#endif } -__cold int mdbx_env_chk(MDBX_env *env, const struct MDBX_chk_callbacks *cb, - MDBX_chk_context_t *ctx, - const enum MDBX_chk_flags_t flags, - enum MDBX_chk_severity verbosity, - unsigned timeout_seconds_16dot16) { - int err, rc = check_env(env, false); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - if (unlikely(!cb || !ctx || ctx->internal)) - return MDBX_EINVAL; - - MDBX_chk_internal_t *const chk = osal_calloc(1, sizeof(MDBX_chk_internal_t)); - if (unlikely(!chk)) - return MDBX_ENOMEM; - - chk->cb = cb; - chk->usr = ctx; - chk->usr->internal = chk; - chk->usr->env = env; - chk->flags = flags; +MDBX_INTERNAL int osal_fastmutex_release(osal_fastmutex_t *fastmutex) { +#if defined(_WIN32) || defined(_WIN64) + LeaveCriticalSection(fastmutex); + return MDBX_SUCCESS; +#else + return pthread_mutex_unlock(fastmutex); +#endif +} - chk->subdb_gc.id = -1; - chk->subdb_gc.name.iov_base = MDBX_CHK_GC; - chk->subdb[FREE_DBI] = &chk->subdb_gc; +/*----------------------------------------------------------------------------*/ - chk->subdb_main.id = -1; - chk->subdb_main.name.iov_base = MDBX_CHK_MAIN; - chk->subdb[MAIN_DBI] = &chk->subdb_main; +#if defined(_WIN32) || defined(_WIN64) - chk->monotime_timeout = - timeout_seconds_16dot16 - ? osal_16dot16_to_monotime(timeout_seconds_16dot16) + osal_monotime() - : 0; - chk->usr->scope_nesting = 0; - chk->usr->result.subdbs = (const void *)&chk->subdb; +MDBX_INTERNAL int osal_mb2w(const char *const src, wchar_t **const pdst) { + const size_t dst_wlen = MultiByteToWideChar( + CP_THREAD_ACP, MB_ERR_INVALID_CHARS, src, -1, nullptr, 0); + wchar_t *dst = *pdst; + int rc = ERROR_INVALID_NAME; + if (unlikely(dst_wlen < 2 || dst_wlen > /* MAX_PATH */ INT16_MAX)) + goto bailout; - MDBX_chk_scope_t *const top = chk->scope_stack; - top->verbosity = verbosity; - top->internal = chk; + dst = osal_realloc(dst, dst_wlen * sizeof(wchar_t)); + rc = MDBX_ENOMEM; + if (unlikely(!dst)) + goto bailout; - // init - rc = chk_scope_end( - chk, chk_scope_begin(chk, 0, MDBX_chk_init, nullptr, nullptr, nullptr)); + *pdst = dst; + if (likely(dst_wlen == (size_t)MultiByteToWideChar(CP_THREAD_ACP, + MB_ERR_INVALID_CHARS, src, + -1, dst, (int)dst_wlen))) + return MDBX_SUCCESS; - // lock - if (likely(!rc)) - rc = chk_scope_begin( - chk, 0, MDBX_chk_lock, nullptr, nullptr, "Taking %slock...", - (env->me_flags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) ? "" : "read "); - if (likely(!rc) && (env->me_flags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) == 0 && - (flags & MDBX_CHK_READWRITE)) { - rc = mdbx_txn_lock(env, false); - if (unlikely(rc)) - chk_error_rc(ctx->scope, rc, "mdbx_txn_lock"); - else - chk->write_locked = true; - } - if (likely(!rc)) { - rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &ctx->txn); - if (unlikely(rc)) - chk_error_rc(ctx->scope, rc, "mdbx_txn_begin"); + rc = ERROR_INVALID_NAME; +bailout: + if (*pdst) { + osal_free(*pdst); + *pdst = nullptr; } - chk_scope_end(chk, rc); + return rc; +} - // doit - if (likely(!rc)) { - chk->subdb_gc.flags = ctx->txn->mt_dbs[FREE_DBI].md_flags; - chk->subdb_main.flags = ctx->txn->mt_dbs[MAIN_DBI].md_flags; - rc = env_chk(top); - } +#endif /* Windows */ - // unlock - if (ctx->txn || chk->write_locked) { - chk_scope_begin(chk, 0, MDBX_chk_unlock, nullptr, nullptr, nullptr); - if (ctx->txn) { - err = mdbx_txn_abort(ctx->txn); - if (err && !rc) - rc = err; - ctx->txn = nullptr; - } - if (chk->write_locked) - mdbx_txn_unlock(env); - rc = chk_scope_end(chk, rc); - } +/*----------------------------------------------------------------------------*/ - // finalize - err = chk_scope_begin(chk, 0, MDBX_chk_finalize, nullptr, nullptr, nullptr); - rc = chk_scope_end(chk, err ? err : rc); - chk_dispose(chk); - return rc; +#if defined(_WIN32) || defined(_WIN64) +#define ior_alignment_mask (ior->pagesize - 1) +#define ior_WriteFile_flag 1 +#define OSAL_IOV_MAX (4096 / sizeof(ior_sgv_element)) + +static void ior_put_event(osal_ioring_t *ior, HANDLE event) { + assert(event && event != INVALID_HANDLE_VALUE && event != ior); + assert(ior->event_stack < ior->allocated); + ior->event_pool[ior->event_stack] = event; + ior->event_stack += 1; } -/******************************************************************************/ +static HANDLE ior_get_event(osal_ioring_t *ior) { + assert(ior->event_stack <= ior->allocated); + if (ior->event_stack > 0) { + ior->event_stack -= 1; + assert(ior->event_pool[ior->event_stack] != 0); + return ior->event_pool[ior->event_stack]; + } + return CreateEventW(nullptr, true, false, nullptr); +} -__dll_export -#ifdef __attribute_used__ - __attribute_used__ -#elif defined(__GNUC__) || __has_attribute(__used__) - __attribute__((__used__)) -#endif -#ifdef __attribute_externally_visible__ - __attribute_externally_visible__ -#elif (defined(__GNUC__) && !defined(__clang__)) || \ - __has_attribute(__externally_visible__) - __attribute__((__externally_visible__)) +static void WINAPI ior_wocr(DWORD err, DWORD bytes, OVERLAPPED *ov) { + osal_ioring_t *ior = ov->hEvent; + ov->Internal = err; + ov->InternalHigh = bytes; + if (++ior->async_completed >= ior->async_waiting) + SetEvent(ior->async_done); +} + +#elif MDBX_HAVE_PWRITEV +#if defined(_SC_IOV_MAX) +static size_t osal_iov_max; +#define OSAL_IOV_MAX osal_iov_max +#else +#define OSAL_IOV_MAX IOV_MAX #endif - const struct MDBX_build_info mdbx_build = { -#ifdef MDBX_BUILD_TIMESTAMP - MDBX_BUILD_TIMESTAMP #else - "\"" __DATE__ " " __TIME__ "\"" -#endif /* MDBX_BUILD_TIMESTAMP */ +#undef OSAL_IOV_MAX +#endif /* OSAL_IOV_MAX */ - , -#ifdef MDBX_BUILD_TARGET - MDBX_BUILD_TARGET -#else - #if defined(__ANDROID_API__) - "Android" MDBX_STRINGIFY(__ANDROID_API__) - #elif defined(__linux__) || defined(__gnu_linux__) - "Linux" - #elif defined(EMSCRIPTEN) || defined(__EMSCRIPTEN__) - "webassembly" - #elif defined(__CYGWIN__) - "CYGWIN" - #elif defined(_WIN64) || defined(_WIN32) || defined(__TOS_WIN__) \ - || defined(__WINDOWS__) - "Windows" - #elif defined(__APPLE__) - #if (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE) \ - || (defined(TARGET_IPHONE_SIMULATOR) && TARGET_IPHONE_SIMULATOR) - "iOS" - #else - "MacOS" - #endif - #elif defined(__FreeBSD__) - "FreeBSD" - #elif defined(__DragonFly__) - "DragonFlyBSD" - #elif defined(__NetBSD__) - "NetBSD" - #elif defined(__OpenBSD__) - "OpenBSD" - #elif defined(__bsdi__) - "UnixBSDI" - #elif defined(__MACH__) - "MACH" - #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) - "HPUX" - #elif defined(_AIX) - "AIX" - #elif defined(__sun) && defined(__SVR4) - "Solaris" - #elif defined(__BSD__) || defined(BSD) - "UnixBSD" - #elif defined(__unix__) || defined(UNIX) || defined(__unix) \ - || defined(__UNIX) || defined(__UNIX__) - "UNIX" - #elif defined(_POSIX_VERSION) - "POSIX" MDBX_STRINGIFY(_POSIX_VERSION) - #else - "UnknownOS" - #endif /* Target OS */ +MDBX_INTERNAL int osal_ioring_create(osal_ioring_t *ior +#if defined(_WIN32) || defined(_WIN64) + , + bool enable_direct, + mdbx_filehandle_t overlapped_fd +#endif /* Windows */ +) { + memset(ior, 0, sizeof(osal_ioring_t)); - "-" +#if defined(_WIN32) || defined(_WIN64) + ior->overlapped_fd = overlapped_fd; + ior->direct = enable_direct && overlapped_fd; + ior->pagesize = globals.sys_pagesize; + ior->pagesize_ln2 = globals.sys_pagesize_ln2; + ior->async_done = ior_get_event(ior); + if (!ior->async_done) + return GetLastError(); +#endif /* !Windows */ - #if defined(__amd64__) - "AMD64" - #elif defined(__ia32__) - "IA32" - #elif defined(__e2k__) || defined(__elbrus__) - "Elbrus" - #elif defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA) - "Alpha" - #elif defined(__aarch64__) || defined(_M_ARM64) - "ARM64" - #elif defined(__arm__) || defined(__thumb__) || defined(__TARGET_ARCH_ARM) \ - || defined(__TARGET_ARCH_THUMB) || defined(_ARM) || defined(_M_ARM) \ - || defined(_M_ARMT) || defined(__arm) - "ARM" - #elif defined(__mips64) || defined(__mips64__) || (defined(__mips) && (__mips >= 64)) - "MIPS64" - #elif defined(__mips__) || defined(__mips) || defined(_R4000) || defined(__MIPS__) - "MIPS" - #elif defined(__hppa64__) || defined(__HPPA64__) || defined(__hppa64) - "PARISC64" - #elif defined(__hppa__) || defined(__HPPA__) || defined(__hppa) - "PARISC" - #elif defined(__ia64__) || defined(__ia64) || defined(_IA64) \ - || defined(__IA64__) || defined(_M_IA64) || defined(__itanium__) - "Itanium" - #elif defined(__powerpc64__) || defined(__ppc64__) || defined(__ppc64) \ - || defined(__powerpc64) || defined(_ARCH_PPC64) - "PowerPC64" - #elif defined(__powerpc__) || defined(__ppc__) || defined(__powerpc) \ - || defined(__ppc) || defined(_ARCH_PPC) || defined(__PPC__) || defined(__POWERPC__) - "PowerPC" - #elif defined(__sparc64__) || defined(__sparc64) - "SPARC64" - #elif defined(__sparc__) || defined(__sparc) - "SPARC" - #elif defined(__s390__) || defined(__s390) || defined(__zarch__) || defined(__zarch) - "S390" - #else - "UnknownARCH" - #endif -#endif /* MDBX_BUILD_TARGET */ +#if MDBX_HAVE_PWRITEV && defined(_SC_IOV_MAX) + assert(osal_iov_max > 0); +#endif /* MDBX_HAVE_PWRITEV && _SC_IOV_MAX */ -#ifdef MDBX_BUILD_TYPE -# if defined(_MSC_VER) -# pragma message("Configuration-depended MDBX_BUILD_TYPE: " MDBX_BUILD_TYPE) -# endif - "-" MDBX_BUILD_TYPE -#endif /* MDBX_BUILD_TYPE */ - , - "MDBX_DEBUG=" MDBX_STRINGIFY(MDBX_DEBUG) -#ifdef ENABLE_GPROF - " ENABLE_GPROF" -#endif /* ENABLE_GPROF */ - " MDBX_WORDBITS=" MDBX_STRINGIFY(MDBX_WORDBITS) - " BYTE_ORDER=" -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - "LITTLE_ENDIAN" -#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - "BIG_ENDIAN" + ior->boundary = ptr_disp(ior->pool, ior->allocated); + return MDBX_SUCCESS; +} + +static inline size_t ior_offset(const ior_item_t *item) { +#if defined(_WIN32) || defined(_WIN64) + return item->ov.Offset | (size_t)((sizeof(size_t) > sizeof(item->ov.Offset)) + ? (uint64_t)item->ov.OffsetHigh << 32 + : 0); #else - #error "FIXME: Unsupported byte order" -#endif /* __BYTE_ORDER__ */ - " MDBX_ENABLE_BIGFOOT=" MDBX_STRINGIFY(MDBX_ENABLE_BIGFOOT) - " MDBX_ENV_CHECKPID=" MDBX_ENV_CHECKPID_CONFIG - " MDBX_TXN_CHECKOWNER=" MDBX_TXN_CHECKOWNER_CONFIG - " MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG - " MDBX_64BIT_CAS=" MDBX_64BIT_CAS_CONFIG - " MDBX_TRUST_RTC=" MDBX_TRUST_RTC_CONFIG - " MDBX_AVOID_MSYNC=" MDBX_STRINGIFY(MDBX_AVOID_MSYNC) - " MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND) - " MDBX_ENABLE_MADVISE=" MDBX_STRINGIFY(MDBX_ENABLE_MADVISE) - " MDBX_ENABLE_MINCORE=" MDBX_STRINGIFY(MDBX_ENABLE_MINCORE) - " MDBX_ENABLE_PGOP_STAT=" MDBX_STRINGIFY(MDBX_ENABLE_PGOP_STAT) - " MDBX_ENABLE_PROFGC=" MDBX_STRINGIFY(MDBX_ENABLE_PROFGC) -#if MDBX_DISABLE_VALIDATION - " MDBX_DISABLE_VALIDATION=YES" -#endif /* MDBX_DISABLE_VALIDATION */ -#ifdef __SANITIZE_ADDRESS__ - " SANITIZE_ADDRESS=YES" -#endif /* __SANITIZE_ADDRESS__ */ -#ifdef ENABLE_MEMCHECK - " ENABLE_MEMCHECK=YES" -#endif /* ENABLE_MEMCHECK */ -#if MDBX_FORCE_ASSERTIONS - " MDBX_FORCE_ASSERTIONS=YES" -#endif /* MDBX_FORCE_ASSERTIONS */ -#ifdef _GNU_SOURCE - " _GNU_SOURCE=YES" + return item->offset; +#endif /* !Windows */ +} + +static inline ior_item_t *ior_next(ior_item_t *item, size_t sgvcnt) { +#if defined(ior_sgv_element) + assert(sgvcnt > 0); + return (ior_item_t *)ptr_disp(item, sizeof(ior_item_t) - + sizeof(ior_sgv_element) + + sizeof(ior_sgv_element) * sgvcnt); #else - " _GNU_SOURCE=NO" -#endif /* _GNU_SOURCE */ -#ifdef __APPLE__ - " MDBX_OSX_SPEED_INSTEADOF_DURABILITY=" MDBX_STRINGIFY(MDBX_OSX_SPEED_INSTEADOF_DURABILITY) -#endif /* MacOS */ -#if defined(_WIN32) || defined(_WIN64) - " MDBX_WITHOUT_MSVC_CRT=" MDBX_STRINGIFY(MDBX_WITHOUT_MSVC_CRT) - " MDBX_BUILD_SHARED_LIBRARY=" MDBX_STRINGIFY(MDBX_BUILD_SHARED_LIBRARY) -#if !MDBX_BUILD_SHARED_LIBRARY - " MDBX_MANUAL_MODULE_HANDLER=" MDBX_STRINGIFY(MDBX_MANUAL_MODULE_HANDLER) + assert(sgvcnt == 1); + (void)sgvcnt; + return item + 1; #endif - " WINVER=" MDBX_STRINGIFY(WINVER) -#else /* Windows */ - " MDBX_LOCKING=" MDBX_LOCKING_CONFIG - " MDBX_USE_OFDLOCKS=" MDBX_USE_OFDLOCKS_CONFIG -#endif /* !Windows */ - " MDBX_CACHELINE_SIZE=" MDBX_STRINGIFY(MDBX_CACHELINE_SIZE) - " MDBX_CPU_WRITEBACK_INCOHERENT=" MDBX_STRINGIFY(MDBX_CPU_WRITEBACK_INCOHERENT) - " MDBX_MMAP_INCOHERENT_CPU_CACHE=" MDBX_STRINGIFY(MDBX_MMAP_INCOHERENT_CPU_CACHE) - " MDBX_MMAP_INCOHERENT_FILE_WRITE=" MDBX_STRINGIFY(MDBX_MMAP_INCOHERENT_FILE_WRITE) - " MDBX_UNALIGNED_OK=" MDBX_STRINGIFY(MDBX_UNALIGNED_OK) - " MDBX_PNL_ASCENDING=" MDBX_STRINGIFY(MDBX_PNL_ASCENDING) - , -#ifdef MDBX_BUILD_COMPILER - MDBX_BUILD_COMPILER +} + +MDBX_INTERNAL int osal_ioring_add(osal_ioring_t *ior, const size_t offset, + void *data, const size_t bytes) { + assert(bytes && data); + assert(bytes % MDBX_MIN_PAGESIZE == 0 && bytes <= MAX_WRITE); + assert(offset % MDBX_MIN_PAGESIZE == 0 && + offset + (uint64_t)bytes <= MAX_MAPSIZE); + +#if defined(_WIN32) || defined(_WIN64) + const unsigned segments = (unsigned)(bytes >> ior->pagesize_ln2); + const bool use_gather = + ior->direct && ior->overlapped_fd && ior->slots_left >= segments; +#endif /* Windows */ + + ior_item_t *item = ior->pool; + if (likely(ior->last)) { + item = ior->last; + if (unlikely(ior_offset(item) + ior_last_bytes(ior, item) == offset) && + likely(ior_last_bytes(ior, item) + bytes <= MAX_WRITE)) { +#if defined(_WIN32) || defined(_WIN64) + if (use_gather && + ((bytes | (uintptr_t)data | ior->last_bytes | + (uintptr_t)(uint64_t)item->sgv[0].Buffer) & + ior_alignment_mask) == 0 && + ior->last_sgvcnt + (size_t)segments < OSAL_IOV_MAX) { + assert(ior->overlapped_fd); + assert((item->single.iov_len & ior_WriteFile_flag) == 0); + assert(item->sgv[ior->last_sgvcnt].Buffer == 0); + ior->last_bytes += bytes; + size_t i = 0; + do { + item->sgv[ior->last_sgvcnt + i].Buffer = PtrToPtr64(data); + data = ptr_disp(data, ior->pagesize); + } while (++i < segments); + ior->slots_left -= segments; + item->sgv[ior->last_sgvcnt += segments].Buffer = 0; + assert((item->single.iov_len & ior_WriteFile_flag) == 0); + return MDBX_SUCCESS; + } + const void *end = ptr_disp(item->single.iov_base, + item->single.iov_len - ior_WriteFile_flag); + if (unlikely(end == data)) { + assert((item->single.iov_len & ior_WriteFile_flag) != 0); + item->single.iov_len += bytes; + return MDBX_SUCCESS; + } +#elif MDBX_HAVE_PWRITEV + assert((int)item->sgvcnt > 0); + const void *end = ptr_disp(item->sgv[item->sgvcnt - 1].iov_base, + item->sgv[item->sgvcnt - 1].iov_len); + if (unlikely(end == data)) { + item->sgv[item->sgvcnt - 1].iov_len += bytes; + ior->last_bytes += bytes; + return MDBX_SUCCESS; + } + if (likely(item->sgvcnt < OSAL_IOV_MAX)) { + if (unlikely(ior->slots_left < 1)) + return MDBX_RESULT_TRUE; + item->sgv[item->sgvcnt].iov_base = data; + item->sgv[item->sgvcnt].iov_len = bytes; + ior->last_bytes += bytes; + item->sgvcnt += 1; + ior->slots_left -= 1; + return MDBX_SUCCESS; + } #else - #ifdef __INTEL_COMPILER - "Intel C/C++ " MDBX_STRINGIFY(__INTEL_COMPILER) - #elif defined(__apple_build_version__) - "Apple clang " MDBX_STRINGIFY(__apple_build_version__) - #elif defined(__ibmxl__) - "IBM clang C " MDBX_STRINGIFY(__ibmxl_version__) "." MDBX_STRINGIFY(__ibmxl_release__) - "." MDBX_STRINGIFY(__ibmxl_modification__) "." MDBX_STRINGIFY(__ibmxl_ptf_fix_level__) - #elif defined(__clang__) - "clang " MDBX_STRINGIFY(__clang_version__) - #elif defined(__MINGW64__) - "MINGW-64 " MDBX_STRINGIFY(__MINGW64_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW64_MINOR_VERSION) - #elif defined(__MINGW32__) - "MINGW-32 " MDBX_STRINGIFY(__MINGW32_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW32_MINOR_VERSION) - #elif defined(__MINGW__) - "MINGW " MDBX_STRINGIFY(__MINGW_MAJOR_VERSION) "." MDBX_STRINGIFY(__MINGW_MINOR_VERSION) - #elif defined(__IBMC__) - "IBM C " MDBX_STRINGIFY(__IBMC__) - #elif defined(__GNUC__) - "GNU C/C++ " - #ifdef __VERSION__ - __VERSION__ - #else - MDBX_STRINGIFY(__GNUC__) "." MDBX_STRINGIFY(__GNUC_MINOR__) "." MDBX_STRINGIFY(__GNUC_PATCHLEVEL__) - #endif - #elif defined(_MSC_VER) - "MSVC " MDBX_STRINGIFY(_MSC_FULL_VER) "-" MDBX_STRINGIFY(_MSC_BUILD) - #else - "Unknown compiler" - #endif -#endif /* MDBX_BUILD_COMPILER */ - , -#ifdef MDBX_BUILD_FLAGS_CONFIG - MDBX_BUILD_FLAGS_CONFIG -#endif /* MDBX_BUILD_FLAGS_CONFIG */ -#ifdef MDBX_BUILD_FLAGS - MDBX_BUILD_FLAGS -#endif /* MDBX_BUILD_FLAGS */ -#if !(defined(MDBX_BUILD_FLAGS_CONFIG) || defined(MDBX_BUILD_FLAGS)) - "undefined (please use correct build script)" -#ifdef _MSC_VER -#pragma message("warning: Build flags undefined. Please use correct build script") + const void *end = ptr_disp(item->single.iov_base, item->single.iov_len); + if (unlikely(end == data)) { + item->single.iov_len += bytes; + return MDBX_SUCCESS; + } +#endif + } + item = ior_next(item, ior_last_sgvcnt(ior, item)); + } + + if (unlikely(ior->slots_left < 1)) + return MDBX_RESULT_TRUE; + + unsigned slots_used = 1; +#if defined(_WIN32) || defined(_WIN64) + item->ov.Internal = item->ov.InternalHigh = 0; + item->ov.Offset = (DWORD)offset; + item->ov.OffsetHigh = HIGH_DWORD(offset); + item->ov.hEvent = 0; + if (!use_gather || ((bytes | (uintptr_t)(data)) & ior_alignment_mask) != 0 || + segments > OSAL_IOV_MAX) { + /* WriteFile() */ + item->single.iov_base = data; + item->single.iov_len = bytes + ior_WriteFile_flag; + assert((item->single.iov_len & ior_WriteFile_flag) != 0); + } else { + /* WriteFileGather() */ + assert(ior->overlapped_fd); + item->sgv[0].Buffer = PtrToPtr64(data); + for (size_t i = 1; i < segments; ++i) { + data = ptr_disp(data, ior->pagesize); + item->sgv[slots_used].Buffer = PtrToPtr64(data); + } + item->sgv[slots_used].Buffer = 0; + assert((item->single.iov_len & ior_WriteFile_flag) == 0); + slots_used = segments; + } + ior->last_bytes = bytes; + ior_last_sgvcnt(ior, item) = slots_used; +#elif MDBX_HAVE_PWRITEV + item->offset = offset; + item->sgv[0].iov_base = data; + item->sgv[0].iov_len = bytes; + ior->last_bytes = bytes; + ior_last_sgvcnt(ior, item) = slots_used; #else -#warning "Build flags undefined. Please use correct build script" -#endif // _MSC_VER + item->offset = offset; + item->single.iov_base = data; + item->single.iov_len = bytes; +#endif /* !Windows */ + ior->slots_left -= slots_used; + ior->last = item; + return MDBX_SUCCESS; +} + +MDBX_INTERNAL void osal_ioring_walk(osal_ioring_t *ior, iov_ctx_t *ctx, + void (*callback)(iov_ctx_t *ctx, + size_t offset, void *data, + size_t bytes)) { + for (ior_item_t *item = ior->pool; item <= ior->last;) { +#if defined(_WIN32) || defined(_WIN64) + size_t offset = ior_offset(item); + char *data = item->single.iov_base; + size_t bytes = item->single.iov_len - ior_WriteFile_flag; + size_t i = 1; + if (bytes & ior_WriteFile_flag) { + data = Ptr64ToPtr(item->sgv[0].Buffer); + bytes = ior->pagesize; + /* Zap: Reading invalid data from 'item->sgv' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385); + while (item->sgv[i].Buffer) { + if (data + ior->pagesize != item->sgv[i].Buffer) { + callback(ctx, offset, data, bytes); + offset += bytes; + data = Ptr64ToPtr(item->sgv[i].Buffer); + bytes = 0; + } + bytes += ior->pagesize; + ++i; + } + } + assert(bytes < MAX_WRITE); + callback(ctx, offset, data, bytes); +#elif MDBX_HAVE_PWRITEV + assert(item->sgvcnt > 0); + size_t offset = item->offset; + size_t i = 0; + do { + callback(ctx, offset, item->sgv[i].iov_base, item->sgv[i].iov_len); + offset += item->sgv[i].iov_len; + } while (++i != item->sgvcnt); +#else + const size_t i = 1; + callback(ctx, item->offset, item->single.iov_base, item->single.iov_len); #endif -}; + item = ior_next(item, i); + } +} + +MDBX_INTERNAL osal_ioring_write_result_t +osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd) { + osal_ioring_write_result_t r = {MDBX_SUCCESS, 0}; + +#if defined(_WIN32) || defined(_WIN64) + HANDLE *const end_wait_for = + ior->event_pool + ior->allocated + + /* был выделен один дополнительный элемент для async_done */ 1; + HANDLE *wait_for = end_wait_for; + LONG async_started = 0; + for (ior_item_t *item = ior->pool; item <= ior->last;) { + item->ov.Internal = STATUS_PENDING; + size_t i = 1, bytes = item->single.iov_len - ior_WriteFile_flag; + r.wops += 1; + if (bytes & ior_WriteFile_flag) { + assert(ior->overlapped_fd && fd == ior->overlapped_fd); + bytes = ior->pagesize; + /* Zap: Reading invalid data from 'item->sgv' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385); + while (item->sgv[i].Buffer) { + bytes += ior->pagesize; + ++i; + } + assert(bytes < MAX_WRITE); + item->ov.hEvent = ior_get_event(ior); + if (unlikely(!item->ov.hEvent)) { + bailout_geterr: + r.err = GetLastError(); + bailout_rc: + assert(r.err != MDBX_SUCCESS); + CancelIo(fd); + return r; + } + if (WriteFileGather(fd, item->sgv, (DWORD)bytes, nullptr, &item->ov)) { + assert(item->ov.Internal == 0 && + WaitForSingleObject(item->ov.hEvent, 0) == WAIT_OBJECT_0); + ior_put_event(ior, item->ov.hEvent); + item->ov.hEvent = 0; + } else { + r.err = (int)GetLastError(); + if (unlikely(r.err != ERROR_IO_PENDING)) { + ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "WriteFileGather", fd, __Wpedantic_format_voidptr(item), + item - ior->pool, ((page_t *)item->single.iov_base)->pgno, + bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), + r.err); + goto bailout_rc; + } + assert(wait_for > ior->event_pool + ior->event_stack); + *--wait_for = item->ov.hEvent; + } + } else if (fd == ior->overlapped_fd) { + assert(bytes < MAX_WRITE); + retry: + item->ov.hEvent = ior; + if (WriteFileEx(fd, item->single.iov_base, (DWORD)bytes, &item->ov, + ior_wocr)) { + async_started += 1; + } else { + r.err = (int)GetLastError(); + switch (r.err) { + default: + ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "WriteFileEx", fd, __Wpedantic_format_voidptr(item), + item - ior->pool, ((page_t *)item->single.iov_base)->pgno, + bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), + r.err); + goto bailout_rc; + case ERROR_NOT_FOUND: + case ERROR_USER_MAPPED_FILE: + case ERROR_LOCK_VIOLATION: + WARNING( + "%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "WriteFileEx", fd, __Wpedantic_format_voidptr(item), + item - ior->pool, ((page_t *)item->single.iov_base)->pgno, bytes, + item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); + SleepEx(0, true); + goto retry; + case ERROR_INVALID_USER_BUFFER: + case ERROR_NOT_ENOUGH_MEMORY: + if (SleepEx(0, true) == WAIT_IO_COMPLETION) + goto retry; + goto bailout_rc; + case ERROR_IO_PENDING: + async_started += 1; + } + } + } else { + assert(bytes < MAX_WRITE); + DWORD written = 0; + if (!WriteFile(fd, item->single.iov_base, (DWORD)bytes, &written, + &item->ov)) { + r.err = (int)GetLastError(); + ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "WriteFile", fd, __Wpedantic_format_voidptr(item), + item - ior->pool, ((page_t *)item->single.iov_base)->pgno, bytes, + item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err); + goto bailout_rc; + } else if (unlikely(written != bytes)) { + r.err = ERROR_WRITE_FAULT; + goto bailout_rc; + } + } + item = ior_next(item, i); + } + + assert(ior->async_waiting > ior->async_completed && + ior->async_waiting == INT_MAX); + ior->async_waiting = async_started; + if (async_started > ior->async_completed && end_wait_for == wait_for) { + assert(wait_for > ior->event_pool + ior->event_stack); + *--wait_for = ior->async_done; + } + + const size_t pending_count = end_wait_for - wait_for; + if (pending_count) { + /* Ждем до MAXIMUM_WAIT_OBJECTS (64) последних хендлов, а после избирательно + * ждем посредством GetOverlappedResult(), если какие-то более ранние + * элементы еще не завершены. В целом, так получается меньше системных + * вызовов, т.е. меньше накладных расходов. Однако, не факт что эта экономия + * не будет перекрыта неэффективностью реализации + * WaitForMultipleObjectsEx(), но тогда это проблемы на стороне M$. */ + DWORD madness; + do + madness = WaitForMultipleObjectsEx((pending_count < MAXIMUM_WAIT_OBJECTS) + ? (DWORD)pending_count + : MAXIMUM_WAIT_OBJECTS, + wait_for, true, + /* сутки */ 86400000ul, true); + while (madness == WAIT_IO_COMPLETION); + STATIC_ASSERT(WAIT_OBJECT_0 == 0); + if (/* madness >= WAIT_OBJECT_0 && */ + madness < WAIT_OBJECT_0 + MAXIMUM_WAIT_OBJECTS) + r.err = MDBX_SUCCESS; + else if (madness >= WAIT_ABANDONED_0 && + madness < WAIT_ABANDONED_0 + MAXIMUM_WAIT_OBJECTS) { + r.err = ERROR_ABANDONED_WAIT_0; + goto bailout_rc; + } else if (madness == WAIT_TIMEOUT) { + r.err = WAIT_TIMEOUT; + goto bailout_rc; + } else { + r.err = /* madness == WAIT_FAILED */ MDBX_PROBLEM; + goto bailout_rc; + } + + assert(ior->async_waiting == ior->async_completed); + for (ior_item_t *item = ior->pool; item <= ior->last;) { + size_t i = 1, bytes = item->single.iov_len - ior_WriteFile_flag; + if (bytes & ior_WriteFile_flag) { + bytes = ior->pagesize; + /* Zap: Reading invalid data from 'item->sgv' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385); + while (item->sgv[i].Buffer) { + bytes += ior->pagesize; + ++i; + } + if (!HasOverlappedIoCompleted(&item->ov)) { + DWORD written = 0; + if (unlikely(!GetOverlappedResult(fd, &item->ov, &written, true))) { + ERROR("%s: item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "GetOverlappedResult", __Wpedantic_format_voidptr(item), + item - ior->pool, ((page_t *)item->single.iov_base)->pgno, + bytes, + item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), + (int)GetLastError()); + goto bailout_geterr; + } + assert(MDBX_SUCCESS == item->ov.Internal); + assert(written == item->ov.InternalHigh); + } + } else { + assert(HasOverlappedIoCompleted(&item->ov)); + } + assert(item->ov.Internal != ERROR_IO_PENDING); + if (unlikely(item->ov.Internal != MDBX_SUCCESS)) { + DWORD written = 0; + r.err = (int)item->ov.Internal; + if ((r.err & 0x80000000) && + GetOverlappedResult(nullptr, &item->ov, &written, true)) + r.err = (int)GetLastError(); + ERROR("%s: item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "Result", __Wpedantic_format_voidptr(item), item - ior->pool, + ((page_t *)item->single.iov_base)->pgno, bytes, + item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), + (int)GetLastError()); + goto bailout_rc; + } + if (unlikely(item->ov.InternalHigh != bytes)) { + r.err = ERROR_WRITE_FAULT; + goto bailout_rc; + } + item = ior_next(item, i); + } + assert(ior->async_waiting == ior->async_completed); + } else { + assert(r.err == MDBX_SUCCESS); + } + assert(ior->async_waiting == ior->async_completed); + +#else + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); + for (ior_item_t *item = ior->pool; item <= ior->last;) { +#if MDBX_HAVE_PWRITEV + assert(item->sgvcnt > 0); + if (item->sgvcnt == 1) + r.err = osal_pwrite(fd, item->sgv[0].iov_base, item->sgv[0].iov_len, + item->offset); + else + r.err = osal_pwritev(fd, item->sgv, item->sgvcnt, item->offset); + + // TODO: io_uring_prep_write(sqe, fd, ...); + + item = ior_next(item, item->sgvcnt); +#else + r.err = osal_pwrite(fd, item->single.iov_base, item->single.iov_len, + item->offset); + item = ior_next(item, 1); +#endif + r.wops += 1; + if (unlikely(r.err != MDBX_SUCCESS)) + break; + } + + // TODO: io_uring_submit(&ring) + // TODO: err = io_uring_wait_cqe(&ring, &cqe); + // TODO: io_uring_cqe_seen(&ring, cqe); + +#endif /* !Windows */ + return r; +} + +MDBX_INTERNAL void osal_ioring_reset(osal_ioring_t *ior) { +#if defined(_WIN32) || defined(_WIN64) + if (ior->last) { + for (ior_item_t *item = ior->pool; item <= ior->last;) { + if (!HasOverlappedIoCompleted(&item->ov)) { + assert(ior->overlapped_fd); + CancelIoEx(ior->overlapped_fd, &item->ov); + } + if (item->ov.hEvent && item->ov.hEvent != ior) + ior_put_event(ior, item->ov.hEvent); + size_t i = 1; + if ((item->single.iov_len & ior_WriteFile_flag) == 0) { + /* Zap: Reading invalid data from 'item->sgv' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385); + while (item->sgv[i].Buffer) + ++i; + } + item = ior_next(item, i); + } + } + ior->async_waiting = INT_MAX; + ior->async_completed = 0; + ResetEvent(ior->async_done); +#endif /* !Windows */ + ior->slots_left = ior->allocated; + ior->last = nullptr; +} + +static void ior_cleanup(osal_ioring_t *ior, const size_t since) { + osal_ioring_reset(ior); +#if defined(_WIN32) || defined(_WIN64) + for (size_t i = since; i < ior->event_stack; ++i) { + /* Zap: Using uninitialized memory '**ior.event_pool' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6001); + CloseHandle(ior->event_pool[i]); + } + ior->event_stack = 0; +#else + (void)since; +#endif /* Windows */ +} + +MDBX_INTERNAL int osal_ioring_resize(osal_ioring_t *ior, size_t items) { + assert(items > 0 && items < INT_MAX / sizeof(ior_item_t)); +#if defined(_WIN32) || defined(_WIN64) + if (ior->state & IOR_STATE_LOCKED) + return MDBX_SUCCESS; + const bool useSetFileIoOverlappedRange = + ior->overlapped_fd && imports.SetFileIoOverlappedRange && items > 42; + const size_t ceiling = + useSetFileIoOverlappedRange + ? ((items < 65536 / 2 / sizeof(ior_item_t)) ? 65536 : 65536 * 4) + : 1024; + const size_t bytes = ceil_powerof2(sizeof(ior_item_t) * items, ceiling); + items = bytes / sizeof(ior_item_t); +#endif /* Windows */ + + if (items != ior->allocated) { + assert(items >= osal_ioring_used(ior)); + if (items < ior->allocated) + ior_cleanup(ior, items); +#if defined(_WIN32) || defined(_WIN64) + void *ptr = osal_realloc( + ior->event_pool, + (items + /* extra for waiting the async_done */ 1) * sizeof(HANDLE)); + if (unlikely(!ptr)) + return MDBX_ENOMEM; + ior->event_pool = ptr; + + int err = osal_memalign_alloc(ceiling, bytes, &ptr); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (ior->pool) { + memcpy(ptr, ior->pool, ior->allocated * sizeof(ior_item_t)); + osal_memalign_free(ior->pool); + } +#else + void *ptr = osal_realloc(ior->pool, sizeof(ior_item_t) * items); + if (unlikely(!ptr)) + return MDBX_ENOMEM; +#endif + ior->pool = ptr; + + if (items > ior->allocated) + memset(ior->pool + ior->allocated, 0, + sizeof(ior_item_t) * (items - ior->allocated)); + ior->allocated = (unsigned)items; + ior->boundary = ptr_disp(ior->pool, ior->allocated); +#if defined(_WIN32) || defined(_WIN64) + if (useSetFileIoOverlappedRange) { + if (imports.SetFileIoOverlappedRange(ior->overlapped_fd, ptr, + (ULONG)bytes)) + ior->state += IOR_STATE_LOCKED; + else + return GetLastError(); + } +#endif /* Windows */ + } + return MDBX_SUCCESS; +} + +MDBX_INTERNAL void osal_ioring_destroy(osal_ioring_t *ior) { + if (ior->allocated) + ior_cleanup(ior, 0); +#if defined(_WIN32) || defined(_WIN64) + osal_memalign_free(ior->pool); + osal_free(ior->event_pool); + CloseHandle(ior->async_done); + if (ior->overlapped_fd) + CloseHandle(ior->overlapped_fd); +#else + osal_free(ior->pool); +#endif + memset(ior, 0, sizeof(osal_ioring_t)); +} + +/*----------------------------------------------------------------------------*/ + +MDBX_INTERNAL int osal_removefile(const pathchar_t *pathname) { +#if defined(_WIN32) || defined(_WIN64) + return DeleteFileW(pathname) ? MDBX_SUCCESS : (int)GetLastError(); +#else + return unlink(pathname) ? errno : MDBX_SUCCESS; +#endif +} + +#if !(defined(_WIN32) || defined(_WIN64)) +static bool is_valid_fd(int fd) { return !(isatty(fd) < 0 && errno == EBADF); } +#endif /*! Windows */ + +MDBX_INTERNAL int osal_removedirectory(const pathchar_t *pathname) { +#if defined(_WIN32) || defined(_WIN64) + return RemoveDirectoryW(pathname) ? MDBX_SUCCESS : (int)GetLastError(); +#else + return rmdir(pathname) ? errno : MDBX_SUCCESS; +#endif +} + +MDBX_INTERNAL int osal_fileexists(const pathchar_t *pathname) { +#if defined(_WIN32) || defined(_WIN64) + if (GetFileAttributesW(pathname) != INVALID_FILE_ATTRIBUTES) + return MDBX_RESULT_TRUE; + int err = GetLastError(); + return (err == ERROR_FILE_NOT_FOUND || err == ERROR_PATH_NOT_FOUND) + ? MDBX_RESULT_FALSE + : err; +#else + if (access(pathname, F_OK) == 0) + return MDBX_RESULT_TRUE; + int err = errno; + return (err == ENOENT || err == ENOTDIR) ? MDBX_RESULT_FALSE : err; +#endif +} + +MDBX_INTERNAL pathchar_t *osal_fileext(const pathchar_t *pathname, size_t len) { + const pathchar_t *ext = nullptr; + for (size_t i = 0; i < len && pathname[i]; i++) + if (pathname[i] == '.') + ext = pathname + i; + else if (osal_isdirsep(pathname[i])) + ext = nullptr; + return (pathchar_t *)ext; +} + +MDBX_INTERNAL bool osal_pathequal(const pathchar_t *l, const pathchar_t *r, + size_t len) { +#if defined(_WIN32) || defined(_WIN64) + for (size_t i = 0; i < len; ++i) { + pathchar_t a = l[i]; + pathchar_t b = r[i]; + a = (a == '\\') ? '/' : a; + b = (b == '\\') ? '/' : b; + if (a != b) + return false; + } + return true; +#else + return memcmp(l, r, len * sizeof(pathchar_t)) == 0; +#endif +} + +MDBX_INTERNAL int osal_openfile(const enum osal_openfile_purpose purpose, + const MDBX_env *env, const pathchar_t *pathname, + mdbx_filehandle_t *fd, + mdbx_mode_t unix_mode_bits) { + *fd = INVALID_HANDLE_VALUE; + +#if defined(_WIN32) || defined(_WIN64) + DWORD CreationDisposition = unix_mode_bits ? OPEN_ALWAYS : OPEN_EXISTING; + DWORD FlagsAndAttributes = + FILE_FLAG_POSIX_SEMANTICS | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED; + DWORD DesiredAccess = FILE_READ_ATTRIBUTES; + DWORD ShareMode = + (env->flags & MDBX_EXCLUSIVE) ? 0 : (FILE_SHARE_READ | FILE_SHARE_WRITE); + + switch (purpose) { + default: + return ERROR_INVALID_PARAMETER; + case MDBX_OPEN_LCK: + CreationDisposition = OPEN_ALWAYS; + DesiredAccess |= GENERIC_READ | GENERIC_WRITE; + FlagsAndAttributes |= FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_TEMPORARY; + break; + case MDBX_OPEN_DXB_READ: + CreationDisposition = OPEN_EXISTING; + DesiredAccess |= GENERIC_READ; + ShareMode |= FILE_SHARE_READ; + break; + case MDBX_OPEN_DXB_LAZY: + DesiredAccess |= GENERIC_READ | GENERIC_WRITE; + break; + case MDBX_OPEN_DXB_OVERLAPPED_DIRECT: + FlagsAndAttributes |= FILE_FLAG_NO_BUFFERING; + /* fall through */ + __fallthrough; + case MDBX_OPEN_DXB_OVERLAPPED: + FlagsAndAttributes |= FILE_FLAG_OVERLAPPED; + /* fall through */ + __fallthrough; + case MDBX_OPEN_DXB_DSYNC: + CreationDisposition = OPEN_EXISTING; + DesiredAccess |= GENERIC_WRITE | GENERIC_READ; + FlagsAndAttributes |= FILE_FLAG_WRITE_THROUGH; + break; + case MDBX_OPEN_COPY: + CreationDisposition = CREATE_NEW; + ShareMode = 0; + DesiredAccess |= GENERIC_WRITE; + if (env->ps >= globals.sys_pagesize) + FlagsAndAttributes |= FILE_FLAG_NO_BUFFERING; + break; + case MDBX_OPEN_DELETE: + CreationDisposition = OPEN_EXISTING; + ShareMode |= FILE_SHARE_DELETE; + DesiredAccess = + FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES | DELETE | SYNCHRONIZE; + break; + } + + *fd = CreateFileW(pathname, DesiredAccess, ShareMode, nullptr, + CreationDisposition, FlagsAndAttributes, nullptr); + if (*fd == INVALID_HANDLE_VALUE) { + int err = (int)GetLastError(); + if (err == ERROR_ACCESS_DENIED && purpose == MDBX_OPEN_LCK) { + if (GetFileAttributesW(pathname) == INVALID_FILE_ATTRIBUTES && + GetLastError() == ERROR_FILE_NOT_FOUND) + err = ERROR_FILE_NOT_FOUND; + } + return err; + } + + BY_HANDLE_FILE_INFORMATION info; + if (!GetFileInformationByHandle(*fd, &info)) { + int err = (int)GetLastError(); + CloseHandle(*fd); + *fd = INVALID_HANDLE_VALUE; + return err; + } + const DWORD AttributesDiff = + (info.dwFileAttributes ^ FlagsAndAttributes) & + (FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED | + FILE_ATTRIBUTE_TEMPORARY | FILE_ATTRIBUTE_COMPRESSED); + if (AttributesDiff) + (void)SetFileAttributesW(pathname, info.dwFileAttributes ^ AttributesDiff); + +#else + int flags = unix_mode_bits ? O_CREAT : 0; + switch (purpose) { + default: + return EINVAL; + case MDBX_OPEN_LCK: + flags |= O_RDWR; + break; + case MDBX_OPEN_DXB_READ: + flags = O_RDONLY; + break; + case MDBX_OPEN_DXB_LAZY: + flags |= O_RDWR; + break; + case MDBX_OPEN_COPY: + flags = O_CREAT | O_WRONLY | O_EXCL; + break; + case MDBX_OPEN_DXB_DSYNC: + flags |= O_WRONLY; +#if defined(O_DSYNC) + flags |= O_DSYNC; +#elif defined(O_SYNC) + flags |= O_SYNC; +#elif defined(O_FSYNC) + flags |= O_FSYNC; +#endif + break; + case MDBX_OPEN_DELETE: + flags = O_RDWR; + break; + } + + const bool direct_nocache_for_copy = + env->ps >= globals.sys_pagesize && purpose == MDBX_OPEN_COPY; + if (direct_nocache_for_copy) { +#if defined(O_DIRECT) + flags |= O_DIRECT; +#endif /* O_DIRECT */ +#if defined(O_NOCACHE) + flags |= O_NOCACHE; +#endif /* O_NOCACHE */ + } + +#ifdef O_CLOEXEC + flags |= O_CLOEXEC; +#endif /* O_CLOEXEC */ + + /* Safeguard for https://libmdbx.dqdkfa.ru/dead-github/issues/144 */ +#if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2 + int stub_fd0 = -1, stub_fd1 = -1, stub_fd2 = -1; + static const char dev_null[] = "/dev/null"; + if (!is_valid_fd(STDIN_FILENO)) { + WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "IN", + STDIN_FILENO, dev_null); + stub_fd0 = open(dev_null, O_RDONLY | O_NOCTTY); + } + if (!is_valid_fd(STDOUT_FILENO)) { + WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "OUT", + STDOUT_FILENO, dev_null); + stub_fd1 = open(dev_null, O_WRONLY | O_NOCTTY); + } + if (!is_valid_fd(STDERR_FILENO)) { + WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "ERR", + STDERR_FILENO, dev_null); + stub_fd2 = open(dev_null, O_WRONLY | O_NOCTTY); + } +#else +#error "Unexpected or unsupported UNIX or POSIX system" +#endif /* STDIN_FILENO == 0 && STDERR_FILENO == 2 */ + + *fd = open(pathname, flags, unix_mode_bits); +#if defined(O_DIRECT) + if (*fd < 0 && (flags & O_DIRECT) && + (errno == EINVAL || errno == EAFNOSUPPORT)) { + flags &= ~(O_DIRECT | O_EXCL); + *fd = open(pathname, flags, unix_mode_bits); + } +#endif /* O_DIRECT */ + + if (*fd < 0 && errno == EACCES && purpose == MDBX_OPEN_LCK) { + struct stat unused; + if (stat(pathname, &unused) == 0 || errno != ENOENT) + errno = EACCES /* restore errno if file exists */; + } + + /* Safeguard for https://libmdbx.dqdkfa.ru/dead-github/issues/144 */ +#if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2 + if (*fd == STDIN_FILENO) { + WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "IN", + STDIN_FILENO); + assert(stub_fd0 == -1); + *fd = dup(stub_fd0 = *fd); + } + if (*fd == STDOUT_FILENO) { + WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "OUT", + STDOUT_FILENO); + assert(stub_fd1 == -1); + *fd = dup(stub_fd1 = *fd); + } + if (*fd == STDERR_FILENO) { + WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "ERR", + STDERR_FILENO); + assert(stub_fd2 == -1); + *fd = dup(stub_fd2 = *fd); + } + if (stub_fd0 != -1) + close(stub_fd0); + if (stub_fd1 != -1) + close(stub_fd1); + if (stub_fd2 != -1) + close(stub_fd2); + if (*fd >= STDIN_FILENO && *fd <= STDERR_FILENO) { + ERROR("Rejecting the use of a FD in the range " + "STDIN_FILENO/%d..STDERR_FILENO/%d to prevent database corruption", + STDIN_FILENO, STDERR_FILENO); + close(*fd); + return EBADF; + } +#else +#error "Unexpected or unsupported UNIX or POSIX system" +#endif /* STDIN_FILENO == 0 && STDERR_FILENO == 2 */ + + if (*fd < 0) + return errno; + +#if defined(FD_CLOEXEC) && !defined(O_CLOEXEC) + const int fd_flags = fcntl(*fd, F_GETFD); + if (fd_flags != -1) + (void)fcntl(*fd, F_SETFD, fd_flags | FD_CLOEXEC); +#endif /* FD_CLOEXEC && !O_CLOEXEC */ + + if (direct_nocache_for_copy) { +#if defined(F_NOCACHE) && !defined(O_NOCACHE) + (void)fcntl(*fd, F_NOCACHE, 1); +#endif /* F_NOCACHE */ + } + +#endif + return MDBX_SUCCESS; +} + +MDBX_INTERNAL int osal_closefile(mdbx_filehandle_t fd) { +#if defined(_WIN32) || defined(_WIN64) + return CloseHandle(fd) ? MDBX_SUCCESS : (int)GetLastError(); +#else + assert(fd > STDERR_FILENO); + return (close(fd) == 0) ? MDBX_SUCCESS : errno; +#endif +} + +MDBX_INTERNAL int osal_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, + uint64_t offset) { + if (bytes > MAX_WRITE) + return MDBX_EINVAL; +#if defined(_WIN32) || defined(_WIN64) + OVERLAPPED ov; + ov.hEvent = 0; + ov.Offset = (DWORD)offset; + ov.OffsetHigh = HIGH_DWORD(offset); + + DWORD read = 0; + if (unlikely(!ReadFile(fd, buf, (DWORD)bytes, &read, &ov))) { + int rc = (int)GetLastError(); + return (rc == MDBX_SUCCESS) ? /* paranoia */ ERROR_READ_FAULT : rc; + } +#else + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); + intptr_t read = pread(fd, buf, bytes, offset); + if (read < 0) { + int rc = errno; + return (rc == MDBX_SUCCESS) ? /* paranoia */ MDBX_EIO : rc; + } +#endif + return (bytes == (size_t)read) ? MDBX_SUCCESS : MDBX_ENODATA; +} + +MDBX_INTERNAL int osal_pwrite(mdbx_filehandle_t fd, const void *buf, + size_t bytes, uint64_t offset) { + while (true) { +#if defined(_WIN32) || defined(_WIN64) + OVERLAPPED ov; + ov.hEvent = 0; + ov.Offset = (DWORD)offset; + ov.OffsetHigh = HIGH_DWORD(offset); + + DWORD written; + if (unlikely(!WriteFile( + fd, buf, likely(bytes <= MAX_WRITE) ? (DWORD)bytes : MAX_WRITE, + &written, &ov))) + return (int)GetLastError(); + if (likely(bytes == written)) + return MDBX_SUCCESS; +#else + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); + const intptr_t written = + pwrite(fd, buf, likely(bytes <= MAX_WRITE) ? bytes : MAX_WRITE, offset); + if (likely(bytes == (size_t)written)) + return MDBX_SUCCESS; + if (written < 0) { + const int rc = errno; + if (rc != EINTR) + return rc; + continue; + } +#endif + bytes -= written; + offset += written; + buf = ptr_disp(buf, written); + } +} + +MDBX_INTERNAL int osal_write(mdbx_filehandle_t fd, const void *buf, + size_t bytes) { + while (true) { +#if defined(_WIN32) || defined(_WIN64) + DWORD written; + if (unlikely(!WriteFile( + fd, buf, likely(bytes <= MAX_WRITE) ? (DWORD)bytes : MAX_WRITE, + &written, nullptr))) + return (int)GetLastError(); + if (likely(bytes == written)) + return MDBX_SUCCESS; +#else + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); + const intptr_t written = + write(fd, buf, likely(bytes <= MAX_WRITE) ? bytes : MAX_WRITE); + if (likely(bytes == (size_t)written)) + return MDBX_SUCCESS; + if (written < 0) { + const int rc = errno; + if (rc != EINTR) + return rc; + continue; + } +#endif + bytes -= written; + buf = ptr_disp(buf, written); + } +} + +int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, size_t sgvcnt, + uint64_t offset) { + size_t expected = 0; + for (size_t i = 0; i < sgvcnt; ++i) + expected += iov[i].iov_len; +#if !MDBX_HAVE_PWRITEV + size_t written = 0; + for (size_t i = 0; i < sgvcnt; ++i) { + int rc = osal_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + written += iov[i].iov_len; + offset += iov[i].iov_len; + } + return (expected == written) ? MDBX_SUCCESS + : MDBX_EIO /* ERROR_WRITE_FAULT */; +#else + int rc; + intptr_t written; + do { + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); + written = pwritev(fd, iov, sgvcnt, offset); + if (likely(expected == (size_t)written)) + return MDBX_SUCCESS; + rc = errno; + } while (rc == EINTR); + return (written < 0) ? rc : MDBX_EIO /* Use which error code? */; +#endif +} + +MDBX_INTERNAL int osal_fsync(mdbx_filehandle_t fd, + enum osal_syncmode_bits mode_bits) { +#if defined(_WIN32) || defined(_WIN64) + if ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) && !FlushFileBuffers(fd)) + return (int)GetLastError(); + return MDBX_SUCCESS; +#else + +#if defined(__APPLE__) && \ + MDBX_OSX_SPEED_INSTEADOF_DURABILITY == MDBX_OSX_WANNA_DURABILITY + if (mode_bits & MDBX_SYNC_IODQ) + return likely(fcntl(fd, F_FULLFSYNC) != -1) ? MDBX_SUCCESS : errno; +#endif /* MacOS */ + + /* LY: This approach is always safe and without appreciable performance + * degradation, even on a kernel with fdatasync's bug. + * + * For more info about of a corresponding fdatasync() bug + * see http://www.spinics.net/lists/linux-ext4/msg33714.html */ + while (1) { + switch (mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_SIZE)) { + case MDBX_SYNC_NONE: + case MDBX_SYNC_KICK: + return MDBX_SUCCESS /* nothing to do */; +#if defined(_POSIX_SYNCHRONIZED_IO) && _POSIX_SYNCHRONIZED_IO > 0 + case MDBX_SYNC_DATA: + if (likely(fdatasync(fd) == 0)) + return MDBX_SUCCESS; + break /* error */; +#if defined(__linux__) || defined(__gnu_linux__) + case MDBX_SYNC_SIZE: + assert(globals.linux_kernel_version >= 0x03060000); + return MDBX_SUCCESS; +#endif /* Linux */ +#endif /* _POSIX_SYNCHRONIZED_IO > 0 */ + default: + if (likely(fsync(fd) == 0)) + return MDBX_SUCCESS; + } + + int rc = errno; + if (rc != EINTR) + return rc; + } +#endif +} + +int osal_filesize(mdbx_filehandle_t fd, uint64_t *length) { +#if defined(_WIN32) || defined(_WIN64) + BY_HANDLE_FILE_INFORMATION info; + if (!GetFileInformationByHandle(fd, &info)) + return (int)GetLastError(); + *length = info.nFileSizeLow | (uint64_t)info.nFileSizeHigh << 32; +#else + struct stat st; + + STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(uint64_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); + if (fstat(fd, &st)) + return errno; + + *length = st.st_size; +#endif + return MDBX_SUCCESS; +} + +MDBX_INTERNAL int osal_is_pipe(mdbx_filehandle_t fd) { +#if defined(_WIN32) || defined(_WIN64) + switch (GetFileType(fd)) { + case FILE_TYPE_DISK: + return MDBX_RESULT_FALSE; + case FILE_TYPE_CHAR: + case FILE_TYPE_PIPE: + return MDBX_RESULT_TRUE; + default: + return (int)GetLastError(); + } +#else + struct stat info; + if (fstat(fd, &info)) + return errno; + switch (info.st_mode & S_IFMT) { + case S_IFBLK: + case S_IFREG: + return MDBX_RESULT_FALSE; + case S_IFCHR: + case S_IFIFO: + case S_IFSOCK: + return MDBX_RESULT_TRUE; + case S_IFDIR: + case S_IFLNK: + default: + return MDBX_INCOMPATIBLE; + } +#endif +} + +MDBX_INTERNAL int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length) { +#if defined(_WIN32) || defined(_WIN64) + if (imports.SetFileInformationByHandle) { + FILE_END_OF_FILE_INFO EndOfFileInfo; + EndOfFileInfo.EndOfFile.QuadPart = length; + return imports.SetFileInformationByHandle(fd, FileEndOfFileInfo, + &EndOfFileInfo, + sizeof(FILE_END_OF_FILE_INFO)) + ? MDBX_SUCCESS + : (int)GetLastError(); + } else { + LARGE_INTEGER li; + li.QuadPart = length; + return (SetFilePointerEx(fd, li, nullptr, FILE_BEGIN) && SetEndOfFile(fd)) + ? MDBX_SUCCESS + : (int)GetLastError(); + } +#else + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); + return ftruncate(fd, length) == 0 ? MDBX_SUCCESS : errno; +#endif +} + +MDBX_INTERNAL int osal_fseek(mdbx_filehandle_t fd, uint64_t pos) { +#if defined(_WIN32) || defined(_WIN64) + LARGE_INTEGER li; + li.QuadPart = pos; + return SetFilePointerEx(fd, li, nullptr, FILE_BEGIN) ? MDBX_SUCCESS + : (int)GetLastError(); +#else + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); + return (lseek(fd, pos, SEEK_SET) < 0) ? errno : MDBX_SUCCESS; +#endif +} + +/*----------------------------------------------------------------------------*/ + +MDBX_INTERNAL int +osal_thread_create(osal_thread_t *thread, + THREAD_RESULT(THREAD_CALL *start_routine)(void *), + void *arg) { +#if defined(_WIN32) || defined(_WIN64) + *thread = CreateThread(nullptr, 0, start_routine, arg, 0, nullptr); + return *thread ? MDBX_SUCCESS : (int)GetLastError(); +#else + return pthread_create(thread, nullptr, start_routine, arg); +#endif +} + +MDBX_INTERNAL int osal_thread_join(osal_thread_t thread) { +#if defined(_WIN32) || defined(_WIN64) + DWORD code = WaitForSingleObject(thread, INFINITE); + return waitstatus2errcode(code); +#else + void *unused_retval = &unused_retval; + return pthread_join(thread, &unused_retval); +#endif +} + +/*----------------------------------------------------------------------------*/ + +MDBX_INTERNAL int osal_msync(const osal_mmap_t *map, size_t offset, + size_t length, enum osal_syncmode_bits mode_bits) { + if (!MDBX_MMAP_USE_MS_ASYNC && mode_bits == MDBX_SYNC_NONE) + return MDBX_SUCCESS; + + void *ptr = ptr_disp(map->base, offset); +#if defined(_WIN32) || defined(_WIN64) + if (!FlushViewOfFile(ptr, length)) + return (int)GetLastError(); + if ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) && + !FlushFileBuffers(map->fd)) + return (int)GetLastError(); +#else +#if defined(__linux__) || defined(__gnu_linux__) + /* Since Linux 2.6.19, MS_ASYNC is in fact a no-op. The kernel properly + * tracks dirty pages and flushes ones as necessary. */ + // + // However, this behavior may be changed in custom kernels, + // so just leave such optimization to the libc discretion. + // NOTE: The MDBX_MMAP_USE_MS_ASYNC must be defined to 1 for such cases. + // + // assert(mdbx.linux_kernel_version > 0x02061300); + // if (mode_bits <= MDBX_SYNC_KICK) + // return MDBX_SUCCESS; +#endif /* Linux */ + if (msync(ptr, length, (mode_bits & MDBX_SYNC_DATA) ? MS_SYNC : MS_ASYNC)) + return errno; + if ((mode_bits & MDBX_SYNC_SIZE) && fsync(map->fd)) + return errno; +#endif + return MDBX_SUCCESS; +} + +MDBX_INTERNAL int osal_check_fs_rdonly(mdbx_filehandle_t handle, + const pathchar_t *pathname, int err) { +#if defined(_WIN32) || defined(_WIN64) + (void)pathname; + (void)err; + if (!imports.GetVolumeInformationByHandleW) + return MDBX_ENOSYS; + DWORD unused, flags; + if (!imports.GetVolumeInformationByHandleW(handle, nullptr, 0, nullptr, + &unused, &flags, nullptr, 0)) + return (int)GetLastError(); + if ((flags & FILE_READ_ONLY_VOLUME) == 0) + return MDBX_EACCESS; +#else + struct statvfs info; + if (err != MDBX_ENOFILE) { + if (statvfs(pathname, &info) == 0) + return (info.f_flag & ST_RDONLY) ? MDBX_SUCCESS : err; + if (errno != MDBX_ENOFILE) + return errno; + } + if (fstatvfs(handle, &info)) + return errno; + if ((info.f_flag & ST_RDONLY) == 0) + return (err == MDBX_ENOFILE) ? MDBX_EACCESS : err; +#endif /* !Windows */ + return MDBX_SUCCESS; +} + +MDBX_INTERNAL int osal_check_fs_incore(mdbx_filehandle_t handle) { +#if defined(_WIN32) || defined(_WIN64) + (void)handle; +#else + struct statfs statfs_info; + if (fstatfs(handle, &statfs_info)) + return errno; + +#if defined(__OpenBSD__) + const unsigned type = 0; +#else + const unsigned type = statfs_info.f_type; +#endif + switch (type) { + case 0x28cd3d45 /* CRAMFS_MAGIC */: + case 0x858458f6 /* RAMFS_MAGIC */: + case 0x01021994 /* TMPFS_MAGIC */: + case 0x73717368 /* SQUASHFS_MAGIC */: + case 0x7275 /* ROMFS_MAGIC */: + return MDBX_RESULT_TRUE; + } + +#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ + defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) || \ + defined(__APPLE__) || defined(__MACH__) || defined(MFSNAMELEN) || \ + defined(MFSTYPENAMELEN) || defined(VFS_NAMELEN) + const char *const name = statfs_info.f_fstypename; + const size_t name_len = sizeof(statfs_info.f_fstypename); +#else + const char *const name = ""; + const size_t name_len = 0; +#endif + if (name_len) { + if (strncasecmp("tmpfs", name, 6) == 0 || + strncasecmp("mfs", name, 4) == 0 || + strncasecmp("ramfs", name, 6) == 0 || + strncasecmp("romfs", name, 6) == 0) + return MDBX_RESULT_TRUE; + } +#endif /* !Windows */ + + return MDBX_RESULT_FALSE; +} + +static int osal_check_fs_local(mdbx_filehandle_t handle, int flags) { +#if defined(_WIN32) || defined(_WIN64) + if (globals.running_under_Wine && !(flags & MDBX_EXCLUSIVE)) + return ERROR_NOT_CAPABLE /* workaround for Wine */; + + if (GetFileType(handle) != FILE_TYPE_DISK) + return ERROR_FILE_OFFLINE; + + if (imports.GetFileInformationByHandleEx) { + FILE_REMOTE_PROTOCOL_INFO RemoteProtocolInfo; + if (imports.GetFileInformationByHandleEx(handle, FileRemoteProtocolInfo, + &RemoteProtocolInfo, + sizeof(RemoteProtocolInfo))) { + if ((RemoteProtocolInfo.Flags & REMOTE_PROTOCOL_INFO_FLAG_OFFLINE) && + !(flags & MDBX_RDONLY)) + return ERROR_FILE_OFFLINE; + if (!(RemoteProtocolInfo.Flags & REMOTE_PROTOCOL_INFO_FLAG_LOOPBACK) && + !(flags & MDBX_EXCLUSIVE)) + return ERROR_REMOTE_STORAGE_MEDIA_ERROR; + } + } + + if (imports.NtFsControlFile) { + NTSTATUS rc; + struct { + WOF_EXTERNAL_INFO wof_info; + union { + WIM_PROVIDER_EXTERNAL_INFO wim_info; + FILE_PROVIDER_EXTERNAL_INFO_V1 file_info; + }; + size_t reserved_for_microsoft_madness[42]; + } GetExternalBacking_OutputBuffer; + IO_STATUS_BLOCK StatusBlock; + rc = imports.NtFsControlFile(handle, nullptr, nullptr, nullptr, + &StatusBlock, FSCTL_GET_EXTERNAL_BACKING, + nullptr, 0, &GetExternalBacking_OutputBuffer, + sizeof(GetExternalBacking_OutputBuffer)); + if (NT_SUCCESS(rc)) { + if (!(flags & MDBX_EXCLUSIVE)) + return ERROR_REMOTE_STORAGE_MEDIA_ERROR; + } else if (rc != STATUS_OBJECT_NOT_EXTERNALLY_BACKED && + rc != STATUS_INVALID_DEVICE_REQUEST && + rc != STATUS_NOT_SUPPORTED) + return ntstatus2errcode(rc); + } + + if (imports.GetVolumeInformationByHandleW && + imports.GetFinalPathNameByHandleW) { + WCHAR *PathBuffer = osal_malloc(sizeof(WCHAR) * INT16_MAX); + if (!PathBuffer) + return MDBX_ENOMEM; + + int rc = MDBX_SUCCESS; + DWORD VolumeSerialNumber, FileSystemFlags; + if (!imports.GetVolumeInformationByHandleW(handle, PathBuffer, INT16_MAX, + &VolumeSerialNumber, nullptr, + &FileSystemFlags, nullptr, 0)) { + rc = (int)GetLastError(); + goto bailout; + } + + if ((flags & MDBX_RDONLY) == 0) { + if (FileSystemFlags & + (FILE_SEQUENTIAL_WRITE_ONCE | FILE_READ_ONLY_VOLUME | + FILE_VOLUME_IS_COMPRESSED)) { + rc = ERROR_REMOTE_STORAGE_MEDIA_ERROR; + goto bailout; + } + } + + if (imports.GetFinalPathNameByHandleW(handle, PathBuffer, INT16_MAX, + FILE_NAME_NORMALIZED | + VOLUME_NAME_NT)) { + if (_wcsnicmp(PathBuffer, L"\\Device\\Mup\\", 12) == 0) { + if (!(flags & MDBX_EXCLUSIVE)) { + rc = ERROR_REMOTE_STORAGE_MEDIA_ERROR; + goto bailout; + } + } + } + + if (F_ISSET(flags, MDBX_RDONLY | MDBX_EXCLUSIVE) && + (FileSystemFlags & FILE_READ_ONLY_VOLUME)) { + /* without-LCK (exclusive readonly) mode for DB on a read-only volume */ + goto bailout; + } + + if (imports.GetFinalPathNameByHandleW(handle, PathBuffer, INT16_MAX, + FILE_NAME_NORMALIZED | + VOLUME_NAME_DOS)) { + UINT DriveType = GetDriveTypeW(PathBuffer); + if (DriveType == DRIVE_NO_ROOT_DIR && + _wcsnicmp(PathBuffer, L"\\\\?\\", 4) == 0 && + _wcsnicmp(PathBuffer + 5, L":\\", 2) == 0) { + PathBuffer[7] = 0; + DriveType = GetDriveTypeW(PathBuffer + 4); + } + switch (DriveType) { + case DRIVE_CDROM: + if (flags & MDBX_RDONLY) + break; + // fall through + case DRIVE_UNKNOWN: + case DRIVE_NO_ROOT_DIR: + case DRIVE_REMOTE: + default: + if (!(flags & MDBX_EXCLUSIVE)) + rc = ERROR_REMOTE_STORAGE_MEDIA_ERROR; + // fall through + case DRIVE_REMOVABLE: + case DRIVE_FIXED: + case DRIVE_RAMDISK: + break; + } + } + + bailout: + osal_free(PathBuffer); + return rc; + } + +#else + + struct statvfs statvfs_info; + if (fstatvfs(handle, &statvfs_info)) + return errno; +#if defined(ST_LOCAL) || defined(ST_EXPORTED) + const unsigned long st_flags = statvfs_info.f_flag; +#endif /* ST_LOCAL || ST_EXPORTED */ + +#if defined(__NetBSD__) + const unsigned type = 0; + const char *const name = statvfs_info.f_fstypename; + const size_t name_len = VFS_NAMELEN; +#elif defined(_AIX) || defined(__OS400__) + const char *const name = statvfs_info.f_basetype; + const size_t name_len = sizeof(statvfs_info.f_basetype); + struct stat st; + if (fstat(handle, &st)) + return errno; + const unsigned type = st.st_vfstype; + if ((st.st_flag & FS_REMOTE) != 0 && !(flags & MDBX_EXCLUSIVE)) + return MDBX_EREMOTE; +#elif defined(FSTYPSZ) || defined(_FSTYPSZ) + const unsigned type = 0; + const char *const name = statvfs_info.f_basetype; + const size_t name_len = sizeof(statvfs_info.f_basetype); +#elif defined(__sun) || defined(__SVR4) || defined(__svr4__) || \ + defined(ST_FSTYPSZ) || defined(_ST_FSTYPSZ) + const unsigned type = 0; + struct stat st; + if (fstat(handle, &st)) + return errno; + const char *const name = st.st_fstype; + const size_t name_len = strlen(name); +#else + struct statfs statfs_info; + if (fstatfs(handle, &statfs_info)) + return errno; +#if defined(__OpenBSD__) + const unsigned type = 0; +#else + const unsigned type = statfs_info.f_type; +#endif +#if defined(MNT_LOCAL) || defined(MNT_EXPORTED) + const unsigned long mnt_flags = statfs_info.f_flags; +#endif /* MNT_LOCAL || MNT_EXPORTED */ +#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ + defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) || \ + defined(__APPLE__) || defined(__MACH__) || defined(MFSNAMELEN) || \ + defined(MFSTYPENAMELEN) || defined(VFS_NAMELEN) + const char *const name = statfs_info.f_fstypename; + const size_t name_len = sizeof(statfs_info.f_fstypename); +#elif defined(__ANDROID_API__) && __ANDROID_API__ < 21 + const char *const name = ""; + const unsigned name_len = 0; +#else + + const char *name = ""; + unsigned name_len = 0; + + struct stat st; + if (fstat(handle, &st)) + return errno; + + char pathbuf[PATH_MAX]; + FILE *mounted = nullptr; +#if defined(__linux__) || defined(__gnu_linux__) + mounted = setmntent("/proc/mounts", "r"); +#endif /* Linux */ + if (!mounted) + mounted = setmntent("/etc/mtab", "r"); + if (mounted) { + const struct mntent *ent; +#if defined(_BSD_SOURCE) || defined(_SVID_SOURCE) || defined(__BIONIC__) || \ + (defined(_DEFAULT_SOURCE) && __GLIBC_PREREQ(2, 19)) + struct mntent entbuf; + const bool should_copy = false; + while (nullptr != + (ent = getmntent_r(mounted, &entbuf, pathbuf, sizeof(pathbuf)))) +#else + const bool should_copy = true; + while (nullptr != (ent = getmntent(mounted))) +#endif + { + struct stat mnt; + if (!stat(ent->mnt_dir, &mnt) && mnt.st_dev == st.st_dev) { + if (should_copy) { + name = + strncpy(pathbuf, ent->mnt_fsname, name_len = sizeof(pathbuf) - 1); + pathbuf[name_len] = 0; + } else { + name = ent->mnt_fsname; + name_len = strlen(name); + } + break; + } + } + endmntent(mounted); + } +#endif /* !xBSD && !Android/Bionic */ +#endif + + if (name_len) { + if (((name_len > 2 && strncasecmp("nfs", name, 3) == 0) || + strncasecmp("cifs", name, name_len) == 0 || + strncasecmp("ncpfs", name, name_len) == 0 || + strncasecmp("smbfs", name, name_len) == 0 || + strcasecmp("9P" /* WSL2 */, name) == 0 || + ((name_len > 3 && strncasecmp("fuse", name, 4) == 0) && + strncasecmp("fuseblk", name, name_len) != 0)) && + !(flags & MDBX_EXCLUSIVE)) + return MDBX_EREMOTE; + if (strcasecmp("ftp", name) == 0 || strcasecmp("http", name) == 0 || + strcasecmp("sshfs", name) == 0) + return MDBX_EREMOTE; + } + +#ifdef ST_LOCAL + if ((st_flags & ST_LOCAL) == 0 && !(flags & MDBX_EXCLUSIVE)) + return MDBX_EREMOTE; +#elif defined(MNT_LOCAL) + if ((mnt_flags & MNT_LOCAL) == 0 && !(flags & MDBX_EXCLUSIVE)) + return MDBX_EREMOTE; +#endif /* ST/MNT_LOCAL */ + +#ifdef ST_EXPORTED + if ((st_flags & ST_EXPORTED) != 0 && !(flags & MDBX_RDONLY)) + return MDBX_EREMOTE; +#elif defined(MNT_EXPORTED) + if ((mnt_flags & MNT_EXPORTED) != 0 && !(flags & MDBX_RDONLY)) + return MDBX_EREMOTE; +#endif /* ST/MNT_EXPORTED */ + + switch (type) { + case 0xFF534D42 /* CIFS_MAGIC_NUMBER */: + case 0x6969 /* NFS_SUPER_MAGIC */: + case 0x564c /* NCP_SUPER_MAGIC */: + case 0x517B /* SMB_SUPER_MAGIC */: +#if defined(__digital__) || defined(__osf__) || defined(__osf) + case 0x0E /* Tru64 NFS */: +#endif +#ifdef ST_FST_NFS + case ST_FST_NFS: +#endif + if ((flags & MDBX_EXCLUSIVE) == 0) + return MDBX_EREMOTE; + case 0: + default: + break; + } +#endif /* Unix */ + + return MDBX_SUCCESS; +} + +static int check_mmap_limit(const size_t limit) { + const bool should_check = +#if defined(__SANITIZE_ADDRESS__) + true; +#else + RUNNING_ON_VALGRIND; +#endif /* __SANITIZE_ADDRESS__ */ + + if (should_check) { + intptr_t pagesize, total_ram_pages, avail_ram_pages; + int err = + mdbx_get_sysraminfo(&pagesize, &total_ram_pages, &avail_ram_pages); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + const int log2page = log2n_powerof2(pagesize); + if ((limit >> (log2page + 7)) > (size_t)total_ram_pages || + (limit >> (log2page + 6)) > (size_t)avail_ram_pages) { + ERROR("%s (%zu pages) is too large for available (%zu pages) or total " + "(%zu pages) system RAM", + "database upper size limit", limit >> log2page, avail_ram_pages, + total_ram_pages); + return MDBX_TOO_LARGE; + } + } + + return MDBX_SUCCESS; +} + +MDBX_INTERNAL int osal_mmap(const int flags, osal_mmap_t *map, size_t size, + const size_t limit, const unsigned options) { + assert(size <= limit); + map->limit = 0; + map->current = 0; + map->base = nullptr; + map->filesize = 0; +#if defined(_WIN32) || defined(_WIN64) + map->section = nullptr; +#endif /* Windows */ + + int err = osal_check_fs_local(map->fd, flags); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + err = check_mmap_limit(limit); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + if ((flags & MDBX_RDONLY) == 0 && (options & MMAP_OPTION_TRUNCATE) != 0) { + err = osal_ftruncate(map->fd, size); + VERBOSE("ftruncate %zu, err %d", size, err); + if (err != MDBX_SUCCESS) + return err; + map->filesize = size; +#if !(defined(_WIN32) || defined(_WIN64)) + map->current = size; +#endif /* !Windows */ + } else { + err = osal_filesize(map->fd, &map->filesize); + VERBOSE("filesize %" PRIu64 ", err %d", map->filesize, err); + if (err != MDBX_SUCCESS) + return err; +#if defined(_WIN32) || defined(_WIN64) + if (map->filesize < size) { + WARNING("file size (%zu) less than requested for mapping (%zu)", + (size_t)map->filesize, size); + size = (size_t)map->filesize; + } +#else + map->current = (map->filesize > limit) ? limit : (size_t)map->filesize; +#endif /* !Windows */ + } + +#if defined(_WIN32) || defined(_WIN64) + LARGE_INTEGER SectionSize; + SectionSize.QuadPart = size; + err = NtCreateSection(&map->section, + /* DesiredAccess */ + (flags & MDBX_WRITEMAP) + ? SECTION_QUERY | SECTION_MAP_READ | + SECTION_EXTEND_SIZE | SECTION_MAP_WRITE + : SECTION_QUERY | SECTION_MAP_READ | + SECTION_EXTEND_SIZE, + /* ObjectAttributes */ nullptr, + /* MaximumSize (InitialSize) */ &SectionSize, + /* SectionPageProtection */ + (flags & MDBX_RDONLY) ? PAGE_READONLY : PAGE_READWRITE, + /* AllocationAttributes */ SEC_RESERVE, map->fd); + if (!NT_SUCCESS(err)) + return ntstatus2errcode(err); + + SIZE_T ViewSize = (flags & MDBX_RDONLY) ? 0 + : globals.running_under_Wine ? size + : limit; + err = NtMapViewOfSection( + map->section, GetCurrentProcess(), &map->base, + /* ZeroBits */ 0, + /* CommitSize */ 0, + /* SectionOffset */ nullptr, &ViewSize, + /* InheritDisposition */ ViewUnmap, + /* AllocationType */ (flags & MDBX_RDONLY) ? 0 : MEM_RESERVE, + /* Win32Protect */ + (flags & MDBX_WRITEMAP) ? PAGE_READWRITE : PAGE_READONLY); + if (!NT_SUCCESS(err)) { + NtClose(map->section); + map->section = 0; + map->base = nullptr; + return ntstatus2errcode(err); + } + assert(map->base != MAP_FAILED); + + map->current = (size_t)SectionSize.QuadPart; + map->limit = ViewSize; + +#else /* Windows */ + +#ifndef MAP_TRYFIXED +#define MAP_TRYFIXED 0 +#endif + +#ifndef MAP_HASSEMAPHORE +#define MAP_HASSEMAPHORE 0 +#endif + +#ifndef MAP_CONCEAL +#define MAP_CONCEAL 0 +#endif + +#ifndef MAP_NOSYNC +#define MAP_NOSYNC 0 +#endif + +#ifndef MAP_FIXED_NOREPLACE +#define MAP_FIXED_NOREPLACE 0 +#endif + +#ifndef MAP_NORESERVE +#define MAP_NORESERVE 0 +#endif + + map->base = mmap(nullptr, limit, + (flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ, + MAP_SHARED | MAP_FILE | MAP_NORESERVE | + (F_ISSET(flags, MDBX_UTTERLY_NOSYNC) ? MAP_NOSYNC : 0) | + ((options & MMAP_OPTION_SEMAPHORE) + ? MAP_HASSEMAPHORE | MAP_NOSYNC + : MAP_CONCEAL), + map->fd, 0); + + if (unlikely(map->base == MAP_FAILED)) { + map->limit = 0; + map->current = 0; + map->base = nullptr; + assert(errno != 0); + return errno; + } + map->limit = limit; + +#if MDBX_ENABLE_MADVISE +#ifdef MADV_DONTFORK + if (unlikely(madvise(map->base, map->limit, MADV_DONTFORK) != 0)) + return errno; +#endif /* MADV_DONTFORK */ +#ifdef MADV_NOHUGEPAGE + (void)madvise(map->base, map->limit, MADV_NOHUGEPAGE); +#endif /* MADV_NOHUGEPAGE */ +#endif /* MDBX_ENABLE_MADVISE */ + +#endif /* ! Windows */ + + VALGRIND_MAKE_MEM_DEFINED(map->base, map->current); + MDBX_ASAN_UNPOISON_MEMORY_REGION(map->base, map->current); + return MDBX_SUCCESS; +} + +MDBX_INTERNAL int osal_munmap(osal_mmap_t *map) { + VALGRIND_MAKE_MEM_NOACCESS(map->base, map->current); + /* Unpoisoning is required for ASAN to avoid false-positive diagnostic + * when this memory will re-used by malloc or another mmapping. + * See https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ + MDBX_ASAN_UNPOISON_MEMORY_REGION( + map->base, (map->filesize && map->filesize < map->limit) ? map->filesize + : map->limit); +#if defined(_WIN32) || defined(_WIN64) + if (map->section) + NtClose(map->section); + NTSTATUS rc = NtUnmapViewOfSection(GetCurrentProcess(), map->base); + if (!NT_SUCCESS(rc)) + ntstatus2errcode(rc); +#else + if (unlikely(munmap(map->base, map->limit))) { + assert(errno != 0); + return errno; + } +#endif /* ! Windows */ + + map->limit = 0; + map->current = 0; + map->base = nullptr; + return MDBX_SUCCESS; +} + +MDBX_INTERNAL int osal_mresize(const int flags, osal_mmap_t *map, size_t size, + size_t limit) { + int rc = osal_filesize(map->fd, &map->filesize); + VERBOSE("flags 0x%x, size %zu, limit %zu, filesize %" PRIu64, flags, size, + limit, map->filesize); + assert(size <= limit); + if (rc != MDBX_SUCCESS) { + map->filesize = 0; + return rc; + } + +#if defined(_WIN32) || defined(_WIN64) + assert(size != map->current || limit != map->limit || size < map->filesize); + + NTSTATUS status; + LARGE_INTEGER SectionSize; + int err; + + if (limit == map->limit && size > map->current) { + if ((flags & MDBX_RDONLY) && map->filesize >= size) { + map->current = size; + return MDBX_SUCCESS; + } else if (!(flags & MDBX_RDONLY) && + /* workaround for Wine */ imports.NtExtendSection) { + /* growth rw-section */ + SectionSize.QuadPart = size; + status = imports.NtExtendSection(map->section, &SectionSize); + if (!NT_SUCCESS(status)) + return ntstatus2errcode(status); + map->current = size; + if (map->filesize < size) + map->filesize = size; + return MDBX_SUCCESS; + } + } + + if (limit > map->limit) { + err = check_mmap_limit(limit); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + /* check ability of address space for growth before unmap */ + PVOID BaseAddress = (PBYTE)map->base + map->limit; + SIZE_T RegionSize = limit - map->limit; + status = NtAllocateVirtualMemory(GetCurrentProcess(), &BaseAddress, 0, + &RegionSize, MEM_RESERVE, PAGE_NOACCESS); + if (status == (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018) + return MDBX_UNABLE_EXTEND_MAPSIZE; + if (!NT_SUCCESS(status)) + return ntstatus2errcode(status); + + status = NtFreeVirtualMemory(GetCurrentProcess(), &BaseAddress, &RegionSize, + MEM_RELEASE); + if (!NT_SUCCESS(status)) + return ntstatus2errcode(status); + } + + /* Windows unable: + * - shrink a mapped file; + * - change size of mapped view; + * - extend read-only mapping; + * Therefore we should unmap/map entire section. */ + if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0) { + if (size <= map->current && limit == map->limit) + return MDBX_SUCCESS; + return MDBX_EPERM; + } + + /* Unpoisoning is required for ASAN to avoid false-positive diagnostic + * when this memory will re-used by malloc or another mmapping. + * See https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ + MDBX_ASAN_UNPOISON_MEMORY_REGION(map->base, map->limit); + status = NtUnmapViewOfSection(GetCurrentProcess(), map->base); + if (!NT_SUCCESS(status)) + return ntstatus2errcode(status); + status = NtClose(map->section); + map->section = nullptr; + PVOID ReservedAddress = nullptr; + SIZE_T ReservedSize = limit; + + if (!NT_SUCCESS(status)) { + bailout_ntstatus: + err = ntstatus2errcode(status); + map->base = nullptr; + map->current = map->limit = 0; + if (ReservedAddress) { + ReservedSize = 0; + status = NtFreeVirtualMemory(GetCurrentProcess(), &ReservedAddress, + &ReservedSize, MEM_RELEASE); + assert(NT_SUCCESS(status)); + (void)status; + } + return err; + } + +retry_file_and_section: + /* resizing of the file may take a while, + * therefore we reserve address space to avoid occupy it by other threads */ + ReservedAddress = map->base; + status = NtAllocateVirtualMemory(GetCurrentProcess(), &ReservedAddress, 0, + &ReservedSize, MEM_RESERVE, PAGE_NOACCESS); + if (!NT_SUCCESS(status)) { + ReservedAddress = nullptr; + if (status != (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018) + goto bailout_ntstatus /* no way to recovery */; + + if (flags & MDBX_MRESIZE_MAY_MOVE) + /* the base address could be changed */ + map->base = nullptr; + } + + if ((flags & MDBX_RDONLY) == 0 && map->filesize != size) { + err = osal_ftruncate(map->fd, size); + if (err == MDBX_SUCCESS) + map->filesize = size; + /* ignore error, because Windows unable shrink file + * that already mapped (by another process) */ + } + + SectionSize.QuadPart = size; + status = NtCreateSection( + &map->section, + /* DesiredAccess */ + (flags & MDBX_WRITEMAP) + ? SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE | + SECTION_MAP_WRITE + : SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE, + /* ObjectAttributes */ nullptr, + /* MaximumSize (InitialSize) */ &SectionSize, + /* SectionPageProtection */ + (flags & MDBX_RDONLY) ? PAGE_READONLY : PAGE_READWRITE, + /* AllocationAttributes */ SEC_RESERVE, map->fd); + + if (!NT_SUCCESS(status)) + goto bailout_ntstatus; + + if (ReservedAddress) { + /* release reserved address space */ + ReservedSize = 0; + status = NtFreeVirtualMemory(GetCurrentProcess(), &ReservedAddress, + &ReservedSize, MEM_RELEASE); + ReservedAddress = nullptr; + if (!NT_SUCCESS(status)) + goto bailout_ntstatus; + } + +retry_mapview:; + SIZE_T ViewSize = (flags & MDBX_RDONLY) ? size : limit; + status = NtMapViewOfSection( + map->section, GetCurrentProcess(), &map->base, + /* ZeroBits */ 0, + /* CommitSize */ 0, + /* SectionOffset */ nullptr, &ViewSize, + /* InheritDisposition */ ViewUnmap, + /* AllocationType */ (flags & MDBX_RDONLY) ? 0 : MEM_RESERVE, + /* Win32Protect */ + (flags & MDBX_WRITEMAP) ? PAGE_READWRITE : PAGE_READONLY); + + if (!NT_SUCCESS(status)) { + if (status == (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018 && + map->base && (flags & MDBX_MRESIZE_MAY_MOVE) != 0) { + /* try remap at another base address */ + map->base = nullptr; + goto retry_mapview; + } + NtClose(map->section); + map->section = nullptr; + + if (map->base && (size != map->current || limit != map->limit)) { + /* try remap with previously size and limit, + * but will return MDBX_UNABLE_EXTEND_MAPSIZE on success */ + rc = (limit > map->limit) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_EPERM; + size = map->current; + ReservedSize = limit = map->limit; + goto retry_file_and_section; + } + + /* no way to recovery */ + goto bailout_ntstatus; + } + assert(map->base != MAP_FAILED); + + map->current = (size_t)SectionSize.QuadPart; + map->limit = ViewSize; + +#else /* Windows */ + + if (flags & MDBX_RDONLY) { + if (size > map->filesize) + rc = MDBX_UNABLE_EXTEND_MAPSIZE; + else if (size < map->filesize && map->filesize > limit) + rc = MDBX_EPERM; + map->current = (map->filesize > limit) ? limit : (size_t)map->filesize; + } else { + if (size > map->filesize || + (size < map->filesize && (flags & txn_shrink_allowed))) { + rc = osal_ftruncate(map->fd, size); + VERBOSE("ftruncate %zu, err %d", size, rc); + if (rc != MDBX_SUCCESS) + return rc; + map->filesize = size; + } + + if (map->current > size) { + /* Clearing asan's bitmask for the region which released in shrinking, + * since: + * - after the shrinking we will get an exception when accessing + * this region and (therefore) do not need the help of ASAN. + * - this allows us to clear the mask only within the file size + * when closing the mapping. */ + MDBX_ASAN_UNPOISON_MEMORY_REGION( + ptr_disp(map->base, size), + ((map->current < map->limit) ? map->current : map->limit) - size); + } + map->current = (size < map->limit) ? size : map->limit; + } + + if (limit == map->limit) + return rc; + + if (limit < map->limit) { + /* unmap an excess at end of mapping. */ + // coverity[offset_free : FALSE] + if (unlikely(munmap(ptr_disp(map->base, limit), map->limit - limit))) { + assert(errno != 0); + return errno; + } + map->limit = limit; + return rc; + } + + int err = check_mmap_limit(limit); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + assert(limit > map->limit); + void *ptr = MAP_FAILED; + +#if (defined(__linux__) || defined(__gnu_linux__)) && defined(_GNU_SOURCE) + ptr = mremap(map->base, map->limit, limit, +#if defined(MREMAP_MAYMOVE) + (flags & MDBX_MRESIZE_MAY_MOVE) ? MREMAP_MAYMOVE : +#endif /* MREMAP_MAYMOVE */ + 0); + if (ptr == MAP_FAILED) { + err = errno; + assert(err != 0); + switch (err) { + default: + return err; + case 0 /* paranoia */: + case EAGAIN: + case ENOMEM: + return MDBX_UNABLE_EXTEND_MAPSIZE; + case EFAULT /* MADV_DODUMP / MADV_DONTDUMP are mixed for mmap-range */: + break; + } + } +#endif /* Linux & _GNU_SOURCE */ + + const unsigned mmap_flags = + MAP_CONCEAL | MAP_SHARED | MAP_FILE | MAP_NORESERVE | + (F_ISSET(flags, MDBX_UTTERLY_NOSYNC) ? MAP_NOSYNC : 0); + const unsigned mmap_prot = + (flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ; + + if (ptr == MAP_FAILED) { + /* Try to mmap additional space beyond the end of mapping. */ + ptr = mmap(ptr_disp(map->base, map->limit), limit - map->limit, mmap_prot, + mmap_flags | MAP_FIXED_NOREPLACE, map->fd, map->limit); + if (ptr == ptr_disp(map->base, map->limit)) + /* успешно прилепили отображение в конец */ + ptr = map->base; + else if (ptr != MAP_FAILED) { + /* the desired address is busy, unmap unsuitable one */ + if (unlikely(munmap(ptr, limit - map->limit))) { + assert(errno != 0); + return errno; + } + ptr = MAP_FAILED; + } else { + err = errno; + assert(err != 0); + switch (err) { + default: + return err; + case 0 /* paranoia */: + case EAGAIN: + case ENOMEM: + return MDBX_UNABLE_EXTEND_MAPSIZE; + case EEXIST: /* address busy */ + case EINVAL: /* kernel don't support MAP_FIXED_NOREPLACE */ + break; + } + } + } + + if (ptr == MAP_FAILED) { + /* unmap and map again whole region */ + if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0) { + /* TODO: Perhaps here it is worth to implement suspend/resume threads + * and perform unmap/map as like for Windows. */ + return MDBX_UNABLE_EXTEND_MAPSIZE; + } + + if (unlikely(munmap(map->base, map->limit))) { + assert(errno != 0); + return errno; + } + + // coverity[pass_freed_arg : FALSE] + ptr = mmap(map->base, limit, mmap_prot, + (flags & MDBX_MRESIZE_MAY_MOVE) + ? mmap_flags + : mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE + : MAP_FIXED), + map->fd, 0); + if (MAP_FIXED_NOREPLACE != 0 && MAP_FIXED_NOREPLACE != MAP_FIXED && + unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) && + errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL) + // coverity[pass_freed_arg : FALSE] + ptr = + mmap(map->base, limit, mmap_prot, mmap_flags | MAP_FIXED, map->fd, 0); + + if (unlikely(ptr == MAP_FAILED)) { + /* try to restore prev mapping */ + // coverity[pass_freed_arg : FALSE] + ptr = mmap(map->base, map->limit, mmap_prot, + (flags & MDBX_MRESIZE_MAY_MOVE) + ? mmap_flags + : mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE + : MAP_FIXED), + map->fd, 0); + if (MAP_FIXED_NOREPLACE != 0 && MAP_FIXED_NOREPLACE != MAP_FIXED && + unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) && + errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL) + // coverity[pass_freed_arg : FALSE] + ptr = mmap(map->base, map->limit, mmap_prot, mmap_flags | MAP_FIXED, + map->fd, 0); + if (unlikely(ptr == MAP_FAILED)) { + VALGRIND_MAKE_MEM_NOACCESS(map->base, map->current); + /* Unpoisoning is required for ASAN to avoid false-positive diagnostic + * when this memory will re-used by malloc or another mmapping. + * See + * https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 + */ + MDBX_ASAN_UNPOISON_MEMORY_REGION( + map->base, (map->current < map->limit) ? map->current : map->limit); + map->limit = 0; + map->current = 0; + map->base = nullptr; + assert(errno != 0); + return errno; + } + rc = MDBX_UNABLE_EXTEND_MAPSIZE; + limit = map->limit; + } + } + + assert(ptr && ptr != MAP_FAILED); + if (map->base != ptr) { + VALGRIND_MAKE_MEM_NOACCESS(map->base, map->current); + /* Unpoisoning is required for ASAN to avoid false-positive diagnostic + * when this memory will re-used by malloc or another mmapping. + * See + * https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 + */ + MDBX_ASAN_UNPOISON_MEMORY_REGION( + map->base, (map->current < map->limit) ? map->current : map->limit); + + VALGRIND_MAKE_MEM_DEFINED(ptr, map->current); + MDBX_ASAN_UNPOISON_MEMORY_REGION(ptr, map->current); + map->base = ptr; + } + map->limit = limit; + map->current = size; + +#if MDBX_ENABLE_MADVISE +#ifdef MADV_DONTFORK + if (unlikely(madvise(map->base, map->limit, MADV_DONTFORK) != 0)) { + assert(errno != 0); + return errno; + } +#endif /* MADV_DONTFORK */ +#ifdef MADV_NOHUGEPAGE + (void)madvise(map->base, map->limit, MADV_NOHUGEPAGE); +#endif /* MADV_NOHUGEPAGE */ +#endif /* MDBX_ENABLE_MADVISE */ + +#endif /* POSIX / Windows */ + + /* Zap: Redundant code */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6287); + assert(rc != MDBX_SUCCESS || + (map->base != nullptr && map->base != MAP_FAILED && + map->current == size && map->limit == limit && + map->filesize >= size)); + return rc; +} + +/*----------------------------------------------------------------------------*/ + +__cold MDBX_INTERNAL void osal_jitter(bool tiny) { + for (;;) { +#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ + defined(__x86_64__) + const unsigned salt = 277u * (unsigned)__rdtsc(); +#elif (defined(_WIN32) || defined(_WIN64)) && MDBX_WITHOUT_MSVC_CRT + static ULONG state; + const unsigned salt = (unsigned)RtlRandomEx(&state); +#else + const unsigned salt = rand(); +#endif + + const unsigned coin = salt % (tiny ? 29u : 43u); + if (coin < 43 / 3) + break; +#if defined(_WIN32) || defined(_WIN64) + SwitchToThread(); + if (coin > 43 * 2 / 3) + Sleep(1); +#else + sched_yield(); + if (coin > 43 * 2 / 3) + usleep(coin); +#endif + } +} + +/*----------------------------------------------------------------------------*/ + +#if defined(_WIN32) || defined(_WIN64) +static LARGE_INTEGER performance_frequency; +#elif defined(__APPLE__) || defined(__MACH__) +#include +static uint64_t ratio_16dot16_to_monotine; +#elif defined(__linux__) || defined(__gnu_linux__) +static clockid_t posix_clockid; +__cold static clockid_t choice_monoclock(void) { + struct timespec probe; +#if defined(CLOCK_BOOTTIME) + if (clock_gettime(CLOCK_BOOTTIME, &probe) == 0) + return CLOCK_BOOTTIME; +#elif defined(CLOCK_MONOTONIC_RAW) + if (clock_gettime(CLOCK_MONOTONIC_RAW, &probe) == 0) + return CLOCK_MONOTONIC_RAW; +#elif defined(CLOCK_MONOTONIC_COARSE) + if (clock_gettime(CLOCK_MONOTONIC_COARSE, &probe) == 0) + return CLOCK_MONOTONIC_COARSE; +#endif + return CLOCK_MONOTONIC; +} +#elif defined(CLOCK_MONOTONIC) +#define posix_clockid CLOCK_MONOTONIC +#else +#define posix_clockid CLOCK_REALTIME +#endif + +MDBX_INTERNAL uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16) { +#if defined(_WIN32) || defined(_WIN64) + const uint64_t ratio = performance_frequency.QuadPart; +#elif defined(__APPLE__) || defined(__MACH__) + const uint64_t ratio = ratio_16dot16_to_monotine; +#else + const uint64_t ratio = UINT64_C(1000000000); +#endif + const uint64_t ret = (ratio * seconds_16dot16 + 32768) >> 16; + return likely(ret || seconds_16dot16 == 0) ? ret : /* fix underflow */ 1; +} + +static uint64_t monotime_limit; +MDBX_INTERNAL uint32_t osal_monotime_to_16dot16(uint64_t monotime) { + if (unlikely(monotime > monotime_limit)) + return UINT32_MAX; + + const uint32_t ret = +#if defined(_WIN32) || defined(_WIN64) + (uint32_t)((monotime << 16) / performance_frequency.QuadPart); +#elif defined(__APPLE__) || defined(__MACH__) + (uint32_t)((monotime << 16) / ratio_16dot16_to_monotine); +#else + (uint32_t)((monotime << 7) / 1953125); +#endif + return ret; +} + +MDBX_INTERNAL uint64_t osal_monotime(void) { +#if defined(_WIN32) || defined(_WIN64) + LARGE_INTEGER counter; + if (QueryPerformanceCounter(&counter)) + return counter.QuadPart; +#elif defined(__APPLE__) || defined(__MACH__) + return mach_absolute_time(); +#else + struct timespec ts; + if (likely(clock_gettime(posix_clockid, &ts) == 0)) + return ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; +#endif + return 0; +} + +MDBX_INTERNAL uint64_t osal_cputime(size_t *optional_page_faults) { +#if defined(_WIN32) || defined(_WIN64) + if (optional_page_faults) { + PROCESS_MEMORY_COUNTERS pmc; + *optional_page_faults = + GetProcessMemoryInfo(GetCurrentProcess(), &pmc, sizeof(pmc)) + ? pmc.PageFaultCount + : 0; + } + FILETIME unused, usermode; + if (GetThreadTimes(GetCurrentThread(), + /* CreationTime */ &unused, + /* ExitTime */ &unused, + /* KernelTime */ &unused, + /* UserTime */ &usermode)) { + /* one second = 10_000_000 * 100ns = 78125 * (1 << 7) * 100ns; + * result = (h * f / 10_000_000) << 32) + l * f / 10_000_000 = + * = ((h * f) >> 7) / 78125) << 32) + ((l * f) >> 7) / 78125; + * 1) {h, l} *= f; + * 2) {h, l} >>= 7; + * 3) result = ((h / 78125) << 32) + l / 78125; */ + uint64_t l = usermode.dwLowDateTime * performance_frequency.QuadPart; + uint64_t h = usermode.dwHighDateTime * performance_frequency.QuadPart; + l = h << (64 - 7) | l >> 7; + h = h >> 7; + return ((h / 78125) << 32) + l / 78125; + } +#elif defined(RUSAGE_THREAD) || defined(RUSAGE_LWP) +#ifndef RUSAGE_THREAD +#define RUSAGE_THREAD RUSAGE_LWP /* Solaris */ +#endif + struct rusage usage; + if (getrusage(RUSAGE_THREAD, &usage) == 0) { + if (optional_page_faults) + *optional_page_faults = usage.ru_majflt; + return usage.ru_utime.tv_sec * UINT64_C(1000000000) + + usage.ru_utime.tv_usec * 1000u; + } + if (optional_page_faults) + *optional_page_faults = 0; +#elif defined(CLOCK_THREAD_CPUTIME_ID) + if (optional_page_faults) + *optional_page_faults = 0; + struct timespec ts; + if (likely(clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0)) + return ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; +#else + /* FIXME */ + if (optional_page_faults) + *optional_page_faults = 0; +#endif + return 0; +} + +/*----------------------------------------------------------------------------*/ + +static void bootid_shake(bin128_t *p) { + /* Bob Jenkins's PRNG: https://burtleburtle.net/bob/rand/smallprng.html */ + const uint32_t e = p->a - (p->b << 23 | p->b >> 9); + p->a = p->b ^ (p->c << 16 | p->c >> 16); + p->b = p->c + (p->d << 11 | p->d >> 21); + p->c = p->d + e; + p->d = e + p->a; +} + +__cold static void bootid_collect(bin128_t *p, const void *s, size_t n) { + p->y += UINT64_C(64526882297375213); + bootid_shake(p); + for (size_t i = 0; i < n; ++i) { + bootid_shake(p); + p->y ^= UINT64_C(48797879452804441) * ((const uint8_t *)s)[i]; + bootid_shake(p); + p->y += 14621231; + } + bootid_shake(p); + + /* minor non-linear tomfoolery */ + const unsigned z = p->x % 61; + p->y = p->y << z | p->y >> (64 - z); + bootid_shake(p); + bootid_shake(p); + const unsigned q = p->x % 59; + p->y = p->y << q | p->y >> (64 - q); + bootid_shake(p); + bootid_shake(p); + bootid_shake(p); +} + +#if defined(_WIN32) || defined(_WIN64) + +__cold static uint64_t windows_systemtime_ms() { + FILETIME ft; + GetSystemTimeAsFileTime(&ft); + return ((uint64_t)ft.dwHighDateTime << 32 | ft.dwLowDateTime) / 10000ul; +} + +__cold static uint64_t windows_bootime(void) { + unsigned confirmed = 0; + uint64_t boottime = 0; + uint64_t up0 = imports.GetTickCount64(); + uint64_t st0 = windows_systemtime_ms(); + for (uint64_t fuse = st0; up0 && st0 < fuse + 1000 * 1000u / 42;) { + YieldProcessor(); + const uint64_t up1 = imports.GetTickCount64(); + const uint64_t st1 = windows_systemtime_ms(); + if (st1 > fuse && st1 == st0 && up1 == up0) { + uint64_t diff = st1 - up1; + if (boottime == diff) { + if (++confirmed > 4) + return boottime; + } else { + confirmed = 0; + boottime = diff; + } + fuse = st1; + Sleep(1); + } + st0 = st1; + up0 = up1; + } + return 0; +} + +__cold static LSTATUS mdbx_RegGetValue(HKEY hKey, LPCSTR lpSubKey, + LPCSTR lpValue, PVOID pvData, + LPDWORD pcbData) { + LSTATUS rc; + if (!imports.RegGetValueA) { + /* an old Windows 2000/XP */ + HKEY hSubKey; + rc = RegOpenKeyA(hKey, lpSubKey, &hSubKey); + if (rc == ERROR_SUCCESS) { + rc = + RegQueryValueExA(hSubKey, lpValue, nullptr, nullptr, pvData, pcbData); + RegCloseKey(hSubKey); + } + return rc; + } + + rc = imports.RegGetValueA(hKey, lpSubKey, lpValue, RRF_RT_ANY, nullptr, + pvData, pcbData); + if (rc != ERROR_FILE_NOT_FOUND) + return rc; + + rc = imports.RegGetValueA(hKey, lpSubKey, lpValue, + RRF_RT_ANY | 0x00010000 /* RRF_SUBKEY_WOW6464KEY */, + nullptr, pvData, pcbData); + if (rc != ERROR_FILE_NOT_FOUND) + return rc; + return imports.RegGetValueA(hKey, lpSubKey, lpValue, + RRF_RT_ANY | + 0x00020000 /* RRF_SUBKEY_WOW6432KEY */, + nullptr, pvData, pcbData); +} +#endif + +__cold MDBX_MAYBE_UNUSED static bool +bootid_parse_uuid(bin128_t *s, const void *p, const size_t n) { + if (n > 31) { + unsigned bits = 0; + for (unsigned i = 0; i < n; ++i) /* try parse an UUID in text form */ { + uint8_t c = ((const uint8_t *)p)[i]; + if (c >= '0' && c <= '9') + c -= '0'; + else if (c >= 'a' && c <= 'f') + c -= 'a' - 10; + else if (c >= 'A' && c <= 'F') + c -= 'A' - 10; + else + continue; + assert(c <= 15); + c ^= s->y >> 60; + s->y = s->y << 4 | s->x >> 60; + s->x = s->x << 4 | c; + bits += 4; + } + if (bits > 42 * 3) + /* UUID parsed successfully */ + return true; + } + + if (n > 15) /* is enough handle it as a binary? */ { + if (n == sizeof(bin128_t)) { + bin128_t aligned; + memcpy(&aligned, p, sizeof(bin128_t)); + s->x += aligned.x; + s->y += aligned.y; + } else + bootid_collect(s, p, n); + return true; + } + + if (n) + bootid_collect(s, p, n); + return false; +} + +__cold static bin128_t osal_bootid(void) { + bin128_t bin = {{0, 0}}; + bool got_machineid = false, got_boottime = false, got_bootseq = false; + +#if defined(__linux__) || defined(__gnu_linux__) + { + const int fd = + open("/proc/sys/kernel/random/boot_id", O_RDONLY | O_NOFOLLOW); + if (fd != -1) { + struct statfs fs; + char buf[42]; + const ssize_t len = + (fstatfs(fd, &fs) == 0 && fs.f_type == /* procfs */ 0x9FA0) + ? read(fd, buf, sizeof(buf)) + : -1; + const int err = close(fd); + assert(err == 0); + (void)err; + if (len > 0 && bootid_parse_uuid(&bin, buf, len)) + return bin; + } + } +#endif /* Linux */ + +#if defined(__APPLE__) || defined(__MACH__) + { + char buf[42]; + size_t len = sizeof(buf); + if (!sysctlbyname("kern.bootsessionuuid", buf, &len, nullptr, 0) && + bootid_parse_uuid(&bin, buf, len)) + return bin; + +#if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && \ + __MAC_OS_X_VERSION_MIN_REQUIRED > 1050 + uuid_t uuid; + struct timespec wait = {0, 1000000000u / 42}; + if (!gethostuuid(uuid, &wait) && + bootid_parse_uuid(&bin, uuid, sizeof(uuid))) + got_machineid = true; +#endif /* > 10.5 */ + + struct timeval boottime; + len = sizeof(boottime); + if (!sysctlbyname("kern.boottime", &boottime, &len, nullptr, 0) && + len == sizeof(boottime) && boottime.tv_sec) + got_boottime = true; + } +#endif /* Apple/Darwin */ + +#if defined(_WIN32) || defined(_WIN64) + { + union buf { + DWORD BootId; + DWORD BaseTime; + SYSTEM_TIMEOFDAY_INFORMATION SysTimeOfDayInfo; + struct { + LARGE_INTEGER BootTime; + LARGE_INTEGER CurrentTime; + LARGE_INTEGER TimeZoneBias; + ULONG TimeZoneId; + ULONG Reserved; + ULONGLONG BootTimeBias; + ULONGLONG SleepTimeBias; + } SysTimeOfDayInfoHacked; + wchar_t MachineGuid[42]; + char DigitalProductId[248]; + } buf; + + static const char HKLM_MicrosoftCryptography[] = + "SOFTWARE\\Microsoft\\Cryptography"; + DWORD len = sizeof(buf); + /* Windows is madness and must die */ + if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_MicrosoftCryptography, + "MachineGuid", &buf.MachineGuid, + &len) == ERROR_SUCCESS && + len < sizeof(buf)) + got_machineid = bootid_parse_uuid(&bin, &buf.MachineGuid, len); + + if (!got_machineid) { + /* again, Windows is madness */ + static const char HKLM_WindowsNT[] = + "SOFTWARE\\Microsoft\\Windows NT\\CurrentVersion"; + static const char HKLM_WindowsNT_DPK[] = + "SOFTWARE\\Microsoft\\Windows " + "NT\\CurrentVersion\\DefaultProductKey"; + static const char HKLM_WindowsNT_DPK2[] = + "SOFTWARE\\Microsoft\\Windows " + "NT\\CurrentVersion\\DefaultProductKey2"; + + len = sizeof(buf); + if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT, + "DigitalProductId", &buf.DigitalProductId, + &len) == ERROR_SUCCESS && + len > 42 && len < sizeof(buf)) { + bootid_collect(&bin, &buf.DigitalProductId, len); + got_machineid = true; + } + len = sizeof(buf); + if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT_DPK, + "DigitalProductId", &buf.DigitalProductId, + &len) == ERROR_SUCCESS && + len > 42 && len < sizeof(buf)) { + bootid_collect(&bin, &buf.DigitalProductId, len); + got_machineid = true; + } + len = sizeof(buf); + if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT_DPK2, + "DigitalProductId", &buf.DigitalProductId, + &len) == ERROR_SUCCESS && + len > 42 && len < sizeof(buf)) { + bootid_collect(&bin, &buf.DigitalProductId, len); + got_machineid = true; + } + } + + static const char HKLM_PrefetcherParams[] = + "SYSTEM\\CurrentControlSet\\Control\\Session Manager\\Memory " + "Management\\PrefetchParameters"; + len = sizeof(buf); + if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_PrefetcherParams, "BootId", + &buf.BootId, &len) == ERROR_SUCCESS && + len > 1 && len < sizeof(buf)) { + bootid_collect(&bin, &buf.BootId, len); + got_bootseq = true; + } + + len = sizeof(buf); + if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_PrefetcherParams, "BaseTime", + &buf.BaseTime, &len) == ERROR_SUCCESS && + len >= sizeof(buf.BaseTime) && buf.BaseTime) { + bootid_collect(&bin, &buf.BaseTime, len); + got_boottime = true; + } + + /* BootTime from SYSTEM_TIMEOFDAY_INFORMATION */ + NTSTATUS status = NtQuerySystemInformation( + 0x03 /* SystemTmeOfDayInformation */, &buf.SysTimeOfDayInfo, + sizeof(buf.SysTimeOfDayInfo), &len); + if (NT_SUCCESS(status) && + len >= offsetof(union buf, SysTimeOfDayInfoHacked.BootTimeBias) + + sizeof(buf.SysTimeOfDayInfoHacked.BootTimeBias) && + buf.SysTimeOfDayInfoHacked.BootTime.QuadPart) { + const uint64_t UnbiasedBootTime = + buf.SysTimeOfDayInfoHacked.BootTime.QuadPart - + buf.SysTimeOfDayInfoHacked.BootTimeBias; + if (UnbiasedBootTime) { + bootid_collect(&bin, &UnbiasedBootTime, sizeof(UnbiasedBootTime)); + got_boottime = true; + } + } + + if (!got_boottime) { + uint64_t boottime = windows_bootime(); + if (boottime) { + bootid_collect(&bin, &boottime, sizeof(boottime)); + got_boottime = true; + } + } + } +#endif /* Windows */ + +#if defined(CTL_HW) && defined(HW_UUID) + if (!got_machineid) { + static const int mib[] = {CTL_HW, HW_UUID}; + char buf[42]; + size_t len = sizeof(buf); + if (sysctl( +#ifdef SYSCTL_LEGACY_NONCONST_MIB + (int *) +#endif + mib, + ARRAY_LENGTH(mib), &buf, &len, nullptr, 0) == 0) + got_machineid = bootid_parse_uuid(&bin, buf, len); + } +#endif /* CTL_HW && HW_UUID */ + +#if defined(CTL_KERN) && defined(KERN_HOSTUUID) + if (!got_machineid) { + static const int mib[] = {CTL_KERN, KERN_HOSTUUID}; + char buf[42]; + size_t len = sizeof(buf); + if (sysctl( +#ifdef SYSCTL_LEGACY_NONCONST_MIB + (int *) +#endif + mib, + ARRAY_LENGTH(mib), &buf, &len, nullptr, 0) == 0) + got_machineid = bootid_parse_uuid(&bin, buf, len); + } +#endif /* CTL_KERN && KERN_HOSTUUID */ + +#if defined(__NetBSD__) + if (!got_machineid) { + char buf[42]; + size_t len = sizeof(buf); + if (sysctlbyname("machdep.dmi.system-uuid", buf, &len, nullptr, 0) == 0) + got_machineid = bootid_parse_uuid(&bin, buf, len); + } +#endif /* __NetBSD__ */ + +#if _XOPEN_SOURCE_EXTENDED + if (!got_machineid) { + const int hostid = gethostid(); + if (hostid > 0) { + bootid_collect(&bin, &hostid, sizeof(hostid)); + got_machineid = true; + } + } +#endif /* _XOPEN_SOURCE_EXTENDED */ + + if (!got_machineid) { + lack: + bin.x = bin.y = 0; + return bin; + } + + /*--------------------------------------------------------------------------*/ + +#if defined(CTL_KERN) && defined(KERN_BOOTTIME) + if (!got_boottime) { + static const int mib[] = {CTL_KERN, KERN_BOOTTIME}; + struct timeval boottime; + size_t len = sizeof(boottime); + if (sysctl( +#ifdef SYSCTL_LEGACY_NONCONST_MIB + (int *) +#endif + mib, + ARRAY_LENGTH(mib), &boottime, &len, nullptr, 0) == 0 && + len == sizeof(boottime) && boottime.tv_sec) { + bootid_collect(&bin, &boottime, len); + got_boottime = true; + } + } +#endif /* CTL_KERN && KERN_BOOTTIME */ + +#if defined(__sun) || defined(__SVR4) || defined(__svr4__) + if (!got_boottime) { + kstat_ctl_t *kc = kstat_open(); + if (kc) { + kstat_t *kp = kstat_lookup(kc, "unix", 0, "system_misc"); + if (kp && kstat_read(kc, kp, 0) != -1) { + kstat_named_t *kn = (kstat_named_t *)kstat_data_lookup(kp, "boot_time"); + if (kn) { + switch (kn->data_type) { + case KSTAT_DATA_INT32: + case KSTAT_DATA_UINT32: + bootid_collect(&bin, &kn->value, sizeof(int32_t)); + got_boottime = true; + case KSTAT_DATA_INT64: + case KSTAT_DATA_UINT64: + bootid_collect(&bin, &kn->value, sizeof(int64_t)); + got_boottime = true; + } + } + } + kstat_close(kc); + } + } +#endif /* SunOS / Solaris */ + +#if _XOPEN_SOURCE_EXTENDED && defined(BOOT_TIME) + if (!got_boottime) { + setutxent(); + const struct utmpx id = {.ut_type = BOOT_TIME}; + const struct utmpx *entry = getutxid(&id); + if (entry) { + bootid_collect(&bin, entry, sizeof(*entry)); + got_boottime = true; + while (unlikely((entry = getutxid(&id)) != nullptr)) { + /* have multiple reboot records, assuming we can distinguish next + * bootsession even if RTC is wrong or absent */ + bootid_collect(&bin, entry, sizeof(*entry)); + got_bootseq = true; + } + } + endutxent(); + } +#endif /* _XOPEN_SOURCE_EXTENDED && BOOT_TIME */ + + if (!got_bootseq) { + if (!got_boottime || !MDBX_TRUST_RTC) + goto lack; + +#if defined(_WIN32) || defined(_WIN64) + FILETIME now; + GetSystemTimeAsFileTime(&now); + if (0x1CCCCCC > now.dwHighDateTime) +#else + struct timespec mono, real; + if (clock_gettime(CLOCK_MONOTONIC, &mono) || + clock_gettime(CLOCK_REALTIME, &real) || + /* wrong time, RTC is mad or absent */ + 1555555555l > real.tv_sec || + /* seems no adjustment by RTC/NTP, i.e. a fake time */ + real.tv_sec < mono.tv_sec || 1234567890l > real.tv_sec - mono.tv_sec || + (real.tv_sec - mono.tv_sec) % 900u == 0) +#endif + goto lack; + } + + return bin; +} + +__cold int mdbx_get_sysraminfo(intptr_t *page_size, intptr_t *total_pages, + intptr_t *avail_pages) { + if (!page_size && !total_pages && !avail_pages) + return MDBX_EINVAL; + if (total_pages) + *total_pages = -1; + if (avail_pages) + *avail_pages = -1; + + const intptr_t pagesize = globals.sys_pagesize; + if (page_size) + *page_size = pagesize; + if (unlikely(pagesize < MDBX_MIN_PAGESIZE || !is_powerof2(pagesize))) + return MDBX_INCOMPATIBLE; + + MDBX_MAYBE_UNUSED const int log2page = log2n_powerof2(pagesize); + assert(pagesize == (INT64_C(1) << log2page)); + (void)log2page; + +#if defined(_WIN32) || defined(_WIN64) + MEMORYSTATUSEX info; + memset(&info, 0, sizeof(info)); + info.dwLength = sizeof(info); + if (!GlobalMemoryStatusEx(&info)) + return (int)GetLastError(); +#endif + + if (total_pages) { +#if defined(_WIN32) || defined(_WIN64) + const intptr_t total_ram_pages = (intptr_t)(info.ullTotalPhys >> log2page); +#elif defined(_SC_PHYS_PAGES) + const intptr_t total_ram_pages = sysconf(_SC_PHYS_PAGES); + if (total_ram_pages == -1) + return errno; +#elif defined(_SC_AIX_REALMEM) + const intptr_t total_ram_Kb = sysconf(_SC_AIX_REALMEM); + if (total_ram_Kb == -1) + return errno; + const intptr_t total_ram_pages = (total_ram_Kb << 10) >> log2page; +#elif defined(HW_USERMEM) || defined(HW_PHYSMEM64) || defined(HW_MEMSIZE) || \ + defined(HW_PHYSMEM) + size_t ram, len = sizeof(ram); + static const int mib[] = {CTL_HW, +#if defined(HW_USERMEM) + HW_USERMEM +#elif defined(HW_PHYSMEM64) + HW_PHYSMEM64 +#elif defined(HW_MEMSIZE) + HW_MEMSIZE +#else + HW_PHYSMEM +#endif + }; + if (sysctl( +#ifdef SYSCTL_LEGACY_NONCONST_MIB + (int *) +#endif + mib, + ARRAY_LENGTH(mib), &ram, &len, nullptr, 0) != 0) + return errno; + if (len != sizeof(ram)) + return MDBX_ENOSYS; + const intptr_t total_ram_pages = (intptr_t)(ram >> log2page); +#else +#error "FIXME: Get User-accessible or physical RAM" +#endif + *total_pages = total_ram_pages; + if (total_ram_pages < 1) + return MDBX_ENOSYS; + } + + if (avail_pages) { +#if defined(_WIN32) || defined(_WIN64) + const intptr_t avail_ram_pages = (intptr_t)(info.ullAvailPhys >> log2page); +#elif defined(_SC_AVPHYS_PAGES) + const intptr_t avail_ram_pages = sysconf(_SC_AVPHYS_PAGES); + if (avail_ram_pages == -1) + return errno; +#elif defined(__MACH__) + mach_msg_type_number_t count = HOST_VM_INFO_COUNT; + vm_statistics_data_t vmstat; + mach_port_t mport = mach_host_self(); + kern_return_t kerr = host_statistics(mach_host_self(), HOST_VM_INFO, + (host_info_t)&vmstat, &count); + mach_port_deallocate(mach_task_self(), mport); + if (unlikely(kerr != KERN_SUCCESS)) + return MDBX_ENOSYS; + const intptr_t avail_ram_pages = vmstat.free_count; +#elif defined(VM_TOTAL) || defined(VM_METER) + struct vmtotal info; + size_t len = sizeof(info); + static const int mib[] = {CTL_VM, +#if defined(VM_TOTAL) + VM_TOTAL +#elif defined(VM_METER) + VM_METER +#endif + }; + if (sysctl( +#ifdef SYSCTL_LEGACY_NONCONST_MIB + (int *) +#endif + mib, + ARRAY_LENGTH(mib), &info, &len, nullptr, 0) != 0) + return errno; + if (len != sizeof(info)) + return MDBX_ENOSYS; + const intptr_t avail_ram_pages = info.t_free; +#else +#error "FIXME: Get Available RAM" +#endif + *avail_pages = avail_ram_pages; + if (avail_ram_pages < 1) + return MDBX_ENOSYS; + } + + return MDBX_SUCCESS; +} + +void osal_ctor(void) { +#if MDBX_HAVE_PWRITEV && defined(_SC_IOV_MAX) + osal_iov_max = sysconf(_SC_IOV_MAX); + if (RUNNING_ON_VALGRIND && osal_iov_max > 64) + /* чтобы не описывать все 1024 исключения в valgrind_suppress.txt */ + osal_iov_max = 64; +#endif /* MDBX_HAVE_PWRITEV && _SC_IOV_MAX */ + +#if defined(_WIN32) || defined(_WIN64) + SYSTEM_INFO si; + GetSystemInfo(&si); + globals.sys_pagesize = si.dwPageSize; + globals.sys_allocation_granularity = si.dwAllocationGranularity; +#else + globals.sys_pagesize = sysconf(_SC_PAGE_SIZE); + globals.sys_allocation_granularity = (MDBX_WORDBITS > 32) ? 65536 : 4096; + globals.sys_allocation_granularity = + (globals.sys_allocation_granularity > globals.sys_pagesize) + ? globals.sys_allocation_granularity + : globals.sys_pagesize; +#endif + assert(globals.sys_pagesize > 0 && + (globals.sys_pagesize & (globals.sys_pagesize - 1)) == 0); + assert(globals.sys_allocation_granularity >= globals.sys_pagesize && + globals.sys_allocation_granularity % globals.sys_pagesize == 0); + globals.sys_pagesize_ln2 = log2n_powerof2(globals.sys_pagesize); + +#if defined(__linux__) || defined(__gnu_linux__) + posix_clockid = choice_monoclock(); +#endif + +#if defined(_WIN32) || defined(_WIN64) + QueryPerformanceFrequency(&performance_frequency); +#elif defined(__APPLE__) || defined(__MACH__) + mach_timebase_info_data_t ti; + mach_timebase_info(&ti); + ratio_16dot16_to_monotine = UINT64_C(1000000000) * ti.denom / ti.numer; +#endif + monotime_limit = osal_16dot16_to_monotime(UINT32_MAX - 1); + + uint32_t proba = UINT32_MAX; + while (true) { + unsigned time_conversion_checkup = + osal_monotime_to_16dot16(osal_16dot16_to_monotime(proba)); + unsigned one_more = (proba < UINT32_MAX) ? proba + 1 : proba; + unsigned one_less = (proba > 0) ? proba - 1 : proba; + ENSURE(nullptr, time_conversion_checkup >= one_less && + time_conversion_checkup <= one_more); + if (proba == 0) + break; + proba >>= 1; + } + + globals.bootid = osal_bootid(); +} + +void osal_dtor(void) {} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 + + +__cold int MDBX_PRINTF_ARGS(2, 3) + bad_page(const page_t *mp, const char *fmt, ...) { + if (LOG_ENABLED(MDBX_LOG_ERROR)) { + static const page_t *prev; + if (prev != mp) { + char buf4unknown[16]; + prev = mp; + debug_log(MDBX_LOG_ERROR, "badpage", 0, + "corrupted %s-page #%u, mod-txnid %" PRIaTXN "\n", + pagetype_caption(page_type(mp), buf4unknown), mp->pgno, + mp->txnid); + } + + va_list args; + va_start(args, fmt); + debug_log_va(MDBX_LOG_ERROR, "badpage", 0, fmt, args); + va_end(args); + } + return MDBX_CORRUPTED; +} + +__cold void MDBX_PRINTF_ARGS(2, 3) + poor_page(const page_t *mp, const char *fmt, ...) { + if (LOG_ENABLED(MDBX_LOG_NOTICE)) { + static const page_t *prev; + if (prev != mp) { + char buf4unknown[16]; + prev = mp; + debug_log(MDBX_LOG_NOTICE, "poorpage", 0, + "suboptimal %s-page #%u, mod-txnid %" PRIaTXN "\n", + pagetype_caption(page_type(mp), buf4unknown), mp->pgno, + mp->txnid); + } + + va_list args; + va_start(args, fmt); + debug_log_va(MDBX_LOG_NOTICE, "poorpage", 0, fmt, args); + va_end(args); + } +} + +MDBX_CONST_FUNCTION static clc_t value_clc(const MDBX_cursor *mc) { + if (likely((mc->flags & z_inner) == 0)) + return mc->clc->v; + else { + clc_t stub = {.cmp = cmp_equal_or_wrong, .lmin = 0, .lmax = 0}; + return stub; + } +} + +__cold int page_check(const MDBX_cursor *const mc, const page_t *const mp) { + DKBUF; + int rc = MDBX_SUCCESS; + if (unlikely(mp->pgno < MIN_PAGENO || mp->pgno > MAX_PAGENO)) + rc = bad_page(mp, "invalid pgno (%u)\n", mp->pgno); + + MDBX_env *const env = mc->txn->env; + const ptrdiff_t offset = ptr_dist(mp, env->dxb_mmap.base); + unsigned flags_mask = P_ILL_BITS; + unsigned flags_expected = 0; + if (offset < 0 || + offset > (ptrdiff_t)(pgno2bytes(env, mc->txn->geo.first_unallocated) - + ((mp->flags & P_SUBP) ? PAGEHDRSZ + 1 : env->ps))) { + /* should be dirty page without MDBX_WRITEMAP, or a subpage of. */ + flags_mask -= P_SUBP; + if ((env->flags & MDBX_WRITEMAP) != 0 || + (!is_shadowed(mc->txn, mp) && !(mp->flags & P_SUBP))) + rc = bad_page(mp, "invalid page-address %p, offset %zi\n", + __Wpedantic_format_voidptr(mp), offset); + } else if (offset & (env->ps - 1)) + flags_expected = P_SUBP; + + if (unlikely((mp->flags & flags_mask) != flags_expected)) + rc = bad_page(mp, "unknown/extra page-flags (have 0x%x, expect 0x%x)\n", + mp->flags & flags_mask, flags_expected); + + cASSERT(mc, (mc->checking & z_dupfix) == 0 || (mc->flags & z_inner) != 0); + const uint8_t type = page_type(mp); + switch (type) { + default: + return bad_page(mp, "invalid type (%u)\n", type); + case P_LARGE: + if (unlikely(mc->flags & z_inner)) + rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", "large", + "nested dupsort tree", mc->tree->flags); + const pgno_t npages = mp->pages; + if (unlikely(npages < 1 || npages >= MAX_PAGENO / 2)) + rc = bad_page(mp, "invalid n-pages (%u) for large-page\n", npages); + if (unlikely(mp->pgno + npages > mc->txn->geo.first_unallocated)) + rc = bad_page( + mp, "end of large-page beyond (%u) allocated space (%u next-pgno)\n", + mp->pgno + npages, mc->txn->geo.first_unallocated); + return rc; //-------------------------- end of large/overflow page handling + case P_LEAF | P_SUBP: + if (unlikely(mc->tree->height != 1)) + rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", + "leaf-sub", "nested dupsort db", mc->tree->flags); + /* fall through */ + __fallthrough; + case P_LEAF: + if (unlikely((mc->checking & z_dupfix) != 0)) + rc = bad_page(mp, + "unexpected leaf-page for dupfix subtree (db-lags 0x%x)\n", + mc->tree->flags); + break; + case P_LEAF | P_DUPFIX | P_SUBP: + if (unlikely(mc->tree->height != 1)) + rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", + "leaf2-sub", "nested dupsort db", mc->tree->flags); + /* fall through */ + __fallthrough; + case P_LEAF | P_DUPFIX: + if (unlikely((mc->checking & z_dupfix) == 0)) + rc = bad_page( + mp, + "unexpected leaf2-page for non-dupfix (sub)tree (db-flags 0x%x)\n", + mc->tree->flags); + break; + case P_BRANCH: + break; + } + + if (unlikely(mp->upper < mp->lower || (mp->lower & 1) || + PAGEHDRSZ + mp->upper > env->ps)) + rc = bad_page(mp, "invalid page lower(%u)/upper(%u) with limit %zu\n", + mp->lower, mp->upper, page_space(env)); + + const char *const end_of_page = ptr_disp(mp, env->ps); + const size_t nkeys = page_numkeys(mp); + STATIC_ASSERT(P_BRANCH == 1); + if (unlikely(nkeys <= (uint8_t)(mp->flags & P_BRANCH))) { + if ((!(mc->flags & z_inner) || mc->tree->items) && + (!(mc->checking & z_updating) || + !(is_modifable(mc->txn, mp) || (mp->flags & P_SUBP)))) + rc = + bad_page(mp, "%s-page nkeys (%zu) < %u\n", + is_branch(mp) ? "branch" : "leaf", nkeys, 1 + is_branch(mp)); + } + + const size_t ksize_max = keysize_max(env->ps, 0); + const size_t leaf2_ksize = mp->dupfix_ksize; + if (is_dupfix_leaf(mp)) { + if (unlikely((mc->flags & z_inner) == 0 || + (mc->tree->flags & MDBX_DUPFIXED) == 0)) + rc = bad_page(mp, "unexpected leaf2-page (db-flags 0x%x)\n", + mc->tree->flags); + else if (unlikely(leaf2_ksize != mc->tree->dupfix_size)) + rc = bad_page(mp, "invalid leaf2_ksize %zu\n", leaf2_ksize); + else if (unlikely(((leaf2_ksize & nkeys) ^ mp->upper) & 1)) + rc = bad_page( + mp, "invalid page upper (%u) for nkeys %zu with leaf2-length %zu\n", + mp->upper, nkeys, leaf2_ksize); + } else { + if (unlikely((mp->upper & 1) || + PAGEHDRSZ + mp->upper + nkeys * sizeof(node_t) + nkeys - 1 > + env->ps)) + rc = + bad_page(mp, "invalid page upper (%u) for nkeys %zu with limit %zu\n", + mp->upper, nkeys, page_space(env)); + } + + MDBX_val here, prev = {0, 0}; + clc_t v_clc = value_clc(mc); + for (size_t i = 0; i < nkeys; ++i) { + if (is_dupfix_leaf(mp)) { + const char *const key = page_dupfix_ptr(mp, i, mc->tree->dupfix_size); + if (unlikely(end_of_page < key + leaf2_ksize)) { + rc = bad_page(mp, "leaf2-item beyond (%zu) page-end\n", + key + leaf2_ksize - end_of_page); + continue; + } + + if (unlikely(leaf2_ksize != mc->clc->k.lmin)) { + if (unlikely(leaf2_ksize < mc->clc->k.lmin || + leaf2_ksize > mc->clc->k.lmax)) + rc = bad_page(mp, + "leaf2-item size (%zu) <> min/max length (%zu/%zu)\n", + leaf2_ksize, mc->clc->k.lmin, mc->clc->k.lmax); + else + mc->clc->k.lmin = mc->clc->k.lmax = leaf2_ksize; + } + if ((mc->checking & z_ignord) == 0) { + here.iov_base = (void *)key; + here.iov_len = leaf2_ksize; + if (prev.iov_base && unlikely(mc->clc->k.cmp(&prev, &here) >= 0)) + rc = bad_page(mp, "leaf2-item #%zu wrong order (%s >= %s)\n", i, + DKEY(&prev), DVAL(&here)); + prev = here; + } + } else { + const node_t *const node = page_node(mp, i); + const char *const node_end = ptr_disp(node, NODESIZE); + if (unlikely(node_end > end_of_page)) { + rc = bad_page(mp, "node[%zu] (%zu) beyond page-end\n", i, + node_end - end_of_page); + continue; + } + const size_t ksize = node_ks(node); + if (unlikely(ksize > ksize_max)) + rc = bad_page(mp, "node[%zu] too long key (%zu)\n", i, ksize); + const char *const key = node_key(node); + if (unlikely(end_of_page < key + ksize)) { + rc = bad_page(mp, "node[%zu] key (%zu) beyond page-end\n", i, + key + ksize - end_of_page); + continue; + } + if ((is_leaf(mp) || i > 0)) { + if (unlikely(ksize < mc->clc->k.lmin || ksize > mc->clc->k.lmax)) + rc = bad_page( + mp, "node[%zu] key size (%zu) <> min/max key-length (%zu/%zu)\n", + i, ksize, mc->clc->k.lmin, mc->clc->k.lmax); + if ((mc->checking & z_ignord) == 0) { + here.iov_base = (void *)key; + here.iov_len = ksize; + if (prev.iov_base && unlikely(mc->clc->k.cmp(&prev, &here) >= 0)) + rc = bad_page(mp, "node[%zu] key wrong order (%s >= %s)\n", i, + DKEY(&prev), DVAL(&here)); + prev = here; + } + } + if (is_branch(mp)) { + if ((mc->checking & z_updating) == 0 && i == 0 && unlikely(ksize != 0)) + rc = bad_page(mp, "branch-node[%zu] wrong 0-node key-length (%zu)\n", + i, ksize); + const pgno_t ref = node_pgno(node); + if (unlikely(ref < MIN_PAGENO) || + (unlikely(ref >= mc->txn->geo.first_unallocated) && + (unlikely(ref >= mc->txn->geo.now) || + !(mc->checking & z_retiring)))) + rc = bad_page(mp, "branch-node[%zu] wrong pgno (%u)\n", i, ref); + if (unlikely(node_flags(node))) + rc = bad_page(mp, "branch-node[%zu] wrong flags (%u)\n", i, + node_flags(node)); + continue; + } -#ifdef __SANITIZE_ADDRESS__ -#if !defined(_MSC_VER) || __has_attribute(weak) -LIBMDBX_API __attribute__((__weak__)) -#endif -const char *__asan_default_options(void) { - return "symbolize=1:allow_addr2line=1:" -#if MDBX_DEBUG - "debug=1:" - "verbosity=2:" -#endif /* MDBX_DEBUG */ - "log_threads=1:" - "report_globals=1:" - "replace_str=1:replace_intrin=1:" - "malloc_context_size=9:" -#if !defined(__APPLE__) - "detect_leaks=1:" -#endif - "check_printf=1:" - "detect_deadlocks=1:" -#ifndef LTO_ENABLED - "check_initialization_order=1:" -#endif - "detect_stack_use_after_return=1:" - "intercept_tls_get_addr=1:" - "decorate_proc_maps=1:" - "abort_on_error=1"; -} -#endif /* __SANITIZE_ADDRESS__ */ + switch (node_flags(node)) { + default: + rc = + bad_page(mp, "invalid node[%zu] flags (%u)\n", i, node_flags(node)); + break; + case N_BIGDATA /* data on large-page */: + case 0 /* usual */: + case N_SUBDATA /* sub-db */: + case N_SUBDATA | N_DUPDATA /* dupsorted sub-tree */: + case N_DUPDATA /* short sub-page */: + break; + } -/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ + const size_t dsize = node_ds(node); + const char *const data = node_data(node); + if (node_flags(node) & N_BIGDATA) { + if (unlikely(end_of_page < data + sizeof(pgno_t))) { + rc = bad_page( + mp, "node-%s(%zu of %zu, %zu bytes) beyond (%zu) page-end\n", + "bigdata-pgno", i, nkeys, dsize, data + dsize - end_of_page); + continue; + } + if (unlikely(dsize <= v_clc.lmin || dsize > v_clc.lmax)) + rc = bad_page( + mp, + "big-node data size (%zu) <> min/max value-length (%zu/%zu)\n", + dsize, v_clc.lmin, v_clc.lmax); + if (unlikely(node_size_len(node_ks(node), dsize) <= + mc->txn->env->leaf_nodemax) && + mc->tree != &mc->txn->dbs[FREE_DBI]) + poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize); -/* - * Copyright 2015-2024 Leonid Yuriev - * and other libmdbx authors: please see AUTHORS file. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ + if ((mc->checking & z_retiring) == 0) { + const pgr_t lp = + page_get_large(mc, node_largedata_pgno(node), mp->txnid); + if (unlikely(lp.err != MDBX_SUCCESS)) + return lp.err; + cASSERT(mc, page_type(lp.page) == P_LARGE); + const unsigned npages = largechunk_npages(env, dsize); + if (unlikely(lp.page->pages != npages)) { + if (lp.page->pages < npages) + rc = bad_page(lp.page, + "too less n-pages %u for bigdata-node (%zu bytes)", + lp.page->pages, dsize); + else if (mc->tree != &mc->txn->dbs[FREE_DBI]) + poor_page(lp.page, + "extra n-pages %u for bigdata-node (%zu bytes)", + lp.page->pages, dsize); + } + } + continue; + } + if (unlikely(end_of_page < data + dsize)) { + rc = bad_page(mp, + "node-%s(%zu of %zu, %zu bytes) beyond (%zu) page-end\n", + "data", i, nkeys, dsize, data + dsize - end_of_page); + continue; + } -#if defined(_WIN32) || defined(_WIN64) + switch (node_flags(node)) { + default: + /* wrong, but already handled */ + continue; + case 0 /* usual */: + if (unlikely(dsize < v_clc.lmin || dsize > v_clc.lmax)) { + rc = bad_page( + mp, "node-data size (%zu) <> min/max value-length (%zu/%zu)\n", + dsize, v_clc.lmin, v_clc.lmax); + continue; + } + break; + case N_SUBDATA /* sub-db */: + if (unlikely(dsize != sizeof(tree_t))) { + rc = bad_page(mp, "invalid sub-db record size (%zu)\n", dsize); + continue; + } + break; + case N_SUBDATA | N_DUPDATA /* dupsorted sub-tree */: + if (unlikely(dsize != sizeof(tree_t))) { + rc = bad_page(mp, "invalid nested-db record size (%zu, expect %zu)\n", + dsize, sizeof(tree_t)); + continue; + } + break; + case N_DUPDATA /* short sub-page */: + if (unlikely(dsize <= PAGEHDRSZ)) { + rc = bad_page(mp, "invalid nested/sub-page record size (%zu)\n", + dsize); + continue; + } else { + const page_t *const sp = (page_t *)data; + switch (sp->flags & + /* ignore legacy P_DIRTY flag */ ~P_LEGACY_DIRTY) { + case P_LEAF | P_SUBP: + case P_LEAF | P_DUPFIX | P_SUBP: + break; + default: + rc = bad_page(mp, "invalid nested/sub-page flags (0x%02x)\n", + sp->flags); + continue; + } -#include -#include + const char *const end_of_subpage = data + dsize; + const intptr_t nsubkeys = page_numkeys(sp); + if (unlikely(nsubkeys == 0) && !(mc->checking & z_updating) && + mc->tree->items) + rc = bad_page(mp, "no keys on a %s-page\n", + is_dupfix_leaf(sp) ? "leaf2-sub" : "leaf-sub"); -#if !MDBX_WITHOUT_MSVC_CRT && defined(_DEBUG) -#include -#endif + MDBX_val sub_here, sub_prev = {0, 0}; + for (int ii = 0; ii < nsubkeys; ii++) { + if (is_dupfix_leaf(sp)) { + /* DUPFIX pages have no entries[] or node headers */ + const size_t sub_ksize = sp->dupfix_ksize; + const char *const sub_key = + page_dupfix_ptr(sp, ii, mc->tree->dupfix_size); + if (unlikely(end_of_subpage < sub_key + sub_ksize)) { + rc = bad_page(mp, "nested-leaf2-key beyond (%zu) nested-page\n", + sub_key + sub_ksize - end_of_subpage); + continue; + } -static int waitstatus2errcode(DWORD result) { - switch (result) { - case WAIT_OBJECT_0: - return MDBX_SUCCESS; - case WAIT_FAILED: - return (int)GetLastError(); - case WAIT_ABANDONED: - return ERROR_ABANDONED_WAIT_0; - case WAIT_IO_COMPLETION: - return ERROR_USER_APC; - case WAIT_TIMEOUT: - return ERROR_TIMEOUT; - default: - return ERROR_UNHANDLED_ERROR; + if (unlikely(sub_ksize != v_clc.lmin)) { + if (unlikely(sub_ksize < v_clc.lmin || sub_ksize > v_clc.lmax)) + rc = bad_page(mp, + "nested-leaf2-key size (%zu) <> min/max " + "value-length (%zu/%zu)\n", + sub_ksize, v_clc.lmin, v_clc.lmax); + else + v_clc.lmin = v_clc.lmax = sub_ksize; + } + if ((mc->checking & z_ignord) == 0) { + sub_here.iov_base = (void *)sub_key; + sub_here.iov_len = sub_ksize; + if (sub_prev.iov_base && + unlikely(v_clc.cmp(&sub_prev, &sub_here) >= 0)) + rc = bad_page(mp, + "nested-leaf2-key #%u wrong order (%s >= %s)\n", + ii, DKEY(&sub_prev), DVAL(&sub_here)); + sub_prev = sub_here; + } + } else { + const node_t *const sub_node = page_node(sp, ii); + const char *const sub_node_end = ptr_disp(sub_node, NODESIZE); + if (unlikely(sub_node_end > end_of_subpage)) { + rc = bad_page(mp, "nested-node beyond (%zu) nested-page\n", + end_of_subpage - sub_node_end); + continue; + } + if (unlikely(node_flags(sub_node) != 0)) + rc = bad_page(mp, "nested-node invalid flags (%u)\n", + node_flags(sub_node)); + + const size_t sub_ksize = node_ks(sub_node); + const char *const sub_key = node_key(sub_node); + const size_t sub_dsize = node_ds(sub_node); + /* char *sub_data = node_data(sub_node); */ + + if (unlikely(sub_ksize < v_clc.lmin || sub_ksize > v_clc.lmax)) + rc = bad_page(mp, + "nested-node-key size (%zu) <> min/max " + "value-length (%zu/%zu)\n", + sub_ksize, v_clc.lmin, v_clc.lmax); + if ((mc->checking & z_ignord) == 0) { + sub_here.iov_base = (void *)sub_key; + sub_here.iov_len = sub_ksize; + if (sub_prev.iov_base && + unlikely(v_clc.cmp(&sub_prev, &sub_here) >= 0)) + rc = bad_page(mp, + "nested-node-key #%u wrong order (%s >= %s)\n", + ii, DKEY(&sub_prev), DVAL(&sub_here)); + sub_prev = sub_here; + } + if (unlikely(sub_dsize != 0)) + rc = bad_page(mp, "nested-node non-empty data size (%zu)\n", + sub_dsize); + if (unlikely(end_of_subpage < sub_key + sub_ksize)) + rc = bad_page(mp, "nested-node-key beyond (%zu) nested-page\n", + sub_key + sub_ksize - end_of_subpage); + } + } + } + break; + } + } } + return rc; } -/* Map a result from an NTAPI call to WIN32 error code. */ -static int ntstatus2errcode(NTSTATUS status) { - DWORD dummy; - OVERLAPPED ov; - memset(&ov, 0, sizeof(ov)); - ov.Internal = status; - /* Zap: '_Param_(1)' could be '0' */ - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6387); - return GetOverlappedResult(NULL, &ov, &dummy, FALSE) ? MDBX_SUCCESS - : (int)GetLastError(); +static __always_inline int check_page_header(const uint16_t ILL, + const page_t *page, + MDBX_txn *const txn, + const txnid_t front) { + if (unlikely(page->flags & ILL)) { + if (ILL == P_ILL_BITS || (page->flags & P_ILL_BITS)) + return bad_page(page, "invalid page's flags (%u)\n", page->flags); + else if (ILL & P_LARGE) { + assert((ILL & (P_BRANCH | P_LEAF | P_DUPFIX)) == 0); + assert(page->flags & (P_BRANCH | P_LEAF | P_DUPFIX)); + return bad_page(page, "unexpected %s instead of %s (%u)\n", + "large/overflow", "branch/leaf/leaf2", page->flags); + } else if (ILL & (P_BRANCH | P_LEAF | P_DUPFIX)) { + assert((ILL & P_BRANCH) && (ILL & P_LEAF) && (ILL & P_DUPFIX)); + assert(page->flags & (P_BRANCH | P_LEAF | P_DUPFIX)); + return bad_page(page, "unexpected %s instead of %s (%u)\n", + "branch/leaf/leaf2", "large/overflow", page->flags); + } else { + assert(false); + } + } + + if (unlikely(page->txnid > front) && + unlikely(page->txnid > txn->front_txnid || front < txn->txnid)) + return bad_page( + page, + "invalid page' txnid (%" PRIaTXN ") for %s' txnid (%" PRIaTXN ")\n", + page->txnid, + (front == txn->front_txnid && front != txn->txnid) ? "front-txn" + : "parent-page", + front); + + if (((ILL & P_LARGE) || !is_largepage(page)) && + (ILL & (P_BRANCH | P_LEAF | P_DUPFIX)) == 0) { + /* Контроль четности page->upper тут либо приводит к ложным ошибкам, + * либо слишком дорог по количеству операций. Заковырка в том, что upper + * может быть нечетным на DUPFIX-страницах, при нечетном количестве + * элементов нечетной длины. Поэтому четность page->upper здесь не + * проверяется, но соответствующие полные проверки есть в page_check(). */ + if (unlikely(page->upper < page->lower || (page->lower & 1) || + PAGEHDRSZ + page->upper > txn->env->ps)) + return bad_page(page, + "invalid page' lower(%u)/upper(%u) with limit %zu\n", + page->lower, page->upper, page_space(txn->env)); + + } else if ((ILL & P_LARGE) == 0) { + const pgno_t npages = page->pages; + if (unlikely(npages < 1) || unlikely(npages >= MAX_PAGENO / 2)) + return bad_page(page, "invalid n-pages (%u) for large-page\n", npages); + if (unlikely(page->pgno + npages > txn->geo.first_unallocated)) + return bad_page( + page, + "end of large-page beyond (%u) allocated space (%u next-pgno)\n", + page->pgno + npages, txn->geo.first_unallocated); + } else { + assert(false); + } + return MDBX_SUCCESS; } -/* We use native NT APIs to setup the memory map, so that we can - * let the DB file grow incrementally instead of always preallocating - * the full size. These APIs are defined in and - * but those headers are meant for driver-level development and - * conflict with the regular user-level headers, so we explicitly - * declare them here. Using these APIs also means we must link to - * ntdll.dll, which is not linked by default in user code. */ +__cold static __noinline pgr_t check_page_complete(const uint16_t ILL, + page_t *page, + const MDBX_cursor *const mc, + const txnid_t front) { + pgr_t r = {page, check_page_header(ILL, page, mc->txn, front)}; + if (likely(r.err == MDBX_SUCCESS)) + r.err = page_check(mc, page); + if (unlikely(r.err != MDBX_SUCCESS)) + mc->txn->flags |= MDBX_TXN_ERROR; + return r; +} + +static __always_inline pgr_t page_get_inline(const uint16_t ILL, + const MDBX_cursor *const mc, + const pgno_t pgno, + const txnid_t front) { + MDBX_txn *const txn = mc->txn; + tASSERT(txn, front <= txn->front_txnid); + + pgr_t r; + if (unlikely(pgno >= txn->geo.first_unallocated)) { + ERROR("page #%" PRIaPGNO " beyond next-pgno", pgno); + r.page = nullptr; + r.err = MDBX_PAGE_NOTFOUND; + bailout: + txn->flags |= MDBX_TXN_ERROR; + return r; + } -extern NTSTATUS NTAPI NtCreateSection( - OUT PHANDLE SectionHandle, IN ACCESS_MASK DesiredAccess, - IN OPTIONAL POBJECT_ATTRIBUTES ObjectAttributes, - IN OPTIONAL PLARGE_INTEGER MaximumSize, IN ULONG SectionPageProtection, - IN ULONG AllocationAttributes, IN OPTIONAL HANDLE FileHandle); + eASSERT(txn->env, ((txn->flags ^ txn->env->flags) & MDBX_WRITEMAP) == 0); + r.page = pgno2page(txn->env, pgno); + if ((txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0) { + const MDBX_txn *spiller = txn; + do { + /* Spilled pages were dirtied in this txn and flushed + * because the dirty list got full. Bring this page + * back in from the map (but don't unspill it here, + * leave that unless page_touch happens again). */ + if (unlikely(spiller->flags & MDBX_TXN_SPILLS) && + spill_search(spiller, pgno)) + break; -typedef struct _SECTION_BASIC_INFORMATION { - ULONG Unknown; - ULONG SectionAttributes; - LARGE_INTEGER SectionSize; -} SECTION_BASIC_INFORMATION, *PSECTION_BASIC_INFORMATION; + const size_t i = dpl_search(spiller, pgno); + tASSERT(txn, (intptr_t)i > 0); + if (spiller->tw.dirtylist->items[i].pgno == pgno) { + r.page = spiller->tw.dirtylist->items[i].ptr; + break; + } -extern NTSTATUS NTAPI NtMapViewOfSection( - IN HANDLE SectionHandle, IN HANDLE ProcessHandle, IN OUT PVOID *BaseAddress, - IN ULONG_PTR ZeroBits, IN SIZE_T CommitSize, - IN OUT OPTIONAL PLARGE_INTEGER SectionOffset, IN OUT PSIZE_T ViewSize, - IN SECTION_INHERIT InheritDisposition, IN ULONG AllocationType, - IN ULONG Win32Protect); + spiller = spiller->parent; + } while (unlikely(spiller)); + } -extern NTSTATUS NTAPI NtUnmapViewOfSection(IN HANDLE ProcessHandle, - IN OPTIONAL PVOID BaseAddress); + if (unlikely(r.page->pgno != pgno)) { + r.err = bad_page( + r.page, "pgno mismatch (%" PRIaPGNO ") != expected (%" PRIaPGNO ")\n", + r.page->pgno, pgno); + goto bailout; + } -/* Zap: Inconsistent annotation for 'NtClose'... */ -MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(28251) -extern NTSTATUS NTAPI NtClose(HANDLE Handle); + if (unlikely(mc->checking & z_pagecheck)) + return check_page_complete(ILL, r.page, mc, front); -extern NTSTATUS NTAPI NtAllocateVirtualMemory( - IN HANDLE ProcessHandle, IN OUT PVOID *BaseAddress, IN ULONG_PTR ZeroBits, - IN OUT PSIZE_T RegionSize, IN ULONG AllocationType, IN ULONG Protect); +#if MDBX_DISABLE_VALIDATION + r.err = MDBX_SUCCESS; +#else + r.err = check_page_header(ILL, r.page, txn, front); + if (unlikely(r.err != MDBX_SUCCESS)) + goto bailout; +#endif /* MDBX_DISABLE_VALIDATION */ + return r; +} -extern NTSTATUS NTAPI NtFreeVirtualMemory(IN HANDLE ProcessHandle, - IN PVOID *BaseAddress, - IN OUT PSIZE_T RegionSize, - IN ULONG FreeType); +pgr_t page_get_any(const MDBX_cursor *const mc, const pgno_t pgno, + const txnid_t front) { + return page_get_inline(P_ILL_BITS, mc, pgno, front); +} -#ifndef WOF_CURRENT_VERSION -typedef struct _WOF_EXTERNAL_INFO { - DWORD Version; - DWORD Provider; -} WOF_EXTERNAL_INFO, *PWOF_EXTERNAL_INFO; -#endif /* WOF_CURRENT_VERSION */ +__hot pgr_t page_get_three(const MDBX_cursor *const mc, const pgno_t pgno, + const txnid_t front) { + return page_get_inline(P_ILL_BITS | P_LARGE, mc, pgno, front); +} -#ifndef WIM_PROVIDER_CURRENT_VERSION -#define WIM_PROVIDER_HASH_SIZE 20 +pgr_t page_get_large(const MDBX_cursor *const mc, const pgno_t pgno, + const txnid_t front) { + return page_get_inline(P_ILL_BITS | P_BRANCH | P_LEAF | P_DUPFIX, mc, pgno, + front); +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 -typedef struct _WIM_PROVIDER_EXTERNAL_INFO { - DWORD Version; - DWORD Flags; - LARGE_INTEGER DataSourceId; - BYTE ResourceHash[WIM_PROVIDER_HASH_SIZE]; -} WIM_PROVIDER_EXTERNAL_INFO, *PWIM_PROVIDER_EXTERNAL_INFO; -#endif /* WIM_PROVIDER_CURRENT_VERSION */ -#ifndef FILE_PROVIDER_CURRENT_VERSION -typedef struct _FILE_PROVIDER_EXTERNAL_INFO_V1 { - ULONG Version; - ULONG Algorithm; - ULONG Flags; -} FILE_PROVIDER_EXTERNAL_INFO_V1, *PFILE_PROVIDER_EXTERNAL_INFO_V1; -#endif /* FILE_PROVIDER_CURRENT_VERSION */ +int iov_init(MDBX_txn *const txn, iov_ctx_t *ctx, size_t items, size_t npages, + mdbx_filehandle_t fd, bool check_coherence) { + ctx->env = txn->env; + ctx->ior = &txn->env->ioring; + ctx->fd = fd; + ctx->coherency_timestamp = + (check_coherence || txn->env->lck->pgops.incoherence.weak) + ? 0 + : UINT64_MAX /* не выполнять сверку */; + ctx->err = osal_ioring_prepare(ctx->ior, items, + pgno_align2os_bytes(txn->env, npages)); + if (likely(ctx->err == MDBX_SUCCESS)) { +#if MDBX_NEED_WRITTEN_RANGE + ctx->flush_begin = MAX_PAGENO; + ctx->flush_end = MIN_PAGENO; +#endif /* MDBX_NEED_WRITTEN_RANGE */ + osal_ioring_reset(ctx->ior); + } + return ctx->err; +} -#ifndef STATUS_OBJECT_NOT_EXTERNALLY_BACKED -#define STATUS_OBJECT_NOT_EXTERNALLY_BACKED ((NTSTATUS)0xC000046DL) -#endif -#ifndef STATUS_INVALID_DEVICE_REQUEST -#define STATUS_INVALID_DEVICE_REQUEST ((NTSTATUS)0xC0000010L) -#endif -#ifndef STATUS_NOT_SUPPORTED -#define STATUS_NOT_SUPPORTED ((NTSTATUS)0xC00000BBL) -#endif +static void iov_callback4dirtypages(iov_ctx_t *ctx, size_t offset, void *data, + size_t bytes) { + MDBX_env *const env = ctx->env; + eASSERT(env, (env->flags & MDBX_WRITEMAP) == 0); -#ifndef FILE_DEVICE_FILE_SYSTEM -#define FILE_DEVICE_FILE_SYSTEM 0x00000009 -#endif + page_t *wp = (page_t *)data; + eASSERT(env, wp->pgno == bytes2pgno(env, offset)); + eASSERT(env, bytes2pgno(env, bytes) >= (is_largepage(wp) ? wp->pages : 1u)); + eASSERT(env, (wp->flags & P_ILL_BITS) == 0); -#ifndef FSCTL_GET_EXTERNAL_BACKING -#define FSCTL_GET_EXTERNAL_BACKING \ - CTL_CODE(FILE_DEVICE_FILE_SYSTEM, 196, METHOD_BUFFERED, FILE_ANY_ACCESS) -#endif + if (likely(ctx->err == MDBX_SUCCESS)) { + const page_t *const rp = ptr_disp(env->dxb_mmap.base, offset); + VALGRIND_MAKE_MEM_DEFINED(rp, bytes); + MDBX_ASAN_UNPOISON_MEMORY_REGION(rp, bytes); + osal_flush_incoherent_mmap(rp, bytes, globals.sys_pagesize); + /* check with timeout as the workaround + * for https://libmdbx.dqdkfa.ru/dead-github/issues/269 + * + * Проблема проявляется только при неупорядоченности: если записанная + * последней мета-страница "обгоняет" ранее записанные, т.е. когда + * записанное в файл позже становится видимым в отображении раньше, + * чем записанное ранее. + * + * Исходно здесь всегда выполнялась полная сверка. Это давало полную + * гарантию защиты от проявления проблемы, но порождало накладные расходы. + * В некоторых сценариях наблюдалось снижение производительности до 10-15%, + * а в синтетических тестах до 30%. Конечно никто не вникал в причины, + * а просто останавливался на мнении "libmdbx не быстрее LMDB", + * например: https://clck.ru/3386er + * + * Поэтому после серии экспериментов и тестов реализовано следующее: + * 0. Посредством опции сборки MDBX_FORCE_CHECK_MMAP_COHERENCY=1 + * можно включить полную сверку после записи. + * Остальные пункты являются взвешенным компромиссом между полной + * гарантией обнаружения проблемы и бесполезными затратами на системах + * без этого недостатка. + * 1. При старте транзакций проверяется соответствие выбранной мета-страницы + * корневым страницам b-tree проверяется. Эта проверка показала себя + * достаточной без сверки после записи. При обнаружении "некогерентности" + * эти случаи подсчитываются, а при их ненулевом счетчике выполняется + * полная сверка. Таким образом, произойдет переключение в режим полной + * сверки, если показавшая себя достаточной проверка заметит проявление + * проблемы хоты-бы раз. + * 2. Сверка не выполняется при фиксации транзакции, так как: + * - при наличии проблемы "не-когерентности" (при отложенном копировании + * или обновлении PTE, после возврата из write-syscall), проверка + * в этом процессе не гарантирует актуальность данных в другом + * процессе, который может запустить транзакцию сразу после коммита; + * - сверка только последнего блока позволяет почти восстановить + * производительность в больших транзакциях, но одновременно размывает + * уверенность в отсутствии сбоев, чем обесценивает всю затею; + * - после записи данных будет записана мета-страница, соответствие + * которой корневым страницам b-tree проверяется при старте + * транзакций, и только эта проверка показала себя достаточной; + * 3. При спиллинге производится полная сверка записанных страниц. Тут был + * соблазн сверять не полностью, а например начало и конец каждого блока. + * Но при спиллинге возможна ситуация повторного вытеснения страниц, в + * том числе large/overflow. При этом возникает риск прочитать в текущей + * транзакции старую версию страницы, до повторной записи. В этом случае + * могут возникать крайне редкие невоспроизводимые ошибки. С учетом того + * что спиллинг выполняет крайне редко, решено отказаться от экономии + * в пользу надежности. */ +#ifndef MDBX_FORCE_CHECK_MMAP_COHERENCY +#define MDBX_FORCE_CHECK_MMAP_COHERENCY 0 +#endif /* MDBX_FORCE_CHECK_MMAP_COHERENCY */ + if ((MDBX_FORCE_CHECK_MMAP_COHERENCY || + ctx->coherency_timestamp != UINT64_MAX) && + unlikely(memcmp(wp, rp, bytes))) { + ctx->coherency_timestamp = 0; + env->lck->pgops.incoherence.weak = + (env->lck->pgops.incoherence.weak >= INT32_MAX) + ? INT32_MAX + : env->lck->pgops.incoherence.weak + 1; + WARNING("catch delayed/non-arrived page %" PRIaPGNO " %s", wp->pgno, + "(workaround for incoherent flaw of unified page/buffer cache)"); + do + if (coherency_timeout(&ctx->coherency_timestamp, wp->pgno, env) != + MDBX_RESULT_TRUE) { + ctx->err = MDBX_PROBLEM; + break; + } + while (unlikely(memcmp(wp, rp, bytes))); + } + } -#ifndef ERROR_NOT_CAPABLE -#define ERROR_NOT_CAPABLE 775L -#endif + if (likely(bytes == env->ps)) + page_shadow_release(env, wp, 1); + else { + do { + eASSERT(env, wp->pgno == bytes2pgno(env, offset)); + eASSERT(env, (wp->flags & P_ILL_BITS) == 0); + size_t npages = is_largepage(wp) ? wp->pages : 1u; + size_t chunk = pgno2bytes(env, npages); + eASSERT(env, bytes >= chunk); + page_t *next = ptr_disp(wp, chunk); + page_shadow_release(env, wp, npages); + wp = next; + offset += chunk; + bytes -= chunk; + } while (bytes); + } +} -#endif /* _WIN32 || _WIN64 */ +static void iov_complete(iov_ctx_t *ctx) { + if ((ctx->env->flags & MDBX_WRITEMAP) == 0) + osal_ioring_walk(ctx->ior, ctx, iov_callback4dirtypages); + osal_ioring_reset(ctx->ior); +} -/*----------------------------------------------------------------------------*/ +int iov_write(iov_ctx_t *ctx) { + eASSERT(ctx->env, !iov_empty(ctx)); + osal_ioring_write_result_t r = osal_ioring_write(ctx->ior, ctx->fd); +#if MDBX_ENABLE_PGOP_STAT + ctx->env->lck->pgops.wops.weak += r.wops; +#endif /* MDBX_ENABLE_PGOP_STAT */ + ctx->err = r.err; + if (unlikely(ctx->err != MDBX_SUCCESS)) + ERROR("Write error: %s", mdbx_strerror(ctx->err)); + iov_complete(ctx); + return ctx->err; +} -#if defined(__ANDROID_API__) -__extern_C void __assert2(const char *file, int line, const char *function, - const char *msg) __noreturn; -#define __assert_fail(assertion, file, line, function) \ - __assert2(file, line, function, assertion) +int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, page_t *dp, size_t npages) { + MDBX_env *const env = txn->env; + tASSERT(txn, ctx->err == MDBX_SUCCESS); + tASSERT(txn, dp->pgno >= MIN_PAGENO && dp->pgno < txn->geo.first_unallocated); + tASSERT(txn, is_modifable(txn, dp)); + tASSERT(txn, !(dp->flags & ~(P_BRANCH | P_LEAF | P_DUPFIX | P_LARGE))); + + if (is_shadowed(txn, dp)) { + tASSERT(txn, !(txn->flags & MDBX_WRITEMAP)); + dp->txnid = txn->txnid; + tASSERT(txn, is_spilled(txn, dp)); +#if MDBX_AVOID_MSYNC + doit:; +#endif /* MDBX_AVOID_MSYNC */ + int err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->pgno), dp, + pgno2bytes(env, npages)); + if (unlikely(err != MDBX_SUCCESS)) { + ctx->err = err; + if (unlikely(err != MDBX_RESULT_TRUE)) { + iov_complete(ctx); + return err; + } + err = iov_write(ctx); + tASSERT(txn, iov_empty(ctx)); + if (likely(err == MDBX_SUCCESS)) { + err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->pgno), dp, + pgno2bytes(env, npages)); + if (unlikely(err != MDBX_SUCCESS)) { + iov_complete(ctx); + return ctx->err = err; + } + } + tASSERT(txn, ctx->err == MDBX_SUCCESS); + } + } else { + tASSERT(txn, txn->flags & MDBX_WRITEMAP); +#if MDBX_AVOID_MSYNC + goto doit; +#endif /* MDBX_AVOID_MSYNC */ + } -#elif defined(__UCLIBC__) -__extern_C void __assert(const char *, const char *, unsigned int, const char *) -#ifdef __THROW - __THROW -#else - __nothrow -#endif /* __THROW */ - MDBX_NORETURN; -#define __assert_fail(assertion, file, line, function) \ - __assert(assertion, file, line, function) +#if MDBX_NEED_WRITTEN_RANGE + ctx->flush_begin = + (ctx->flush_begin < dp->pgno) ? ctx->flush_begin : dp->pgno; + ctx->flush_end = (ctx->flush_end > dp->pgno + (pgno_t)npages) + ? ctx->flush_end + : dp->pgno + (pgno_t)npages; +#endif /* MDBX_NEED_WRITTEN_RANGE */ + return MDBX_SUCCESS; +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 -#elif _POSIX_C_SOURCE > 200212 && \ - /* workaround for avoid musl libc wrong prototype */ ( \ - defined(__GLIBC__) || defined(__GNU_LIBRARY__)) -/* Prototype should match libc runtime. ISO POSIX (2003) & LSB 1.x-3.x */ -__extern_C void __assert_fail(const char *assertion, const char *file, - unsigned line, const char *function) -#ifdef __THROW - __THROW -#else - __nothrow -#endif /* __THROW */ - MDBX_NORETURN; -#elif defined(__APPLE__) || defined(__MACH__) -__extern_C void __assert_rtn(const char *function, const char *file, int line, - const char *assertion) /* __nothrow */ -#ifdef __dead2 - __dead2 -#else - MDBX_NORETURN -#endif /* __dead2 */ -#ifdef __disable_tail_calls - __disable_tail_calls -#endif /* __disable_tail_calls */ - ; +static inline tree_t *outer_tree(MDBX_cursor *mc) { + cASSERT(mc, (mc->flags & z_inner) != 0); + subcur_t *mx = container_of(mc->tree, subcur_t, nested_tree); + cursor_couple_t *couple = container_of(mx, cursor_couple_t, inner); + cASSERT(mc, mc->tree == &couple->outer.subcur->nested_tree); + cASSERT(mc, &mc->clc->k == &couple->outer.clc->v); + return couple->outer.tree; +} -#define __assert_fail(assertion, file, line, function) \ - __assert_rtn(function, file, line, assertion) -#elif defined(__sun) || defined(__SVR4) || defined(__svr4__) -__extern_C void __assert_c99(const char *assection, const char *file, int line, - const char *function) MDBX_NORETURN; -#define __assert_fail(assertion, file, line, function) \ - __assert_c99(assertion, file, line, function) -#elif defined(__OpenBSD__) -__extern_C __dead void __assert2(const char *file, int line, - const char *function, - const char *assertion) /* __nothrow */; -#define __assert_fail(assertion, file, line, function) \ - __assert2(file, line, function, assertion) -#elif defined(__NetBSD__) -__extern_C __dead void __assert13(const char *file, int line, - const char *function, - const char *assertion) /* __nothrow */; -#define __assert_fail(assertion, file, line, function) \ - __assert13(file, line, function, assertion) -#elif defined(__FreeBSD__) || defined(__BSD__) || defined(__bsdi__) || \ - defined(__DragonFly__) -__extern_C void __assert(const char *function, const char *file, int line, - const char *assertion) /* __nothrow */ -#ifdef __dead2 - __dead2 -#else - MDBX_NORETURN -#endif /* __dead2 */ -#ifdef __disable_tail_calls - __disable_tail_calls -#endif /* __disable_tail_calls */ - ; -#define __assert_fail(assertion, file, line, function) \ - __assert(function, file, line, assertion) +pgr_t page_new(MDBX_cursor *mc, const unsigned flags) { + cASSERT(mc, (flags & P_LARGE) == 0); + pgr_t ret = gc_alloc_single(mc); + if (unlikely(ret.err != MDBX_SUCCESS)) + return ret; -#endif /* __assert_fail */ + DEBUG("db %zu allocated new page %" PRIaPGNO, cursor_dbi(mc), ret.page->pgno); + ret.page->flags = (uint16_t)flags; + cASSERT(mc, *cursor_dbi_state(mc) & DBI_DIRTY); + cASSERT(mc, mc->txn->flags & MDBX_TXN_DIRTY); +#if MDBX_ENABLE_PGOP_STAT + mc->txn->env->lck->pgops.newly.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ -__cold void mdbx_assert_fail(const MDBX_env *env, const char *msg, - const char *func, unsigned line) { -#if MDBX_DEBUG - if (env && env->me_assert_func) - env->me_assert_func(env, msg, func, line); -#else - (void)env; - assert_fail(msg, func, line); + STATIC_ASSERT(P_BRANCH == 1); + const unsigned is_branch = flags & P_BRANCH; + + ret.page->lower = 0; + ret.page->upper = (indx_t)(mc->txn->env->ps - PAGEHDRSZ); + mc->tree->branch_pages += is_branch; + mc->tree->leaf_pages += 1 - is_branch; + if (unlikely(mc->flags & z_inner)) { + tree_t *outer = outer_tree(mc); + outer->branch_pages += is_branch; + outer->leaf_pages += 1 - is_branch; + } + return ret; } -MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, - unsigned line) { -#endif /* MDBX_DEBUG */ +pgr_t page_new_large(MDBX_cursor *mc, const size_t npages) { + pgr_t ret = likely(npages == 1) ? gc_alloc_single(mc) + : gc_alloc_ex(mc, npages, ALLOC_DEFAULT); + if (unlikely(ret.err != MDBX_SUCCESS)) + return ret; - if (mdbx_static.logger.ptr) - debug_log(MDBX_LOG_FATAL, func, line, "assert: %s\n", msg); - else { -#if defined(_WIN32) || defined(_WIN64) - char *message = nullptr; - const int num = osal_asprintf(&message, "\r\nMDBX-ASSERTION: %s, %s:%u", - msg, func ? func : "unknown", line); - if (num < 1 || !message) - message = ""; - OutputDebugStringA(message); -#else - __assert_fail(msg, "mdbx", line, func); -#endif - } + DEBUG("dbi %zu allocated new large-page %" PRIaPGNO ", num %zu", + cursor_dbi(mc), ret.page->pgno, npages); + ret.page->flags = P_LARGE; + cASSERT(mc, *cursor_dbi_state(mc) & DBI_DIRTY); + cASSERT(mc, mc->txn->flags & MDBX_TXN_DIRTY); +#if MDBX_ENABLE_PGOP_STAT + mc->txn->env->lck->pgops.newly.weak += npages; +#endif /* MDBX_ENABLE_PGOP_STAT */ - while (1) { -#if defined(_WIN32) || defined(_WIN64) -#if !MDBX_WITHOUT_MSVC_CRT && defined(_DEBUG) - _CrtDbgReport(_CRT_ASSERT, func ? func : "unknown", line, "libmdbx", - "assertion failed: %s", msg); -#else - if (IsDebuggerPresent()) - DebugBreak(); -#endif - FatalExit(STATUS_ASSERTION_FAILURE); -#else - abort(); -#endif + mc->tree->large_pages += (pgno_t)npages; + ret.page->pages = (pgno_t)npages; + cASSERT(mc, !(mc->flags & z_inner)); + return ret; +} + +__hot void page_copy(page_t *const dst, const page_t *const src, + const size_t size) { + STATIC_ASSERT(UINT16_MAX > MDBX_MAX_PAGESIZE - PAGEHDRSZ); + STATIC_ASSERT(MDBX_MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 4); + void *copy_dst = dst; + const void *copy_src = src; + size_t copy_len = size; + if (src->flags & P_DUPFIX) { + copy_len = PAGEHDRSZ + src->dupfix_ksize * page_numkeys(src); + if (unlikely(copy_len > size)) + goto bailout; + } else if ((src->flags & P_LARGE) == 0) { + size_t upper = src->upper, lower = src->lower; + intptr_t unused = upper - lower; + /* If page isn't full, just copy the used portion. Adjust + * alignment so memcpy may copy words instead of bytes. */ + if (unused > MDBX_CACHELINE_SIZE * 3) { + lower = ceil_powerof2(lower + PAGEHDRSZ, sizeof(void *)); + upper = floor_powerof2(upper + PAGEHDRSZ, sizeof(void *)); + if (unlikely(upper > copy_len)) + goto bailout; + memcpy(copy_dst, copy_src, lower); + copy_dst = ptr_disp(copy_dst, upper); + copy_src = ptr_disp(copy_src, upper); + copy_len -= upper; + } } + memcpy(copy_dst, copy_src, copy_len); + return; + +bailout: + if (src->flags & P_DUPFIX) + bad_page(src, "%s addr %p, n-keys %zu, ksize %u", + "invalid/corrupted source page", __Wpedantic_format_voidptr(src), + page_numkeys(src), src->dupfix_ksize); + else + bad_page(src, "%s addr %p, upper %u", "invalid/corrupted source page", + __Wpedantic_format_voidptr(src), src->upper); + memset(dst, -1, size); } -__cold void mdbx_panic(const char *fmt, ...) { - va_list ap; - va_start(ap, fmt); +__cold pgr_t __must_check_result page_unspill(MDBX_txn *const txn, + const page_t *const mp) { + VERBOSE("unspill page %" PRIaPGNO, mp->pgno); + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0); + tASSERT(txn, is_spilled(txn, mp)); + const MDBX_txn *scan = txn; + pgr_t ret; + do { + tASSERT(txn, (scan->flags & MDBX_TXN_SPILLS) != 0); + const size_t si = spill_search(scan, mp->pgno); + if (!si) + continue; + const unsigned npages = is_largepage(mp) ? mp->pages : 1; + ret.page = page_shadow_alloc(txn, npages); + if (unlikely(!ret.page)) { + ret.err = MDBX_ENOMEM; + return ret; + } + page_copy(ret.page, mp, pgno2bytes(txn->env, npages)); + if (scan == txn) { + /* If in current txn, this page is no longer spilled. + * If it happens to be the last page, truncate the spill list. + * Otherwise mark it as deleted by setting the LSB. */ + spill_remove(txn, si, npages); + } /* otherwise, if belonging to a parent txn, the + * page remains spilled until child commits */ - char *message = nullptr; - const int num = osal_vasprintf(&message, fmt, ap); - va_end(ap); - const char *const const_message = - unlikely(num < 1 || !message) - ? "" - : message; + ret.err = page_dirty(txn, ret.page, npages); + if (unlikely(ret.err != MDBX_SUCCESS)) + return ret; +#if MDBX_ENABLE_PGOP_STAT + txn->env->lck->pgops.unspill.weak += npages; +#endif /* MDBX_ENABLE_PGOP_STAT */ + ret.page->flags |= (scan == txn) ? 0 : P_SPILLED; + ret.err = MDBX_SUCCESS; + return ret; + } while (likely((scan = scan->parent) != nullptr && + (scan->flags & MDBX_TXN_SPILLS) != 0)); + ERROR("Page %" PRIaPGNO " mod-txnid %" PRIaTXN + " not found in the spill-list(s), current txn %" PRIaTXN + " front %" PRIaTXN ", root txn %" PRIaTXN " front %" PRIaTXN, + mp->pgno, mp->txnid, txn->txnid, txn->front_txnid, + txn->env->basal_txn->txnid, txn->env->basal_txn->front_txnid); + ret.err = MDBX_PROBLEM; + ret.page = nullptr; + return ret; +} - if (mdbx_static.logger.ptr) - debug_log(MDBX_LOG_FATAL, "panic", 0, "%s", const_message); +__hot int page_touch_modifable(MDBX_txn *txn, const page_t *const mp) { + tASSERT(txn, is_modifable(txn, mp) && txn->tw.dirtylist); + tASSERT(txn, !is_largepage(mp) && !is_subpage(mp)); + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - while (1) { -#if defined(_WIN32) || defined(_WIN64) -#if !MDBX_WITHOUT_MSVC_CRT && defined(_DEBUG) - _CrtDbgReport(_CRT_ASSERT, "mdbx.c", 0, "libmdbx", "panic: %s", - const_message); -#else - OutputDebugStringA("\r\nMDBX-PANIC: "); - OutputDebugStringA(const_message); - if (IsDebuggerPresent()) - DebugBreak(); -#endif - FatalExit(ERROR_UNHANDLED_ERROR); -#else - __assert_fail(const_message, "mdbx", 0, "panic"); - abort(); -#endif + const size_t n = dpl_search(txn, mp->pgno); + if (MDBX_AVOID_MSYNC && + unlikely(txn->tw.dirtylist->items[n].pgno != mp->pgno)) { + tASSERT(txn, (txn->flags & MDBX_WRITEMAP)); + tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length + 1); + VERBOSE("unspill page %" PRIaPGNO, mp->pgno); +#if MDBX_ENABLE_PGOP_STAT + txn->env->lck->pgops.unspill.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + return page_dirty(txn, (page_t *)mp, 1); + } + + tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length); + tASSERT(txn, txn->tw.dirtylist->items[n].pgno == mp->pgno && + txn->tw.dirtylist->items[n].ptr == mp); + if (!MDBX_AVOID_MSYNC || (txn->flags & MDBX_WRITEMAP) == 0) { + size_t *const ptr = + ptr_disp(txn->tw.dirtylist->items[n].ptr, -(ptrdiff_t)sizeof(size_t)); + *ptr = txn->tw.dirtylru; } + return MDBX_SUCCESS; } -/*----------------------------------------------------------------------------*/ +__hot int page_touch_unmodifable(MDBX_txn *txn, MDBX_cursor *mc, + const page_t *const mp) { + tASSERT(txn, !is_modifable(txn, mp) && !is_largepage(mp)); + if (is_subpage(mp)) { + ((page_t *)mp)->txnid = txn->front_txnid; + return MDBX_SUCCESS; + } -#ifndef osal_vasprintf -MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, - va_list ap) { - va_list ones; - va_copy(ones, ap); - const int needed = vsnprintf(nullptr, 0, fmt, ones); - va_end(ones); + int rc; + page_t *np; + if (is_frozen(txn, mp)) { + /* CoW the page */ + rc = pnl_need(&txn->tw.retired_pages, 1); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + const pgr_t par = gc_alloc_single(mc); + rc = par.err; + np = par.page; + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; - if (unlikely(needed < 0 || needed >= INT_MAX)) { - *strp = nullptr; - return needed; - } + const pgno_t pgno = np->pgno; + DEBUG("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, cursor_dbi_dbg(mc), + mp->pgno, pgno); + tASSERT(txn, mp->pgno != pgno); + pnl_append_prereserved(txn->tw.retired_pages, mp->pgno); + /* Update the parent page, if any, to point to the new page */ + if (likely(mc->top)) { + page_t *parent = mc->pg[mc->top - 1]; + node_t *node = page_node(parent, mc->ki[mc->top - 1]); + node_set_pgno(node, pgno); + } else { + mc->tree->root = pgno; + } + +#if MDBX_ENABLE_PGOP_STAT + txn->env->lck->pgops.cow.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + page_copy(np, mp, txn->env->ps); + np->pgno = pgno; + np->txnid = txn->front_txnid; + } else if (is_spilled(txn, mp)) { + pgr_t pur = page_unspill(txn, mp); + np = pur.page; + rc = pur.err; + if (likely(rc == MDBX_SUCCESS)) { + tASSERT(txn, np != nullptr); + goto done; + } + goto fail; + } else { + if (unlikely(!txn->parent)) { + ERROR("Unexpected not frozen/modifiable/spilled but shadowed %s " + "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," + " without parent transaction, current txn %" PRIaTXN + " front %" PRIaTXN, + is_branch(mp) ? "branch" : "leaf", mp->pgno, mp->txnid, + mc->txn->txnid, mc->txn->front_txnid); + rc = MDBX_PROBLEM; + goto fail; + } + + DEBUG("clone db %d page %" PRIaPGNO, cursor_dbi_dbg(mc), mp->pgno); + tASSERT(txn, + txn->tw.dirtylist->length <= PAGELIST_LIMIT + MDBX_PNL_GRANULATE); + /* No - copy it */ + np = page_shadow_alloc(txn, 1); + if (unlikely(!np)) { + rc = MDBX_ENOMEM; + goto fail; + } + page_copy(np, mp, txn->env->ps); + + /* insert a clone of parent's dirty page, so don't touch dirtyroom */ + rc = page_dirty(txn, np, 1); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; - *strp = osal_malloc(needed + (size_t)1); - if (unlikely(*strp == nullptr)) { -#if defined(_WIN32) || defined(_WIN64) - SetLastError(MDBX_ENOMEM); -#else - errno = MDBX_ENOMEM; -#endif - return -1; +#if MDBX_ENABLE_PGOP_STAT + txn->env->lck->pgops.clone.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ } - const int actual = vsnprintf(*strp, needed + (size_t)1, fmt, ap); - assert(actual == needed); - if (unlikely(actual < 0)) { - osal_free(*strp); - *strp = nullptr; +done: + /* Adjust cursors pointing to mp */ + mc->pg[mc->top] = np; + MDBX_cursor *m2 = txn->cursors[cursor_dbi(mc)]; + if (mc->flags & z_inner) { + for (; m2; m2 = m2->next) { + MDBX_cursor *m3 = &m2->subcur->cursor; + if (m3->top < mc->top) + continue; + if (m3->pg[mc->top] == mp) + m3->pg[mc->top] = np; + } + } else { + for (; m2; m2 = m2->next) { + if (m2->top < mc->top) + continue; + if (m2->pg[mc->top] == mp) { + m2->pg[mc->top] = np; + if (is_leaf(np) && inner_pointed(m2)) + cursor_inner_refresh(m2, np, m2->ki[mc->top]); + } + } } - return actual; -} -#endif /* osal_vasprintf */ + return MDBX_SUCCESS; -#ifndef osal_asprintf -MDBX_INTERNAL_FUNC int osal_asprintf(char **strp, const char *fmt, ...) { - va_list ap; - va_start(ap, fmt); - const int rc = osal_vasprintf(strp, fmt, ap); - va_end(ap); +fail: + txn->flags |= MDBX_TXN_ERROR; return rc; } -#endif /* osal_asprintf */ -#ifndef osal_memalign_alloc -MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes, - void **result) { - assert(is_powerof2(alignment) && alignment >= sizeof(void *)); -#if defined(_WIN32) || defined(_WIN64) - (void)alignment; - *result = VirtualAlloc(NULL, bytes, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); - return *result ? MDBX_SUCCESS : MDBX_ENOMEM /* ERROR_OUTOFMEMORY */; -#elif defined(_ISOC11_SOURCE) - *result = aligned_alloc(alignment, ceil_powerof2(bytes, alignment)); - return *result ? MDBX_SUCCESS : errno; -#elif _POSIX_VERSION >= 200112L && \ - (!defined(__ANDROID_API__) || __ANDROID_API__ >= 17) - *result = nullptr; - return posix_memalign(result, alignment, bytes); -#elif __GLIBC_PREREQ(2, 16) || __STDC_VERSION__ >= 201112L - *result = memalign(alignment, bytes); - return *result ? MDBX_SUCCESS : errno; -#else -#error FIXME -#endif -} -#endif /* osal_memalign_alloc */ +page_t *page_shadow_alloc(MDBX_txn *txn, size_t num) { + MDBX_env *env = txn->env; + page_t *np = env->shadow_reserve; + size_t size = env->ps; + if (likely(num == 1 && np)) { + eASSERT(env, env->shadow_reserve_len > 0); + MDBX_ASAN_UNPOISON_MEMORY_REGION(np, size); + VALGRIND_MEMPOOL_ALLOC(env, ptr_disp(np, -(ptrdiff_t)sizeof(size_t)), + size + sizeof(size_t)); + VALGRIND_MAKE_MEM_DEFINED(&page_next(np), sizeof(page_t *)); + env->shadow_reserve = page_next(np); + env->shadow_reserve_len -= 1; + } else { + size = pgno2bytes(env, num); + void *const ptr = osal_malloc(size + sizeof(size_t)); + if (unlikely(!ptr)) { + txn->flags |= MDBX_TXN_ERROR; + return nullptr; + } + VALGRIND_MEMPOOL_ALLOC(env, ptr, size + sizeof(size_t)); + np = ptr_disp(ptr, sizeof(size_t)); + } -#ifndef osal_memalign_free -MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr) { -#if defined(_WIN32) || defined(_WIN64) - VirtualFree(ptr, 0, MEM_RELEASE); -#else - osal_free(ptr); + if ((env->flags & MDBX_NOMEMINIT) == 0) { + /* For a single page alloc, we init everything after the page header. + * For multi-page, we init the final page; if the caller needed that + * many pages they will be filling in at least up to the last page. */ + size_t skip = PAGEHDRSZ; + if (num > 1) + skip += pgno2bytes(env, num - 1); + memset(ptr_disp(np, skip), 0, size - skip); + } +#if MDBX_DEBUG + np->pgno = 0; #endif + VALGRIND_MAKE_MEM_UNDEFINED(np, size); + np->flags = 0; + np->pages = (pgno_t)num; + return np; } -#endif /* osal_memalign_free */ -#ifndef osal_strdup -char *osal_strdup(const char *str) { - if (!str) - return NULL; - size_t bytes = strlen(str) + 1; - char *dup = osal_malloc(bytes); - if (dup) - memcpy(dup, str, bytes); - return dup; +void page_shadow_release(MDBX_env *env, page_t *dp, size_t npages) { + VALGRIND_MAKE_MEM_UNDEFINED(dp, pgno2bytes(env, npages)); + MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, pgno2bytes(env, npages)); + if (unlikely(env->flags & MDBX_PAGEPERTURB)) + memset(dp, -1, pgno2bytes(env, npages)); + if (likely(npages == 1 && + env->shadow_reserve_len < env->options.dp_reserve_limit)) { + MDBX_ASAN_POISON_MEMORY_REGION(dp, env->ps); + MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(dp), sizeof(page_t *)); + page_next(dp) = env->shadow_reserve; + VALGRIND_MEMPOOL_FREE(env, ptr_disp(dp, -(ptrdiff_t)sizeof(size_t))); + env->shadow_reserve = dp; + env->shadow_reserve_len += 1; + } else { + /* large pages just get freed directly */ + void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t)); + VALGRIND_MEMPOOL_FREE(env, ptr); + osal_free(ptr); + } } -#endif /* osal_strdup */ - -/*----------------------------------------------------------------------------*/ -MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair) { - int rc; - memset(condpair, 0, sizeof(osal_condpair_t)); -#if defined(_WIN32) || defined(_WIN64) - if ((condpair->mutex = CreateMutexW(NULL, FALSE, NULL)) == NULL) { - rc = (int)GetLastError(); - goto bailout_mutex; - } - if ((condpair->event[0] = CreateEventW(NULL, FALSE, FALSE, NULL)) == NULL) { - rc = (int)GetLastError(); - goto bailout_event; +__cold static void page_kill(MDBX_txn *txn, page_t *mp, pgno_t pgno, + size_t npages) { + MDBX_env *const env = txn->env; + DEBUG("kill %zu page(s) %" PRIaPGNO, npages, pgno); + eASSERT(env, pgno >= NUM_METAS && npages); + if (!is_frozen(txn, mp)) { + const size_t bytes = pgno2bytes(env, npages); + memset(mp, -1, bytes); + mp->pgno = pgno; + if ((txn->flags & MDBX_WRITEMAP) == 0) + osal_pwrite(env->lazy_fd, mp, bytes, pgno2bytes(env, pgno)); + } else { + struct iovec iov[MDBX_AUXILARY_IOV_MAX]; + iov[0].iov_len = env->ps; + iov[0].iov_base = ptr_disp(env->page_auxbuf, env->ps); + size_t iov_off = pgno2bytes(env, pgno), n = 1; + while (--npages) { + iov[n] = iov[0]; + if (++n == MDBX_AUXILARY_IOV_MAX) { + osal_pwritev(env->lazy_fd, iov, MDBX_AUXILARY_IOV_MAX, iov_off); + iov_off += pgno2bytes(env, MDBX_AUXILARY_IOV_MAX); + n = 0; + } + } + osal_pwritev(env->lazy_fd, iov, n, iov_off); } - if ((condpair->event[1] = CreateEventW(NULL, FALSE, FALSE, NULL)) != NULL) - return MDBX_SUCCESS; - - rc = (int)GetLastError(); - (void)CloseHandle(condpair->event[0]); -bailout_event: - (void)CloseHandle(condpair->mutex); -#else - rc = pthread_mutex_init(&condpair->mutex, NULL); - if (unlikely(rc != 0)) - goto bailout_mutex; - rc = pthread_cond_init(&condpair->cond[0], NULL); - if (unlikely(rc != 0)) - goto bailout_cond; - rc = pthread_cond_init(&condpair->cond[1], NULL); - if (likely(rc == 0)) - return MDBX_SUCCESS; - - (void)pthread_cond_destroy(&condpair->cond[0]); -bailout_cond: - (void)pthread_mutex_destroy(&condpair->mutex); -#endif -bailout_mutex: - memset(condpair, 0, sizeof(osal_condpair_t)); - return rc; } -MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair) { -#if defined(_WIN32) || defined(_WIN64) - int rc = CloseHandle(condpair->mutex) ? MDBX_SUCCESS : (int)GetLastError(); - rc = CloseHandle(condpair->event[0]) ? rc : (int)GetLastError(); - rc = CloseHandle(condpair->event[1]) ? rc : (int)GetLastError(); -#else - int err, rc = pthread_mutex_destroy(&condpair->mutex); - rc = (err = pthread_cond_destroy(&condpair->cond[0])) ? err : rc; - rc = (err = pthread_cond_destroy(&condpair->cond[1])) ? err : rc; -#endif - memset(condpair, 0, sizeof(osal_condpair_t)); - return rc; +static inline bool suitable4loose(const MDBX_txn *txn, pgno_t pgno) { + /* TODO: + * 1) при включенной "экономии последовательностей" проверить, что + * страница не примыкает к какой-либо из уже находящийся в reclaimed. + * 2) стоит подумать над тем, чтобы при большом loose-списке отбрасывать + половину в reclaimed. */ + return txn->tw.loose_count < txn->env->options.dp_loose_limit && + (!MDBX_ENABLE_REFUND || + /* skip pages near to the end in favor of compactification */ + txn->geo.first_unallocated > + pgno + txn->env->options.dp_loose_limit || + txn->geo.first_unallocated <= txn->env->options.dp_loose_limit); } -MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair) { -#if defined(_WIN32) || defined(_WIN64) - DWORD code = WaitForSingleObject(condpair->mutex, INFINITE); - return waitstatus2errcode(code); -#else - return osal_pthread_mutex_lock(&condpair->mutex); -#endif -} +/* Retire, loosen or free a single page. + * + * For dirty pages, saves single pages to a list for future reuse in this same + * txn. It has been pulled from the GC and already resides on the dirty list, + * but has been deleted. Use these pages first before pulling again from the GC. + * + * If the page wasn't dirtied in this txn, just add it + * to this txn's free list. */ +int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, + page_t *mp /* maybe null */, + unsigned pageflags /* maybe unknown/zero */) { + int rc; + MDBX_txn *const txn = mc->txn; + tASSERT(txn, !mp || (mp->pgno == pgno && mp->flags == pageflags)); -MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair) { -#if defined(_WIN32) || defined(_WIN64) - return ReleaseMutex(condpair->mutex) ? MDBX_SUCCESS : (int)GetLastError(); -#else - return pthread_mutex_unlock(&condpair->mutex); -#endif -} + /* During deleting entire subtrees, it is reasonable and possible to avoid + * reading leaf pages, i.e. significantly reduce hard page-faults & IOPs: + * - mp is null, i.e. the page has not yet been read; + * - pagetype is known and the P_LEAF bit is set; + * - we can determine the page status via scanning the lists + * of dirty and spilled pages. + * + * On the other hand, this could be suboptimal for WRITEMAP mode, since + * requires support the list of dirty pages and avoid explicit spilling. + * So for flexibility and avoid extra internal dependencies we just + * fallback to reading if dirty list was not allocated yet. */ + size_t di = 0, si = 0, npages = 1; + enum page_status { + unknown, + frozen, + spilled, + shadowed, + modifable + } status = unknown; -MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair, - bool part) { -#if defined(_WIN32) || defined(_WIN64) - return SetEvent(condpair->event[part]) ? MDBX_SUCCESS : (int)GetLastError(); -#else - return pthread_cond_signal(&condpair->cond[part]); -#endif -} + if (unlikely(!mp)) { + if (ASSERT_ENABLED() && pageflags) { + pgr_t check; + check = page_get_any(mc, pgno, txn->front_txnid); + if (unlikely(check.err != MDBX_SUCCESS)) + return check.err; + tASSERT(txn, ((unsigned)check.page->flags & ~P_SPILLED) == + (pageflags & ~P_FROZEN)); + tASSERT(txn, !(pageflags & P_FROZEN) || is_frozen(txn, check.page)); + } + if (pageflags & P_FROZEN) { + status = frozen; + if (ASSERT_ENABLED()) { + for (MDBX_txn *scan = txn; scan; scan = scan->parent) { + tASSERT(txn, !txn->tw.spilled.list || !spill_search(scan, pgno)); + tASSERT(txn, !scan->tw.dirtylist || !debug_dpl_find(scan, pgno)); + } + } + goto status_done; + } else if (pageflags && txn->tw.dirtylist) { + if ((di = dpl_exist(txn, pgno)) != 0) { + mp = txn->tw.dirtylist->items[di].ptr; + tASSERT(txn, is_modifable(txn, mp)); + status = modifable; + goto status_done; + } + if ((si = spill_search(txn, pgno)) != 0) { + status = spilled; + goto status_done; + } + for (MDBX_txn *parent = txn->parent; parent; parent = parent->parent) { + if (dpl_exist(parent, pgno)) { + status = shadowed; + goto status_done; + } + if (spill_search(parent, pgno)) { + status = spilled; + goto status_done; + } + } + status = frozen; + goto status_done; + } -MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair, - bool part) { -#if defined(_WIN32) || defined(_WIN64) - DWORD code = SignalObjectAndWait(condpair->mutex, condpair->event[part], - INFINITE, FALSE); - if (code == WAIT_OBJECT_0) { - code = WaitForSingleObject(condpair->mutex, INFINITE); - if (code == WAIT_OBJECT_0) - return MDBX_SUCCESS; + pgr_t pg = page_get_any(mc, pgno, txn->front_txnid); + if (unlikely(pg.err != MDBX_SUCCESS)) + return pg.err; + mp = pg.page; + tASSERT(txn, !pageflags || mp->flags == pageflags); + pageflags = mp->flags; } - return waitstatus2errcode(code); -#else - return pthread_cond_wait(&condpair->cond[part], &condpair->mutex); -#endif -} -/*----------------------------------------------------------------------------*/ + if (is_frozen(txn, mp)) { + status = frozen; + tASSERT(txn, !is_modifable(txn, mp)); + tASSERT(txn, !is_spilled(txn, mp)); + tASSERT(txn, !is_shadowed(txn, mp)); + tASSERT(txn, !debug_dpl_find(txn, pgno)); + tASSERT(txn, !txn->tw.spilled.list || !spill_search(txn, pgno)); + } else if (is_modifable(txn, mp)) { + status = modifable; + if (txn->tw.dirtylist) + di = dpl_exist(txn, pgno); + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) || !is_spilled(txn, mp)); + tASSERT(txn, !txn->tw.spilled.list || !spill_search(txn, pgno)); + } else if (is_shadowed(txn, mp)) { + status = shadowed; + tASSERT(txn, !txn->tw.spilled.list || !spill_search(txn, pgno)); + tASSERT(txn, !debug_dpl_find(txn, pgno)); + } else { + tASSERT(txn, is_spilled(txn, mp)); + status = spilled; + si = spill_search(txn, pgno); + tASSERT(txn, !debug_dpl_find(txn, pgno)); + } -MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex) { -#if defined(_WIN32) || defined(_WIN64) - InitializeCriticalSection(fastmutex); - return MDBX_SUCCESS; -#elif MDBX_DEBUG - pthread_mutexattr_t ma; - int rc = pthread_mutexattr_init(&ma); - if (likely(!rc)) { - rc = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK); - if (likely(!rc) || rc == ENOTSUP) - rc = pthread_mutex_init(fastmutex, &ma); - pthread_mutexattr_destroy(&ma); +status_done: + if (likely((pageflags & P_LARGE) == 0)) { + STATIC_ASSERT(P_BRANCH == 1); + const bool is_branch = pageflags & P_BRANCH; + cASSERT(mc, ((pageflags & P_LEAF) == 0) == is_branch); + if (unlikely(mc->flags & z_inner)) { + tree_t *outer = outer_tree(mc); + cASSERT(mc, !is_branch || outer->branch_pages > 0); + outer->branch_pages -= is_branch; + cASSERT(mc, is_branch || outer->leaf_pages > 0); + outer->leaf_pages -= 1 - is_branch; + } + cASSERT(mc, !is_branch || mc->tree->branch_pages > 0); + mc->tree->branch_pages -= is_branch; + cASSERT(mc, is_branch || mc->tree->leaf_pages > 0); + mc->tree->leaf_pages -= 1 - is_branch; + } else { + npages = mp->pages; + cASSERT(mc, mc->tree->large_pages >= npages); + mc->tree->large_pages -= (pgno_t)npages; } - return rc; -#else - return pthread_mutex_init(fastmutex, nullptr); -#endif -} -MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex) { -#if defined(_WIN32) || defined(_WIN64) - DeleteCriticalSection(fastmutex); - return MDBX_SUCCESS; -#else - return pthread_mutex_destroy(fastmutex); -#endif -} + if (status == frozen) { + retire: + DEBUG("retire %zu page %" PRIaPGNO, npages, pgno); + rc = pnl_append_span(&txn->tw.retired_pages, pgno, npages); + tASSERT(txn, dpl_check(txn)); + return rc; + } -MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex) { -#if defined(_WIN32) || defined(_WIN64) - __try { - EnterCriticalSection(fastmutex); - } __except ( - (GetExceptionCode() == - 0xC0000194 /* STATUS_POSSIBLE_DEADLOCK / EXCEPTION_POSSIBLE_DEADLOCK */) - ? EXCEPTION_EXECUTE_HANDLER - : EXCEPTION_CONTINUE_SEARCH) { - return MDBX_EDEADLK; + /* Возврат страниц в нераспределенный "хвост" БД. + * Содержимое страниц не уничтожается, а для вложенных транзакций граница + * нераспределенного "хвоста" БД сдвигается только при их коммите. */ + if (MDBX_ENABLE_REFUND && + unlikely(pgno + npages == txn->geo.first_unallocated)) { + const char *kind = nullptr; + if (status == modifable) { + /* Страница испачкана в этой транзакции, но до этого могла быть + * аллоцирована, испачкана и пролита в одной из родительских транзакций. + * Её МОЖНО вытолкнуть в нераспределенный хвост. */ + kind = "dirty"; + /* Remove from dirty list */ + page_wash(txn, di, mp, npages); + } else if (si) { + /* Страница пролита в этой транзакции, т.е. она аллоцирована + * и запачкана в этой или одной из родительских транзакций. + * Её МОЖНО вытолкнуть в нераспределенный хвост. */ + kind = "spilled"; + tASSERT(txn, status == spilled); + spill_remove(txn, si, npages); + } else { + /* Страница аллоцирована, запачкана и возможно пролита в одной + * из родительских транзакций. + * Её МОЖНО вытолкнуть в нераспределенный хвост. */ + kind = "parent's"; + if (ASSERT_ENABLED() && mp) { + kind = nullptr; + for (MDBX_txn *parent = txn->parent; parent; parent = parent->parent) { + if (spill_search(parent, pgno)) { + kind = "parent-spilled"; + tASSERT(txn, status == spilled); + break; + } + if (mp == debug_dpl_find(parent, pgno)) { + kind = "parent-dirty"; + tASSERT(txn, status == shadowed); + break; + } + } + tASSERT(txn, kind != nullptr); + } + tASSERT(txn, status == spilled || status == shadowed); + } + DEBUG("refunded %zu %s page %" PRIaPGNO, npages, kind, pgno); + txn->geo.first_unallocated = pgno; + txn_refund(txn); + return MDBX_SUCCESS; } - return MDBX_SUCCESS; -#else - return osal_pthread_mutex_lock(fastmutex); -#endif -} -MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex) { -#if defined(_WIN32) || defined(_WIN64) - LeaveCriticalSection(fastmutex); - return MDBX_SUCCESS; -#else - return pthread_mutex_unlock(fastmutex); -#endif -} + if (status == modifable) { + /* Dirty page from this transaction */ + /* If suitable we can reuse it through loose list */ + if (likely(npages == 1 && suitable4loose(txn, pgno)) && + (di || !txn->tw.dirtylist)) { + DEBUG("loosen dirty page %" PRIaPGNO, pgno); + if (MDBX_DEBUG != 0 || unlikely(txn->env->flags & MDBX_PAGEPERTURB)) + memset(page_data(mp), -1, txn->env->ps - PAGEHDRSZ); + mp->txnid = INVALID_TXNID; + mp->flags = P_LOOSE; + page_next(mp) = txn->tw.loose_pages; + txn->tw.loose_pages = mp; + txn->tw.loose_count++; +#if MDBX_ENABLE_REFUND + txn->tw.loose_refund_wl = (pgno + 2 > txn->tw.loose_refund_wl) + ? pgno + 2 + : txn->tw.loose_refund_wl; +#endif /* MDBX_ENABLE_REFUND */ + VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), txn->env->ps - PAGEHDRSZ); + MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), txn->env->ps - PAGEHDRSZ); + return MDBX_SUCCESS; + } -/*----------------------------------------------------------------------------*/ +#if !MDBX_DEBUG && !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) + if (unlikely(txn->env->flags & MDBX_PAGEPERTURB)) +#endif + { + /* Страница могла быть изменена в одной из родительских транзакций, + * в том числе, позже выгружена и затем снова загружена и изменена. + * В обоих случаях её нельзя затирать на диске и помечать недоступной + * в asan и/или valgrind */ + for (MDBX_txn *parent = txn->parent; + parent && (parent->flags & MDBX_TXN_SPILLS); + parent = parent->parent) { + if (spill_intersect(parent, pgno, npages)) + goto skip_invalidate; + if (dpl_intersect(parent, pgno, npages)) + goto skip_invalidate; + } -#if defined(_WIN32) || defined(_WIN64) +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) + if (MDBX_DEBUG != 0 || unlikely(txn->env->flags & MDBX_PAGEPERTURB)) +#endif + page_kill(txn, mp, pgno, npages); + if ((txn->flags & MDBX_WRITEMAP) == 0) { + VALGRIND_MAKE_MEM_NOACCESS(page_data(pgno2page(txn->env, pgno)), + pgno2bytes(txn->env, npages) - PAGEHDRSZ); + MDBX_ASAN_POISON_MEMORY_REGION(page_data(pgno2page(txn->env, pgno)), + pgno2bytes(txn->env, npages) - + PAGEHDRSZ); + } + } + skip_invalidate: -MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst) { - const size_t dst_wlen = MultiByteToWideChar( - CP_THREAD_ACP, MB_ERR_INVALID_CHARS, src, -1, nullptr, 0); - wchar_t *dst = *pdst; - int rc = ERROR_INVALID_NAME; - if (unlikely(dst_wlen < 2 || dst_wlen > /* MAX_PATH */ INT16_MAX)) - goto bailout; + /* wash dirty page */ + page_wash(txn, di, mp, npages); - dst = osal_realloc(dst, dst_wlen * sizeof(wchar_t)); - rc = MDBX_ENOMEM; - if (unlikely(!dst)) - goto bailout; + reclaim: + DEBUG("reclaim %zu %s page %" PRIaPGNO, npages, "dirty", pgno); + rc = pnl_insert_span(&txn->tw.relist, pgno, npages); + tASSERT(txn, + pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - + MDBX_ENABLE_REFUND)); + tASSERT(txn, dpl_check(txn)); + return rc; + } - *pdst = dst; - if (likely(dst_wlen == (size_t)MultiByteToWideChar(CP_THREAD_ACP, - MB_ERR_INVALID_CHARS, src, - -1, dst, (int)dst_wlen))) - return MDBX_SUCCESS; + if (si) { + /* Page ws spilled in this txn */ + spill_remove(txn, si, npages); + /* Страница могла быть выделена и затем пролита в этой транзакции, + * тогда её необходимо поместить в reclaimed-список. + * Либо она могла быть выделена в одной из родительских транзакций и затем + * пролита в этой транзакции, тогда её необходимо поместить в + * retired-список для последующей фильтрации при коммите. */ + for (MDBX_txn *parent = txn->parent; parent; parent = parent->parent) { + if (dpl_exist(parent, pgno)) + goto retire; + } + /* Страница точно была выделена в этой транзакции + * и теперь может быть использована повторно. */ + goto reclaim; + } - rc = ERROR_INVALID_NAME; -bailout: - if (*pdst) { - osal_free(*pdst); - *pdst = nullptr; + if (status == shadowed) { + /* Dirty page MUST BE a clone from (one of) parent transaction(s). */ + if (ASSERT_ENABLED()) { + const page_t *parent_dp = nullptr; + /* Check parent(s)'s dirty lists. */ + for (MDBX_txn *parent = txn->parent; parent && !parent_dp; + parent = parent->parent) { + tASSERT(txn, !spill_search(parent, pgno)); + parent_dp = debug_dpl_find(parent, pgno); + } + tASSERT(txn, parent_dp && (!mp || parent_dp == mp)); + } + /* Страница была выделена в родительской транзакции и теперь может быть + * использована повторно, но только внутри этой транзакции, либо дочерних. + */ + goto reclaim; } - return rc; + + /* Страница может входить в доступный читателям MVCC-снимок, либо же она + * могла быть выделена, а затем пролита в одной из родительских + * транзакций. Поэтому пока помещаем её в retired-список, который будет + * фильтроваться относительно dirty- и spilled-списков родительских + * транзакций при коммите дочерних транзакций, либо же будет записан + * в GC в неизменном виде. */ + goto retire; } -#endif /* Windows */ +__hot int __must_check_result page_dirty(MDBX_txn *txn, page_t *mp, + size_t npages) { + tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0); + mp->txnid = txn->front_txnid; + if (!txn->tw.dirtylist) { + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + txn->tw.writemap_dirty_npages += npages; + tASSERT(txn, txn->tw.spilled.list == nullptr); + return MDBX_SUCCESS; + } + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); -/*----------------------------------------------------------------------------*/ +#if xMDBX_DEBUG_SPILLING == 2 + txn->env->debug_dirtied_act += 1; + ENSURE(txn->env, txn->env->debug_dirtied_act < txn->env->debug_dirtied_est); + ENSURE(txn->env, txn->tw.dirtyroom + txn->tw.loose_count > 0); +#endif /* xMDBX_DEBUG_SPILLING == 2 */ -#if defined(_WIN32) || defined(_WIN64) -#define ior_alignment_mask (ior->pagesize - 1) -#define ior_WriteFile_flag 1 -#define OSAL_IOV_MAX (4096 / sizeof(ior_sgv_element)) + int rc; + if (unlikely(txn->tw.dirtyroom == 0)) { + if (txn->tw.loose_count) { + page_t *lp = txn->tw.loose_pages; + DEBUG("purge-and-reclaim loose page %" PRIaPGNO, lp->pgno); + rc = pnl_insert_span(&txn->tw.relist, lp->pgno, 1); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + size_t di = dpl_search(txn, lp->pgno); + tASSERT(txn, txn->tw.dirtylist->items[di].ptr == lp); + dpl_remove(txn, di); + MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *)); + VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *)); + txn->tw.loose_pages = page_next(lp); + txn->tw.loose_count--; + txn->tw.dirtyroom++; + if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP)) + page_shadow_release(txn->env, lp, 1); + } else { + ERROR("Dirtyroom is depleted, DPL length %zu", txn->tw.dirtylist->length); + if (!MDBX_AVOID_MSYNC || !(txn->flags & MDBX_WRITEMAP)) + page_shadow_release(txn->env, mp, npages); + return MDBX_TXN_FULL; + } + } -static void ior_put_event(osal_ioring_t *ior, HANDLE event) { - assert(event && event != INVALID_HANDLE_VALUE && event != ior); - assert(ior->event_stack < ior->allocated); - ior->event_pool[ior->event_stack] = event; - ior->event_stack += 1; + rc = dpl_append(txn, mp->pgno, mp, npages); + if (unlikely(rc != MDBX_SUCCESS)) { + bailout: + txn->flags |= MDBX_TXN_ERROR; + return rc; + } + txn->tw.dirtyroom--; + tASSERT(txn, dpl_check(txn)); + return MDBX_SUCCESS; } -static HANDLE ior_get_event(osal_ioring_t *ior) { - assert(ior->event_stack <= ior->allocated); - if (ior->event_stack > 0) { - ior->event_stack -= 1; - assert(ior->event_pool[ior->event_stack] != 0); - return ior->event_pool[ior->event_stack]; +void recalculate_subpage_thresholds(MDBX_env *env) { + size_t whole = env->leaf_nodemax - NODESIZE; + env->subpage_limit = (whole * env->options.subpage.limit + 32767) >> 16; + whole = env->subpage_limit; + env->subpage_reserve_limit = + (whole * env->options.subpage.reserve_limit + 32767) >> 16; + eASSERT(env, env->leaf_nodemax >= env->subpage_limit + NODESIZE); + eASSERT(env, env->subpage_limit >= env->subpage_reserve_limit); + + whole = env->leaf_nodemax; + env->subpage_room_threshold = + (whole * env->options.subpage.room_threshold + 32767) >> 16; + env->subpage_reserve_prereq = + (whole * env->options.subpage.reserve_prereq + 32767) >> 16; + if (env->subpage_room_threshold + env->subpage_reserve_limit > + (intptr_t)page_space(env)) + env->subpage_reserve_prereq = page_space(env); + else if (env->subpage_reserve_prereq < + env->subpage_room_threshold + env->subpage_reserve_limit) + env->subpage_reserve_prereq = + env->subpage_room_threshold + env->subpage_reserve_limit; + eASSERT(env, env->subpage_reserve_prereq > + env->subpage_room_threshold + env->subpage_reserve_limit); +} + +size_t page_subleaf2_reserve(const MDBX_env *env, size_t host_page_room, + size_t subpage_len, size_t item_len) { + eASSERT(env, (subpage_len & 1) == 0); + eASSERT(env, env->leaf_nodemax >= env->subpage_limit + NODESIZE); + size_t reserve = 0; + for (size_t n = 0; + n < 5 && reserve + item_len <= env->subpage_reserve_limit && + EVEN_CEIL(subpage_len + item_len) <= env->subpage_limit && + host_page_room >= + env->subpage_reserve_prereq + EVEN_CEIL(subpage_len + item_len); + ++n) { + subpage_len += item_len; + reserve += item_len; } - return CreateEventW(nullptr, true, false, nullptr); + return reserve + (subpage_len & 1); } +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \note Please refer to the COPYRIGHT file for explanations license change, +/// credits and acknowledgments. +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 -static void WINAPI ior_wocr(DWORD err, DWORD bytes, OVERLAPPED *ov) { - osal_ioring_t *ior = ov->hEvent; - ov->Internal = err; - ov->InternalHigh = bytes; - if (++ior->async_completed >= ior->async_waiting) - SetEvent(ior->async_done); + +/* Search for the lowest key under the current branch page. + * This just bypasses a numkeys check in the current page + * before calling tree_search_finalize(), because the callers + * are all in situations where the current page is known to + * be underfilled. */ +__hot int tree_search_lowest(MDBX_cursor *mc) { + cASSERT(mc, mc->top >= 0); + page_t *mp = mc->pg[mc->top]; + cASSERT(mc, is_branch(mp)); + + node_t *node = page_node(mp, 0); + int err = page_get(mc, node_pgno(node), &mp, mp->txnid); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + mc->ki[mc->top] = 0; + err = cursor_push(mc, mp, 0); + if (unlikely(err != MDBX_SUCCESS)) + return err; + return tree_search_finalize(mc, nullptr, Z_FIRST); } -#elif MDBX_HAVE_PWRITEV -#if defined(_SC_IOV_MAX) -static size_t osal_iov_max; -#define OSAL_IOV_MAX osal_iov_max -#else -#define OSAL_IOV_MAX IOV_MAX -#endif -#else -#undef OSAL_IOV_MAX -#endif /* OSAL_IOV_MAX */ +__hot int tree_search(MDBX_cursor *mc, const MDBX_val *key, int flags) { + int err; + if (unlikely(mc->txn->flags & MDBX_TXN_BLOCKED)) { + DEBUG("%s", "transaction has failed, must abort"); + err = MDBX_BAD_TXN; + bailout: + be_poor(mc); + return err; + } -MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *ior -#if defined(_WIN32) || defined(_WIN64) - , - bool enable_direct, - mdbx_filehandle_t overlapped_fd -#endif /* Windows */ -) { - memset(ior, 0, sizeof(osal_ioring_t)); + const size_t dbi = cursor_dbi(mc); + if (unlikely(*cursor_dbi_state(mc) & DBI_STALE)) { + err = sdb_fetch(mc->txn, dbi); + if (unlikely(err != MDBX_SUCCESS)) + goto bailout; + } -#if defined(_WIN32) || defined(_WIN64) - ior->overlapped_fd = overlapped_fd; - ior->direct = enable_direct && overlapped_fd; - const unsigned pagesize = (unsigned)osal_syspagesize(); - ior->pagesize = pagesize; - ior->pagesize_ln2 = (uint8_t)log2n_powerof2(pagesize); - ior->async_done = ior_get_event(ior); - if (!ior->async_done) - return GetLastError(); -#endif /* !Windows */ + const pgno_t root = mc->tree->root; + if (unlikely(root == P_INVALID)) { + DEBUG("%s", "tree is empty"); + cASSERT(mc, is_poor(mc)); + return MDBX_NOTFOUND; + } -#if MDBX_HAVE_PWRITEV && defined(_SC_IOV_MAX) - assert(osal_iov_max > 0); -#endif /* MDBX_HAVE_PWRITEV && _SC_IOV_MAX */ + cASSERT(mc, root >= NUM_METAS && root < mc->txn->geo.first_unallocated); + if (mc->top < 0 || mc->pg[0]->pgno != root) { + txnid_t pp_txnid = mc->tree->mod_txnid; + pp_txnid = /* tree->mod_txnid maybe zero in a legacy DB */ pp_txnid + ? pp_txnid + : mc->txn->txnid; + if ((mc->txn->flags & MDBX_TXN_RDONLY) == 0) { + MDBX_txn *scan = mc->txn; + do + if ((scan->flags & MDBX_TXN_DIRTY) && + (dbi == MAIN_DBI || (scan->dbi_state[dbi] & DBI_DIRTY))) { + /* После коммита вложенных тразакций может быть mod_txnid > front */ + pp_txnid = scan->front_txnid; + break; + } + while (unlikely((scan = scan->parent) != nullptr)); + } + err = page_get(mc, root, &mc->pg[0], pp_txnid); + if (unlikely(err != MDBX_SUCCESS)) + goto bailout; + } - ior->boundary = ptr_disp(ior->pool, ior->allocated); - return MDBX_SUCCESS; -} + mc->top = 0; + mc->ki[0] = (flags & Z_LAST) ? page_numkeys(mc->pg[0]) - 1 : 0; + DEBUG("db %d root page %" PRIaPGNO " has flags 0x%X", cursor_dbi_dbg(mc), + root, mc->pg[0]->flags); -static __inline size_t ior_offset(const ior_item_t *item) { -#if defined(_WIN32) || defined(_WIN64) - return item->ov.Offset | (size_t)((sizeof(size_t) > sizeof(item->ov.Offset)) - ? (uint64_t)item->ov.OffsetHigh << 32 - : 0); -#else - return item->offset; -#endif /* !Windows */ -} + if (flags & Z_MODIFY) { + err = page_touch(mc); + if (unlikely(err != MDBX_SUCCESS)) + goto bailout; + } -static __inline ior_item_t *ior_next(ior_item_t *item, size_t sgvcnt) { -#if defined(ior_sgv_element) - assert(sgvcnt > 0); - return (ior_item_t *)ptr_disp(item, sizeof(ior_item_t) - - sizeof(ior_sgv_element) + - sizeof(ior_sgv_element) * sgvcnt); -#else - assert(sgvcnt == 1); - (void)sgvcnt; - return item + 1; -#endif -} + if (flags & Z_ROOTONLY) + return MDBX_SUCCESS; -MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, - void *data, const size_t bytes) { - assert(bytes && data); - assert(bytes % MIN_PAGESIZE == 0 && bytes <= MAX_WRITE); - assert(offset % MIN_PAGESIZE == 0 && offset + (uint64_t)bytes <= MAX_MAPSIZE); + return tree_search_finalize(mc, key, flags); +} -#if defined(_WIN32) || defined(_WIN64) - const unsigned segments = (unsigned)(bytes >> ior->pagesize_ln2); - const bool use_gather = - ior->direct && ior->overlapped_fd && ior->slots_left >= segments; -#endif /* Windows */ +__hot __noinline int tree_search_finalize(MDBX_cursor *mc, const MDBX_val *key, + int flags) { + cASSERT(mc, !is_poor(mc)); + DKBUF_DEBUG; + int err; + page_t *mp = mc->pg[mc->top]; + intptr_t ki = (flags & Z_FIRST) ? 0 : page_numkeys(mp) - 1; + while (is_branch(mp)) { + DEBUG("branch page %" PRIaPGNO " has %zu keys", mp->pgno, page_numkeys(mp)); + cASSERT(mc, page_numkeys(mp) > 1); + DEBUG("found index 0 to page %" PRIaPGNO, node_pgno(page_node(mp, 0))); - ior_item_t *item = ior->pool; - if (likely(ior->last)) { - item = ior->last; - if (unlikely(ior_offset(item) + ior_last_bytes(ior, item) == offset) && - likely(ior_last_bytes(ior, item) + bytes <= MAX_WRITE)) { -#if defined(_WIN32) || defined(_WIN64) - if (use_gather && - ((bytes | (uintptr_t)data | ior->last_bytes | - (uintptr_t)(uint64_t)item->sgv[0].Buffer) & - ior_alignment_mask) == 0 && - ior->last_sgvcnt + (size_t)segments < OSAL_IOV_MAX) { - assert(ior->overlapped_fd); - assert((item->single.iov_len & ior_WriteFile_flag) == 0); - assert(item->sgv[ior->last_sgvcnt].Buffer == 0); - ior->last_bytes += bytes; - size_t i = 0; - do { - item->sgv[ior->last_sgvcnt + i].Buffer = PtrToPtr64(data); - data = ptr_disp(data, ior->pagesize); - } while (++i < segments); - ior->slots_left -= segments; - item->sgv[ior->last_sgvcnt += segments].Buffer = 0; - assert((item->single.iov_len & ior_WriteFile_flag) == 0); - return MDBX_SUCCESS; - } - const void *end = ptr_disp(item->single.iov_base, - item->single.iov_len - ior_WriteFile_flag); - if (unlikely(end == data)) { - assert((item->single.iov_len & ior_WriteFile_flag) != 0); - item->single.iov_len += bytes; - return MDBX_SUCCESS; - } -#elif MDBX_HAVE_PWRITEV - assert((int)item->sgvcnt > 0); - const void *end = ptr_disp(item->sgv[item->sgvcnt - 1].iov_base, - item->sgv[item->sgvcnt - 1].iov_len); - if (unlikely(end == data)) { - item->sgv[item->sgvcnt - 1].iov_len += bytes; - ior->last_bytes += bytes; - return MDBX_SUCCESS; - } - if (likely(item->sgvcnt < OSAL_IOV_MAX)) { - if (unlikely(ior->slots_left < 1)) - return MDBX_RESULT_TRUE; - item->sgv[item->sgvcnt].iov_base = data; - item->sgv[item->sgvcnt].iov_len = bytes; - ior->last_bytes += bytes; - item->sgvcnt += 1; - ior->slots_left -= 1; - return MDBX_SUCCESS; - } -#else - const void *end = ptr_disp(item->single.iov_base, item->single.iov_len); - if (unlikely(end == data)) { - item->single.iov_len += bytes; - return MDBX_SUCCESS; - } -#endif + if ((flags & (Z_FIRST | Z_LAST)) == 0) { + const struct node_search_result nsr = node_search(mc, key); + if (likely(nsr.node)) + ki = mc->ki[mc->top] + (intptr_t)nsr.exact - 1; + DEBUG("following index %zu for key [%s]", ki, DKEY_DEBUG(key)); } - item = ior_next(item, ior_last_sgvcnt(ior, item)); - } - if (unlikely(ior->slots_left < 1)) - return MDBX_RESULT_TRUE; + err = page_get(mc, node_pgno(page_node(mp, ki)), &mp, mp->txnid); + if (unlikely(err != MDBX_SUCCESS)) + goto bailout; - unsigned slots_used = 1; -#if defined(_WIN32) || defined(_WIN64) - item->ov.Internal = item->ov.InternalHigh = 0; - item->ov.Offset = (DWORD)offset; - item->ov.OffsetHigh = HIGH_DWORD(offset); - item->ov.hEvent = 0; - if (!use_gather || ((bytes | (uintptr_t)(data)) & ior_alignment_mask) != 0 || - segments > OSAL_IOV_MAX) { - /* WriteFile() */ - item->single.iov_base = data; - item->single.iov_len = bytes + ior_WriteFile_flag; - assert((item->single.iov_len & ior_WriteFile_flag) != 0); - } else { - /* WriteFileGather() */ - assert(ior->overlapped_fd); - item->sgv[0].Buffer = PtrToPtr64(data); - for (size_t i = 1; i < segments; ++i) { - data = ptr_disp(data, ior->pagesize); - item->sgv[slots_used].Buffer = PtrToPtr64(data); + mc->ki[mc->top] = (indx_t)ki; + ki = (flags & Z_FIRST) ? 0 : page_numkeys(mp) - 1; + err = cursor_push(mc, mp, ki); + if (unlikely(err != MDBX_SUCCESS)) + goto bailout; + + if (flags & Z_MODIFY) { + err = page_touch(mc); + if (unlikely(err != MDBX_SUCCESS)) + goto bailout; + mp = mc->pg[mc->top]; } - item->sgv[slots_used].Buffer = 0; - assert((item->single.iov_len & ior_WriteFile_flag) == 0); - slots_used = segments; } - ior->last_bytes = bytes; - ior_last_sgvcnt(ior, item) = slots_used; -#elif MDBX_HAVE_PWRITEV - item->offset = offset; - item->sgv[0].iov_base = data; - item->sgv[0].iov_len = bytes; - ior->last_bytes = bytes; - ior_last_sgvcnt(ior, item) = slots_used; -#else - item->offset = offset; - item->single.iov_base = data; - item->single.iov_len = bytes; -#endif /* !Windows */ - ior->slots_left -= slots_used; - ior->last = item; - return MDBX_SUCCESS; -} -MDBX_INTERNAL_FUNC void osal_ioring_walk( - osal_ioring_t *ior, iov_ctx_t *ctx, - void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes)) { - for (ior_item_t *item = ior->pool; item <= ior->last;) { -#if defined(_WIN32) || defined(_WIN64) - size_t offset = ior_offset(item); - char *data = item->single.iov_base; - size_t bytes = item->single.iov_len - ior_WriteFile_flag; - size_t i = 1; - if (bytes & ior_WriteFile_flag) { - data = Ptr64ToPtr(item->sgv[0].Buffer); - bytes = ior->pagesize; - /* Zap: Reading invalid data from 'item->sgv' */ - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385); - while (item->sgv[i].Buffer) { - if (data + ior->pagesize != item->sgv[i].Buffer) { - callback(ctx, offset, data, bytes); - offset += bytes; - data = Ptr64ToPtr(item->sgv[i].Buffer); - bytes = 0; - } - bytes += ior->pagesize; - ++i; - } - } - assert(bytes < MAX_WRITE); - callback(ctx, offset, data, bytes); -#elif MDBX_HAVE_PWRITEV - assert(item->sgvcnt > 0); - size_t offset = item->offset; - size_t i = 0; - do { - callback(ctx, offset, item->sgv[i].iov_base, item->sgv[i].iov_len); - offset += item->sgv[i].iov_len; - } while (++i != item->sgvcnt); -#else - const size_t i = 1; - callback(ctx, item->offset, item->single.iov_base, item->single.iov_len); -#endif - item = ior_next(item, i); + if (!MDBX_DISABLE_VALIDATION && unlikely(!check_leaf_type(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->pgno, mp->flags); + err = MDBX_CORRUPTED; + bailout: + be_poor(mc); + return err; + } + + DEBUG("found leaf page %" PRIaPGNO " for key [%s]", mp->pgno, + DKEY_DEBUG(key)); + be_filled(mc); + return MDBX_SUCCESS; +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 + + +MDBX_INTERNAL pnl_t pnl_alloc(size_t size) { + size_t bytes = pnl_size2bytes(size); + pnl_t pnl = osal_malloc(bytes); + if (likely(pnl)) { +#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) + bytes = malloc_usable_size(pnl); +#endif /* malloc_usable_size */ + pnl[0] = pnl_bytes2size(bytes); + assert(pnl[0] >= size); + pnl += 1; + *pnl = 0; } + return pnl; } -MDBX_INTERNAL_FUNC osal_ioring_write_result_t -osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd) { - osal_ioring_write_result_t r = {MDBX_SUCCESS, 0}; +MDBX_INTERNAL void pnl_free(pnl_t pnl) { + if (likely(pnl)) + osal_free(pnl - 1); +} -#if defined(_WIN32) || defined(_WIN64) - HANDLE *const end_wait_for = - ior->event_pool + ior->allocated + - /* был выделен один дополнительный элемент для async_done */ 1; - HANDLE *wait_for = end_wait_for; - LONG async_started = 0; - for (ior_item_t *item = ior->pool; item <= ior->last;) { - item->ov.Internal = STATUS_PENDING; - size_t i = 1, bytes = item->single.iov_len - ior_WriteFile_flag; - r.wops += 1; - if (bytes & ior_WriteFile_flag) { - assert(ior->overlapped_fd && fd == ior->overlapped_fd); - bytes = ior->pagesize; - /* Zap: Reading invalid data from 'item->sgv' */ - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385); - while (item->sgv[i].Buffer) { - bytes += ior->pagesize; - ++i; - } - assert(bytes < MAX_WRITE); - item->ov.hEvent = ior_get_event(ior); - if (unlikely(!item->ov.hEvent)) { - bailout_geterr: - r.err = GetLastError(); - bailout_rc: - assert(r.err != MDBX_SUCCESS); - CancelIo(fd); - return r; - } - if (WriteFileGather(fd, item->sgv, (DWORD)bytes, nullptr, &item->ov)) { - assert(item->ov.Internal == 0 && - WaitForSingleObject(item->ov.hEvent, 0) == WAIT_OBJECT_0); - ior_put_event(ior, item->ov.hEvent); - item->ov.hEvent = 0; - } else { - r.err = (int)GetLastError(); - if (unlikely(r.err != ERROR_IO_PENDING)) { - ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 - ", err %d", - "WriteFileGather", fd, __Wpedantic_format_voidptr(item), - item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, - bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), - r.err); - goto bailout_rc; - } - assert(wait_for > ior->event_pool + ior->event_stack); - *--wait_for = item->ov.hEvent; - } - } else if (fd == ior->overlapped_fd) { - assert(bytes < MAX_WRITE); - retry: - item->ov.hEvent = ior; - if (WriteFileEx(fd, item->single.iov_base, (DWORD)bytes, &item->ov, - ior_wocr)) { - async_started += 1; - } else { - r.err = (int)GetLastError(); - switch (r.err) { - default: - ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 - ", err %d", - "WriteFileEx", fd, __Wpedantic_format_voidptr(item), - item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, - bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), - r.err); - goto bailout_rc; - case ERROR_NOT_FOUND: - case ERROR_USER_MAPPED_FILE: - case ERROR_LOCK_VIOLATION: - WARNING( - "%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 - ", err %d", - "WriteFileEx", fd, __Wpedantic_format_voidptr(item), - item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, - bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), - r.err); - SleepEx(0, true); - goto retry; - case ERROR_INVALID_USER_BUFFER: - case ERROR_NOT_ENOUGH_MEMORY: - if (SleepEx(0, true) == WAIT_IO_COMPLETION) - goto retry; - goto bailout_rc; - case ERROR_IO_PENDING: - async_started += 1; - } - } - } else { - assert(bytes < MAX_WRITE); - DWORD written = 0; - if (!WriteFile(fd, item->single.iov_base, (DWORD)bytes, &written, - &item->ov)) { - r.err = (int)GetLastError(); - ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 - ", err %d", - "WriteFile", fd, __Wpedantic_format_voidptr(item), - item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, - bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), - r.err); - goto bailout_rc; - } else if (unlikely(written != bytes)) { - r.err = ERROR_WRITE_FAULT; - goto bailout_rc; - } +MDBX_INTERNAL void pnl_shrink(pnl_t __restrict *__restrict ppnl) { + assert(pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) >= MDBX_PNL_INITIAL && + pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) < + MDBX_PNL_INITIAL * 3 / 2); + assert(MDBX_PNL_GETSIZE(*ppnl) <= PAGELIST_LIMIT && + MDBX_PNL_ALLOCLEN(*ppnl) >= MDBX_PNL_GETSIZE(*ppnl)); + MDBX_PNL_SETSIZE(*ppnl, 0); + if (unlikely(MDBX_PNL_ALLOCLEN(*ppnl) > + MDBX_PNL_INITIAL * (MDBX_PNL_PREALLOC_FOR_RADIXSORT ? 8 : 4) - + MDBX_CACHELINE_SIZE / sizeof(pgno_t))) { + size_t bytes = pnl_size2bytes(MDBX_PNL_INITIAL * 2); + pnl_t pnl = osal_realloc(*ppnl - 1, bytes); + if (likely(pnl)) { +#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) + bytes = malloc_usable_size(pnl); +#endif /* malloc_usable_size */ + *pnl = pnl_bytes2size(bytes); + *ppnl = pnl + 1; } - item = ior_next(item, i); } +} - assert(ior->async_waiting > ior->async_completed && - ior->async_waiting == INT_MAX); - ior->async_waiting = async_started; - if (async_started > ior->async_completed && end_wait_for == wait_for) { - assert(wait_for > ior->event_pool + ior->event_stack); - *--wait_for = ior->async_done; +MDBX_INTERNAL int pnl_reserve(pnl_t __restrict *__restrict ppnl, + const size_t wanna) { + const size_t allocated = MDBX_PNL_ALLOCLEN(*ppnl); + assert(MDBX_PNL_GETSIZE(*ppnl) <= PAGELIST_LIMIT && + MDBX_PNL_ALLOCLEN(*ppnl) >= MDBX_PNL_GETSIZE(*ppnl)); + if (likely(allocated >= wanna)) + return MDBX_SUCCESS; + + if (unlikely(wanna > /* paranoia */ PAGELIST_LIMIT)) { + ERROR("PNL too long (%zu > %zu)", wanna, (size_t)PAGELIST_LIMIT); + return MDBX_TXN_FULL; } - const size_t pending_count = end_wait_for - wait_for; - if (pending_count) { - /* Ждем до MAXIMUM_WAIT_OBJECTS (64) последних хендлов, а после избирательно - * ждем посредством GetOverlappedResult(), если какие-то более ранние - * элементы еще не завершены. В целом, так получается меньше системных - * вызовов, т.е. меньше накладных расходов. Однако, не факт что эта экономия - * не будет перекрыта неэффективностью реализации - * WaitForMultipleObjectsEx(), но тогда это проблемы на стороне M$. */ - DWORD madness; - do - madness = WaitForMultipleObjectsEx((pending_count < MAXIMUM_WAIT_OBJECTS) - ? (DWORD)pending_count - : MAXIMUM_WAIT_OBJECTS, - wait_for, true, - /* сутки */ 86400000ul, true); - while (madness == WAIT_IO_COMPLETION); - STATIC_ASSERT(WAIT_OBJECT_0 == 0); - if (/* madness >= WAIT_OBJECT_0 && */ - madness < WAIT_OBJECT_0 + MAXIMUM_WAIT_OBJECTS) - r.err = MDBX_SUCCESS; - else if (madness >= WAIT_ABANDONED_0 && - madness < WAIT_ABANDONED_0 + MAXIMUM_WAIT_OBJECTS) { - r.err = ERROR_ABANDONED_WAIT_0; - goto bailout_rc; - } else if (madness == WAIT_TIMEOUT) { - r.err = WAIT_TIMEOUT; - goto bailout_rc; - } else { - r.err = /* madness == WAIT_FAILED */ MDBX_PROBLEM; - goto bailout_rc; - } + const size_t size = (wanna + wanna - allocated < PAGELIST_LIMIT) + ? wanna + wanna - allocated + : PAGELIST_LIMIT; + size_t bytes = pnl_size2bytes(size); + pnl_t pnl = osal_realloc(*ppnl - 1, bytes); + if (likely(pnl)) { +#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) + bytes = malloc_usable_size(pnl); +#endif /* malloc_usable_size */ + *pnl = pnl_bytes2size(bytes); + assert(*pnl >= wanna); + *ppnl = pnl + 1; + return MDBX_SUCCESS; + } + return MDBX_ENOMEM; +} - assert(ior->async_waiting == ior->async_completed); - for (ior_item_t *item = ior->pool; item <= ior->last;) { - size_t i = 1, bytes = item->single.iov_len - ior_WriteFile_flag; - if (bytes & ior_WriteFile_flag) { - bytes = ior->pagesize; - /* Zap: Reading invalid data from 'item->sgv' */ - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385); - while (item->sgv[i].Buffer) { - bytes += ior->pagesize; - ++i; - } - if (!HasOverlappedIoCompleted(&item->ov)) { - DWORD written = 0; - if (unlikely(!GetOverlappedResult(fd, &item->ov, &written, true))) { - ERROR("%s: item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 - ", err %d", - "GetOverlappedResult", __Wpedantic_format_voidptr(item), - item - ior->pool, - ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, - item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), - (int)GetLastError()); - goto bailout_geterr; - } - assert(MDBX_SUCCESS == item->ov.Internal); - assert(written == item->ov.InternalHigh); - } - } else { - assert(HasOverlappedIoCompleted(&item->ov)); - } - assert(item->ov.Internal != ERROR_IO_PENDING); - if (unlikely(item->ov.Internal != MDBX_SUCCESS)) { - DWORD written = 0; - r.err = (int)item->ov.Internal; - if ((r.err & 0x80000000) && - GetOverlappedResult(NULL, &item->ov, &written, true)) - r.err = (int)GetLastError(); - ERROR("%s: item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 - ", err %d", - "Result", __Wpedantic_format_voidptr(item), item - ior->pool, - ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, - item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), - (int)GetLastError()); - goto bailout_rc; - } - if (unlikely(item->ov.InternalHigh != bytes)) { - r.err = ERROR_WRITE_FAULT; - goto bailout_rc; - } - item = ior_next(item, i); +static __always_inline int __must_check_result pnl_append_stepped( + unsigned step, __restrict pnl_t *ppnl, pgno_t pgno, size_t n) { + assert(n > 0); + int rc = pnl_need(ppnl, n); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + const pnl_t pnl = *ppnl; + if (likely(n == 1)) { + pnl_append_prereserved(pnl, pgno); + return MDBX_SUCCESS; + } + +#if MDBX_PNL_ASCENDING + size_t w = MDBX_PNL_GETSIZE(pnl); + do { + pnl[++w] = pgno; + pgno += step; + } while (--n); + MDBX_PNL_SETSIZE(pnl, w); +#else + size_t w = MDBX_PNL_GETSIZE(pnl) + n; + MDBX_PNL_SETSIZE(pnl, w); + do { + pnl[w--] = pgno; + pgno += step; + } while (--n); +#endif + return MDBX_SUCCESS; +} + +__hot MDBX_INTERNAL int __must_check_result +spill_append_span(__restrict pnl_t *ppnl, pgno_t pgno, size_t n) { + return pnl_append_stepped(2, ppnl, pgno << 1, n); +} + +__hot MDBX_INTERNAL int __must_check_result +pnl_append_span(__restrict pnl_t *ppnl, pgno_t pgno, size_t n) { + return pnl_append_stepped(1, ppnl, pgno, n); +} + +__hot MDBX_INTERNAL int __must_check_result +pnl_insert_span(__restrict pnl_t *ppnl, pgno_t pgno, size_t n) { + assert(n > 0); + int rc = pnl_need(ppnl, n); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + const pnl_t pnl = *ppnl; + size_t r = MDBX_PNL_GETSIZE(pnl), w = r + n; + MDBX_PNL_SETSIZE(pnl, w); + while (r && MDBX_PNL_DISORDERED(pnl[r], pgno)) + pnl[w--] = pnl[r--]; + + for (pgno_t fill = MDBX_PNL_ASCENDING ? pgno + n : pgno; w > r; --w) + pnl[w] = MDBX_PNL_ASCENDING ? --fill : fill++; + + return MDBX_SUCCESS; +} + +__hot __noinline MDBX_INTERNAL bool pnl_check(const const_pnl_t pnl, + const size_t limit) { + assert(limit >= MIN_PAGENO - MDBX_ENABLE_REFUND); + if (likely(MDBX_PNL_GETSIZE(pnl))) { + if (unlikely(MDBX_PNL_GETSIZE(pnl) > PAGELIST_LIMIT)) + return false; + if (unlikely(MDBX_PNL_LEAST(pnl) < MIN_PAGENO)) + return false; + if (unlikely(MDBX_PNL_MOST(pnl) >= limit)) + return false; + + if ((!MDBX_DISABLE_VALIDATION || AUDIT_ENABLED()) && + likely(MDBX_PNL_GETSIZE(pnl) > 1)) { + const pgno_t *scan = MDBX_PNL_BEGIN(pnl); + const pgno_t *const end = MDBX_PNL_END(pnl); + pgno_t prev = *scan++; + do { + if (unlikely(!MDBX_PNL_ORDERED(prev, *scan))) + return false; + prev = *scan; + } while (likely(++scan != end)); } - assert(ior->async_waiting == ior->async_completed); - } else { - assert(r.err == MDBX_SUCCESS); } - assert(ior->async_waiting == ior->async_completed); + return true; +} +static __always_inline void +pnl_merge_inner(pgno_t *__restrict dst, const pgno_t *__restrict src_a, + const pgno_t *__restrict src_b, + const pgno_t *__restrict const src_b_detent) { + do { +#if MDBX_HAVE_CMOV + const bool flag = MDBX_PNL_ORDERED(*src_b, *src_a); +#if defined(__LCC__) || __CLANG_PREREQ(13, 0) + // lcc 1.26: 13ШК (подготовка и первая итерация) + 7ШК (цикл), БЕЗ loop-mode + // gcc>=7: cmp+jmp с возвратом в тело цикла (WTF?) + // gcc<=6: cmov×3 + // clang<=12: cmov×3 + // clang>=13: cmov, set+add/sub + *dst = flag ? *src_a-- : *src_b--; #else - STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), - "libmdbx requires 64-bit file I/O on 64-bit systems"); - for (ior_item_t *item = ior->pool; item <= ior->last;) { -#if MDBX_HAVE_PWRITEV - assert(item->sgvcnt > 0); - if (item->sgvcnt == 1) - r.err = osal_pwrite(fd, item->sgv[0].iov_base, item->sgv[0].iov_len, - item->offset); - else - r.err = osal_pwritev(fd, item->sgv, item->sgvcnt, item->offset); + // gcc: cmov, cmp+set+add/sub + // clang<=5: cmov×2, set+add/sub + // clang>=6: cmov, set+add/sub + *dst = flag ? *src_a : *src_b; + src_b += (ptrdiff_t)flag - 1; + src_a -= flag; +#endif + --dst; +#else /* MDBX_HAVE_CMOV */ + while (MDBX_PNL_ORDERED(*src_b, *src_a)) + *dst-- = *src_a--; + *dst-- = *src_b--; +#endif /* !MDBX_HAVE_CMOV */ + } while (likely(src_b > src_b_detent)); +} - // TODO: io_uring_prep_write(sqe, fd, ...); +__hot MDBX_INTERNAL size_t pnl_merge(pnl_t dst, const pnl_t src) { + assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); + assert(pnl_check(src, MAX_PAGENO + 1)); + const size_t src_len = MDBX_PNL_GETSIZE(src); + const size_t dst_len = MDBX_PNL_GETSIZE(dst); + size_t total = dst_len; + assert(MDBX_PNL_ALLOCLEN(dst) >= total); + if (likely(src_len > 0)) { + total += src_len; + if (!MDBX_DEBUG && total < (MDBX_HAVE_CMOV ? 21 : 12)) + goto avoid_call_libc_for_short_cases; + if (dst_len == 0 || + MDBX_PNL_ORDERED(MDBX_PNL_LAST(dst), MDBX_PNL_FIRST(src))) + memcpy(MDBX_PNL_END(dst), MDBX_PNL_BEGIN(src), src_len * sizeof(pgno_t)); + else if (MDBX_PNL_ORDERED(MDBX_PNL_LAST(src), MDBX_PNL_FIRST(dst))) { + memmove(MDBX_PNL_BEGIN(dst) + src_len, MDBX_PNL_BEGIN(dst), + dst_len * sizeof(pgno_t)); + memcpy(MDBX_PNL_BEGIN(dst), MDBX_PNL_BEGIN(src), + src_len * sizeof(pgno_t)); + } else { + avoid_call_libc_for_short_cases: + dst[0] = /* the detent */ (MDBX_PNL_ASCENDING ? 0 : P_INVALID); + pnl_merge_inner(dst + total, dst + dst_len, src + src_len, src); + } + MDBX_PNL_SETSIZE(dst, total); + } + assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); + return total; +} - item = ior_next(item, item->sgvcnt); +#if MDBX_PNL_ASCENDING +#define MDBX_PNL_EXTRACT_KEY(ptr) (*(ptr)) #else - r.err = osal_pwrite(fd, item->single.iov_base, item->single.iov_len, - item->offset); - item = ior_next(item, 1); +#define MDBX_PNL_EXTRACT_KEY(ptr) (P_INVALID - *(ptr)) #endif - r.wops += 1; - if (unlikely(r.err != MDBX_SUCCESS)) - break; - } +RADIXSORT_IMPL(pgno, pgno_t, MDBX_PNL_EXTRACT_KEY, + MDBX_PNL_PREALLOC_FOR_RADIXSORT, 0) - // TODO: io_uring_submit(&ring) - // TODO: err = io_uring_wait_cqe(&ring, &cqe); - // TODO: io_uring_cqe_seen(&ring, cqe); +SORT_IMPL(pgno_sort, false, pgno_t, MDBX_PNL_ORDERED) -#endif /* !Windows */ - return r; +__hot __noinline MDBX_INTERNAL void pnl_sort_nochk(pnl_t pnl) { + if (likely(MDBX_PNL_GETSIZE(pnl) < MDBX_RADIXSORT_THRESHOLD) || + unlikely(!pgno_radixsort(&MDBX_PNL_FIRST(pnl), MDBX_PNL_GETSIZE(pnl)))) + pgno_sort(MDBX_PNL_BEGIN(pnl), MDBX_PNL_END(pnl)); } -MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *ior) { -#if defined(_WIN32) || defined(_WIN64) - if (ior->last) { - for (ior_item_t *item = ior->pool; item <= ior->last;) { - if (!HasOverlappedIoCompleted(&item->ov)) { - assert(ior->overlapped_fd); - CancelIoEx(ior->overlapped_fd, &item->ov); - } - if (item->ov.hEvent && item->ov.hEvent != ior) - ior_put_event(ior, item->ov.hEvent); - size_t i = 1; - if ((item->single.iov_len & ior_WriteFile_flag) == 0) { - /* Zap: Reading invalid data from 'item->sgv' */ - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385); - while (item->sgv[i].Buffer) - ++i; - } - item = ior_next(item, i); +SEARCH_IMPL(pgno_bsearch, pgno_t, pgno_t, MDBX_PNL_ORDERED) + +__hot __noinline MDBX_INTERNAL size_t pnl_search_nochk(const pnl_t pnl, + pgno_t pgno) { + const pgno_t *begin = MDBX_PNL_BEGIN(pnl); + const pgno_t *it = pgno_bsearch(begin, MDBX_PNL_GETSIZE(pnl), pgno); + const pgno_t *end = begin + MDBX_PNL_GETSIZE(pnl); + assert(it >= begin && it <= end); + if (it != begin) + assert(MDBX_PNL_ORDERED(it[-1], pgno)); + if (it != end) + assert(!MDBX_PNL_ORDERED(it[0], pgno)); + return it - begin + 1; +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 + + +typedef struct diff_result { + ptrdiff_t diff; + intptr_t level; + ptrdiff_t root_nkeys; +} diff_t; + +/* calculates: r = x - y */ +__hot static int cursor_diff(const MDBX_cursor *const __restrict x, + const MDBX_cursor *const __restrict y, + diff_t *const __restrict r) { + r->diff = 0; + r->level = 0; + r->root_nkeys = 0; + + if (unlikely(x->signature != cur_signature_live)) + return (x->signature == cur_signature_ready4dispose) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + if (unlikely(y->signature != cur_signature_live)) + return (y->signature == cur_signature_ready4dispose) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + int rc = check_txn(x->txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(x->txn != y->txn)) + return MDBX_BAD_TXN; + + if (unlikely(y->dbi_state != x->dbi_state)) + return MDBX_EINVAL; + + const intptr_t depth = (x->top < y->top) ? x->top : y->top; + if (unlikely(depth < 0)) + return MDBX_ENODATA; + + r->root_nkeys = page_numkeys(x->pg[0]); + intptr_t nkeys = r->root_nkeys; + for (;;) { + if (unlikely(y->pg[r->level] != x->pg[r->level])) { + ERROR("Mismatch cursors's pages at %zu level", r->level); + return MDBX_PROBLEM; + } + r->diff = x->ki[r->level] - y->ki[r->level]; + if (r->diff) + break; + r->level += 1; + if (r->level > depth) { + r->diff = + CMP2INT(x->flags & (z_eof | z_hollow), y->flags & (z_eof | z_hollow)); + return MDBX_SUCCESS; } + nkeys = page_numkeys(x->pg[r->level]); } - ior->async_waiting = INT_MAX; - ior->async_completed = 0; - ResetEvent(ior->async_done); -#endif /* !Windows */ - ior->slots_left = ior->allocated; - ior->last = nullptr; -} -static void ior_cleanup(osal_ioring_t *ior, const size_t since) { - osal_ioring_reset(ior); -#if defined(_WIN32) || defined(_WIN64) - for (size_t i = since; i < ior->event_stack; ++i) { - /* Zap: Using uninitialized memory '**ior.event_pool' */ - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6001); - CloseHandle(ior->event_pool[i]); + while (unlikely(r->diff == 1) && likely(r->level < depth)) { + r->level += 1; + /* DB'PAGEs: 0------------------>MAX + * + * CURSORs: y < x + * STACK[i ]: | + * STACK[+1]: ...y++N|0++x... + */ + nkeys = page_numkeys(y->pg[r->level]); + r->diff = (nkeys - y->ki[r->level]) + x->ki[r->level]; + assert(r->diff > 0); } - ior->event_stack = 0; -#else - (void)since; -#endif /* Windows */ + + while (unlikely(r->diff == -1) && likely(r->level < depth)) { + r->level += 1; + /* DB'PAGEs: 0------------------>MAX + * + * CURSORs: x < y + * STACK[i ]: | + * STACK[+1]: ...x--N|0--y... + */ + nkeys = page_numkeys(x->pg[r->level]); + r->diff = -(nkeys - x->ki[r->level]) - y->ki[r->level]; + assert(r->diff < 0); + } + + return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *ior, size_t items) { - assert(items > 0 && items < INT_MAX / sizeof(ior_item_t)); -#if defined(_WIN32) || defined(_WIN64) - if (ior->state & IOR_STATE_LOCKED) - return MDBX_SUCCESS; - const bool useSetFileIoOverlappedRange = - ior->overlapped_fd && mdbx_SetFileIoOverlappedRange && items > 42; - const size_t ceiling = - useSetFileIoOverlappedRange - ? ((items < 65536 / 2 / sizeof(ior_item_t)) ? 65536 : 65536 * 4) - : 1024; - const size_t bytes = ceil_powerof2(sizeof(ior_item_t) * items, ceiling); - items = bytes / sizeof(ior_item_t); -#endif /* Windows */ +__hot static ptrdiff_t estimate(const tree_t *tree, + diff_t *const __restrict dr) { + /* root: branch-page => scale = leaf-factor * branch-factor^(N-1) + * level-1: branch-page(s) => scale = leaf-factor * branch-factor^2 + * level-2: branch-page(s) => scale = leaf-factor * branch-factor + * level-N: branch-page(s) => scale = leaf-factor + * leaf-level: leaf-page(s) => scale = 1 + */ + ptrdiff_t btree_power = (ptrdiff_t)tree->height - 2 - (ptrdiff_t)dr->level; + if (btree_power < 0) + return dr->diff; - if (items != ior->allocated) { - assert(items >= osal_ioring_used(ior)); - if (items < ior->allocated) - ior_cleanup(ior, items); -#if defined(_WIN32) || defined(_WIN64) - void *ptr = osal_realloc( - ior->event_pool, - (items + /* extra for waiting the async_done */ 1) * sizeof(HANDLE)); - if (unlikely(!ptr)) - return MDBX_ENOMEM; - ior->event_pool = ptr; + ptrdiff_t estimated = + (ptrdiff_t)tree->items * dr->diff / (ptrdiff_t)tree->leaf_pages; + if (btree_power == 0) + return estimated; - int err = osal_memalign_alloc(ceiling, bytes, &ptr); - if (unlikely(err != MDBX_SUCCESS)) - return err; - if (ior->pool) { - memcpy(ptr, ior->pool, ior->allocated * sizeof(ior_item_t)); - osal_memalign_free(ior->pool); - } -#else - void *ptr = osal_realloc(ior->pool, sizeof(ior_item_t) * items); - if (unlikely(!ptr)) - return MDBX_ENOMEM; -#endif - ior->pool = ptr; + if (tree->height < 4) { + assert(dr->level == 0 && btree_power == 1); + return (ptrdiff_t)tree->items * dr->diff / (ptrdiff_t)dr->root_nkeys; + } - if (items > ior->allocated) - memset(ior->pool + ior->allocated, 0, - sizeof(ior_item_t) * (items - ior->allocated)); - ior->allocated = (unsigned)items; - ior->boundary = ptr_disp(ior->pool, ior->allocated); -#if defined(_WIN32) || defined(_WIN64) - if (useSetFileIoOverlappedRange) { - if (mdbx_SetFileIoOverlappedRange(ior->overlapped_fd, ptr, (ULONG)bytes)) - ior->state += IOR_STATE_LOCKED; - else - return GetLastError(); + /* average_branchpage_fillfactor = total(branch_entries) / branch_pages + total(branch_entries) = leaf_pages + branch_pages - 1 (root page) */ + const size_t log2_fixedpoint = sizeof(size_t) - 1; + const size_t half = UINT64_C(1) << (log2_fixedpoint - 1); + const size_t factor = + ((tree->leaf_pages + tree->branch_pages - 1) << log2_fixedpoint) / + tree->branch_pages; + while (1) { + switch ((size_t)btree_power) { + default: { + const size_t square = (factor * factor + half) >> log2_fixedpoint; + const size_t quad = (square * square + half) >> log2_fixedpoint; + do { + estimated = estimated * quad + half; + estimated >>= log2_fixedpoint; + btree_power -= 4; + } while (btree_power >= 4); + continue; + } + case 3: + estimated = estimated * factor + half; + estimated >>= log2_fixedpoint; + __fallthrough /* fall through */; + case 2: + estimated = estimated * factor + half; + estimated >>= log2_fixedpoint; + __fallthrough /* fall through */; + case 1: + estimated = estimated * factor + half; + estimated >>= log2_fixedpoint; + __fallthrough /* fall through */; + case 0: + if (unlikely(estimated > (ptrdiff_t)tree->items)) + return (ptrdiff_t)tree->items; + if (unlikely(estimated < -(ptrdiff_t)tree->items)) + return -(ptrdiff_t)tree->items; + return estimated; } -#endif /* Windows */ } - return MDBX_SUCCESS; -} - -MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *ior) { - if (ior->allocated) - ior_cleanup(ior, 0); -#if defined(_WIN32) || defined(_WIN64) - osal_memalign_free(ior->pool); - osal_free(ior->event_pool); - CloseHandle(ior->async_done); - if (ior->overlapped_fd) - CloseHandle(ior->overlapped_fd); -#else - osal_free(ior->pool); -#endif - memset(ior, 0, sizeof(osal_ioring_t)); } -/*----------------------------------------------------------------------------*/ - -MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname) { -#if defined(_WIN32) || defined(_WIN64) - return DeleteFileW(pathname) ? MDBX_SUCCESS : (int)GetLastError(); -#else - return unlink(pathname) ? errno : MDBX_SUCCESS; -#endif -} +__hot int mdbx_estimate_distance(const MDBX_cursor *first, + const MDBX_cursor *last, + ptrdiff_t *distance_items) { + if (unlikely(first == nullptr || last == nullptr || + distance_items == nullptr)) + return MDBX_EINVAL; -#if !(defined(_WIN32) || defined(_WIN64)) -static bool is_valid_fd(int fd) { return !(isatty(fd) < 0 && errno == EBADF); } -#endif /*! Windows */ + *distance_items = 0; + diff_t dr; + int rc = cursor_diff(last, first, &dr); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; -MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname) { -#if defined(_WIN32) || defined(_WIN64) - return RemoveDirectoryW(pathname) ? MDBX_SUCCESS : (int)GetLastError(); -#else - return rmdir(pathname) ? errno : MDBX_SUCCESS; -#endif -} + cASSERT(first, dr.diff || inner_pointed(first) == inner_pointed(last)); + if (unlikely(dr.diff == 0) && inner_pointed(first)) { + first = &first->subcur->cursor; + last = &last->subcur->cursor; + rc = cursor_diff(first, last, &dr); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } -MDBX_INTERNAL_FUNC int osal_fileexists(const pathchar_t *pathname) { -#if defined(_WIN32) || defined(_WIN64) - if (GetFileAttributesW(pathname) != INVALID_FILE_ATTRIBUTES) - return MDBX_RESULT_TRUE; - int err = GetLastError(); - return (err == ERROR_FILE_NOT_FOUND || err == ERROR_PATH_NOT_FOUND) - ? MDBX_RESULT_FALSE - : err; -#else - if (access(pathname, F_OK) == 0) - return MDBX_RESULT_TRUE; - int err = errno; - return (err == ENOENT || err == ENOTDIR) ? MDBX_RESULT_FALSE : err; -#endif -} + if (likely(dr.diff != 0)) + *distance_items = estimate(first->tree, &dr); -MDBX_INTERNAL_FUNC pathchar_t *osal_fileext(const pathchar_t *pathname, - size_t len) { - const pathchar_t *ext = nullptr; - for (size_t i = 0; i < len && pathname[i]; i++) - if (pathname[i] == '.') - ext = pathname + i; - else if (osal_isdirsep(pathname[i])) - ext = nullptr; - return (pathchar_t *)ext; + return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC bool osal_pathequal(const pathchar_t *l, const pathchar_t *r, - size_t len) { -#if defined(_WIN32) || defined(_WIN64) - for (size_t i = 0; i < len; ++i) { - pathchar_t a = l[i]; - pathchar_t b = r[i]; - a = (a == '\\') ? '/' : a; - b = (b == '\\') ? '/' : b; - if (a != b) - return false; - } - return true; -#else - return memcmp(l, r, len * sizeof(pathchar_t)) == 0; -#endif -} +__hot int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, + MDBX_val *data, MDBX_cursor_op move_op, + ptrdiff_t *distance_items) { + if (unlikely(cursor == nullptr || distance_items == nullptr || + move_op == MDBX_GET_CURRENT || move_op == MDBX_GET_MULTIPLE)) + return MDBX_EINVAL; -MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, - const MDBX_env *env, - const pathchar_t *pathname, - mdbx_filehandle_t *fd, - mdbx_mode_t unix_mode_bits) { - *fd = INVALID_HANDLE_VALUE; + if (unlikely(cursor->signature != cur_signature_live)) + return (cursor->signature == cur_signature_ready4dispose) ? MDBX_EINVAL + : MDBX_EBADSIGN; -#if defined(_WIN32) || defined(_WIN64) - DWORD CreationDisposition = unix_mode_bits ? OPEN_ALWAYS : OPEN_EXISTING; - DWORD FlagsAndAttributes = - FILE_FLAG_POSIX_SEMANTICS | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED; - DWORD DesiredAccess = FILE_READ_ATTRIBUTES; - DWORD ShareMode = (env->me_flags & MDBX_EXCLUSIVE) - ? 0 - : (FILE_SHARE_READ | FILE_SHARE_WRITE); + int rc = check_txn(cursor->txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - switch (purpose) { - default: - return ERROR_INVALID_PARAMETER; - case MDBX_OPEN_LCK: - CreationDisposition = OPEN_ALWAYS; - DesiredAccess |= GENERIC_READ | GENERIC_WRITE; - FlagsAndAttributes |= FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_TEMPORARY; - break; - case MDBX_OPEN_DXB_READ: - CreationDisposition = OPEN_EXISTING; - DesiredAccess |= GENERIC_READ; - ShareMode |= FILE_SHARE_READ; - break; - case MDBX_OPEN_DXB_LAZY: - DesiredAccess |= GENERIC_READ | GENERIC_WRITE; - break; - case MDBX_OPEN_DXB_OVERLAPPED_DIRECT: - FlagsAndAttributes |= FILE_FLAG_NO_BUFFERING; - /* fall through */ - __fallthrough; - case MDBX_OPEN_DXB_OVERLAPPED: - FlagsAndAttributes |= FILE_FLAG_OVERLAPPED; - /* fall through */ - __fallthrough; - case MDBX_OPEN_DXB_DSYNC: - CreationDisposition = OPEN_EXISTING; - DesiredAccess |= GENERIC_WRITE | GENERIC_READ; - FlagsAndAttributes |= FILE_FLAG_WRITE_THROUGH; - break; - case MDBX_OPEN_COPY: - CreationDisposition = CREATE_NEW; - ShareMode = 0; - DesiredAccess |= GENERIC_WRITE; - if (env->me_psize >= env->me_os_psize) - FlagsAndAttributes |= FILE_FLAG_NO_BUFFERING; - break; - case MDBX_OPEN_DELETE: - CreationDisposition = OPEN_EXISTING; - ShareMode |= FILE_SHARE_DELETE; - DesiredAccess = - FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES | DELETE | SYNCHRONIZE; - break; - } + if (unlikely(!is_pointed(cursor))) + return MDBX_ENODATA; - *fd = CreateFileW(pathname, DesiredAccess, ShareMode, NULL, - CreationDisposition, FlagsAndAttributes, NULL); - if (*fd == INVALID_HANDLE_VALUE) { - int err = (int)GetLastError(); - if (err == ERROR_ACCESS_DENIED && purpose == MDBX_OPEN_LCK) { - if (GetFileAttributesW(pathname) == INVALID_FILE_ATTRIBUTES && - GetLastError() == ERROR_FILE_NOT_FOUND) - err = ERROR_FILE_NOT_FOUND; - } - return err; - } + cursor_couple_t next; + rc = cursor_init(&next.outer, cursor->txn, cursor_dbi(cursor)); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - BY_HANDLE_FILE_INFORMATION info; - if (!GetFileInformationByHandle(*fd, &info)) { - int err = (int)GetLastError(); - CloseHandle(*fd); - *fd = INVALID_HANDLE_VALUE; - return err; + cursor_cpstk(cursor, &next.outer); + if (cursor->tree->flags & MDBX_DUPSORT) { + subcur_t *mx = &container_of(cursor, cursor_couple_t, outer)->inner; + cursor_cpstk(&mx->cursor, &next.inner.cursor); } - const DWORD AttributesDiff = - (info.dwFileAttributes ^ FlagsAndAttributes) & - (FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED | - FILE_ATTRIBUTE_TEMPORARY | FILE_ATTRIBUTE_COMPRESSED); - if (AttributesDiff) - (void)SetFileAttributesW(pathname, info.dwFileAttributes ^ AttributesDiff); -#else - int flags = unix_mode_bits ? O_CREAT : 0; - switch (purpose) { - default: - return EINVAL; - case MDBX_OPEN_LCK: - flags |= O_RDWR; - break; - case MDBX_OPEN_DXB_READ: - flags = O_RDONLY; - break; - case MDBX_OPEN_DXB_LAZY: - flags |= O_RDWR; - break; - case MDBX_OPEN_COPY: - flags = O_CREAT | O_WRONLY | O_EXCL; - break; - case MDBX_OPEN_DXB_DSYNC: - flags |= O_WRONLY; -#if defined(O_DSYNC) - flags |= O_DSYNC; -#elif defined(O_SYNC) - flags |= O_SYNC; -#elif defined(O_FSYNC) - flags |= O_FSYNC; -#endif - break; - case MDBX_OPEN_DELETE: - flags = O_RDWR; - break; + MDBX_val stub_data; + if (data == nullptr) { + const unsigned mask = + 1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | 1 << MDBX_SET_KEY; + if (unlikely(mask & (1 << move_op))) + return MDBX_EINVAL; + stub_data.iov_base = nullptr; + stub_data.iov_len = 0; + data = &stub_data; } - const bool direct_nocache_for_copy = - env->me_psize >= env->me_os_psize && purpose == MDBX_OPEN_COPY; - if (direct_nocache_for_copy) { -#if defined(O_DIRECT) - flags |= O_DIRECT; -#endif /* O_DIRECT */ -#if defined(O_NOCACHE) - flags |= O_NOCACHE; -#endif /* O_NOCACHE */ + MDBX_val stub_key; + if (key == nullptr) { + const unsigned mask = 1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | + 1 << MDBX_SET_KEY | 1 << MDBX_SET | + 1 << MDBX_SET_RANGE; + if (unlikely(mask & (1 << move_op))) + return MDBX_EINVAL; + stub_key.iov_base = nullptr; + stub_key.iov_len = 0; + key = &stub_key; } -#ifdef O_CLOEXEC - flags |= O_CLOEXEC; -#endif /* O_CLOEXEC */ - - /* Safeguard for https://libmdbx.dqdkfa.ru/dead-github/issues/144 */ -#if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2 - int stub_fd0 = -1, stub_fd1 = -1, stub_fd2 = -1; - static const char dev_null[] = "/dev/null"; - if (!is_valid_fd(STDIN_FILENO)) { - WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "IN", - STDIN_FILENO, dev_null); - stub_fd0 = open(dev_null, O_RDONLY | O_NOCTTY); - } - if (!is_valid_fd(STDOUT_FILENO)) { - WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "OUT", - STDOUT_FILENO, dev_null); - stub_fd1 = open(dev_null, O_WRONLY | O_NOCTTY); - } - if (!is_valid_fd(STDERR_FILENO)) { - WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "ERR", - STDERR_FILENO, dev_null); - stub_fd2 = open(dev_null, O_WRONLY | O_NOCTTY); - } -#else -#error "Unexpected or unsupported UNIX or POSIX system" -#endif /* STDIN_FILENO == 0 && STDERR_FILENO == 2 */ + next.outer.signature = cur_signature_live; + rc = cursor_ops(&next.outer, key, data, move_op); + if (unlikely(rc != MDBX_SUCCESS && + (rc != MDBX_NOTFOUND || !is_pointed(&next.outer)))) + return rc; - *fd = open(pathname, flags, unix_mode_bits); -#if defined(O_DIRECT) - if (*fd < 0 && (flags & O_DIRECT) && - (errno == EINVAL || errno == EAFNOSUPPORT)) { - flags &= ~(O_DIRECT | O_EXCL); - *fd = open(pathname, flags, unix_mode_bits); + if (move_op == MDBX_LAST) { + next.outer.flags |= z_eof; + next.inner.cursor.flags |= z_eof; } -#endif /* O_DIRECT */ + return mdbx_estimate_distance(cursor, &next.outer, distance_items); +} - if (*fd < 0 && errno == EACCES && purpose == MDBX_OPEN_LCK) { - struct stat unused; - if (stat(pathname, &unused) == 0 || errno != ENOENT) - errno = EACCES /* restore errno if file exists */; +__hot int mdbx_estimate_range(const MDBX_txn *txn, MDBX_dbi dbi, + const MDBX_val *begin_key, + const MDBX_val *begin_data, + const MDBX_val *end_key, const MDBX_val *end_data, + ptrdiff_t *size_items) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!size_items)) + return MDBX_EINVAL; + + if (unlikely(begin_data && + (begin_key == nullptr || begin_key == MDBX_EPSILON))) + return MDBX_EINVAL; + + if (unlikely(end_data && (end_key == nullptr || end_key == MDBX_EPSILON))) + return MDBX_EINVAL; + + if (unlikely(begin_key == MDBX_EPSILON && end_key == MDBX_EPSILON)) + return MDBX_EINVAL; + + cursor_couple_t begin; + /* LY: first, initialize cursor to refresh a DB in case it have DB_STALE */ + rc = cursor_init(&begin.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(begin.outer.tree->items == 0)) { + *size_items = 0; + return MDBX_SUCCESS; } - /* Safeguard for https://libmdbx.dqdkfa.ru/dead-github/issues/144 */ -#if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2 - if (*fd == STDIN_FILENO) { - WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "IN", - STDIN_FILENO); - assert(stub_fd0 == -1); - *fd = dup(stub_fd0 = *fd); + if (!begin_key) { + if (unlikely(!end_key)) { + /* LY: FIRST..LAST case */ + *size_items = (ptrdiff_t)begin.outer.tree->items; + return MDBX_SUCCESS; + } + rc = outer_first(&begin.outer, nullptr, nullptr); + if (unlikely(end_key == MDBX_EPSILON)) { + /* LY: FIRST..+epsilon case */ + return (rc == MDBX_SUCCESS) + ? mdbx_cursor_count(&begin.outer, (size_t *)size_items) + : rc; + } + } else { + if (unlikely(begin_key == MDBX_EPSILON)) { + if (end_key == nullptr) { + /* LY: -epsilon..LAST case */ + rc = outer_last(&begin.outer, nullptr, nullptr); + return (rc == MDBX_SUCCESS) + ? mdbx_cursor_count(&begin.outer, (size_t *)size_items) + : rc; + } + /* LY: -epsilon..value case */ + assert(end_key != MDBX_EPSILON); + begin_key = end_key; + } else if (unlikely(end_key == MDBX_EPSILON)) { + /* LY: value..+epsilon case */ + assert(begin_key != MDBX_EPSILON); + end_key = begin_key; + } + if (end_key && !begin_data && !end_data && + (begin_key == end_key || + begin.outer.clc->k.cmp(begin_key, end_key) == 0)) { + /* LY: single key case */ + rc = cursor_seek(&begin.outer, (MDBX_val *)begin_key, nullptr, MDBX_SET) + .err; + if (unlikely(rc != MDBX_SUCCESS)) { + *size_items = 0; + return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc; + } + *size_items = 1; + if (inner_pointed(&begin.outer)) + *size_items = + (sizeof(*size_items) >= sizeof(begin.inner.nested_tree.items) || + begin.inner.nested_tree.items <= PTRDIFF_MAX) + ? (size_t)begin.inner.nested_tree.items + : PTRDIFF_MAX; + + return MDBX_SUCCESS; + } else { + MDBX_val proxy_key = *begin_key; + MDBX_val proxy_data = {nullptr, 0}; + if (begin_data) + proxy_data = *begin_data; + rc = cursor_seek(&begin.outer, &proxy_key, &proxy_data, + MDBX_SET_LOWERBOUND) + .err; + } } - if (*fd == STDOUT_FILENO) { - WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "OUT", - STDOUT_FILENO); - assert(stub_fd1 == -1); - *fd = dup(stub_fd1 = *fd); + + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc != MDBX_NOTFOUND || !is_pointed(&begin.outer)) + return rc; } - if (*fd == STDERR_FILENO) { - WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "ERR", - STDERR_FILENO); - assert(stub_fd2 == -1); - *fd = dup(stub_fd2 = *fd); + + cursor_couple_t end; + rc = cursor_init(&end.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + if (!end_key) { + rc = outer_last(&end.outer, nullptr, nullptr); + end.outer.flags |= z_eof | z_hollow; + end.inner.cursor.flags |= z_eof | z_hollow; + } else { + MDBX_val proxy_key = *end_key; + MDBX_val proxy_data = {nullptr, 0}; + if (end_data) + proxy_data = *end_data; + rc = cursor_seek(&end.outer, &proxy_key, &proxy_data, MDBX_SET_LOWERBOUND) + .err; } - if (stub_fd0 != -1) - close(stub_fd0); - if (stub_fd1 != -1) - close(stub_fd1); - if (stub_fd2 != -1) - close(stub_fd2); - if (*fd >= STDIN_FILENO && *fd <= STDERR_FILENO) { - ERROR("Rejecting the use of a FD in the range " - "STDIN_FILENO/%d..STDERR_FILENO/%d to prevent database corruption", - STDIN_FILENO, STDERR_FILENO); - close(*fd); - return EBADF; + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc != MDBX_NOTFOUND || !is_pointed(&end.outer)) + return rc; } -#else -#error "Unexpected or unsupported UNIX or POSIX system" -#endif /* STDIN_FILENO == 0 && STDERR_FILENO == 2 */ - if (*fd < 0) - return errno; + rc = mdbx_estimate_distance(&begin.outer, &end.outer, size_items); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + assert(*size_items >= -(ptrdiff_t)begin.outer.tree->items && + *size_items <= (ptrdiff_t)begin.outer.tree->items); -#if defined(FD_CLOEXEC) && !defined(O_CLOEXEC) - const int fd_flags = fcntl(*fd, F_GETFD); - if (fd_flags != -1) - (void)fcntl(*fd, F_SETFD, fd_flags | FD_CLOEXEC); -#endif /* FD_CLOEXEC && !O_CLOEXEC */ +#if 0 /* LY: Was decided to returns as-is (i.e. negative) the estimation \ + * results for an inverted ranges. */ - if (direct_nocache_for_copy) { -#if defined(F_NOCACHE) && !defined(O_NOCACHE) - (void)fcntl(*fd, F_NOCACHE, 1); -#endif /* F_NOCACHE */ - } + /* Commit 8ddfd1f34ad7cf7a3c4aa75d2e248ca7e639ed63 + Change-Id: If59eccf7311123ab6384c4b93f9b1fed5a0a10d1 */ + if (*size_items < 0) { + /* LY: inverted range case */ + *size_items += (ptrdiff_t)begin.outer.tree->items; + } else if (*size_items == 0 && begin_key && end_key) { + int cmp = begin.outer.kvx->cmp(&origin_begin_key, &origin_end_key); + if (cmp == 0 && cursor_pointed(begin.inner.cursor.flags) && + begin_data && end_data) + cmp = begin.outer.kvx->v.cmp(&origin_begin_data, &origin_end_data); + if (cmp > 0) { + /* LY: inverted range case with empty scope */ + *size_items = (ptrdiff_t)begin.outer.tree->items; + } + } + assert(*size_items >= 0 && + *size_items <= (ptrdiff_t)begin.outer.tree->items); #endif + return MDBX_SUCCESS; } +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 -MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd) { -#if defined(_WIN32) || defined(_WIN64) - return CloseHandle(fd) ? MDBX_SUCCESS : (int)GetLastError(); + +#if MDBX_ENABLE_REFUND +static void refund_reclaimed(MDBX_txn *txn) { + /* Scanning in descend order */ + pgno_t first_unallocated = txn->geo.first_unallocated; + const pnl_t pnl = txn->tw.relist; + tASSERT(txn, + MDBX_PNL_GETSIZE(pnl) && MDBX_PNL_MOST(pnl) == first_unallocated - 1); +#if MDBX_PNL_ASCENDING + size_t i = MDBX_PNL_GETSIZE(pnl); + tASSERT(txn, pnl[i] == first_unallocated - 1); + while (--first_unallocated, --i > 0 && pnl[i] == first_unallocated - 1) + ; + MDBX_PNL_SETSIZE(pnl, i); #else - assert(fd > STDERR_FILENO); - return (close(fd) == 0) ? MDBX_SUCCESS : errno; + size_t i = 1; + tASSERT(txn, pnl[i] == first_unallocated - 1); + size_t len = MDBX_PNL_GETSIZE(pnl); + while (--first_unallocated, ++i <= len && pnl[i] == first_unallocated - 1) + ; + MDBX_PNL_SETSIZE(pnl, len -= i - 1); + for (size_t move = 0; move < len; ++move) + pnl[1 + move] = pnl[i + move]; #endif + VERBOSE("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, + txn->geo.first_unallocated - first_unallocated, + txn->geo.first_unallocated, first_unallocated); + txn->geo.first_unallocated = first_unallocated; + tASSERT(txn, + pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - 1)); } -MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, - uint64_t offset) { - if (bytes > MAX_WRITE) - return MDBX_EINVAL; -#if defined(_WIN32) || defined(_WIN64) - OVERLAPPED ov; - ov.hEvent = 0; - ov.Offset = (DWORD)offset; - ov.OffsetHigh = HIGH_DWORD(offset); +static void refund_loose(MDBX_txn *txn) { + tASSERT(txn, txn->tw.loose_pages != nullptr); + tASSERT(txn, txn->tw.loose_count > 0); - DWORD read = 0; - if (unlikely(!ReadFile(fd, buf, (DWORD)bytes, &read, &ov))) { - int rc = (int)GetLastError(); - return (rc == MDBX_SUCCESS) ? /* paranoia */ ERROR_READ_FAULT : rc; - } -#else - STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), - "libmdbx requires 64-bit file I/O on 64-bit systems"); - intptr_t read = pread(fd, buf, bytes, offset); - if (read < 0) { - int rc = errno; - return (rc == MDBX_SUCCESS) ? /* paranoia */ MDBX_EIO : rc; + dpl_t *const dl = txn->tw.dirtylist; + if (dl) { + tASSERT(txn, dl->length >= txn->tw.loose_count); + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + } else { + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); } -#endif - return (bytes == (size_t)read) ? MDBX_SUCCESS : MDBX_ENODATA; -} -MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, - size_t bytes, uint64_t offset) { - while (true) { -#if defined(_WIN32) || defined(_WIN64) - OVERLAPPED ov; - ov.hEvent = 0; - ov.Offset = (DWORD)offset; - ov.OffsetHigh = HIGH_DWORD(offset); + pgno_t onstack[MDBX_CACHELINE_SIZE * 8 / sizeof(pgno_t)]; + pnl_t suitable = onstack; + + if (!dl || dl->length - dl->sorted > txn->tw.loose_count) { + /* Dirty list is useless since unsorted. */ + if (pnl_bytes2size(sizeof(onstack)) < txn->tw.loose_count) { + suitable = pnl_alloc(txn->tw.loose_count); + if (unlikely(!suitable)) + return /* this is not a reason for transaction fail */; + } + + /* Collect loose-pages which may be refunded. */ + tASSERT(txn, + txn->geo.first_unallocated >= MIN_PAGENO + txn->tw.loose_count); + pgno_t most = MIN_PAGENO; + size_t w = 0; + for (const page_t *lp = txn->tw.loose_pages; lp; lp = page_next(lp)) { + tASSERT(txn, lp->flags == P_LOOSE); + tASSERT(txn, txn->geo.first_unallocated > lp->pgno); + if (likely(txn->geo.first_unallocated - txn->tw.loose_count <= + lp->pgno)) { + tASSERT(txn, + w < ((suitable == onstack) ? pnl_bytes2size(sizeof(onstack)) + : MDBX_PNL_ALLOCLEN(suitable))); + suitable[++w] = lp->pgno; + most = (lp->pgno > most) ? lp->pgno : most; + } + MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *)); + VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *)); + } + + if (most + 1 == txn->geo.first_unallocated) { + /* Sort suitable list and refund pages at the tail. */ + MDBX_PNL_SETSIZE(suitable, w); + pnl_sort(suitable, MAX_PAGENO + 1); + + /* Scanning in descend order */ + const intptr_t step = MDBX_PNL_ASCENDING ? -1 : 1; + const intptr_t begin = + MDBX_PNL_ASCENDING ? MDBX_PNL_GETSIZE(suitable) : 1; + const intptr_t end = + MDBX_PNL_ASCENDING ? 0 : MDBX_PNL_GETSIZE(suitable) + 1; + tASSERT(txn, suitable[begin] >= suitable[end - step]); + tASSERT(txn, most == suitable[begin]); + + for (intptr_t i = begin + step; i != end; i += step) { + if (suitable[i] != most - 1) + break; + most -= 1; + } + const size_t refunded = txn->geo.first_unallocated - most; + DEBUG("refund-suitable %zu pages %" PRIaPGNO " -> %" PRIaPGNO, refunded, + most, txn->geo.first_unallocated); + txn->geo.first_unallocated = most; + txn->tw.loose_count -= refunded; + if (dl) { + txn->tw.dirtyroom += refunded; + dl->pages_including_loose -= refunded; + assert(txn->tw.dirtyroom <= txn->env->options.dp_limit); + + /* Filter-out dirty list */ + size_t r = 0; + w = 0; + if (dl->sorted) { + do { + if (dl->items[++r].pgno < most) { + if (++w != r) + dl->items[w] = dl->items[r]; + } + } while (r < dl->sorted); + dl->sorted = w; + } + while (r < dl->length) { + if (dl->items[++r].pgno < most) { + if (++w != r) + dl->items[w] = dl->items[r]; + } + } + dpl_setlen(dl, w); + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->parent ? txn->parent->tw.dirtyroom + : txn->env->options.dp_limit)); + } + goto unlink_loose; + } + } else { + /* Dirtylist is mostly sorted, just refund loose pages at the end. */ + dpl_sort(txn); + tASSERT(txn, + dl->length < 2 || dl->items[1].pgno < dl->items[dl->length].pgno); + tASSERT(txn, dl->sorted == dl->length); - DWORD written; - if (unlikely(!WriteFile( - fd, buf, likely(bytes <= MAX_WRITE) ? (DWORD)bytes : MAX_WRITE, - &written, &ov))) - return (int)GetLastError(); - if (likely(bytes == written)) - return MDBX_SUCCESS; -#else - STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), - "libmdbx requires 64-bit file I/O on 64-bit systems"); - const intptr_t written = - pwrite(fd, buf, likely(bytes <= MAX_WRITE) ? bytes : MAX_WRITE, offset); - if (likely(bytes == (size_t)written)) - return MDBX_SUCCESS; - if (written < 0) { - const int rc = errno; - if (rc != EINTR) - return rc; - continue; + /* Scan dirtylist tail-forward and cutoff suitable pages. */ + size_t n; + for (n = dl->length; dl->items[n].pgno == txn->geo.first_unallocated - 1 && + dl->items[n].ptr->flags == P_LOOSE; + --n) { + tASSERT(txn, n > 0); + page_t *dp = dl->items[n].ptr; + DEBUG("refund-sorted page %" PRIaPGNO, dp->pgno); + tASSERT(txn, dp->pgno == dl->items[n].pgno); + txn->geo.first_unallocated -= 1; } -#endif - bytes -= written; - offset += written; - buf = ptr_disp(buf, written); - } -} + dpl_setlen(dl, n); -MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, - size_t bytes) { - while (true) { -#if defined(_WIN32) || defined(_WIN64) - DWORD written; - if (unlikely(!WriteFile( - fd, buf, likely(bytes <= MAX_WRITE) ? (DWORD)bytes : MAX_WRITE, - &written, nullptr))) - return (int)GetLastError(); - if (likely(bytes == written)) - return MDBX_SUCCESS; -#else - STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), - "libmdbx requires 64-bit file I/O on 64-bit systems"); - const intptr_t written = - write(fd, buf, likely(bytes <= MAX_WRITE) ? bytes : MAX_WRITE); - if (likely(bytes == (size_t)written)) - return MDBX_SUCCESS; - if (written < 0) { - const int rc = errno; - if (rc != EINTR) - return rc; - continue; + if (dl->sorted != dl->length) { + const size_t refunded = dl->sorted - dl->length; + dl->sorted = dl->length; + txn->tw.loose_count -= refunded; + txn->tw.dirtyroom += refunded; + dl->pages_including_loose -= refunded; + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->parent ? txn->parent->tw.dirtyroom + : txn->env->options.dp_limit)); + + /* Filter-out loose chain & dispose refunded pages. */ + unlink_loose: + for (page_t *__restrict *__restrict link = &txn->tw.loose_pages; *link;) { + page_t *dp = *link; + tASSERT(txn, dp->flags == P_LOOSE); + MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(dp), sizeof(page_t *)); + VALGRIND_MAKE_MEM_DEFINED(&page_next(dp), sizeof(page_t *)); + if (txn->geo.first_unallocated > dp->pgno) { + link = &page_next(dp); + } else { + *link = page_next(dp); + if ((txn->flags & MDBX_WRITEMAP) == 0) + page_shadow_release(txn->env, dp, 1); + } + } } -#endif - bytes -= written; - buf = ptr_disp(buf, written); } -} -int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, size_t sgvcnt, - uint64_t offset) { - size_t expected = 0; - for (size_t i = 0; i < sgvcnt; ++i) - expected += iov[i].iov_len; -#if !MDBX_HAVE_PWRITEV - size_t written = 0; - for (size_t i = 0; i < sgvcnt; ++i) { - int rc = osal_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - written += iov[i].iov_len; - offset += iov[i].iov_len; - } - return (expected == written) ? MDBX_SUCCESS - : MDBX_EIO /* ERROR_WRITE_FAULT */; -#else - int rc; - intptr_t written; - do { - STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), - "libmdbx requires 64-bit file I/O on 64-bit systems"); - written = pwritev(fd, iov, sgvcnt, offset); - if (likely(expected == (size_t)written)) - return MDBX_SUCCESS; - rc = errno; - } while (rc == EINTR); - return (written < 0) ? rc : MDBX_EIO /* Use which error code? */; -#endif + tASSERT(txn, dpl_check(txn)); + if (suitable != onstack) + pnl_free(suitable); + txn->tw.loose_refund_wl = txn->geo.first_unallocated; } -MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, - enum osal_syncmode_bits mode_bits) { -#if defined(_WIN32) || defined(_WIN64) - if ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) && !FlushFileBuffers(fd)) - return (int)GetLastError(); - return MDBX_SUCCESS; -#else +bool txn_refund(MDBX_txn *txn) { + const pgno_t before = txn->geo.first_unallocated; -#if defined(__APPLE__) && \ - MDBX_OSX_SPEED_INSTEADOF_DURABILITY == MDBX_OSX_WANNA_DURABILITY - if (mode_bits & MDBX_SYNC_IODQ) - return likely(fcntl(fd, F_FULLFSYNC) != -1) ? MDBX_SUCCESS : errno; -#endif /* MacOS */ + if (txn->tw.loose_pages && + txn->tw.loose_refund_wl > txn->geo.first_unallocated) + refund_loose(txn); - /* LY: This approach is always safe and without appreciable performance - * degradation, even on a kernel with fdatasync's bug. - * - * For more info about of a corresponding fdatasync() bug - * see http://www.spinics.net/lists/linux-ext4/msg33714.html */ - while (1) { - switch (mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_SIZE)) { - case MDBX_SYNC_NONE: - case MDBX_SYNC_KICK: - return MDBX_SUCCESS /* nothing to do */; -#if defined(_POSIX_SYNCHRONIZED_IO) && _POSIX_SYNCHRONIZED_IO > 0 - case MDBX_SYNC_DATA: - if (likely(fdatasync(fd) == 0)) - return MDBX_SUCCESS; - break /* error */; -#if defined(__linux__) || defined(__gnu_linux__) - case MDBX_SYNC_SIZE: - assert(linux_kernel_version >= 0x03060000); - return MDBX_SUCCESS; -#endif /* Linux */ -#endif /* _POSIX_SYNCHRONIZED_IO > 0 */ - default: - if (likely(fsync(fd) == 0)) - return MDBX_SUCCESS; - } + while (true) { + if (MDBX_PNL_GETSIZE(txn->tw.relist) == 0 || + MDBX_PNL_MOST(txn->tw.relist) != txn->geo.first_unallocated - 1) + break; - int rc = errno; - if (rc != EINTR) - return rc; + refund_reclaimed(txn); + if (!txn->tw.loose_pages || + txn->tw.loose_refund_wl <= txn->geo.first_unallocated) + break; + + const pgno_t memo = txn->geo.first_unallocated; + refund_loose(txn); + if (memo == txn->geo.first_unallocated) + break; } -#endif -} -int osal_filesize(mdbx_filehandle_t fd, uint64_t *length) { -#if defined(_WIN32) || defined(_WIN64) - BY_HANDLE_FILE_INFORMATION info; - if (!GetFileInformationByHandle(fd, &info)) - return (int)GetLastError(); - *length = info.nFileSizeLow | (uint64_t)info.nFileSizeHigh << 32; -#else - struct stat st; + if (before == txn->geo.first_unallocated) + return false; - STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(uint64_t), - "libmdbx requires 64-bit file I/O on 64-bit systems"); - if (fstat(fd, &st)) - return errno; + if (txn->tw.spilled.list) + /* Squash deleted pagenums if we refunded any */ + spill_purge(txn); - *length = st.st_size; -#endif - return MDBX_SUCCESS; + return true; } -MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd) { -#if defined(_WIN32) || defined(_WIN64) - switch (GetFileType(fd)) { - case FILE_TYPE_DISK: - return MDBX_RESULT_FALSE; - case FILE_TYPE_CHAR: - case FILE_TYPE_PIPE: - return MDBX_RESULT_TRUE; - default: - return (int)GetLastError(); - } -#else - struct stat info; - if (fstat(fd, &info)) - return errno; - switch (info.st_mode & S_IFMT) { - case S_IFBLK: - case S_IFREG: - return MDBX_RESULT_FALSE; - case S_IFCHR: - case S_IFIFO: - case S_IFSOCK: - return MDBX_RESULT_TRUE; - case S_IFDIR: - case S_IFLNK: - default: - return MDBX_INCOMPATIBLE; - } -#endif +#else /* MDBX_ENABLE_REFUND */ + +bool txn_refund(MDBX_txn *txn) { + (void)txn; + /* No online auto-compactification. */ + return false; } -MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length) { -#if defined(_WIN32) || defined(_WIN64) - if (mdbx_SetFileInformationByHandle) { - FILE_END_OF_FILE_INFO EndOfFileInfo; - EndOfFileInfo.EndOfFile.QuadPart = length; - return mdbx_SetFileInformationByHandle(fd, FileEndOfFileInfo, - &EndOfFileInfo, - sizeof(FILE_END_OF_FILE_INFO)) - ? MDBX_SUCCESS - : (int)GetLastError(); - } else { - LARGE_INTEGER li; - li.QuadPart = length; - return (SetFilePointerEx(fd, li, NULL, FILE_BEGIN) && SetEndOfFile(fd)) - ? MDBX_SUCCESS - : (int)GetLastError(); +#endif /* MDBX_ENABLE_REFUND */ +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 + + +void spill_remove(MDBX_txn *txn, size_t idx, size_t npages) { + tASSERT(txn, idx > 0 && idx <= MDBX_PNL_GETSIZE(txn->tw.spilled.list) && + txn->tw.spilled.least_removed > 0); + txn->tw.spilled.least_removed = (idx < txn->tw.spilled.least_removed) + ? idx + : txn->tw.spilled.least_removed; + txn->tw.spilled.list[idx] |= 1; + MDBX_PNL_SETSIZE(txn->tw.spilled.list, + MDBX_PNL_GETSIZE(txn->tw.spilled.list) - + (idx == MDBX_PNL_GETSIZE(txn->tw.spilled.list))); + + while (unlikely(npages > 1)) { + const pgno_t pgno = (txn->tw.spilled.list[idx] >> 1) + 1; + if (MDBX_PNL_ASCENDING) { + if (++idx > MDBX_PNL_GETSIZE(txn->tw.spilled.list) || + (txn->tw.spilled.list[idx] >> 1) != pgno) + return; + } else { + if (--idx < 1 || (txn->tw.spilled.list[idx] >> 1) != pgno) + return; + txn->tw.spilled.least_removed = (idx < txn->tw.spilled.least_removed) + ? idx + : txn->tw.spilled.least_removed; + } + txn->tw.spilled.list[idx] |= 1; + MDBX_PNL_SETSIZE(txn->tw.spilled.list, + MDBX_PNL_GETSIZE(txn->tw.spilled.list) - + (idx == MDBX_PNL_GETSIZE(txn->tw.spilled.list))); + --npages; } -#else - STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), - "libmdbx requires 64-bit file I/O on 64-bit systems"); - return ftruncate(fd, length) == 0 ? MDBX_SUCCESS : errno; -#endif } -MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos) { -#if defined(_WIN32) || defined(_WIN64) - LARGE_INTEGER li; - li.QuadPart = pos; - return SetFilePointerEx(fd, li, NULL, FILE_BEGIN) ? MDBX_SUCCESS - : (int)GetLastError(); -#else - STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), - "libmdbx requires 64-bit file I/O on 64-bit systems"); - return (lseek(fd, pos, SEEK_SET) < 0) ? errno : MDBX_SUCCESS; -#endif +pnl_t spill_purge(MDBX_txn *txn) { + tASSERT(txn, txn->tw.spilled.least_removed > 0); + const pnl_t sl = txn->tw.spilled.list; + if (txn->tw.spilled.least_removed != INT_MAX) { + size_t len = MDBX_PNL_GETSIZE(sl), r, w; + for (w = r = txn->tw.spilled.least_removed; r <= len; ++r) { + sl[w] = sl[r]; + w += 1 - (sl[r] & 1); + } + for (size_t i = 1; i < w; ++i) + tASSERT(txn, (sl[i] & 1) == 0); + MDBX_PNL_SETSIZE(sl, w - 1); + txn->tw.spilled.least_removed = INT_MAX; + } else { + for (size_t i = 1; i <= MDBX_PNL_GETSIZE(sl); ++i) + tASSERT(txn, (sl[i] & 1) == 0); + } + return sl; } /*----------------------------------------------------------------------------*/ -MDBX_INTERNAL_FUNC int -osal_thread_create(osal_thread_t *thread, - THREAD_RESULT(THREAD_CALL *start_routine)(void *), - void *arg) { -#if defined(_WIN32) || defined(_WIN64) - *thread = CreateThread(NULL, 0, start_routine, arg, 0, NULL); - return *thread ? MDBX_SUCCESS : (int)GetLastError(); -#else - return pthread_create(thread, NULL, start_routine, arg); -#endif +static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, page_t *dp, + const size_t npages) { + tASSERT(txn, !(txn->flags & MDBX_WRITEMAP)); +#if MDBX_ENABLE_PGOP_STAT + txn->env->lck->pgops.spill.weak += npages; +#endif /* MDBX_ENABLE_PGOP_STAT */ + const pgno_t pgno = dp->pgno; + int err = iov_page(txn, ctx, dp, npages); + if (likely(err == MDBX_SUCCESS)) + err = spill_append_span(&txn->tw.spilled.list, pgno, npages); + return err; } -MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread) { -#if defined(_WIN32) || defined(_WIN64) - DWORD code = WaitForSingleObject(thread, INFINITE); - return waitstatus2errcode(code); -#else - void *unused_retval = &unused_retval; - return pthread_join(thread, &unused_retval); -#endif +/* Set unspillable LRU-label for dirty pages watched by txn. + * Returns the number of pages marked as unspillable. */ +static size_t spill_cursor_keep(const MDBX_txn *const txn, + const MDBX_cursor *mc) { + tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); + size_t keep = 0; + while (!is_poor(mc)) { + tASSERT(txn, mc->top >= 0); + const page_t *mp; + intptr_t i = 0; + do { + mp = mc->pg[i]; + tASSERT(txn, !is_subpage(mp)); + if (is_modifable(txn, mp)) { + size_t const n = dpl_search(txn, mp->pgno); + if (txn->tw.dirtylist->items[n].pgno == mp->pgno && + /* не считаем дважды */ dpl_age(txn, n)) { + size_t *const ptr = ptr_disp(txn->tw.dirtylist->items[n].ptr, + -(ptrdiff_t)sizeof(size_t)); + *ptr = txn->tw.dirtylru; + tASSERT(txn, dpl_age(txn, n) == 0); + ++keep; + } + } + } while (++i <= mc->top); + + tASSERT(txn, is_leaf(mp)); + if (!mc->subcur || mc->ki[mc->top] >= page_numkeys(mp)) + break; + if (!(node_flags(page_node(mp, mc->ki[mc->top])) & N_SUBDATA)) + break; + mc = &mc->subcur->cursor; + } + return keep; } -/*----------------------------------------------------------------------------*/ +static size_t spill_txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { + tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); + dpl_lru_turn(txn); + size_t keep = m0 ? spill_cursor_keep(txn, m0) : 0; -MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, - size_t length, - enum osal_syncmode_bits mode_bits) { - if (!MDBX_MMAP_USE_MS_ASYNC && mode_bits == MDBX_SYNC_NONE) - return MDBX_SUCCESS; + TXN_FOREACH_DBI_ALL(txn, dbi) { + if (F_ISSET(txn->dbi_state[dbi], DBI_DIRTY | DBI_VALID) && + txn->dbs[dbi].root != P_INVALID) + for (MDBX_cursor *mc = txn->cursors[dbi]; mc; mc = mc->next) + if (mc != m0) + keep += spill_cursor_keep(txn, mc); + } - void *ptr = ptr_disp(map->base, offset); -#if defined(_WIN32) || defined(_WIN64) - if (!FlushViewOfFile(ptr, length)) - return (int)GetLastError(); - if ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) && - !FlushFileBuffers(map->fd)) - return (int)GetLastError(); -#else -#if defined(__linux__) || defined(__gnu_linux__) - /* Since Linux 2.6.19, MS_ASYNC is in fact a no-op. The kernel properly - * tracks dirty pages and flushes ones as necessary. */ - // - // However, this behavior may be changed in custom kernels, - // so just leave such optimization to the libc discretion. - // NOTE: The MDBX_MMAP_USE_MS_ASYNC must be defined to 1 for such cases. - // - // assert(linux_kernel_version > 0x02061300); - // if (mode_bits <= MDBX_SYNC_KICK) - // return MDBX_SUCCESS; -#endif /* Linux */ - if (msync(ptr, length, (mode_bits & MDBX_SYNC_DATA) ? MS_SYNC : MS_ASYNC)) - return errno; - if ((mode_bits & MDBX_SYNC_SIZE) && fsync(map->fd)) - return errno; -#endif - return MDBX_SUCCESS; + return keep; } -MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, - const pathchar_t *pathname, - int err) { -#if defined(_WIN32) || defined(_WIN64) - (void)pathname; - (void)err; - if (!mdbx_GetVolumeInformationByHandleW) - return MDBX_ENOSYS; - DWORD unused, flags; - if (!mdbx_GetVolumeInformationByHandleW(handle, nullptr, 0, nullptr, &unused, - &flags, nullptr, 0)) - return (int)GetLastError(); - if ((flags & FILE_READ_ONLY_VOLUME) == 0) - return MDBX_EACCESS; -#else - struct statvfs info; - if (err != MDBX_ENOFILE) { - if (statvfs(pathname, &info) == 0) - return (info.f_flag & ST_RDONLY) ? MDBX_SUCCESS : err; - if (errno != MDBX_ENOFILE) - return errno; +/* Returns the spilling priority (0..255) for a dirty page: + * 0 = should be spilled; + * ... + * > 255 = must not be spilled. */ +MDBX_NOTHROW_PURE_FUNCTION static unsigned +spill_prio(const MDBX_txn *txn, const size_t i, const uint32_t reciprocal) { + dpl_t *const dl = txn->tw.dirtylist; + const uint32_t age = dpl_age(txn, i); + const size_t npages = dpl_npages(dl, i); + const pgno_t pgno = dl->items[i].pgno; + if (age == 0) { + DEBUG("skip %s %zu page %" PRIaPGNO, "keep", npages, pgno); + return 256; } - if (fstatvfs(handle, &info)) - return errno; - if ((info.f_flag & ST_RDONLY) == 0) - return (err == MDBX_ENOFILE) ? MDBX_EACCESS : err; -#endif /* !Windows */ - return MDBX_SUCCESS; + + page_t *const dp = dl->items[i].ptr; + if (dp->flags & (P_LOOSE | P_SPILLED)) { + DEBUG("skip %s %zu page %" PRIaPGNO, + (dp->flags & P_LOOSE) ? "loose" : "parent-spilled", npages, pgno); + return 256; + } + + /* Can't spill twice, + * make sure it's not already in a parent's spill list(s). */ + MDBX_txn *parent = txn->parent; + if (parent && (parent->flags & MDBX_TXN_SPILLS)) { + do + if (spill_intersect(parent, pgno, npages)) { + DEBUG("skip-2 parent-spilled %zu page %" PRIaPGNO, npages, pgno); + dp->flags |= P_SPILLED; + return 256; + } + while ((parent = parent->parent) != nullptr); + } + + tASSERT(txn, age * (uint64_t)reciprocal < UINT32_MAX); + unsigned prio = age * reciprocal >> 24; + tASSERT(txn, prio < 256); + if (likely(npages == 1)) + return prio = 256 - prio; + + /* make a large/overflow pages be likely to spill */ + size_t factor = npages | npages >> 1; + factor |= factor >> 2; + factor |= factor >> 4; + factor |= factor >> 8; + factor |= factor >> 16; + factor = (size_t)prio * log2n_powerof2(factor + 1) + /* golden ratio */ 157; + factor = (factor < 256) ? 255 - factor : 0; + tASSERT(txn, factor < 256 && factor < (256 - prio)); + return prio = (unsigned)factor; } -MDBX_INTERNAL_FUNC int osal_check_fs_incore(mdbx_filehandle_t handle) { -#if defined(_WIN32) || defined(_WIN64) - (void)handle; -#else - struct statfs statfs_info; - if (fstatfs(handle, &statfs_info)) - return errno; +static size_t spill_gate(const MDBX_env *env, intptr_t part, + const size_t total) { + const intptr_t spill_min = + env->options.spill_min_denominator + ? (total + env->options.spill_min_denominator - 1) / + env->options.spill_min_denominator + : 1; + const intptr_t spill_max = + total - (env->options.spill_max_denominator + ? total / env->options.spill_max_denominator + : 0); + part = (part < spill_max) ? part : spill_max; + part = (part > spill_min) ? part : spill_min; + eASSERT(env, part >= 0 && (size_t)part <= total); + return (size_t)part; +} -#if defined(__OpenBSD__) - const unsigned type = 0; +__cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, + const intptr_t wanna_spill_entries, + const intptr_t wanna_spill_npages, + const size_t need) { + tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0); + + int rc = MDBX_SUCCESS; + if (unlikely(txn->tw.loose_count >= + (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose + : txn->tw.writemap_dirty_npages))) + goto done; + + const size_t dirty_entries = + txn->tw.dirtylist ? (txn->tw.dirtylist->length - txn->tw.loose_count) : 1; + const size_t dirty_npages = + (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose + : txn->tw.writemap_dirty_npages) - + txn->tw.loose_count; + const size_t need_spill_entries = + spill_gate(txn->env, wanna_spill_entries, dirty_entries); + const size_t need_spill_npages = + spill_gate(txn->env, wanna_spill_npages, dirty_npages); + + const size_t need_spill = (need_spill_entries > need_spill_npages) + ? need_spill_entries + : need_spill_npages; + if (!need_spill) + goto done; + + if (txn->flags & MDBX_WRITEMAP) { + NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "msync", + dirty_entries, dirty_npages); + const MDBX_env *env = txn->env; + tASSERT(txn, txn->tw.spilled.list == nullptr); + rc = osal_msync(&txn->env->dxb_mmap, 0, + pgno_align2os_bytes(env, txn->geo.first_unallocated), + MDBX_SYNC_KICK); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; +#if MDBX_AVOID_MSYNC + MDBX_ANALYSIS_ASSUME(txn->tw.dirtylist != nullptr); + tASSERT(txn, dpl_check(txn)); + env->lck->unsynced_pages.weak += + txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count; + dpl_clear(txn->tw.dirtylist); + txn->tw.dirtyroom = env->options.dp_limit - txn->tw.loose_count; + for (page_t *lp = txn->tw.loose_pages; lp != nullptr; lp = page_next(lp)) { + tASSERT(txn, lp->flags == P_LOOSE); + rc = dpl_append(txn, lp->pgno, lp, 1); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *)); + VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *)); + } + tASSERT(txn, dpl_check(txn)); #else - const unsigned type = statfs_info.f_type; -#endif - switch (type) { - case 0x28cd3d45 /* CRAMFS_MAGIC */: - case 0x858458f6 /* RAMFS_MAGIC */: - case 0x01021994 /* TMPFS_MAGIC */: - case 0x73717368 /* SQUASHFS_MAGIC */: - case 0x7275 /* ROMFS_MAGIC */: - return MDBX_RESULT_TRUE; + tASSERT(txn, txn->tw.dirtylist == nullptr); + env->lck->unsynced_pages.weak += txn->tw.writemap_dirty_npages; + txn->tw.writemap_spilled_npages += txn->tw.writemap_dirty_npages; + txn->tw.writemap_dirty_npages = 0; +#endif /* MDBX_AVOID_MSYNC */ + goto done; + } + + NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "write", + need_spill_entries, need_spill_npages); + MDBX_ANALYSIS_ASSUME(txn->tw.dirtylist != nullptr); + tASSERT(txn, txn->tw.dirtylist->length - txn->tw.loose_count >= 1); + tASSERT(txn, txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count >= + need_spill_npages); + if (!txn->tw.spilled.list) { + txn->tw.spilled.least_removed = INT_MAX; + txn->tw.spilled.list = pnl_alloc(need_spill); + if (unlikely(!txn->tw.spilled.list)) { + rc = MDBX_ENOMEM; + bailout: + txn->flags |= MDBX_TXN_ERROR; + return rc; + } + } else { + /* purge deleted slots */ + spill_purge(txn); + rc = pnl_reserve(&txn->tw.spilled.list, need_spill); + (void)rc /* ignore since the resulting list may be shorter + and pnl_append() will increase pnl on demand */ + ; } -#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ - defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) || \ - defined(__APPLE__) || defined(__MACH__) || defined(MFSNAMELEN) || \ - defined(MFSTYPENAMELEN) || defined(VFS_NAMELEN) - const char *const name = statfs_info.f_fstypename; - const size_t name_len = sizeof(statfs_info.f_fstypename); -#else - const char *const name = ""; - const size_t name_len = 0; -#endif - if (name_len) { - if (strncasecmp("tmpfs", name, 6) == 0 || - strncasecmp("mfs", name, 4) == 0 || - strncasecmp("ramfs", name, 6) == 0 || - strncasecmp("romfs", name, 6) == 0) - return MDBX_RESULT_TRUE; + /* Сортируем чтобы запись на диск была полее последовательна */ + dpl_t *const dl = dpl_sort(txn); + + /* Preserve pages which may soon be dirtied again */ + const size_t unspillable = spill_txn_keep(txn, m0); + if (unspillable + txn->tw.loose_count >= dl->length) { +#if xMDBX_DEBUG_SPILLING == 1 /* avoid false failure in debug mode */ + if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need)) + return MDBX_SUCCESS; +#endif /* xMDBX_DEBUG_SPILLING */ + ERROR("all %zu dirty pages are unspillable since referenced " + "by a cursor(s), use fewer cursors or increase " + "MDBX_opt_txn_dp_limit", + unspillable); + goto done; } -#endif /* !Windows */ - return MDBX_RESULT_FALSE; -} + /* Подзадача: Вытолкнуть часть страниц на диск в соответствии с LRU, + * но при этом учесть важные поправки: + * - лучше выталкивать старые large/overflow страницы, так будет освобождено + * больше памяти, а также так как они (в текущем понимании) гораздо реже + * повторно изменяются; + * - при прочих равных лучше выталкивать смежные страницы, так будет + * меньше I/O операций; + * - желательно потратить на это меньше времени чем std::partial_sort_copy; + * + * Решение: + * - Квантуем весь диапазон lru-меток до 256 значений и задействуем один + * проход 8-битного radix-sort. В результате получаем 256 уровней + * "свежести", в том числе значение lru-метки, старее которой страницы + * должны быть выгружены; + * - Двигаемся последовательно в сторону увеличения номеров страниц + * и выталкиваем страницы с lru-меткой старее отсекающего значения, + * пока не вытолкнем достаточно; + * - Встречая страницы смежные с выталкиваемыми для уменьшения кол-ва + * I/O операций выталкиваем и их, если они попадают в первую половину + * между выталкиваемыми и самыми свежими lru-метками; + * - дополнительно при сортировке умышленно старим large/overflow страницы, + * тем самым повышая их шансы на выталкивание. */ -static int osal_check_fs_local(mdbx_filehandle_t handle, int flags) { -#if defined(_WIN32) || defined(_WIN64) - if (mdbx_RunningUnderWine() && !(flags & MDBX_EXCLUSIVE)) - return ERROR_NOT_CAPABLE /* workaround for Wine */; + /* get min/max of LRU-labels */ + uint32_t age_max = 0; + for (size_t i = 1; i <= dl->length; ++i) { + const uint32_t age = dpl_age(txn, i); + age_max = (age_max >= age) ? age_max : age; + } - if (GetFileType(handle) != FILE_TYPE_DISK) - return ERROR_FILE_OFFLINE; + VERBOSE("lru-head %u, age-max %u", txn->tw.dirtylru, age_max); - if (mdbx_GetFileInformationByHandleEx) { - FILE_REMOTE_PROTOCOL_INFO RemoteProtocolInfo; - if (mdbx_GetFileInformationByHandleEx(handle, FileRemoteProtocolInfo, - &RemoteProtocolInfo, - sizeof(RemoteProtocolInfo))) { - if ((RemoteProtocolInfo.Flags & REMOTE_PROTOCOL_INFO_FLAG_OFFLINE) && - !(flags & MDBX_RDONLY)) - return ERROR_FILE_OFFLINE; - if (!(RemoteProtocolInfo.Flags & REMOTE_PROTOCOL_INFO_FLAG_LOOPBACK) && - !(flags & MDBX_EXCLUSIVE)) - return ERROR_REMOTE_STORAGE_MEDIA_ERROR; + /* half of 8-bit radix-sort */ + pgno_t radix_entries[256], radix_npages[256]; + memset(&radix_entries, 0, sizeof(radix_entries)); + memset(&radix_npages, 0, sizeof(radix_npages)); + size_t spillable_entries = 0, spillable_npages = 0; + const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1); + for (size_t i = 1; i <= dl->length; ++i) { + const unsigned prio = spill_prio(txn, i, reciprocal); + size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t)); + TRACE("page %" PRIaPGNO + ", lru %zu, is_multi %c, npages %u, age %u of %u, prio %u", + dl->items[i].pgno, *ptr, (dl->items[i].npages > 1) ? 'Y' : 'N', + dpl_npages(dl, i), dpl_age(txn, i), age_max, prio); + if (prio < 256) { + radix_entries[prio] += 1; + spillable_entries += 1; + const pgno_t npages = dpl_npages(dl, i); + radix_npages[prio] += npages; + spillable_npages += npages; } } - if (mdbx_NtFsControlFile) { - NTSTATUS rc; - struct { - WOF_EXTERNAL_INFO wof_info; - union { - WIM_PROVIDER_EXTERNAL_INFO wim_info; - FILE_PROVIDER_EXTERNAL_INFO_V1 file_info; - }; - size_t reserved_for_microsoft_madness[42]; - } GetExternalBacking_OutputBuffer; - IO_STATUS_BLOCK StatusBlock; - rc = mdbx_NtFsControlFile(handle, NULL, NULL, NULL, &StatusBlock, - FSCTL_GET_EXTERNAL_BACKING, NULL, 0, - &GetExternalBacking_OutputBuffer, - sizeof(GetExternalBacking_OutputBuffer)); - if (NT_SUCCESS(rc)) { - if (!(flags & MDBX_EXCLUSIVE)) - return ERROR_REMOTE_STORAGE_MEDIA_ERROR; - } else if (rc != STATUS_OBJECT_NOT_EXTERNALLY_BACKED && - rc != STATUS_INVALID_DEVICE_REQUEST && - rc != STATUS_NOT_SUPPORTED) - return ntstatus2errcode(rc); - } + tASSERT(txn, spillable_npages >= spillable_entries); + pgno_t spilled_entries = 0, spilled_npages = 0; + if (likely(spillable_entries > 0)) { + size_t prio2spill = 0, prio2adjacent = 128, + amount_entries = radix_entries[0], amount_npages = radix_npages[0]; + for (size_t i = 1; i < 256; i++) { + if (amount_entries < need_spill_entries || + amount_npages < need_spill_npages) { + prio2spill = i; + prio2adjacent = i + (257 - i) / 2; + amount_entries += radix_entries[i]; + amount_npages += radix_npages[i]; + } else if (amount_entries + amount_entries < + spillable_entries + need_spill_entries + /* РАВНОЗНАЧНО: amount - need_spill < spillable - amount */ + || amount_npages + amount_npages < + spillable_npages + need_spill_npages) { + prio2adjacent = i; + amount_entries += radix_entries[i]; + amount_npages += radix_npages[i]; + } else + break; + } - if (mdbx_GetVolumeInformationByHandleW && mdbx_GetFinalPathNameByHandleW) { - WCHAR *PathBuffer = osal_malloc(sizeof(WCHAR) * INT16_MAX); - if (!PathBuffer) - return MDBX_ENOMEM; + VERBOSE("prio2spill %zu, prio2adjacent %zu, spillable %zu/%zu," + " wanna-spill %zu/%zu, amount %zu/%zu", + prio2spill, prio2adjacent, spillable_entries, spillable_npages, + need_spill_entries, need_spill_npages, amount_entries, + amount_npages); + tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256); - int rc = MDBX_SUCCESS; - DWORD VolumeSerialNumber, FileSystemFlags; - if (!mdbx_GetVolumeInformationByHandleW(handle, PathBuffer, INT16_MAX, - &VolumeSerialNumber, NULL, - &FileSystemFlags, NULL, 0)) { - rc = (int)GetLastError(); + iov_ctx_t ctx; + rc = iov_init( + txn, &ctx, amount_entries, amount_npages, +#if defined(_WIN32) || defined(_WIN64) + txn->env->ioring.overlapped_fd ? txn->env->ioring.overlapped_fd : +#endif + txn->env->lazy_fd, + true); + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - } - if ((flags & MDBX_RDONLY) == 0) { - if (FileSystemFlags & - (FILE_SEQUENTIAL_WRITE_ONCE | FILE_READ_ONLY_VOLUME | - FILE_VOLUME_IS_COMPRESSED)) { - rc = ERROR_REMOTE_STORAGE_MEDIA_ERROR; - goto bailout; - } - } + size_t r = 0, w = 0; + pgno_t last = 0; + while (r < dl->length && (spilled_entries < need_spill_entries || + spilled_npages < need_spill_npages)) { + dl->items[++w] = dl->items[++r]; + unsigned prio = spill_prio(txn, w, reciprocal); + if (prio > prio2spill && + (prio >= prio2adjacent || last != dl->items[w].pgno)) + continue; - if (mdbx_GetFinalPathNameByHandleW(handle, PathBuffer, INT16_MAX, - FILE_NAME_NORMALIZED | VOLUME_NAME_NT)) { - if (_wcsnicmp(PathBuffer, L"\\Device\\Mup\\", 12) == 0) { - if (!(flags & MDBX_EXCLUSIVE)) { - rc = ERROR_REMOTE_STORAGE_MEDIA_ERROR; - goto bailout; - } + const size_t e = w; + last = dpl_endpgno(dl, w); + while (--w && dpl_endpgno(dl, w) == dl->items[w + 1].pgno && + spill_prio(txn, w, reciprocal) < prio2adjacent) + ; + + for (size_t i = w; ++i <= e;) { + const unsigned npages = dpl_npages(dl, i); + prio = spill_prio(txn, i, reciprocal); + DEBUG("%sspill[%zu] %u page %" PRIaPGNO " (age %d, prio %u)", + (prio > prio2spill) ? "co-" : "", i, npages, dl->items[i].pgno, + dpl_age(txn, i), prio); + tASSERT(txn, prio < 256); + ++spilled_entries; + spilled_npages += npages; + rc = spill_page(txn, &ctx, dl->items[i].ptr, npages); + if (unlikely(rc != MDBX_SUCCESS)) + goto failed; } } - if (F_ISSET(flags, MDBX_RDONLY | MDBX_EXCLUSIVE) && - (FileSystemFlags & FILE_READ_ONLY_VOLUME)) { - /* without-LCK (exclusive readonly) mode for DB on a read-only volume */ + VERBOSE("spilled entries %u, spilled npages %u", spilled_entries, + spilled_npages); + tASSERT(txn, spillable_entries == 0 || spilled_entries > 0); + tASSERT(txn, spilled_npages >= spilled_entries); + + failed: + while (r < dl->length) + dl->items[++w] = dl->items[++r]; + tASSERT(txn, r - w == spilled_entries || rc != MDBX_SUCCESS); + + dl->sorted = dpl_setlen(dl, w); + txn->tw.dirtyroom += spilled_entries; + txn->tw.dirtylist->pages_including_loose -= spilled_npages; + tASSERT(txn, dpl_check(txn)); + + if (!iov_empty(&ctx)) { + tASSERT(txn, rc == MDBX_SUCCESS); + rc = iov_write(&ctx); + } + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; + + txn->env->lck->unsynced_pages.weak += spilled_npages; + pnl_sort(txn->tw.spilled.list, (size_t)txn->geo.first_unallocated << 1); + txn->flags |= MDBX_TXN_SPILLS; + NOTICE("spilled %u dirty-entries, %u dirty-npages, now have %zu dirty-room", + spilled_entries, spilled_npages, txn->tw.dirtyroom); + } else { + tASSERT(txn, rc == MDBX_SUCCESS); + for (size_t i = 1; i <= dl->length; ++i) { + page_t *dp = dl->items[i].ptr; + VERBOSE( + "unspillable[%zu]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", + i, dp->pgno, dpl_npages(dl, i), dp->flags, dpl_age(txn, i), + spill_prio(txn, i, reciprocal)); } + } - if (mdbx_GetFinalPathNameByHandleW(handle, PathBuffer, INT16_MAX, - FILE_NAME_NORMALIZED | - VOLUME_NAME_DOS)) { - UINT DriveType = GetDriveTypeW(PathBuffer); - if (DriveType == DRIVE_NO_ROOT_DIR && - _wcsnicmp(PathBuffer, L"\\\\?\\", 4) == 0 && - _wcsnicmp(PathBuffer + 5, L":\\", 2) == 0) { - PathBuffer[7] = 0; - DriveType = GetDriveTypeW(PathBuffer + 4); - } - switch (DriveType) { - case DRIVE_CDROM: - if (flags & MDBX_RDONLY) - break; - // fall through - case DRIVE_UNKNOWN: - case DRIVE_NO_ROOT_DIR: - case DRIVE_REMOTE: - default: - if (!(flags & MDBX_EXCLUSIVE)) - rc = ERROR_REMOTE_STORAGE_MEDIA_ERROR; - // fall through - case DRIVE_REMOVABLE: - case DRIVE_FIXED: - case DRIVE_RAMDISK: - break; - } +#if xMDBX_DEBUG_SPILLING == 2 + if (txn->tw.loose_count + txn->tw.dirtyroom <= need / 2 + 1) + ERROR("dirty-list length: before %zu, after %zu, parent %zi, loose %zu; " + "needed %zu, spillable %zu; " + "spilled %u dirty-entries, now have %zu dirty-room", + dl->length + spilled_entries, dl->length, + (txn->parent && txn->parent->tw.dirtylist) + ? (intptr_t)txn->parent->tw.dirtylist->length + : -1, + txn->tw.loose_count, need, spillable_entries, spilled_entries, + txn->tw.dirtyroom); + ENSURE(txn->env, txn->tw.loose_count + txn->tw.dirtyroom > need / 2); +#endif /* xMDBX_DEBUG_SPILLING */ + +done: + return likely(txn->tw.dirtyroom + txn->tw.loose_count > + ((need > CURSOR_STACK_SIZE) ? CURSOR_STACK_SIZE : need)) + ? MDBX_SUCCESS + : MDBX_TXN_FULL; +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 + + +int sdb_setup(const MDBX_env *env, kvx_t *const kvx, const tree_t *const db) { + if (unlikely(!check_sdb_flags(db->flags))) { + ERROR("incompatible or invalid db.flags (0x%x) ", db->flags); + return MDBX_INCOMPATIBLE; + } + if (unlikely(!kvx->clc.k.cmp)) { + kvx->clc.k.cmp = builtin_keycmp(db->flags); + kvx->clc.v.cmp = builtin_datacmp(db->flags); + } + + kvx->clc.k.lmin = keysize_min(db->flags); + kvx->clc.k.lmax = env_keysize_max(env, db->flags); + kvx->clc.v.lmin = valsize_min(db->flags); + kvx->clc.v.lmax = env_valsize_max(env, db->flags); + + if ((db->flags & (MDBX_DUPFIXED | MDBX_INTEGERDUP)) != 0 && db->dupfix_size) { + if (!MDBX_DISABLE_VALIDATION && + unlikely(db->dupfix_size < kvx->clc.v.lmin || + db->dupfix_size > kvx->clc.v.lmax)) { + ERROR("db.dupfix_size (%u) <> min/max value-length (%zu/%zu)", + db->dupfix_size, kvx->clc.v.lmin, kvx->clc.v.lmax); + return MDBX_CORRUPTED; } + kvx->clc.v.lmin = kvx->clc.v.lmax = db->dupfix_size; + } + return MDBX_SUCCESS; +} +int sdb_fetch(MDBX_txn *txn, size_t dbi) { + cursor_couple_t couple; + int rc = cursor_init(&couple.outer, txn, MAIN_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + kvx_t *const kvx = &txn->env->kvs[dbi]; + rc = tree_search(&couple.outer, &kvx->name, 0); + if (unlikely(rc != MDBX_SUCCESS)) { bailout: - osal_free(PathBuffer); + NOTICE("dbi %zu refs to inaccessible subDB `%*s` for txn %" PRIaTXN + " (err %d)", + dbi, (int)kvx->name.iov_len, (const char *)kvx->name.iov_base, + txn->txnid, rc); + return (rc == MDBX_NOTFOUND) ? MDBX_BAD_DBI : rc; + } + + MDBX_val data; + struct node_search_result nsr = node_search(&couple.outer, &kvx->name); + if (unlikely(!nsr.exact)) { + rc = MDBX_NOTFOUND; + goto bailout; + } + if (unlikely((node_flags(nsr.node) & (N_DUPDATA | N_SUBDATA)) != N_SUBDATA)) { + NOTICE("dbi %zu refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", + dbi, (int)kvx->name.iov_len, (const char *)kvx->name.iov_base, + txn->txnid, "wrong flags"); + return MDBX_INCOMPATIBLE; /* not a named DB */ + } + + rc = node_read(&couple.outer, nsr.node, &data, + couple.outer.pg[couple.outer.top]); + if (unlikely(rc != MDBX_SUCCESS)) return rc; + + if (unlikely(data.iov_len != sizeof(tree_t))) { + NOTICE("dbi %zu refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", + dbi, (int)kvx->name.iov_len, (const char *)kvx->name.iov_base, + txn->txnid, "wrong rec-size"); + return MDBX_INCOMPATIBLE; /* not a named DB */ } -#else + uint16_t flags = UNALIGNED_PEEK_16(data.iov_base, tree_t, flags); + /* The txn may not know this DBI, or another process may + * have dropped and recreated the DB with other flags. */ + tree_t *const db = &txn->dbs[dbi]; + if (unlikely((db->flags & DB_PERSISTENT_FLAGS) != flags)) { + NOTICE("dbi %zu refs to the re-created subDB `%*s` for txn %" PRIaTXN + " with different flags (present 0x%X != wanna 0x%X)", + dbi, (int)kvx->name.iov_len, (const char *)kvx->name.iov_base, + txn->txnid, db->flags & DB_PERSISTENT_FLAGS, flags); + return MDBX_INCOMPATIBLE; + } - struct statvfs statvfs_info; - if (fstatvfs(handle, &statvfs_info)) - return errno; -#if defined(ST_LOCAL) || defined(ST_EXPORTED) - const unsigned long st_flags = statvfs_info.f_flag; -#endif /* ST_LOCAL || ST_EXPORTED */ + memcpy(db, data.iov_base, sizeof(tree_t)); +#if !MDBX_DISABLE_VALIDATION + const txnid_t pp_txnid = couple.outer.pg[couple.outer.top]->txnid; + tASSERT(txn, txn->front_txnid >= pp_txnid); + if (unlikely(db->mod_txnid > pp_txnid)) { + ERROR("db.mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")", + db->mod_txnid, pp_txnid); + return MDBX_CORRUPTED; + } +#endif /* !MDBX_DISABLE_VALIDATION */ + rc = sdb_setup(txn->env, kvx, db); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; -#if defined(__NetBSD__) - const unsigned type = 0; - const char *const name = statvfs_info.f_fstypename; - const size_t name_len = VFS_NAMELEN; -#elif defined(_AIX) || defined(__OS400__) - const char *const name = statvfs_info.f_basetype; - const size_t name_len = sizeof(statvfs_info.f_basetype); - struct stat st; - if (fstat(handle, &st)) - return errno; - const unsigned type = st.st_vfstype; - if ((st.st_flag & FS_REMOTE) != 0 && !(flags & MDBX_EXCLUSIVE)) - return MDBX_EREMOTE; -#elif defined(FSTYPSZ) || defined(_FSTYPSZ) - const unsigned type = 0; - const char *const name = statvfs_info.f_basetype; - const size_t name_len = sizeof(statvfs_info.f_basetype); -#elif defined(__sun) || defined(__SVR4) || defined(__svr4__) || \ - defined(ST_FSTYPSZ) || defined(_ST_FSTYPSZ) - const unsigned type = 0; - struct stat st; - if (fstat(handle, &st)) - return errno; - const char *const name = st.st_fstype; - const size_t name_len = strlen(name); -#else - struct statfs statfs_info; - if (fstatfs(handle, &statfs_info)) - return errno; -#if defined(__OpenBSD__) - const unsigned type = 0; -#else - const unsigned type = statfs_info.f_type; -#endif -#if defined(MNT_LOCAL) || defined(MNT_EXPORTED) - const unsigned long mnt_flags = statfs_info.f_flags; -#endif /* MNT_LOCAL || MNT_EXPORTED */ -#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ - defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) || \ - defined(__APPLE__) || defined(__MACH__) || defined(MFSNAMELEN) || \ - defined(MFSTYPENAMELEN) || defined(VFS_NAMELEN) - const char *const name = statfs_info.f_fstypename; - const size_t name_len = sizeof(statfs_info.f_fstypename); -#elif defined(__ANDROID_API__) && __ANDROID_API__ < 21 - const char *const name = ""; - const unsigned name_len = 0; -#else + txn->dbi_state[dbi] &= ~DBI_STALE; + return MDBX_SUCCESS; +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 - const char *name = ""; - unsigned name_len = 0; - struct stat st; - if (fstat(handle, &st)) - return errno; +typedef struct rthc_entry { + MDBX_env *env; +} rthc_entry_t; - char pathbuf[PATH_MAX]; - FILE *mounted = nullptr; -#if defined(__linux__) || defined(__gnu_linux__) - mounted = setmntent("/proc/mounts", "r"); -#endif /* Linux */ - if (!mounted) - mounted = setmntent("/etc/mtab", "r"); - if (mounted) { - const struct mntent *ent; -#if defined(_BSD_SOURCE) || defined(_SVID_SOURCE) || defined(__BIONIC__) || \ - (defined(_DEFAULT_SOURCE) && __GLIBC_PREREQ(2, 19)) - struct mntent entbuf; - const bool should_copy = false; - while (nullptr != - (ent = getmntent_r(mounted, &entbuf, pathbuf, sizeof(pathbuf)))) +#if MDBX_DEBUG +#define RTHC_INITIAL_LIMIT 1 #else - const bool should_copy = true; - while (nullptr != (ent = getmntent(mounted))) +#define RTHC_INITIAL_LIMIT 16 #endif - { - struct stat mnt; - if (!stat(ent->mnt_dir, &mnt) && mnt.st_dev == st.st_dev) { - if (should_copy) { - name = - strncpy(pathbuf, ent->mnt_fsname, name_len = sizeof(pathbuf) - 1); - pathbuf[name_len] = 0; - } else { - name = ent->mnt_fsname; - name_len = strlen(name); - } - break; + +static unsigned rthc_count, rthc_limit = RTHC_INITIAL_LIMIT; +static rthc_entry_t rthc_table_static[RTHC_INITIAL_LIMIT]; +static rthc_entry_t *rthc_table = rthc_table_static; + +static int uniq_peek(const osal_mmap_t *pending, osal_mmap_t *scan) { + int rc; + uint64_t bait; + lck_t *const pending_lck = pending->lck; + lck_t *const scan_lck = scan->lck; + if (pending_lck) { + bait = atomic_load64(&pending_lck->bait_uniqueness, mo_AcquireRelease); + rc = MDBX_SUCCESS; + } else { + bait = 0 /* hush MSVC warning */; + rc = osal_msync(scan, 0, sizeof(lck_t), MDBX_SYNC_DATA); + if (rc == MDBX_SUCCESS) + rc = osal_pread(pending->fd, &bait, sizeof(scan_lck->bait_uniqueness), + offsetof(lck_t, bait_uniqueness)); + } + if (likely(rc == MDBX_SUCCESS) && + bait == atomic_load64(&scan_lck->bait_uniqueness, mo_AcquireRelease)) + rc = MDBX_RESULT_TRUE; + + TRACE("uniq-peek: %s, bait 0x%016" PRIx64 ",%s rc %d", + pending_lck ? "mem" : "file", bait, + (rc == MDBX_RESULT_TRUE) ? " found," : (rc ? " FAILED," : ""), rc); + return rc; +} + +static int uniq_poke(const osal_mmap_t *pending, osal_mmap_t *scan, + uint64_t *abra) { + if (*abra == 0) { + const uintptr_t tid = osal_thread_self(); + uintptr_t uit = 0; + memcpy(&uit, &tid, (sizeof(tid) < sizeof(uit)) ? sizeof(tid) : sizeof(uit)); + *abra = rrxmrrxmsx_0(osal_monotime() + UINT64_C(5873865991930747) * uit); + } + const uint64_t cadabra = + rrxmrrxmsx_0(*abra + UINT64_C(7680760450171793) * (unsigned)osal_getpid()) + << 24 | + *abra >> 40; + lck_t *const scan_lck = scan->lck; + atomic_store64(&scan_lck->bait_uniqueness, cadabra, mo_AcquireRelease); + *abra = *abra * UINT64_C(6364136223846793005) + 1; + return uniq_peek(pending, scan); +} + +__cold int rthc_uniq_check(const osal_mmap_t *pending, MDBX_env **found) { + *found = nullptr; + uint64_t salt = 0; + for (size_t i = 0; i < rthc_count; ++i) { + MDBX_env *const scan = rthc_table[i].env; + if (!scan->lck_mmap.lck || &scan->lck_mmap == pending) + continue; + int err = + atomic_load64(&scan->lck_mmap.lck->bait_uniqueness, mo_AcquireRelease) + ? uniq_peek(pending, &scan->lck_mmap) + : uniq_poke(pending, &scan->lck_mmap, &salt); + if (err == MDBX_ENODATA) { + uint64_t length = 0; + if (likely(osal_filesize(pending->fd, &length) == MDBX_SUCCESS && + length == 0)) { + /* LY: skip checking since LCK-file is empty, i.e. just created. */ + DEBUG("%s", "unique (new/empty lck)"); + return MDBX_SUCCESS; } } - endmntent(mounted); + if (err == MDBX_RESULT_TRUE) + err = uniq_poke(pending, &scan->lck_mmap, &salt); + if (err == MDBX_RESULT_TRUE) { + (void)osal_msync(&scan->lck_mmap, 0, sizeof(lck_t), MDBX_SYNC_KICK); + err = uniq_poke(pending, &scan->lck_mmap, &salt); + } + if (err == MDBX_RESULT_TRUE) { + err = uniq_poke(pending, &scan->lck_mmap, &salt); + *found = scan; + DEBUG("found %p", __Wpedantic_format_voidptr(*found)); + return MDBX_SUCCESS; + } + if (unlikely(err != MDBX_SUCCESS)) { + DEBUG("failed rc %d", err); + return err; + } } -#endif /* !xBSD && !Android/Bionic */ -#endif - if (name_len) { - if (((name_len > 2 && strncasecmp("nfs", name, 3) == 0) || - strncasecmp("cifs", name, name_len) == 0 || - strncasecmp("ncpfs", name, name_len) == 0 || - strncasecmp("smbfs", name, name_len) == 0 || - strcasecmp("9P" /* WSL2 */, name) == 0 || - ((name_len > 3 && strncasecmp("fuse", name, 4) == 0) && - strncasecmp("fuseblk", name, name_len) != 0)) && - !(flags & MDBX_EXCLUSIVE)) - return MDBX_EREMOTE; - if (strcasecmp("ftp", name) == 0 || strcasecmp("http", name) == 0 || - strcasecmp("sshfs", name) == 0) - return MDBX_EREMOTE; - } + DEBUG("%s", "unique"); + return MDBX_SUCCESS; +} -#ifdef ST_LOCAL - if ((st_flags & ST_LOCAL) == 0 && !(flags & MDBX_EXCLUSIVE)) - return MDBX_EREMOTE; -#elif defined(MNT_LOCAL) - if ((mnt_flags & MNT_LOCAL) == 0 && !(flags & MDBX_EXCLUSIVE)) - return MDBX_EREMOTE; -#endif /* ST/MNT_LOCAL */ +//------------------------------------------------------------------------------ -#ifdef ST_EXPORTED - if ((st_flags & ST_EXPORTED) != 0 && !(flags & MDBX_RDONLY)) - return MDBX_EREMOTE; -#elif defined(MNT_EXPORTED) - if ((mnt_flags & MNT_EXPORTED) != 0 && !(flags & MDBX_RDONLY)) - return MDBX_EREMOTE; -#endif /* ST/MNT_EXPORTED */ +#if defined(_WIN32) || defined(_WIN64) +static CRITICAL_SECTION rthc_critical_section; +#else + +static pthread_mutex_t rthc_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t rthc_cond = PTHREAD_COND_INITIALIZER; +static osal_thread_key_t rthc_key; +static mdbx_atomic_uint32_t rthc_pending; + +static inline uint64_t rthc_signature(const void *addr, uint8_t kind) { + uint64_t salt = osal_thread_self() * UINT64_C(0xA2F0EEC059629A17) ^ + UINT64_C(0x01E07C6FDB596497) * (uintptr_t)(addr); +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return salt << 8 | kind; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return (uint64_t)kind << 56 | salt >> 8; +#else +#error "FIXME: Unsupported byte order" +#endif /* __BYTE_ORDER__ */ +} - switch (type) { - case 0xFF534D42 /* CIFS_MAGIC_NUMBER */: - case 0x6969 /* NFS_SUPER_MAGIC */: - case 0x564c /* NCP_SUPER_MAGIC */: - case 0x517B /* SMB_SUPER_MAGIC */: -#if defined(__digital__) || defined(__osf__) || defined(__osf) - case 0x0E /* Tru64 NFS */: +#define MDBX_THREAD_RTHC_REGISTERED(addr) rthc_signature(addr, 0x0D) +#define MDBX_THREAD_RTHC_COUNTED(addr) rthc_signature(addr, 0xC0) +static __thread uint64_t rthc_thread_state +#if __has_attribute(tls_model) && \ + (defined(__PIC__) || defined(__pic__) || MDBX_BUILD_SHARED_LIBRARY) + __attribute__((tls_model("local-dynamic"))) #endif -#ifdef ST_FST_NFS - case ST_FST_NFS: + ; + +#if defined(__APPLE__) && defined(__SANITIZE_ADDRESS__) && \ + !defined(MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS) +/* Avoid ASAN-trap due the target TLS-variable feed by Darwin's tlv_free() */ +#define MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS \ + __attribute__((__no_sanitize_address__, __noinline__)) +#else +#define MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS inline #endif - if ((flags & MDBX_EXCLUSIVE) == 0) - return MDBX_EREMOTE; - case 0: - default: - break; - } -#endif /* Unix */ - return MDBX_SUCCESS; +MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS static uint64_t rthc_read(const void *rthc) { + return *(volatile uint64_t *)rthc; } -static int check_mmap_limit(const size_t limit) { - const bool should_check = -#if defined(__SANITIZE_ADDRESS__) - true; +MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS static uint64_t +rthc_compare_and_clean(const void *rthc, const uint64_t signature) { +#if MDBX_64BIT_CAS + return atomic_cas64((mdbx_atomic_uint64_t *)rthc, signature, 0); +#elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + return atomic_cas32((mdbx_atomic_uint32_t *)rthc, (uint32_t)signature, 0); +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + return atomic_cas32((mdbx_atomic_uint32_t *)rthc, (uint32_t)(signature >> 32), + 0); #else - RUNNING_ON_VALGRIND; -#endif /* __SANITIZE_ADDRESS__ */ +#error "FIXME: Unsupported byte order" +#endif +} - if (should_check) { - intptr_t pagesize, total_ram_pages, avail_ram_pages; - int err = - mdbx_get_sysraminfo(&pagesize, &total_ram_pages, &avail_ram_pages); - if (unlikely(err != MDBX_SUCCESS)) - return err; +static inline int rthc_atexit(void (*dtor)(void *), void *obj, + void *dso_symbol) { +#ifndef MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL +#if defined(LIBCXXABI_HAS_CXA_THREAD_ATEXIT_IMPL) || \ + defined(HAVE___CXA_THREAD_ATEXIT_IMPL) || __GLIBC_PREREQ(2, 18) || \ + defined(BIONIC) +#define MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL 1 +#else +#define MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL 0 +#endif +#endif /* MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL */ - const int log2page = log2n_powerof2(pagesize); - if ((limit >> (log2page + 7)) > (size_t)total_ram_pages || - (limit >> (log2page + 6)) > (size_t)avail_ram_pages) { - ERROR("%s (%zu pages) is too large for available (%zu pages) or total " - "(%zu pages) system RAM", - "database upper size limit", limit >> log2page, avail_ram_pages, - total_ram_pages); - return MDBX_TOO_LARGE; - } +#ifndef MDBX_HAVE_CXA_THREAD_ATEXIT +#if defined(LIBCXXABI_HAS_CXA_THREAD_ATEXIT) || \ + defined(HAVE___CXA_THREAD_ATEXIT) +#define MDBX_HAVE_CXA_THREAD_ATEXIT 1 +#elif !MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL && \ + (defined(__linux__) || defined(__gnu_linux__)) +#define MDBX_HAVE_CXA_THREAD_ATEXIT 1 +#else +#define MDBX_HAVE_CXA_THREAD_ATEXIT 0 +#endif +#endif /* MDBX_HAVE_CXA_THREAD_ATEXIT */ + + int rc = MDBX_ENOSYS; +#if MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL && !MDBX_HAVE_CXA_THREAD_ATEXIT +#define __cxa_thread_atexit __cxa_thread_atexit_impl +#endif +#if MDBX_HAVE_CXA_THREAD_ATEXIT || defined(__cxa_thread_atexit) + extern int __cxa_thread_atexit(void (*dtor)(void *), void *obj, + void *dso_symbol) MDBX_WEAK_IMPORT_ATTRIBUTE; + if (&__cxa_thread_atexit) + rc = __cxa_thread_atexit(dtor, obj, dso_symbol); +#elif defined(__APPLE__) || defined(_DARWIN_C_SOURCE) + extern void _tlv_atexit(void (*termfunc)(void *objAddr), void *objAddr) + MDBX_WEAK_IMPORT_ATTRIBUTE; + if (&_tlv_atexit) { + (void)dso_symbol; + _tlv_atexit(dtor, obj); + rc = 0; } +#else + (void)dtor; + (void)obj; + (void)dso_symbol; +#endif + return rc; +} - return MDBX_SUCCESS; +__cold void workaround_glibc_bug21031(void) { + /* Workaround for https://sourceware.org/bugzilla/show_bug.cgi?id=21031 + * + * Due race between pthread_key_delete() and __nptl_deallocate_tsd() + * The destructor(s) of thread-local-storage object(s) may be running + * in another thread(s) and be blocked or not finished yet. + * In such case we get a SEGFAULT after unload this library DSO. + * + * So just by yielding a few timeslices we give a chance + * to such destructor(s) for completion and avoids segfault. */ + sched_yield(); + sched_yield(); + sched_yield(); } +#endif /* !Windows */ -MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, size_t size, - const size_t limit, const unsigned options) { - assert(size <= limit); - map->limit = 0; - map->current = 0; - map->base = nullptr; - map->filesize = 0; +void rthc_lock(void) { #if defined(_WIN32) || defined(_WIN64) - map->section = NULL; -#endif /* Windows */ + EnterCriticalSection(&rthc_critical_section); +#else + ENSURE(nullptr, osal_pthread_mutex_lock(&rthc_mutex) == 0); +#endif +} - int err = osal_check_fs_local(map->fd, flags); - if (unlikely(err != MDBX_SUCCESS)) - return err; +void rthc_unlock(void) { +#if defined(_WIN32) || defined(_WIN64) + LeaveCriticalSection(&rthc_critical_section); +#else + ENSURE(nullptr, pthread_mutex_unlock(&rthc_mutex) == 0); +#endif +} - err = check_mmap_limit(limit); - if (unlikely(err != MDBX_SUCCESS)) - return err; +static inline int thread_key_create(osal_thread_key_t *key) { + int rc; +#if defined(_WIN32) || defined(_WIN64) + *key = TlsAlloc(); + rc = (*key != TLS_OUT_OF_INDEXES) ? MDBX_SUCCESS : GetLastError(); +#else + rc = pthread_key_create(key, nullptr); +#endif + TRACE("&key = %p, value %" PRIuPTR ", rc %d", __Wpedantic_format_voidptr(key), + (uintptr_t)*key, rc); + return rc; +} - if ((flags & MDBX_RDONLY) == 0 && (options & MMAP_OPTION_TRUNCATE) != 0) { - err = osal_ftruncate(map->fd, size); - VERBOSE("ftruncate %zu, err %d", size, err); - if (err != MDBX_SUCCESS) - return err; - map->filesize = size; -#if !(defined(_WIN32) || defined(_WIN64)) - map->current = size; -#endif /* !Windows */ - } else { - err = osal_filesize(map->fd, &map->filesize); - VERBOSE("filesize %" PRIu64 ", err %d", map->filesize, err); - if (err != MDBX_SUCCESS) - return err; +void thread_rthc_set(osal_thread_key_t key, const void *value) { #if defined(_WIN32) || defined(_WIN64) - if (map->filesize < size) { - WARNING("file size (%zu) less than requested for mapping (%zu)", - (size_t)map->filesize, size); - size = (size_t)map->filesize; - } + ENSURE(nullptr, TlsSetValue(key, (void *)value)); #else - map->current = (map->filesize > limit) ? limit : (size_t)map->filesize; -#endif /* !Windows */ + const uint64_t sign_registered = + MDBX_THREAD_RTHC_REGISTERED(&rthc_thread_state); + const uint64_t sign_counted = MDBX_THREAD_RTHC_COUNTED(&rthc_thread_state); + if (value && unlikely(rthc_thread_state != sign_registered && + rthc_thread_state != sign_counted)) { + rthc_thread_state = sign_registered; + TRACE("thread registered 0x%" PRIxPTR, osal_thread_self()); + if (rthc_atexit(rthc_thread_dtor, &rthc_thread_state, + (void *)&mdbx_version /* dso_anchor */)) { + ENSURE(nullptr, pthread_setspecific(rthc_key, &rthc_thread_state) == 0); + rthc_thread_state = sign_counted; + const unsigned count_before = atomic_add32(&rthc_pending, 1); + ENSURE(nullptr, count_before < INT_MAX); + NOTICE("fallback to pthreads' tsd, key %" PRIuPTR ", count %u", + (uintptr_t)rthc_key, count_before); + (void)count_before; + } } + ENSURE(nullptr, pthread_setspecific(key, value) == 0); +#endif +} +/* dtor called for thread, i.e. for all mdbx's environment objects */ +__cold void rthc_thread_dtor(void *rthc) { + rthc_lock(); + const uint32_t current_pid = osal_getpid(); #if defined(_WIN32) || defined(_WIN64) - LARGE_INTEGER SectionSize; - SectionSize.QuadPart = size; - err = NtCreateSection( - &map->section, - /* DesiredAccess */ - (flags & MDBX_WRITEMAP) - ? SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE | - SECTION_MAP_WRITE - : SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE, - /* ObjectAttributes */ NULL, /* MaximumSize (InitialSize) */ &SectionSize, - /* SectionPageProtection */ - (flags & MDBX_RDONLY) ? PAGE_READONLY : PAGE_READWRITE, - /* AllocationAttributes */ SEC_RESERVE, map->fd); - if (!NT_SUCCESS(err)) - return ntstatus2errcode(err); + TRACE(">> pid %d, thread 0x%" PRIxPTR ", module %p", current_pid, + osal_thread_self(), rthc); +#else + TRACE(">> pid %d, thread 0x%" PRIxPTR ", rthc %p", current_pid, + osal_thread_self(), rthc); +#endif - SIZE_T ViewSize = (flags & MDBX_RDONLY) ? 0 - : mdbx_RunningUnderWine() ? size - : limit; - err = NtMapViewOfSection( - map->section, GetCurrentProcess(), &map->base, - /* ZeroBits */ 0, - /* CommitSize */ 0, - /* SectionOffset */ NULL, &ViewSize, - /* InheritDisposition */ ViewUnmap, - /* AllocationType */ (flags & MDBX_RDONLY) ? 0 : MEM_RESERVE, - /* Win32Protect */ - (flags & MDBX_WRITEMAP) ? PAGE_READWRITE : PAGE_READONLY); - if (!NT_SUCCESS(err)) { - NtClose(map->section); - map->section = 0; - map->base = nullptr; - return ntstatus2errcode(err); + for (size_t i = 0; i < rthc_count; ++i) { + MDBX_env *const env = rthc_table[i].env; + if (env->pid != current_pid) + continue; + if (!(env->flags & ENV_TXKEY)) + continue; + reader_slot_t *const reader = thread_rthc_get(env->me_txkey); + reader_slot_t *const begin = &env->lck_mmap.lck->rdt[0]; + reader_slot_t *const end = &env->lck_mmap.lck->rdt[env->max_readers]; + if (reader < begin || reader >= end) + continue; +#if !defined(_WIN32) && !defined(_WIN64) + if (pthread_setspecific(env->me_txkey, nullptr) != 0) { + TRACE("== thread 0x%" PRIxPTR + ", rthc %p: ignore race with tsd-key deletion", + osal_thread_self(), __Wpedantic_format_voidptr(reader)); + continue /* ignore race with tsd-key deletion by mdbx_env_close() */; + } +#endif + + TRACE("== thread 0x%" PRIxPTR + ", rthc %p, [%zi], %p ... %p (%+i), rtch-pid %i, " + "current-pid %i", + osal_thread_self(), __Wpedantic_format_voidptr(reader), i, + __Wpedantic_format_voidptr(begin), __Wpedantic_format_voidptr(end), + (int)(reader - begin), reader->pid.weak, current_pid); + if (atomic_load32(&reader->pid, mo_Relaxed) == current_pid) { + TRACE("==== thread 0x%" PRIxPTR ", rthc %p, cleanup", osal_thread_self(), + __Wpedantic_format_voidptr(reader)); + (void)atomic_cas32(&reader->pid, current_pid, 0); + atomic_store32(&env->lck->rdt_refresh_flag, true, mo_Relaxed); + } } - assert(map->base != MAP_FAILED); - map->current = (size_t)SectionSize.QuadPart; - map->limit = ViewSize; +#if defined(_WIN32) || defined(_WIN64) + TRACE("<< thread 0x%" PRIxPTR ", module %p", osal_thread_self(), rthc); + rthc_unlock(); +#else + const uint64_t sign_registered = MDBX_THREAD_RTHC_REGISTERED(rthc); + const uint64_t sign_counted = MDBX_THREAD_RTHC_COUNTED(rthc); + const uint64_t state = rthc_read(rthc); + if (state == sign_registered && + rthc_compare_and_clean(rthc, sign_registered)) { + TRACE("== thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), rthc, osal_getpid(), "registered", state); + } else if (state == sign_counted && + rthc_compare_and_clean(rthc, sign_counted)) { + TRACE("== thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), rthc, osal_getpid(), "counted", state); + ENSURE(nullptr, atomic_sub32(&rthc_pending, 1) > 0); + } else { + WARNING("thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), rthc, osal_getpid(), "wrong", state); + } -#else /* Windows */ + if (atomic_load32(&rthc_pending, mo_AcquireRelease) == 0) { + TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, wake", osal_thread_self(), + rthc, osal_getpid()); + ENSURE(nullptr, pthread_cond_broadcast(&rthc_cond) == 0); + } -#ifndef MAP_TRYFIXED -#define MAP_TRYFIXED 0 + TRACE("<< thread 0x%" PRIxPTR ", rthc %p", osal_thread_self(), rthc); + /* Allow tail call optimization, i.e. gcc should generate the jmp instruction + * instead of a call for pthread_mutex_unlock() and therefore CPU could not + * return to current DSO's code section, which may be unloaded immediately + * after the mutex got released. */ + pthread_mutex_unlock(&rthc_mutex); #endif +} -#ifndef MAP_HASSEMAPHORE -#define MAP_HASSEMAPHORE 0 -#endif +__cold int rthc_register(MDBX_env *const env) { + TRACE(">> env %p, rthc_count %u, rthc_limit %u", + __Wpedantic_format_voidptr(env), rthc_count, rthc_limit); -#ifndef MAP_CONCEAL -#define MAP_CONCEAL 0 -#endif + int rc = MDBX_SUCCESS; + for (size_t i = 0; i < rthc_count; ++i) + if (unlikely(rthc_table[i].env == env)) { + rc = MDBX_PANIC; + goto bailout; + } -#ifndef MAP_NOSYNC -#define MAP_NOSYNC 0 -#endif + env->me_txkey = 0; + if (unlikely(rthc_count == rthc_limit)) { + rthc_entry_t *new_table = + osal_realloc((rthc_table == rthc_table_static) ? nullptr : rthc_table, + sizeof(rthc_entry_t) * rthc_limit * 2); + if (unlikely(new_table == nullptr)) { + rc = MDBX_ENOMEM; + goto bailout; + } + if (rthc_table == rthc_table_static) + memcpy(new_table, rthc_table, sizeof(rthc_entry_t) * rthc_limit); + rthc_table = new_table; + rthc_limit *= 2; + } -#ifndef MAP_FIXED_NOREPLACE -#define MAP_FIXED_NOREPLACE 0 -#endif + if ((env->flags & MDBX_NOSTICKYTHREADS) == 0) { + rc = thread_key_create(&env->me_txkey); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + env->flags |= ENV_TXKEY; + } -#ifndef MAP_NORESERVE -#define MAP_NORESERVE 0 -#endif + rthc_table[rthc_count].env = env; + TRACE("== [%i] = env %p, key %" PRIuPTR, rthc_count, + __Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey); + ++rthc_count; - map->base = mmap( - NULL, limit, (flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ, - MAP_SHARED | MAP_FILE | MAP_NORESERVE | - (F_ISSET(flags, MDBX_UTTERLY_NOSYNC) ? MAP_NOSYNC : 0) | - ((options & MMAP_OPTION_SEMAPHORE) ? MAP_HASSEMAPHORE | MAP_NOSYNC - : MAP_CONCEAL), - map->fd, 0); +bailout: + TRACE("<< env %p, key %" PRIuPTR ", rthc_count %u, rthc_limit %u, rc %d", + __Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey, rthc_count, + rthc_limit, rc); + return rc; +} - if (unlikely(map->base == MAP_FAILED)) { - map->limit = 0; - map->current = 0; - map->base = nullptr; - assert(errno != 0); - return errno; +__cold static int rthc_drown(MDBX_env *const env) { + const uint32_t current_pid = osal_getpid(); + int rc = MDBX_SUCCESS; + MDBX_env *inprocess_neighbor = nullptr; + if (likely(env->lck_mmap.lck && current_pid == env->pid)) { + reader_slot_t *const begin = &env->lck_mmap.lck->rdt[0]; + reader_slot_t *const end = &env->lck_mmap.lck->rdt[env->max_readers]; + TRACE("== %s env %p pid %d, readers %p ...%p, current-pid %d", + (current_pid == env->pid) ? "cleanup" : "skip", + __Wpedantic_format_voidptr(env), env->pid, + __Wpedantic_format_voidptr(begin), __Wpedantic_format_voidptr(end), + current_pid); + bool cleaned = false; + for (reader_slot_t *r = begin; r < end; ++r) { + if (atomic_load32(&r->pid, mo_Relaxed) == current_pid) { + atomic_store32(&r->pid, 0, mo_AcquireRelease); + TRACE("== cleanup %p", __Wpedantic_format_voidptr(r)); + cleaned = true; + } + } + if (cleaned) + atomic_store32(&env->lck_mmap.lck->rdt_refresh_flag, true, mo_Relaxed); + rc = rthc_uniq_check(&env->lck_mmap, &inprocess_neighbor); + if (!inprocess_neighbor && env->registered_reader_pid && + env->lck_mmap.fd != INVALID_HANDLE_VALUE) { + int err = lck_rpid_clear(env); + rc = rc ? rc : err; + } } - map->limit = limit; - -#if MDBX_ENABLE_MADVISE -#ifdef MADV_DONTFORK - if (unlikely(madvise(map->base, map->limit, MADV_DONTFORK) != 0)) - return errno; -#endif /* MADV_DONTFORK */ -#ifdef MADV_NOHUGEPAGE - (void)madvise(map->base, map->limit, MADV_NOHUGEPAGE); -#endif /* MADV_NOHUGEPAGE */ -#endif /* MDBX_ENABLE_MADVISE */ + int err = lck_destroy(env, inprocess_neighbor, current_pid); + env->pid = 0; + return rc ? rc : err; +} -#endif /* ! Windows */ +__cold int rthc_remove(MDBX_env *const env) { + TRACE(">>> env %p, key %zu, rthc_count %u, rthc_limit %u", + __Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey, rthc_count, + rthc_limit); - VALGRIND_MAKE_MEM_DEFINED(map->base, map->current); - MDBX_ASAN_UNPOISON_MEMORY_REGION(map->base, map->current); - return MDBX_SUCCESS; -} + int rc = MDBX_SUCCESS; + if (likely(env->pid)) + rc = rthc_drown(env); -MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map) { - VALGRIND_MAKE_MEM_NOACCESS(map->base, map->current); - /* Unpoisoning is required for ASAN to avoid false-positive diagnostic - * when this memory will re-used by malloc or another mmapping. - * See https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ - MDBX_ASAN_UNPOISON_MEMORY_REGION( - map->base, (map->filesize && map->filesize < map->limit) ? map->filesize - : map->limit); -#if defined(_WIN32) || defined(_WIN64) - if (map->section) - NtClose(map->section); - NTSTATUS rc = NtUnmapViewOfSection(GetCurrentProcess(), map->base); - if (!NT_SUCCESS(rc)) - ntstatus2errcode(rc); -#else - if (unlikely(munmap(map->base, map->limit))) { - assert(errno != 0); - return errno; + for (size_t i = 0; i < rthc_count; ++i) { + if (rthc_table[i].env == env) { + if (--rthc_count > 0) + rthc_table[i] = rthc_table[rthc_count]; + else if (rthc_table != rthc_table_static) { + void *tmp = rthc_table; + rthc_table = rthc_table_static; + rthc_limit = RTHC_INITIAL_LIMIT; + osal_memory_barrier(); + osal_free(tmp); + } + break; + } } -#endif /* ! Windows */ - map->limit = 0; - map->current = 0; - map->base = nullptr; - return MDBX_SUCCESS; + TRACE("<<< %p, key %zu, rthc_count %u, rthc_limit %u", + __Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey, rthc_count, + rthc_limit); + return rc; } -MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, - size_t size, size_t limit) { - int rc = osal_filesize(map->fd, &map->filesize); - VERBOSE("flags 0x%x, size %zu, limit %zu, filesize %" PRIu64, flags, size, - limit, map->filesize); - assert(size <= limit); - if (rc != MDBX_SUCCESS) { - map->filesize = 0; - return rc; +#if !defined(_WIN32) && !defined(_WIN64) +__cold void rthc_afterfork(void) { + NOTICE("drown %d rthc entries", rthc_count); + for (size_t i = 0; i < rthc_count; ++i) { + MDBX_env *const env = rthc_table[i].env; + NOTICE("drown env %p", __Wpedantic_format_voidptr(env)); + if (env->lck_mmap.lck) + osal_munmap(&env->lck_mmap); + if (env->dxb_mmap.base) { + osal_munmap(&env->dxb_mmap); +#ifdef ENABLE_MEMCHECK + VALGRIND_DISCARD(env->valgrind_handle); + env->valgrind_handle = -1; +#endif /* ENABLE_MEMCHECK */ + } + env->lck = lckless_stub(env); + rthc_drown(env); } + if (rthc_table != rthc_table_static) + osal_free(rthc_table); + rthc_count = 0; + rthc_table = rthc_table_static; + rthc_limit = RTHC_INITIAL_LIMIT; + rthc_pending.weak = 0; +} +#endif /* ! Windows */ +__cold void rthc_ctor(void) { #if defined(_WIN32) || defined(_WIN64) - assert(size != map->current || limit != map->limit || size < map->filesize); - - NTSTATUS status; - LARGE_INTEGER SectionSize; - int err; + InitializeCriticalSection(&rthc_critical_section); +#else + ENSURE(nullptr, pthread_atfork(nullptr, nullptr, rthc_afterfork) == 0); + ENSURE(nullptr, pthread_key_create(&rthc_key, rthc_thread_dtor) == 0); + TRACE("pid %d, &mdbx_rthc_key = %p, value 0x%x", osal_getpid(), + __Wpedantic_format_voidptr(&rthc_key), (unsigned)rthc_key); +#endif +} - if (limit == map->limit && size > map->current) { - if ((flags & MDBX_RDONLY) && map->filesize >= size) { - map->current = size; - return MDBX_SUCCESS; - } else if (!(flags & MDBX_RDONLY) && - /* workaround for Wine */ mdbx_NtExtendSection) { - /* growth rw-section */ - SectionSize.QuadPart = size; - status = mdbx_NtExtendSection(map->section, &SectionSize); - if (!NT_SUCCESS(status)) - return ntstatus2errcode(status); - map->current = size; - if (map->filesize < size) - map->filesize = size; - return MDBX_SUCCESS; +__cold void rthc_dtor(const uint32_t current_pid) { + rthc_lock(); +#if !defined(_WIN32) && !defined(_WIN64) + uint64_t *rthc = pthread_getspecific(rthc_key); + TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status 0x%08" PRIx64 + ", left %d", + osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid, + rthc ? rthc_read(rthc) : ~UINT64_C(0), + atomic_load32(&rthc_pending, mo_Relaxed)); + if (rthc) { + const uint64_t sign_registered = MDBX_THREAD_RTHC_REGISTERED(rthc); + const uint64_t sign_counted = MDBX_THREAD_RTHC_COUNTED(rthc); + const uint64_t state = rthc_read(rthc); + if (state == sign_registered && + rthc_compare_and_clean(rthc, sign_registered)) { + TRACE("== thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid, + "registered", state); + } else if (state == sign_counted && + rthc_compare_and_clean(rthc, sign_counted)) { + TRACE("== thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid, + "counted", state); + ENSURE(nullptr, atomic_sub32(&rthc_pending, 1) > 0); + } else { + WARNING("thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid, + "wrong", state); } } - if (limit > map->limit) { - err = check_mmap_limit(limit); - if (unlikely(err != MDBX_SUCCESS)) - return err; - - /* check ability of address space for growth before unmap */ - PVOID BaseAddress = (PBYTE)map->base + map->limit; - SIZE_T RegionSize = limit - map->limit; - status = NtAllocateVirtualMemory(GetCurrentProcess(), &BaseAddress, 0, - &RegionSize, MEM_RESERVE, PAGE_NOACCESS); - if (status == (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018) - return MDBX_UNABLE_EXTEND_MAPSIZE; - if (!NT_SUCCESS(status)) - return ntstatus2errcode(status); - - status = NtFreeVirtualMemory(GetCurrentProcess(), &BaseAddress, &RegionSize, - MEM_RELEASE); - if (!NT_SUCCESS(status)) - return ntstatus2errcode(status); + struct timespec abstime; + ENSURE(nullptr, clock_gettime(CLOCK_REALTIME, &abstime) == 0); + abstime.tv_nsec += 1000000000l / 10; + if (abstime.tv_nsec >= 1000000000l) { + abstime.tv_nsec -= 1000000000l; + abstime.tv_sec += 1; } +#if MDBX_DEBUG > 0 + abstime.tv_sec += 600; +#endif - /* Windows unable: - * - shrink a mapped file; - * - change size of mapped view; - * - extend read-only mapping; - * Therefore we should unmap/map entire section. */ - if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0) { - if (size <= map->current && limit == map->limit) - return MDBX_SUCCESS; - return MDBX_EPERM; + for (unsigned left; + (left = atomic_load32(&rthc_pending, mo_AcquireRelease)) > 0;) { + NOTICE("tls-cleanup: pid %d, pending %u, wait for...", current_pid, left); + const int rc = pthread_cond_timedwait(&rthc_cond, &rthc_mutex, &abstime); + if (rc && rc != EINTR) + break; } + thread_key_delete(rthc_key); +#endif - /* Unpoisoning is required for ASAN to avoid false-positive diagnostic - * when this memory will re-used by malloc or another mmapping. - * See https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ - MDBX_ASAN_UNPOISON_MEMORY_REGION(map->base, map->limit); - status = NtUnmapViewOfSection(GetCurrentProcess(), map->base); - if (!NT_SUCCESS(status)) - return ntstatus2errcode(status); - status = NtClose(map->section); - map->section = NULL; - PVOID ReservedAddress = NULL; - SIZE_T ReservedSize = limit; - - if (!NT_SUCCESS(status)) { - bailout_ntstatus: - err = ntstatus2errcode(status); - map->base = NULL; - map->current = map->limit = 0; - if (ReservedAddress) { - ReservedSize = 0; - status = NtFreeVirtualMemory(GetCurrentProcess(), &ReservedAddress, - &ReservedSize, MEM_RELEASE); - assert(NT_SUCCESS(status)); - (void)status; + for (size_t i = 0; i < rthc_count; ++i) { + MDBX_env *const env = rthc_table[i].env; + if (env->pid != current_pid) + continue; + if (!(env->flags & ENV_TXKEY)) + continue; + reader_slot_t *const begin = &env->lck_mmap.lck->rdt[0]; + reader_slot_t *const end = &env->lck_mmap.lck->rdt[env->max_readers]; + thread_key_delete(env->me_txkey); + bool cleaned = false; + for (reader_slot_t *reader = begin; reader < end; ++reader) { + TRACE("== [%zi] = key %" PRIuPTR ", %p ... %p, rthc %p (%+i), " + "rthc-pid %i, current-pid %i", + i, (uintptr_t)env->me_txkey, __Wpedantic_format_voidptr(begin), + __Wpedantic_format_voidptr(end), __Wpedantic_format_voidptr(reader), + (int)(reader - begin), reader->pid.weak, current_pid); + if (atomic_load32(&reader->pid, mo_Relaxed) == current_pid) { + (void)atomic_cas32(&reader->pid, current_pid, 0); + TRACE("== cleanup %p", __Wpedantic_format_voidptr(reader)); + cleaned = true; + } } - return err; + if (cleaned) + atomic_store32(&env->lck->rdt_refresh_flag, true, mo_Relaxed); } -retry_file_and_section: - /* resizing of the file may take a while, - * therefore we reserve address space to avoid occupy it by other threads */ - ReservedAddress = map->base; - status = NtAllocateVirtualMemory(GetCurrentProcess(), &ReservedAddress, 0, - &ReservedSize, MEM_RESERVE, PAGE_NOACCESS); - if (!NT_SUCCESS(status)) { - ReservedAddress = NULL; - if (status != (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018) - goto bailout_ntstatus /* no way to recovery */; - - if (flags & MDBX_MRESIZE_MAY_MOVE) - /* the base address could be changed */ - map->base = NULL; - } + rthc_limit = rthc_count = 0; + if (rthc_table != rthc_table_static) + osal_free(rthc_table); + rthc_table = nullptr; + rthc_unlock(); - if ((flags & MDBX_RDONLY) == 0 && map->filesize != size) { - err = osal_ftruncate(map->fd, size); - if (err == MDBX_SUCCESS) - map->filesize = size; - /* ignore error, because Windows unable shrink file - * that already mapped (by another process) */ - } +#if defined(_WIN32) || defined(_WIN64) + DeleteCriticalSection(&rthc_critical_section); +#else + /* LY: yielding a few timeslices to give a more chance + * to racing destructor(s) for completion. */ + workaround_glibc_bug21031(); +#endif +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \note Please refer to the COPYRIGHT file for explanations license change, +/// credits and acknowledgments. +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 - SectionSize.QuadPart = size; - status = NtCreateSection( - &map->section, - /* DesiredAccess */ - (flags & MDBX_WRITEMAP) - ? SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE | - SECTION_MAP_WRITE - : SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE, - /* ObjectAttributes */ NULL, - /* MaximumSize (InitialSize) */ &SectionSize, - /* SectionPageProtection */ - (flags & MDBX_RDONLY) ? PAGE_READONLY : PAGE_READWRITE, - /* AllocationAttributes */ SEC_RESERVE, map->fd); - if (!NT_SUCCESS(status)) - goto bailout_ntstatus; +static MDBX_cursor *cursor_clone(const MDBX_cursor *csrc, + cursor_couple_t *couple) { + cASSERT(csrc, csrc->txn->txnid >= csrc->txn->env->lck->cached_oldest.weak); + couple->outer.next = nullptr; + couple->outer.backup = nullptr; + couple->outer.subcur = nullptr; + couple->outer.clc = nullptr; + couple->outer.txn = csrc->txn; + couple->outer.dbi_state = csrc->dbi_state; + couple->outer.checking = z_pagecheck; + couple->outer.tree = nullptr; + couple->outer.top_and_flags = 0; - if (ReservedAddress) { - /* release reserved address space */ - ReservedSize = 0; - status = NtFreeVirtualMemory(GetCurrentProcess(), &ReservedAddress, - &ReservedSize, MEM_RELEASE); - ReservedAddress = NULL; - if (!NT_SUCCESS(status)) - goto bailout_ntstatus; + MDBX_cursor *cdst = &couple->outer; + if (is_inner(csrc)) { + couple->inner.cursor.next = nullptr; + couple->inner.cursor.backup = nullptr; + couple->inner.cursor.subcur = nullptr; + couple->inner.cursor.txn = csrc->txn; + couple->inner.cursor.dbi_state = csrc->dbi_state; + couple->outer.subcur = &couple->inner; + cdst = &couple->inner.cursor; } -retry_mapview:; - SIZE_T ViewSize = (flags & MDBX_RDONLY) ? size : limit; - status = NtMapViewOfSection( - map->section, GetCurrentProcess(), &map->base, - /* ZeroBits */ 0, - /* CommitSize */ 0, - /* SectionOffset */ NULL, &ViewSize, - /* InheritDisposition */ ViewUnmap, - /* AllocationType */ (flags & MDBX_RDONLY) ? 0 : MEM_RESERVE, - /* Win32Protect */ - (flags & MDBX_WRITEMAP) ? PAGE_READWRITE : PAGE_READONLY); - - if (!NT_SUCCESS(status)) { - if (status == (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018 && - map->base && (flags & MDBX_MRESIZE_MAY_MOVE) != 0) { - /* try remap at another base address */ - map->base = NULL; - goto retry_mapview; - } - NtClose(map->section); - map->section = NULL; - - if (map->base && (size != map->current || limit != map->limit)) { - /* try remap with previously size and limit, - * but will return MDBX_UNABLE_EXTEND_MAPSIZE on success */ - rc = (limit > map->limit) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_EPERM; - size = map->current; - ReservedSize = limit = map->limit; - goto retry_file_and_section; - } - - /* no way to recovery */ - goto bailout_ntstatus; - } - assert(map->base != MAP_FAILED); + cdst->checking = csrc->checking; + cdst->tree = csrc->tree; + cdst->clc = csrc->clc; + cursor_cpstk(csrc, cdst); + return cdst; +} - map->current = (size_t)SectionSize.QuadPart; - map->limit = ViewSize; +/*----------------------------------------------------------------------------*/ -#else /* Windows */ +void recalculate_merge_thresholds(MDBX_env *env) { + const size_t bytes = page_space(env); + env->merge_threshold = + (uint16_t)(bytes - + (bytes * env->options.merge_threshold_16dot16_percent >> 16)); + env->merge_threshold_gc = + (uint16_t)(bytes - ((env->options.merge_threshold_16dot16_percent > 19005) + ? bytes / 3 /* 33 % */ + : bytes / 4 /* 25 % */)); +} - if (flags & MDBX_RDONLY) { - if (size > map->filesize) - rc = MDBX_UNABLE_EXTEND_MAPSIZE; - else if (size < map->filesize && map->filesize > limit) - rc = MDBX_EPERM; - map->current = (map->filesize > limit) ? limit : (size_t)map->filesize; - } else { - if (size > map->filesize || - (size < map->filesize && (flags & MDBX_SHRINK_ALLOWED))) { - rc = osal_ftruncate(map->fd, size); - VERBOSE("ftruncate %zu, err %d", size, rc); - if (rc != MDBX_SUCCESS) - return rc; - map->filesize = size; - } +int tree_drop(MDBX_cursor *mc, const bool may_have_subDBs) { + MDBX_txn *txn = mc->txn; + int rc = tree_search(mc, nullptr, Z_FIRST); + if (likely(rc == MDBX_SUCCESS)) { + /* DUPSORT sub-DBs have no large-pages/subDBs. Omit scanning leaves. + * This also avoids any P_DUPFIX pages, which have no nodes. + * Also if the DB doesn't have sub-DBs and has no large/overflow + * pages, omit scanning leaves. */ + if (!(may_have_subDBs | mc->tree->large_pages)) + cursor_pop(mc); - if (map->current > size) { - /* Clearing asan's bitmask for the region which released in shrinking, - * since: - * - after the shrinking we will get an exception when accessing - * this region and (therefore) do not need the help of ASAN. - * - this allows us to clear the mask only within the file size - * when closing the mapping. */ - MDBX_ASAN_UNPOISON_MEMORY_REGION( - ptr_disp(map->base, size), - ((map->current < map->limit) ? map->current : map->limit) - size); - } - map->current = (size < map->limit) ? size : map->limit; - } + rc = pnl_need(&txn->tw.retired_pages, (size_t)mc->tree->branch_pages + + (size_t)mc->tree->leaf_pages + + (size_t)mc->tree->large_pages); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; - if (limit == map->limit) - return rc; + page_t *stack[CURSOR_STACK_SIZE]; + for (intptr_t i = 0; i <= mc->top; ++i) + stack[i] = mc->pg[i]; - if (limit < map->limit) { - /* unmap an excess at end of mapping. */ - // coverity[offset_free : FALSE] - if (unlikely(munmap(ptr_disp(map->base, limit), map->limit - limit))) { - assert(errno != 0); - return errno; - } - map->limit = limit; - return rc; + while (mc->top >= 0) { + page_t *const mp = mc->pg[mc->top]; + const size_t nkeys = page_numkeys(mp); + if (is_leaf(mp)) { + cASSERT(mc, mc->top + 1 == mc->tree->height); + for (size_t i = 0; i < nkeys; i++) { + node_t *node = page_node(mp, i); + if (node_flags(node) & N_BIGDATA) { + rc = page_retire_ex(mc, node_largedata_pgno(node), nullptr, 0); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + if (!(may_have_subDBs | mc->tree->large_pages)) + goto pop; + } else if (node_flags(node) & N_SUBDATA) { + if (unlikely((node_flags(node) & N_DUPDATA) == 0)) { + rc = /* disallowing implicit subDB deletion */ MDBX_INCOMPATIBLE; + goto bailout; + } + rc = cursor_dupsort_setup(mc, node, mp); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + rc = tree_drop(&mc->subcur->cursor, false); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + } + } else { + cASSERT(mc, mc->top + 1 < mc->tree->height); + mc->checking |= z_retiring; + const unsigned pagetype = (is_frozen(txn, mp) ? P_FROZEN : 0) + + ((mc->top + 2 == mc->tree->height) + ? (mc->checking & (P_LEAF | P_DUPFIX)) + : P_BRANCH); + for (size_t i = 0; i < nkeys; i++) { + node_t *node = page_node(mp, i); + tASSERT(txn, (node_flags(node) & + (N_BIGDATA | N_SUBDATA | N_DUPDATA)) == 0); + const pgno_t pgno = node_pgno(node); + rc = page_retire_ex(mc, pgno, nullptr, pagetype); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + mc->checking -= z_retiring; + } + if (!mc->top) + break; + cASSERT(mc, nkeys > 0); + mc->ki[mc->top] = (indx_t)nkeys; + rc = cursor_sibling_right(mc); + if (unlikely(rc != MDBX_SUCCESS)) { + if (unlikely(rc != MDBX_NOTFOUND)) + goto bailout; + /* no more siblings, go back to beginning + * of previous level. */ + pop: + cursor_pop(mc); + mc->ki[0] = 0; + for (intptr_t i = 1; i <= mc->top; i++) { + mc->pg[i] = stack[i]; + mc->ki[i] = 0; + } + } + } + rc = page_retire(mc, mc->pg[0]); } - int err = check_mmap_limit(limit); - if (unlikely(err != MDBX_SUCCESS)) - return err; +bailout: + be_poor(mc); + if (unlikely(rc != MDBX_SUCCESS)) + txn->flags |= MDBX_TXN_ERROR; + return rc; +} - assert(limit > map->limit); - void *ptr = MAP_FAILED; +static int node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { + int rc; + DKBUF_DEBUG; -#if (defined(__linux__) || defined(__gnu_linux__)) && defined(_GNU_SOURCE) - ptr = mremap(map->base, map->limit, limit, -#if defined(MREMAP_MAYMOVE) - (flags & MDBX_MRESIZE_MAY_MOVE) ? MREMAP_MAYMOVE : -#endif /* MREMAP_MAYMOVE */ - 0); - if (ptr == MAP_FAILED) { - err = errno; - assert(err != 0); - switch (err) { - default: - return err; - case 0 /* paranoia */: - case EAGAIN: - case ENOMEM: - return MDBX_UNABLE_EXTEND_MAPSIZE; - case EFAULT /* MADV_DODUMP / MADV_DONTDUMP are mixed for mmap-range */: - break; - } + page_t *psrc = csrc->pg[csrc->top]; + page_t *pdst = cdst->pg[cdst->top]; + cASSERT(csrc, page_type(psrc) == page_type(pdst)); + cASSERT(csrc, csrc->tree == cdst->tree); + cASSERT(csrc, csrc->top == cdst->top); + if (unlikely(page_type(psrc) != page_type(pdst))) { + bailout: + ERROR("Wrong or mismatch pages's types (src %d, dst %d) to move node", + page_type(psrc), page_type(pdst)); + csrc->txn->flags |= MDBX_TXN_ERROR; + return MDBX_PROBLEM; } -#endif /* Linux & _GNU_SOURCE */ - const unsigned mmap_flags = - MAP_CONCEAL | MAP_SHARED | MAP_FILE | MAP_NORESERVE | - (F_ISSET(flags, MDBX_UTTERLY_NOSYNC) ? MAP_NOSYNC : 0); - const unsigned mmap_prot = - (flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ; + MDBX_val key4move; + switch (page_type(psrc)) { + case P_BRANCH: { + const node_t *srcnode = page_node(psrc, csrc->ki[csrc->top]); + cASSERT(csrc, node_flags(srcnode) == 0); + const pgno_t srcpg = node_pgno(srcnode); + key4move.iov_len = node_ks(srcnode); + key4move.iov_base = node_key(srcnode); - if (ptr == MAP_FAILED) { - /* Try to mmap additional space beyond the end of mapping. */ - ptr = mmap(ptr_disp(map->base, map->limit), limit - map->limit, mmap_prot, - mmap_flags | MAP_FIXED_NOREPLACE, map->fd, map->limit); - if (ptr == ptr_disp(map->base, map->limit)) - /* успешно прилепили отображение в конец */ - ptr = map->base; - else if (ptr != MAP_FAILED) { - /* the desired address is busy, unmap unsuitable one */ - if (unlikely(munmap(ptr, limit - map->limit))) { - assert(errno != 0); - return errno; - } - ptr = MAP_FAILED; - } else { - err = errno; - assert(err != 0); - switch (err) { - default: - return err; - case 0 /* paranoia */: - case EAGAIN: - case ENOMEM: - return MDBX_UNABLE_EXTEND_MAPSIZE; - case EEXIST: /* address busy */ - case EINVAL: /* kernel don't support MAP_FIXED_NOREPLACE */ - break; + if (csrc->ki[csrc->top] == 0) { + const int8_t top = csrc->top; + cASSERT(csrc, top >= 0); + /* must find the lowest key below src */ + rc = tree_search_lowest(csrc); + page_t *lowest_page = csrc->pg[csrc->top]; + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + cASSERT(csrc, is_leaf(lowest_page)); + if (unlikely(!is_leaf(lowest_page))) + goto bailout; + if (is_dupfix_leaf(lowest_page)) + key4move = page_dupfix_key(lowest_page, 0, csrc->tree->dupfix_size); + else { + const node_t *lowest_node = page_node(lowest_page, 0); + key4move.iov_len = node_ks(lowest_node); + key4move.iov_base = node_key(lowest_node); } - } - } - if (ptr == MAP_FAILED) { - /* unmap and map again whole region */ - if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0) { - /* TODO: Perhaps here it is worth to implement suspend/resume threads - * and perform unmap/map as like for Windows. */ - return MDBX_UNABLE_EXTEND_MAPSIZE; - } + /* restore cursor after mdbx_page_search_lowest() */ + csrc->top = top; + csrc->ki[csrc->top] = 0; - if (unlikely(munmap(map->base, map->limit))) { - assert(errno != 0); - return errno; + /* paranoia */ + cASSERT(csrc, psrc == csrc->pg[csrc->top]); + cASSERT(csrc, is_branch(psrc)); + if (unlikely(!is_branch(psrc))) + goto bailout; } - // coverity[pass_freed_arg : FALSE] - ptr = mmap(map->base, limit, mmap_prot, - (flags & MDBX_MRESIZE_MAY_MOVE) - ? mmap_flags - : mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE - : MAP_FIXED), - map->fd, 0); - if (MAP_FIXED_NOREPLACE != 0 && MAP_FIXED_NOREPLACE != MAP_FIXED && - unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) && - errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL) - // coverity[pass_freed_arg : FALSE] - ptr = - mmap(map->base, limit, mmap_prot, mmap_flags | MAP_FIXED, map->fd, 0); + if (cdst->ki[cdst->top] == 0) { + cursor_couple_t couple; + MDBX_cursor *const mn = cursor_clone(cdst, &couple); + const int8_t top = cdst->top; + cASSERT(csrc, top >= 0); - if (unlikely(ptr == MAP_FAILED)) { - /* try to restore prev mapping */ - // coverity[pass_freed_arg : FALSE] - ptr = mmap(map->base, map->limit, mmap_prot, - (flags & MDBX_MRESIZE_MAY_MOVE) - ? mmap_flags - : mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE - : MAP_FIXED), - map->fd, 0); - if (MAP_FIXED_NOREPLACE != 0 && MAP_FIXED_NOREPLACE != MAP_FIXED && - unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) && - errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL) - // coverity[pass_freed_arg : FALSE] - ptr = mmap(map->base, map->limit, mmap_prot, mmap_flags | MAP_FIXED, - map->fd, 0); - if (unlikely(ptr == MAP_FAILED)) { - VALGRIND_MAKE_MEM_NOACCESS(map->base, map->current); - /* Unpoisoning is required for ASAN to avoid false-positive diagnostic - * when this memory will re-used by malloc or another mmapping. - * See - * https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 - */ - MDBX_ASAN_UNPOISON_MEMORY_REGION( - map->base, (map->current < map->limit) ? map->current : map->limit); - map->limit = 0; - map->current = 0; - map->base = nullptr; - assert(errno != 0); - return errno; + /* must find the lowest key below dst */ + rc = tree_search_lowest(mn); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + page_t *const lowest_page = mn->pg[mn->top]; + cASSERT(cdst, is_leaf(lowest_page)); + if (unlikely(!is_leaf(lowest_page))) + goto bailout; + MDBX_val key; + if (is_dupfix_leaf(lowest_page)) + key = page_dupfix_key(lowest_page, 0, mn->tree->dupfix_size); + else { + node_t *lowest_node = page_node(lowest_page, 0); + key.iov_len = node_ks(lowest_node); + key.iov_base = node_key(lowest_node); } - rc = MDBX_UNABLE_EXTEND_MAPSIZE; - limit = map->limit; + + /* restore cursor after mdbx_page_search_lowest() */ + mn->top = top; + mn->ki[mn->top] = 0; + + const intptr_t delta = EVEN_CEIL(key.iov_len) - + EVEN_CEIL(node_ks(page_node(mn->pg[mn->top], 0))); + const intptr_t needed = branch_size(cdst->txn->env, &key4move) + delta; + const intptr_t have = page_room(pdst); + if (unlikely(needed > have)) + return MDBX_RESULT_TRUE; + + if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) + return rc; + psrc = csrc->pg[csrc->top]; + pdst = cdst->pg[cdst->top]; + + couple.outer.next = mn->txn->cursors[cursor_dbi(mn)]; + mn->txn->cursors[cursor_dbi(mn)] = &couple.outer; + rc = tree_propagate_key(mn, &key); + mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next; + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } else { + const size_t needed = branch_size(cdst->txn->env, &key4move); + const size_t have = page_room(pdst); + if (unlikely(needed > have)) + return MDBX_RESULT_TRUE; + + if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) + return rc; + psrc = csrc->pg[csrc->top]; + pdst = cdst->pg[cdst->top]; } - } - assert(ptr && ptr != MAP_FAILED); - if (map->base != ptr) { - VALGRIND_MAKE_MEM_NOACCESS(map->base, map->current); - /* Unpoisoning is required for ASAN to avoid false-positive diagnostic - * when this memory will re-used by malloc or another mmapping. - * See - * https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 - */ - MDBX_ASAN_UNPOISON_MEMORY_REGION( - map->base, (map->current < map->limit) ? map->current : map->limit); + DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO + " to node %u on page %" PRIaPGNO, + "branch", csrc->ki[csrc->top], DKEY_DEBUG(&key4move), psrc->pgno, + cdst->ki[cdst->top], pdst->pgno); + /* Add the node to the destination page. */ + rc = node_add_branch(cdst, cdst->ki[cdst->top], &key4move, srcpg); + } break; - VALGRIND_MAKE_MEM_DEFINED(ptr, map->current); - MDBX_ASAN_UNPOISON_MEMORY_REGION(ptr, map->current); - map->base = ptr; - } - map->limit = limit; - map->current = size; + case P_LEAF: { + /* Mark src and dst as dirty. */ + if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) + return rc; + psrc = csrc->pg[csrc->top]; + pdst = cdst->pg[cdst->top]; + const node_t *srcnode = page_node(psrc, csrc->ki[csrc->top]); + MDBX_val data; + data.iov_len = node_ds(srcnode); + data.iov_base = node_data(srcnode); + key4move.iov_len = node_ks(srcnode); + key4move.iov_base = node_key(srcnode); + DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO + " to node %u on page %" PRIaPGNO, + "leaf", csrc->ki[csrc->top], DKEY_DEBUG(&key4move), psrc->pgno, + cdst->ki[cdst->top], pdst->pgno); + /* Add the node to the destination page. */ + rc = node_add_leaf(cdst, cdst->ki[cdst->top], &key4move, &data, + node_flags(srcnode)); + } break; -#if MDBX_ENABLE_MADVISE -#ifdef MADV_DONTFORK - if (unlikely(madvise(map->base, map->limit, MADV_DONTFORK) != 0)) { - assert(errno != 0); - return errno; + case P_LEAF | P_DUPFIX: { + /* Mark src and dst as dirty. */ + if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) + return rc; + psrc = csrc->pg[csrc->top]; + pdst = cdst->pg[cdst->top]; + key4move = + page_dupfix_key(psrc, csrc->ki[csrc->top], csrc->tree->dupfix_size); + DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO + " to node %u on page %" PRIaPGNO, + "leaf2", csrc->ki[csrc->top], DKEY_DEBUG(&key4move), psrc->pgno, + cdst->ki[cdst->top], pdst->pgno); + /* Add the node to the destination page. */ + rc = node_add_dupfix(cdst, cdst->ki[cdst->top], &key4move); + } break; + + default: + assert(false); + goto bailout; } -#endif /* MADV_DONTFORK */ -#ifdef MADV_NOHUGEPAGE - (void)madvise(map->base, map->limit, MADV_NOHUGEPAGE); -#endif /* MADV_NOHUGEPAGE */ -#endif /* MDBX_ENABLE_MADVISE */ -#endif /* POSIX / Windows */ + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - /* Zap: Redundant code */ - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6287); - assert(rc != MDBX_SUCCESS || - (map->base != nullptr && map->base != MAP_FAILED && - map->current == size && map->limit == limit && - map->filesize >= size)); - return rc; -} + /* Delete the node from the source page. */ + node_del(csrc, key4move.iov_len); + + cASSERT(csrc, psrc == csrc->pg[csrc->top]); + cASSERT(cdst, pdst == cdst->pg[cdst->top]); + cASSERT(csrc, page_type(psrc) == page_type(pdst)); + + /* csrc курсор тут всегда временный, на стеке внутри tree_rebalance(), + * и его нет необходимости корректировать. */ + { + /* Adjust other cursors pointing to mp */ + MDBX_cursor *m2, *m3; + const size_t dbi = cursor_dbi(csrc); + cASSERT(csrc, csrc->top == cdst->top); + if (fromleft) { + /* If we're adding on the left, bump others up */ + for (m2 = csrc->txn->cursors[dbi]; m2; m2 = m2->next) { + m3 = (csrc->flags & z_inner) ? &m2->subcur->cursor : m2; + if (!is_related(csrc, m3)) + continue; + + if (m3 != cdst && m3->pg[csrc->top] == pdst && + m3->ki[csrc->top] >= cdst->ki[csrc->top]) { + m3->ki[csrc->top] += 1; + } + + if (/* m3 != csrc && */ m3->pg[csrc->top] == psrc && + m3->ki[csrc->top] == csrc->ki[csrc->top]) { + m3->pg[csrc->top] = pdst; + m3->ki[csrc->top] = cdst->ki[cdst->top]; + cASSERT(csrc, csrc->top > 0); + m3->ki[csrc->top - 1] += 1; + } + + if (is_leaf(psrc) && inner_pointed(m3)) { + cASSERT(csrc, csrc->top == m3->top); + size_t nkeys = page_numkeys(m3->pg[csrc->top]); + if (likely(nkeys > m3->ki[csrc->top])) + cursor_inner_refresh(m3, m3->pg[csrc->top], m3->ki[csrc->top]); + } + } + } else { + /* Adding on the right, bump others down */ + for (m2 = csrc->txn->cursors[dbi]; m2; m2 = m2->next) { + m3 = (csrc->flags & z_inner) ? &m2->subcur->cursor : m2; + if (!is_related(csrc, m3)) + continue; + if (m3->pg[csrc->top] == psrc) { + if (!m3->ki[csrc->top]) { + m3->pg[csrc->top] = pdst; + m3->ki[csrc->top] = cdst->ki[cdst->top]; + cASSERT(csrc, csrc->top > 0 && m3->ki[csrc->top - 1] > 0); + m3->ki[csrc->top - 1] -= 1; + } else + m3->ki[csrc->top] -= 1; + + if (is_leaf(psrc) && inner_pointed(m3)) { + cASSERT(csrc, csrc->top == m3->top); + size_t nkeys = page_numkeys(m3->pg[csrc->top]); + if (likely(nkeys > m3->ki[csrc->top])) + cursor_inner_refresh(m3, m3->pg[csrc->top], m3->ki[csrc->top]); + } + } + } + } + } -/*----------------------------------------------------------------------------*/ + /* Update the parent separators. */ + if (csrc->ki[csrc->top] == 0) { + cASSERT(csrc, csrc->top > 0); + if (csrc->ki[csrc->top - 1] != 0) { + MDBX_val key; + if (is_dupfix_leaf(psrc)) + key = page_dupfix_key(psrc, 0, csrc->tree->dupfix_size); + else { + node_t *srcnode = page_node(psrc, 0); + key.iov_len = node_ks(srcnode); + key.iov_base = node_key(srcnode); + } + DEBUG("update separator for source page %" PRIaPGNO " to [%s]", + psrc->pgno, DKEY_DEBUG(&key)); -__cold MDBX_INTERNAL_FUNC void osal_jitter(bool tiny) { - for (;;) { -#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ - defined(__x86_64__) - const unsigned salt = 277u * (unsigned)__rdtsc(); -#elif (defined(_WIN32) || defined(_WIN64)) && MDBX_WITHOUT_MSVC_CRT - static ULONG state; - const unsigned salt = (unsigned)RtlRandomEx(&state); -#else - const unsigned salt = rand(); -#endif + cursor_couple_t couple; + MDBX_cursor *const mn = cursor_clone(csrc, &couple); + cASSERT(csrc, mn->top > 0); + mn->top -= 1; - const unsigned coin = salt % (tiny ? 29u : 43u); - if (coin < 43 / 3) - break; -#if defined(_WIN32) || defined(_WIN64) - SwitchToThread(); - if (coin > 43 * 2 / 3) - Sleep(1); -#else - sched_yield(); - if (coin > 43 * 2 / 3) - usleep(coin); -#endif + couple.outer.next = mn->txn->cursors[cursor_dbi(mn)]; + mn->txn->cursors[cursor_dbi(mn)] = &couple.outer; + rc = tree_propagate_key(mn, &key); + mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next; + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + if (is_branch(psrc)) { + const MDBX_val nullkey = {0, 0}; + const indx_t ix = csrc->ki[csrc->top]; + csrc->ki[csrc->top] = 0; + rc = tree_propagate_key(csrc, &nullkey); + csrc->ki[csrc->top] = ix; + cASSERT(csrc, rc == MDBX_SUCCESS); + } } -} -/*----------------------------------------------------------------------------*/ + if (cdst->ki[cdst->top] == 0) { + cASSERT(cdst, cdst->top > 0); + if (cdst->ki[cdst->top - 1] != 0) { + MDBX_val key; + if (is_dupfix_leaf(pdst)) + key = page_dupfix_key(pdst, 0, cdst->tree->dupfix_size); + else { + node_t *srcnode = page_node(pdst, 0); + key.iov_len = node_ks(srcnode); + key.iov_base = node_key(srcnode); + } + DEBUG("update separator for destination page %" PRIaPGNO " to [%s]", + pdst->pgno, DKEY_DEBUG(&key)); + cursor_couple_t couple; + MDBX_cursor *const mn = cursor_clone(cdst, &couple); + cASSERT(cdst, mn->top > 0); + mn->top -= 1; + + couple.outer.next = mn->txn->cursors[cursor_dbi(mn)]; + mn->txn->cursors[cursor_dbi(mn)] = &couple.outer; + rc = tree_propagate_key(mn, &key); + mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next; + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + if (is_branch(pdst)) { + const MDBX_val nullkey = {0, 0}; + const indx_t ix = cdst->ki[cdst->top]; + cdst->ki[cdst->top] = 0; + rc = tree_propagate_key(cdst, &nullkey); + cdst->ki[cdst->top] = ix; + cASSERT(cdst, rc == MDBX_SUCCESS); + } + } -#if defined(_WIN32) || defined(_WIN64) -static LARGE_INTEGER performance_frequency; -#elif defined(__APPLE__) || defined(__MACH__) -#include -static uint64_t ratio_16dot16_to_monotine; -#elif defined(__linux__) || defined(__gnu_linux__) -static clockid_t posix_clockid; -__cold static clockid_t choice_monoclock(void) { - struct timespec probe; -#if defined(CLOCK_BOOTTIME) - if (clock_gettime(CLOCK_BOOTTIME, &probe) == 0) - return CLOCK_BOOTTIME; -#elif defined(CLOCK_MONOTONIC_RAW) - if (clock_gettime(CLOCK_MONOTONIC_RAW, &probe) == 0) - return CLOCK_MONOTONIC_RAW; -#elif defined(CLOCK_MONOTONIC_COARSE) - if (clock_gettime(CLOCK_MONOTONIC_COARSE, &probe) == 0) - return CLOCK_MONOTONIC_COARSE; -#endif - return CLOCK_MONOTONIC; + return MDBX_SUCCESS; } -#elif defined(CLOCK_MONOTONIC) -#define posix_clockid CLOCK_MONOTONIC -#else -#define posix_clockid CLOCK_REALTIME -#endif -MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16) { -#if defined(_WIN32) || defined(_WIN64) - const uint64_t ratio = performance_frequency.QuadPart; -#elif defined(__APPLE__) || defined(__MACH__) - const uint64_t ratio = ratio_16dot16_to_monotine; -#else - const uint64_t ratio = UINT64_C(1000000000); -#endif - const uint64_t ret = (ratio * seconds_16dot16 + 32768) >> 16; - return likely(ret || seconds_16dot16 == 0) ? ret : /* fix underflow */ 1; -} +static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { + MDBX_val key; + int rc; -static uint64_t monotime_limit; -MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime) { - if (unlikely(monotime > monotime_limit)) - return UINT32_MAX; + cASSERT(csrc, csrc != cdst); + cASSERT(csrc, cursor_is_tracked(csrc)); + cASSERT(cdst, cursor_is_tracked(cdst)); + const page_t *const psrc = csrc->pg[csrc->top]; + page_t *pdst = cdst->pg[cdst->top]; + DEBUG("merging page %" PRIaPGNO " into %" PRIaPGNO, psrc->pgno, pdst->pgno); + + cASSERT(csrc, page_type(psrc) == page_type(pdst)); + cASSERT(csrc, csrc->clc == cdst->clc && csrc->tree == cdst->tree); + cASSERT(csrc, csrc->top > 0); /* can't merge root page */ + cASSERT(cdst, cdst->top > 0); + cASSERT(cdst, cdst->top + 1 < cdst->tree->height || + is_leaf(cdst->pg[cdst->tree->height - 1])); + cASSERT(csrc, csrc->top + 1 < csrc->tree->height || + is_leaf(csrc->pg[csrc->tree->height - 1])); + cASSERT(cdst, csrc->txn->env->options.prefer_waf_insteadof_balance || + page_room(pdst) >= page_used(cdst->txn->env, psrc)); + const int pagetype = page_type(psrc); - const uint32_t ret = -#if defined(_WIN32) || defined(_WIN64) - (uint32_t)((monotime << 16) / performance_frequency.QuadPart); -#elif defined(__APPLE__) || defined(__MACH__) - (uint32_t)((monotime << 16) / ratio_16dot16_to_monotine); -#else - (uint32_t)((monotime << 7) / 1953125); -#endif - return ret; -} + /* Move all nodes from src to dst */ + const size_t dst_nkeys = page_numkeys(pdst); + const size_t src_nkeys = page_numkeys(psrc); + cASSERT(cdst, dst_nkeys + src_nkeys >= (is_leaf(psrc) ? 1u : 2u)); + if (likely(src_nkeys)) { + size_t ii = dst_nkeys; + if (unlikely(pagetype & P_DUPFIX)) { + /* Mark dst as dirty. */ + rc = page_touch(cdst); + cASSERT(cdst, rc != MDBX_RESULT_TRUE); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; -MDBX_INTERNAL_FUNC uint64_t osal_monotime(void) { -#if defined(_WIN32) || defined(_WIN64) - LARGE_INTEGER counter; - if (QueryPerformanceCounter(&counter)) - return counter.QuadPart; -#elif defined(__APPLE__) || defined(__MACH__) - return mach_absolute_time(); -#else - struct timespec ts; - if (likely(clock_gettime(posix_clockid, &ts) == 0)) - return ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; -#endif - return 0; -} + key.iov_len = csrc->tree->dupfix_size; + key.iov_base = page_data(psrc); + size_t i = 0; + do { + rc = node_add_dupfix(cdst, ii++, &key); + cASSERT(cdst, rc != MDBX_RESULT_TRUE); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + key.iov_base = ptr_disp(key.iov_base, key.iov_len); + } while (++i != src_nkeys); + } else { + node_t *srcnode = page_node(psrc, 0); + key.iov_len = node_ks(srcnode); + key.iov_base = node_key(srcnode); + if (pagetype & P_BRANCH) { + cursor_couple_t couple; + MDBX_cursor *const mn = cursor_clone(csrc, &couple); -MDBX_INTERNAL_FUNC uint64_t osal_cputime(size_t *optional_page_faults) { -#if defined(_WIN32) || defined(_WIN64) - if (optional_page_faults) { - PROCESS_MEMORY_COUNTERS pmc; - *optional_page_faults = - GetProcessMemoryInfo(GetCurrentProcess(), &pmc, sizeof(pmc)) - ? pmc.PageFaultCount - : 0; - } - FILETIME unused, usermode; - if (GetThreadTimes(GetCurrentThread(), - /* CreationTime */ &unused, - /* ExitTime */ &unused, - /* KernelTime */ &unused, - /* UserTime */ &usermode)) { - /* one second = 10_000_000 * 100ns = 78125 * (1 << 7) * 100ns; - * result = (h * f / 10_000_000) << 32) + l * f / 10_000_000 = - * = ((h * f) >> 7) / 78125) << 32) + ((l * f) >> 7) / 78125; - * 1) {h, l} *= f; - * 2) {h, l} >>= 7; - * 3) result = ((h / 78125) << 32) + l / 78125; */ - uint64_t l = usermode.dwLowDateTime * performance_frequency.QuadPart; - uint64_t h = usermode.dwHighDateTime * performance_frequency.QuadPart; - l = h << (64 - 7) | l >> 7; - h = h >> 7; - return ((h / 78125) << 32) + l / 78125; - } -#elif defined(RUSAGE_THREAD) || defined(RUSAGE_LWP) -#ifndef RUSAGE_THREAD -#define RUSAGE_THREAD RUSAGE_LWP /* Solaris */ -#endif - struct rusage usage; - if (getrusage(RUSAGE_THREAD, &usage) == 0) { - if (optional_page_faults) - *optional_page_faults = usage.ru_majflt; - return usage.ru_utime.tv_sec * UINT64_C(1000000000) + - usage.ru_utime.tv_usec * 1000u; - } - if (optional_page_faults) - *optional_page_faults = 0; -#elif defined(CLOCK_THREAD_CPUTIME_ID) - if (optional_page_faults) - *optional_page_faults = 0; - struct timespec ts; - if (likely(clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0)) - return ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; -#else - /* FIXME */ - if (optional_page_faults) - *optional_page_faults = 0; -#endif - return 0; -} + /* must find the lowest key below src */ + rc = tree_search_lowest(mn); + cASSERT(csrc, rc != MDBX_RESULT_TRUE); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; -/*----------------------------------------------------------------------------*/ + const page_t *mp = mn->pg[mn->top]; + if (likely(!is_dupfix_leaf(mp))) { + cASSERT(mn, is_leaf(mp)); + const node_t *lowest = page_node(mp, 0); + key.iov_len = node_ks(lowest); + key.iov_base = node_key(lowest); + } else { + cASSERT(mn, mn->top > csrc->top); + key = page_dupfix_key(mp, mn->ki[mn->top], csrc->tree->dupfix_size); + } + cASSERT(mn, key.iov_len >= csrc->clc->k.lmin); + cASSERT(mn, key.iov_len <= csrc->clc->k.lmax); -static void bootid_shake(bin128_t *p) { - /* Bob Jenkins's PRNG: https://burtleburtle.net/bob/rand/smallprng.html */ - const uint32_t e = p->a - (p->b << 23 | p->b >> 9); - p->a = p->b ^ (p->c << 16 | p->c >> 16); - p->b = p->c + (p->d << 11 | p->d >> 21); - p->c = p->d + e; - p->d = e + p->a; -} + const size_t dst_room = page_room(pdst); + const size_t src_used = page_used(cdst->txn->env, psrc); + const size_t space_needed = src_used - node_ks(srcnode) + key.iov_len; + if (unlikely(space_needed > dst_room)) + return MDBX_RESULT_TRUE; + } -__cold static void bootid_collect(bin128_t *p, const void *s, size_t n) { - p->y += UINT64_C(64526882297375213); - bootid_shake(p); - for (size_t i = 0; i < n; ++i) { - bootid_shake(p); - p->y ^= UINT64_C(48797879452804441) * ((const uint8_t *)s)[i]; - bootid_shake(p); - p->y += 14621231; - } - bootid_shake(p); + /* Mark dst as dirty. */ + rc = page_touch(cdst); + cASSERT(cdst, rc != MDBX_RESULT_TRUE); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - /* minor non-linear tomfoolery */ - const unsigned z = p->x % 61; - p->y = p->y << z | p->y >> (64 - z); - bootid_shake(p); - bootid_shake(p); - const unsigned q = p->x % 59; - p->y = p->y << q | p->y >> (64 - q); - bootid_shake(p); - bootid_shake(p); - bootid_shake(p); -} + size_t i = 0; + while (true) { + if (pagetype & P_LEAF) { + MDBX_val data; + data.iov_len = node_ds(srcnode); + data.iov_base = node_data(srcnode); + rc = node_add_leaf(cdst, ii++, &key, &data, node_flags(srcnode)); + } else { + cASSERT(csrc, node_flags(srcnode) == 0); + rc = node_add_branch(cdst, ii++, &key, node_pgno(srcnode)); + } + cASSERT(cdst, rc != MDBX_RESULT_TRUE); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; -#if defined(_WIN32) || defined(_WIN64) + if (++i == src_nkeys) + break; + srcnode = page_node(psrc, i); + key.iov_len = node_ks(srcnode); + key.iov_base = node_key(srcnode); + } + } -__cold static uint64_t windows_systemtime_ms() { - FILETIME ft; - GetSystemTimeAsFileTime(&ft); - return ((uint64_t)ft.dwHighDateTime << 32 | ft.dwLowDateTime) / 10000ul; -} + pdst = cdst->pg[cdst->top]; + DEBUG("dst page %" PRIaPGNO " now has %zu keys (%u.%u%% filled)", + pdst->pgno, page_numkeys(pdst), + page_fill_percentum_x10(cdst->txn->env, pdst) / 10, + page_fill_percentum_x10(cdst->txn->env, pdst) % 10); -__cold static uint64_t windows_bootime(void) { - unsigned confirmed = 0; - uint64_t boottime = 0; - uint64_t up0 = mdbx_GetTickCount64(); - uint64_t st0 = windows_systemtime_ms(); - for (uint64_t fuse = st0; up0 && st0 < fuse + 1000 * 1000u / 42;) { - YieldProcessor(); - const uint64_t up1 = mdbx_GetTickCount64(); - const uint64_t st1 = windows_systemtime_ms(); - if (st1 > fuse && st1 == st0 && up1 == up0) { - uint64_t diff = st1 - up1; - if (boottime == diff) { - if (++confirmed > 4) - return boottime; - } else { - confirmed = 0; - boottime = diff; - } - fuse = st1; - Sleep(1); + cASSERT(csrc, psrc == csrc->pg[csrc->top]); + cASSERT(cdst, pdst == cdst->pg[cdst->top]); + } + + /* Unlink the src page from parent and add to free list. */ + csrc->top -= 1; + node_del(csrc, 0); + if (csrc->ki[csrc->top] == 0) { + const MDBX_val nullkey = {0, 0}; + rc = tree_propagate_key(csrc, &nullkey); + cASSERT(csrc, rc != MDBX_RESULT_TRUE); + if (unlikely(rc != MDBX_SUCCESS)) { + csrc->top += 1; + return rc; } - st0 = st1; - up0 = up1; } - return 0; -} + csrc->top += 1; -__cold static LSTATUS mdbx_RegGetValue(HKEY hKey, LPCSTR lpSubKey, - LPCSTR lpValue, PVOID pvData, - LPDWORD pcbData) { - LSTATUS rc; - if (!mdbx_RegGetValueA) { - /* an old Windows 2000/XP */ - HKEY hSubKey; - rc = RegOpenKeyA(hKey, lpSubKey, &hSubKey); - if (rc == ERROR_SUCCESS) { - rc = RegQueryValueExA(hSubKey, lpValue, NULL, NULL, pvData, pcbData); - RegCloseKey(hSubKey); + cASSERT(csrc, psrc == csrc->pg[csrc->top]); + cASSERT(cdst, pdst == cdst->pg[cdst->top]); + + { + /* Adjust other cursors pointing to mp */ + MDBX_cursor *m2, *m3; + const size_t dbi = cursor_dbi(csrc); + for (m2 = csrc->txn->cursors[dbi]; m2; m2 = m2->next) { + m3 = (csrc->flags & z_inner) ? &m2->subcur->cursor : m2; + if (!is_related(csrc, m3)) + continue; + if (m3->pg[csrc->top] == psrc) { + m3->pg[csrc->top] = pdst; + m3->ki[csrc->top] += (indx_t)dst_nkeys; + m3->ki[csrc->top - 1] = cdst->ki[csrc->top - 1]; + } else if (m3->pg[csrc->top - 1] == csrc->pg[csrc->top - 1] && + m3->ki[csrc->top - 1] > csrc->ki[csrc->top - 1]) { + cASSERT(m3, m3->ki[csrc->top - 1] > 0 && + m3->ki[csrc->top - 1] <= + page_numkeys(m3->pg[csrc->top - 1])); + m3->ki[csrc->top - 1] -= 1; + } + + if (is_leaf(psrc) && inner_pointed(m3)) { + cASSERT(csrc, csrc->top == m3->top); + size_t nkeys = page_numkeys(m3->pg[csrc->top]); + if (likely(nkeys > m3->ki[csrc->top])) + cursor_inner_refresh(m3, m3->pg[csrc->top], m3->ki[csrc->top]); + } } - return rc; } - rc = mdbx_RegGetValueA(hKey, lpSubKey, lpValue, RRF_RT_ANY, NULL, pvData, - pcbData); - if (rc != ERROR_FILE_NOT_FOUND) + rc = page_retire(csrc, (page_t *)psrc); + cASSERT(csrc, rc != MDBX_RESULT_TRUE); + if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_RegGetValueA(hKey, lpSubKey, lpValue, - RRF_RT_ANY | 0x00010000 /* RRF_SUBKEY_WOW6464KEY */, - NULL, pvData, pcbData); - if (rc != ERROR_FILE_NOT_FOUND) + cASSERT(cdst, cdst->tree->items > 0); + cASSERT(cdst, cdst->top + 1 <= cdst->tree->height); + cASSERT(cdst, cdst->top > 0); + page_t *const top_page = cdst->pg[cdst->top]; + const indx_t top_indx = cdst->ki[cdst->top]; + const int save_top = cdst->top; + const uint16_t save_height = cdst->tree->height; + cursor_pop(cdst); + rc = tree_rebalance(cdst); + if (unlikely(rc != MDBX_SUCCESS)) return rc; - return mdbx_RegGetValueA(hKey, lpSubKey, lpValue, - RRF_RT_ANY | 0x00020000 /* RRF_SUBKEY_WOW6432KEY */, - NULL, pvData, pcbData); -} -#endif -__cold MDBX_MAYBE_UNUSED static bool -bootid_parse_uuid(bin128_t *s, const void *p, const size_t n) { - if (n > 31) { - unsigned bits = 0; - for (unsigned i = 0; i < n; ++i) /* try parse an UUID in text form */ { - uint8_t c = ((const uint8_t *)p)[i]; - if (c >= '0' && c <= '9') - c -= '0'; - else if (c >= 'a' && c <= 'f') - c -= 'a' - 10; - else if (c >= 'A' && c <= 'F') - c -= 'A' - 10; - else - continue; - assert(c <= 15); - c ^= s->y >> 60; - s->y = s->y << 4 | s->x >> 60; - s->x = s->x << 4 | c; - bits += 4; - } - if (bits > 42 * 3) - /* UUID parsed successfully */ - return true; - } + cASSERT(cdst, cdst->tree->items > 0); + cASSERT(cdst, cdst->top + 1 <= cdst->tree->height); - if (n > 15) /* is enough handle it as a binary? */ { - if (n == sizeof(bin128_t)) { - bin128_t aligned; - memcpy(&aligned, p, sizeof(bin128_t)); - s->x += aligned.x; - s->y += aligned.y; - } else - bootid_collect(s, p, n); - return true; +#if MDBX_ENABLE_PGOP_STAT + cdst->txn->env->lck->pgops.merge.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + + if (is_leaf(cdst->pg[cdst->top])) { + /* LY: don't touch cursor if top-page is a LEAF */ + cASSERT(cdst, is_leaf(cdst->pg[cdst->top]) || + page_type(cdst->pg[cdst->top]) == pagetype); + return MDBX_SUCCESS; } - if (n) - bootid_collect(s, p, n); - return false; -} + cASSERT(cdst, page_numkeys(top_page) == dst_nkeys + src_nkeys); -__cold MDBX_INTERNAL_FUNC bin128_t osal_bootid(void) { - bin128_t bin = {{0, 0}}; - bool got_machineid = false, got_boottime = false, got_bootseq = false; + if (unlikely(pagetype != page_type(top_page))) { + /* LY: LEAF-page becomes BRANCH, unable restore cursor's stack */ + goto bailout; + } -#if defined(__linux__) || defined(__gnu_linux__) - { - const int fd = - open("/proc/sys/kernel/random/boot_id", O_RDONLY | O_NOFOLLOW); - if (fd != -1) { - struct statfs fs; - char buf[42]; - const ssize_t len = - (fstatfs(fd, &fs) == 0 && fs.f_type == /* procfs */ 0x9FA0) - ? read(fd, buf, sizeof(buf)) - : -1; - const int err = close(fd); - assert(err == 0); - (void)err; - if (len > 0 && bootid_parse_uuid(&bin, buf, len)) - return bin; - } + if (top_page == cdst->pg[cdst->top]) { + /* LY: don't touch cursor if prev top-page already on the top */ + cASSERT(cdst, cdst->ki[cdst->top] == top_indx); + cASSERT(cdst, is_leaf(cdst->pg[cdst->top]) || + page_type(cdst->pg[cdst->top]) == pagetype); + return MDBX_SUCCESS; } -#endif /* Linux */ -#if defined(__APPLE__) || defined(__MACH__) - { - char buf[42]; - size_t len = sizeof(buf); - if (!sysctlbyname("kern.bootsessionuuid", buf, &len, nullptr, 0) && - bootid_parse_uuid(&bin, buf, len)) - return bin; + const int new_top = save_top - save_height + cdst->tree->height; + if (unlikely(new_top < 0 || new_top >= cdst->tree->height)) { + /* LY: out of range, unable restore cursor's stack */ + goto bailout; + } -#if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && \ - __MAC_OS_X_VERSION_MIN_REQUIRED > 1050 - uuid_t uuid; - struct timespec wait = {0, 1000000000u / 42}; - if (!gethostuuid(uuid, &wait) && - bootid_parse_uuid(&bin, uuid, sizeof(uuid))) - got_machineid = true; -#endif /* > 10.5 */ + if (top_page == cdst->pg[new_top]) { + cASSERT(cdst, cdst->ki[new_top] == top_indx); + /* LY: restore cursor stack */ + cdst->top = (int8_t)new_top; + cASSERT(cdst, cdst->top + 1 < cdst->tree->height || + is_leaf(cdst->pg[cdst->tree->height - 1])); + cASSERT(cdst, is_leaf(cdst->pg[cdst->top]) || + page_type(cdst->pg[cdst->top]) == pagetype); + return MDBX_SUCCESS; + } - struct timeval boottime; - len = sizeof(boottime); - if (!sysctlbyname("kern.boottime", &boottime, &len, nullptr, 0) && - len == sizeof(boottime) && boottime.tv_sec) - got_boottime = true; + page_t *const stub_page = (page_t *)(~(uintptr_t)top_page); + const indx_t stub_indx = top_indx; + if (save_height > cdst->tree->height && + ((cdst->pg[save_top] == top_page && cdst->ki[save_top] == top_indx) || + (cdst->pg[save_top] == stub_page && cdst->ki[save_top] == stub_indx))) { + /* LY: restore cursor stack */ + cdst->pg[new_top] = top_page; + cdst->ki[new_top] = top_indx; +#if MDBX_DEBUG + cdst->pg[new_top + 1] = nullptr; + cdst->ki[new_top + 1] = INT16_MAX; +#endif + cdst->top = (int8_t)new_top; + cASSERT(cdst, cdst->top + 1 < cdst->tree->height || + is_leaf(cdst->pg[cdst->tree->height - 1])); + cASSERT(cdst, is_leaf(cdst->pg[cdst->top]) || + page_type(cdst->pg[cdst->top]) == pagetype); + return MDBX_SUCCESS; } -#endif /* Apple/Darwin */ -#if defined(_WIN32) || defined(_WIN64) - { - union buf { - DWORD BootId; - DWORD BaseTime; - SYSTEM_TIMEOFDAY_INFORMATION SysTimeOfDayInfo; - struct { - LARGE_INTEGER BootTime; - LARGE_INTEGER CurrentTime; - LARGE_INTEGER TimeZoneBias; - ULONG TimeZoneId; - ULONG Reserved; - ULONGLONG BootTimeBias; - ULONGLONG SleepTimeBias; - } SysTimeOfDayInfoHacked; - wchar_t MachineGuid[42]; - char DigitalProductId[248]; - } buf; +bailout: + /* LY: unable restore cursor's stack */ + be_poor(cdst); + return MDBX_CURSOR_FULL; +} - static const char HKLM_MicrosoftCryptography[] = - "SOFTWARE\\Microsoft\\Cryptography"; - DWORD len = sizeof(buf); - /* Windows is madness and must die */ - if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_MicrosoftCryptography, - "MachineGuid", &buf.MachineGuid, - &len) == ERROR_SUCCESS && - len < sizeof(buf)) - got_machineid = bootid_parse_uuid(&bin, &buf.MachineGuid, len); +int tree_rebalance(MDBX_cursor *mc) { + cASSERT(mc, cursor_is_tracked(mc)); + cASSERT(mc, mc->top >= 0); + cASSERT(mc, mc->top + 1 < mc->tree->height || + is_leaf(mc->pg[mc->tree->height - 1])); + const page_t *const tp = mc->pg[mc->top]; + const uint8_t pagetype = page_type(tp); - if (!got_machineid) { - /* again, Windows is madness */ - static const char HKLM_WindowsNT[] = - "SOFTWARE\\Microsoft\\Windows NT\\CurrentVersion"; - static const char HKLM_WindowsNT_DPK[] = - "SOFTWARE\\Microsoft\\Windows " - "NT\\CurrentVersion\\DefaultProductKey"; - static const char HKLM_WindowsNT_DPK2[] = - "SOFTWARE\\Microsoft\\Windows " - "NT\\CurrentVersion\\DefaultProductKey2"; + STATIC_ASSERT(P_BRANCH == 1); + const size_t minkeys = (pagetype & P_BRANCH) + (size_t)1; - len = sizeof(buf); - if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT, - "DigitalProductId", &buf.DigitalProductId, - &len) == ERROR_SUCCESS && - len > 42 && len < sizeof(buf)) { - bootid_collect(&bin, &buf.DigitalProductId, len); - got_machineid = true; - } - len = sizeof(buf); - if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT_DPK, - "DigitalProductId", &buf.DigitalProductId, - &len) == ERROR_SUCCESS && - len > 42 && len < sizeof(buf)) { - bootid_collect(&bin, &buf.DigitalProductId, len); - got_machineid = true; + /* Pages emptier than this are candidates for merging. */ + size_t room_threshold = likely(mc->tree != &mc->txn->dbs[FREE_DBI]) + ? mc->txn->env->merge_threshold + : mc->txn->env->merge_threshold_gc; + + const size_t numkeys = page_numkeys(tp); + const size_t room = page_room(tp); + DEBUG("rebalancing %s page %" PRIaPGNO + " (has %zu keys, fill %u.%u%%, used %zu, room %zu bytes)", + is_leaf(tp) ? "leaf" : "branch", tp->pgno, numkeys, + page_fill_percentum_x10(mc->txn->env, tp) / 10, + page_fill_percentum_x10(mc->txn->env, tp) % 10, + page_used(mc->txn->env, tp), room); + cASSERT(mc, is_modifable(mc->txn, tp)); + + if (unlikely(numkeys < minkeys)) { + DEBUG("page %" PRIaPGNO " must be merged due keys < %zu threshold", + tp->pgno, minkeys); + } else if (unlikely(room > room_threshold)) { + DEBUG("page %" PRIaPGNO " should be merged due room %zu > %zu threshold", + tp->pgno, room, room_threshold); + } else { + DEBUG("no need to rebalance page %" PRIaPGNO ", room %zu < %zu threshold", + tp->pgno, room, room_threshold); + cASSERT(mc, mc->tree->items > 0); + return MDBX_SUCCESS; + } + + int rc; + if (mc->top == 0) { + page_t *const mp = mc->pg[0]; + const size_t nkeys = page_numkeys(mp); + cASSERT(mc, (mc->tree->items == 0) == (nkeys == 0)); + if (nkeys == 0) { + DEBUG("%s", "tree is completely empty"); + cASSERT(mc, is_leaf(mp)); + cASSERT(mc, (*cursor_dbi_state(mc) & DBI_DIRTY) != 0); + cASSERT(mc, mc->tree->branch_pages == 0 && mc->tree->large_pages == 0 && + mc->tree->leaf_pages == 1); + /* Adjust cursors pointing to mp */ + for (MDBX_cursor *m2 = mc->txn->cursors[cursor_dbi(mc)]; m2; + m2 = m2->next) { + MDBX_cursor *m3 = (mc->flags & z_inner) ? &m2->subcur->cursor : m2; + if (!is_poor(m3) && m3->pg[0] == mp) { + be_poor(m3); + m3->flags |= z_after_delete; + } } - len = sizeof(buf); - if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT_DPK2, - "DigitalProductId", &buf.DigitalProductId, - &len) == ERROR_SUCCESS && - len > 42 && len < sizeof(buf)) { - bootid_collect(&bin, &buf.DigitalProductId, len); - got_machineid = true; + if (is_subpage(mp)) { + return MDBX_SUCCESS; + } else { + mc->tree->root = P_INVALID; + mc->tree->height = 0; + return page_retire(mc, mp); } } - - static const char HKLM_PrefetcherParams[] = - "SYSTEM\\CurrentControlSet\\Control\\Session Manager\\Memory " - "Management\\PrefetchParameters"; - len = sizeof(buf); - if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_PrefetcherParams, "BootId", - &buf.BootId, &len) == ERROR_SUCCESS && - len > 1 && len < sizeof(buf)) { - bootid_collect(&bin, &buf.BootId, len); - got_bootseq = true; - } - - len = sizeof(buf); - if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_PrefetcherParams, "BaseTime", - &buf.BaseTime, &len) == ERROR_SUCCESS && - len >= sizeof(buf.BaseTime) && buf.BaseTime) { - bootid_collect(&bin, &buf.BaseTime, len); - got_boottime = true; + if (is_subpage(mp)) { + DEBUG("%s", "Can't rebalance a subpage, ignoring"); + cASSERT(mc, is_leaf(tp)); + return MDBX_SUCCESS; } - - /* BootTime from SYSTEM_TIMEOFDAY_INFORMATION */ - NTSTATUS status = NtQuerySystemInformation( - 0x03 /* SystemTmeOfDayInformation */, &buf.SysTimeOfDayInfo, - sizeof(buf.SysTimeOfDayInfo), &len); - if (NT_SUCCESS(status) && - len >= offsetof(union buf, SysTimeOfDayInfoHacked.BootTimeBias) + - sizeof(buf.SysTimeOfDayInfoHacked.BootTimeBias) && - buf.SysTimeOfDayInfoHacked.BootTime.QuadPart) { - const uint64_t UnbiasedBootTime = - buf.SysTimeOfDayInfoHacked.BootTime.QuadPart - - buf.SysTimeOfDayInfoHacked.BootTimeBias; - if (UnbiasedBootTime) { - bootid_collect(&bin, &UnbiasedBootTime, sizeof(UnbiasedBootTime)); - got_boottime = true; + if (is_branch(mp) && nkeys == 1) { + DEBUG("%s", "collapsing root page!"); + mc->tree->root = node_pgno(page_node(mp, 0)); + rc = page_get(mc, mc->tree->root, &mc->pg[0], mp->txnid); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + mc->tree->height--; + mc->ki[0] = mc->ki[1]; + for (intptr_t i = 1; i < mc->tree->height; i++) { + mc->pg[i] = mc->pg[i + 1]; + mc->ki[i] = mc->ki[i + 1]; } - } - if (!got_boottime) { - uint64_t boottime = windows_bootime(); - if (boottime) { - bootid_collect(&bin, &boottime, sizeof(boottime)); - got_boottime = true; + /* Adjust other cursors pointing to mp */ + for (MDBX_cursor *m2 = mc->txn->cursors[cursor_dbi(mc)]; m2; + m2 = m2->next) { + MDBX_cursor *m3 = (mc->flags & z_inner) ? &m2->subcur->cursor : m2; + if (is_related(mc, m3) && m3->pg[0] == mp) { + for (intptr_t i = 0; i < mc->tree->height; i++) { + m3->pg[i] = m3->pg[i + 1]; + m3->ki[i] = m3->ki[i + 1]; + } + m3->top -= 1; + } } + cASSERT(mc, is_leaf(mc->pg[mc->top]) || + page_type(mc->pg[mc->top]) == pagetype); + cASSERT(mc, mc->top + 1 < mc->tree->height || + is_leaf(mc->pg[mc->tree->height - 1])); + return page_retire(mc, mp); } + DEBUG("root page %" PRIaPGNO " doesn't need rebalancing (flags 0x%x)", + mp->pgno, mp->flags); + return MDBX_SUCCESS; } -#endif /* Windows */ -#if defined(CTL_HW) && defined(HW_UUID) - if (!got_machineid) { - static const int mib[] = {CTL_HW, HW_UUID}; - char buf[42]; - size_t len = sizeof(buf); - if (sysctl( -#ifdef SYSCTL_LEGACY_NONCONST_MIB - (int *) -#endif - mib, - ARRAY_LENGTH(mib), &buf, &len, NULL, 0) == 0) - got_machineid = bootid_parse_uuid(&bin, buf, len); - } -#endif /* CTL_HW && HW_UUID */ + /* The parent (branch page) must have at least 2 pointers, + * otherwise the tree is invalid. */ + const size_t pre_top = mc->top - 1; + cASSERT(mc, is_branch(mc->pg[pre_top])); + cASSERT(mc, !is_subpage(mc->pg[0])); + cASSERT(mc, page_numkeys(mc->pg[pre_top]) > 1); -#if defined(CTL_KERN) && defined(KERN_HOSTUUID) - if (!got_machineid) { - static const int mib[] = {CTL_KERN, KERN_HOSTUUID}; - char buf[42]; - size_t len = sizeof(buf); - if (sysctl( -#ifdef SYSCTL_LEGACY_NONCONST_MIB - (int *) -#endif - mib, - ARRAY_LENGTH(mib), &buf, &len, NULL, 0) == 0) - got_machineid = bootid_parse_uuid(&bin, buf, len); + /* Leaf page fill factor is below the threshold. + * Try to move keys from left or right neighbor, or + * merge with a neighbor page. */ + + /* Find neighbors. */ + cursor_couple_t couple; + MDBX_cursor *const mn = cursor_clone(mc, &couple); + + page_t *left = nullptr, *right = nullptr; + if (mn->ki[pre_top] > 0) { + rc = + page_get(mn, node_pgno(page_node(mn->pg[pre_top], mn->ki[pre_top] - 1)), + &left, mc->pg[mc->top]->txnid); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + cASSERT(mc, page_type(left) == page_type(mc->pg[mc->top])); } -#endif /* CTL_KERN && KERN_HOSTUUID */ + if (mn->ki[pre_top] + (size_t)1 < page_numkeys(mn->pg[pre_top])) { + rc = page_get( + mn, node_pgno(page_node(mn->pg[pre_top], mn->ki[pre_top] + (size_t)1)), + &right, mc->pg[mc->top]->txnid); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + cASSERT(mc, page_type(right) == page_type(mc->pg[mc->top])); + } + cASSERT(mc, left || right); -#if defined(__NetBSD__) - if (!got_machineid) { - char buf[42]; - size_t len = sizeof(buf); - if (sysctlbyname("machdep.dmi.system-uuid", buf, &len, NULL, 0) == 0) - got_machineid = bootid_parse_uuid(&bin, buf, len); + const size_t ki_top = mc->ki[mc->top]; + const size_t ki_pre_top = mn->ki[pre_top]; + const size_t nkeys = page_numkeys(mn->pg[mn->top]); + + const size_t left_room = left ? page_room(left) : 0; + const size_t right_room = right ? page_room(right) : 0; + const size_t left_nkeys = left ? page_numkeys(left) : 0; + const size_t right_nkeys = right ? page_numkeys(right) : 0; + bool involve = false; +retry: + cASSERT(mc, mc->top > 0); + if (left_room > room_threshold && left_room >= right_room && + (is_modifable(mc->txn, left) || involve)) { + /* try merge with left */ + cASSERT(mc, left_nkeys >= minkeys); + mn->pg[mn->top] = left; + mn->ki[mn->top - 1] = (indx_t)(ki_pre_top - 1); + mn->ki[mn->top] = (indx_t)(left_nkeys - 1); + mc->ki[mc->top] = 0; + const size_t new_ki = ki_top + left_nkeys; + mn->ki[mn->top] += mc->ki[mn->top] + 1; + couple.outer.next = mn->txn->cursors[cursor_dbi(mn)]; + mn->txn->cursors[cursor_dbi(mn)] = &couple.outer; + rc = page_merge(mc, mn); + mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next; + if (likely(rc != MDBX_RESULT_TRUE)) { + cursor_cpstk(mn, mc); + mc->ki[mc->top] = (indx_t)new_ki; + cASSERT(mc, rc || page_numkeys(mc->pg[mc->top]) >= minkeys); + return rc; + } + } + if (right_room > room_threshold && + (is_modifable(mc->txn, right) || involve)) { + /* try merge with right */ + cASSERT(mc, right_nkeys >= minkeys); + mn->pg[mn->top] = right; + mn->ki[mn->top - 1] = (indx_t)(ki_pre_top + 1); + mn->ki[mn->top] = 0; + mc->ki[mc->top] = (indx_t)nkeys; + couple.outer.next = mn->txn->cursors[cursor_dbi(mn)]; + mn->txn->cursors[cursor_dbi(mn)] = &couple.outer; + rc = page_merge(mn, mc); + mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next; + if (likely(rc != MDBX_RESULT_TRUE)) { + mc->ki[mc->top] = (indx_t)ki_top; + cASSERT(mc, rc || page_numkeys(mc->pg[mc->top]) >= minkeys); + return rc; + } } -#endif /* __NetBSD__ */ -#if _XOPEN_SOURCE_EXTENDED - if (!got_machineid) { - const int hostid = gethostid(); - if (hostid > 0) { - bootid_collect(&bin, &hostid, sizeof(hostid)); - got_machineid = true; + if (left_nkeys > minkeys && + (right_nkeys <= left_nkeys || right_room >= left_room) && + (is_modifable(mc->txn, left) || involve)) { + /* try move from left */ + mn->pg[mn->top] = left; + mn->ki[mn->top - 1] = (indx_t)(ki_pre_top - 1); + mn->ki[mn->top] = (indx_t)(left_nkeys - 1); + mc->ki[mc->top] = 0; + couple.outer.next = mn->txn->cursors[cursor_dbi(mn)]; + mn->txn->cursors[cursor_dbi(mn)] = &couple.outer; + rc = node_move(mn, mc, true); + mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next; + if (likely(rc != MDBX_RESULT_TRUE)) { + mc->ki[mc->top] = (indx_t)(ki_top + 1); + cASSERT(mc, rc || page_numkeys(mc->pg[mc->top]) >= minkeys); + return rc; + } + } + if (right_nkeys > minkeys && (is_modifable(mc->txn, right) || involve)) { + /* try move from right */ + mn->pg[mn->top] = right; + mn->ki[mn->top - 1] = (indx_t)(ki_pre_top + 1); + mn->ki[mn->top] = 0; + mc->ki[mc->top] = (indx_t)nkeys; + couple.outer.next = mn->txn->cursors[cursor_dbi(mn)]; + mn->txn->cursors[cursor_dbi(mn)] = &couple.outer; + rc = node_move(mn, mc, false); + mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next; + if (likely(rc != MDBX_RESULT_TRUE)) { + mc->ki[mc->top] = (indx_t)ki_top; + cASSERT(mc, rc || page_numkeys(mc->pg[mc->top]) >= minkeys); + return rc; } } -#endif /* _XOPEN_SOURCE_EXTENDED */ - if (!got_machineid) { - lack: - bin.x = bin.y = 0; - return bin; + if (nkeys >= minkeys) { + mc->ki[mc->top] = (indx_t)ki_top; + if (AUDIT_ENABLED()) + return cursor_check_updating(mc); + return MDBX_SUCCESS; } - /*--------------------------------------------------------------------------*/ + if (mc->txn->env->options.prefer_waf_insteadof_balance && + likely(room_threshold > 0)) { + room_threshold = 0; + goto retry; + } + if (likely(!involve) && + (likely(mc->tree != &mc->txn->dbs[FREE_DBI]) || mc->txn->tw.loose_pages || + MDBX_PNL_GETSIZE(mc->txn->tw.relist) || + (mc->flags & z_gcu_preparation) || (mc->txn->flags & txn_gc_drained) || + room_threshold)) { + involve = true; + goto retry; + } + if (likely(room_threshold > 0)) { + room_threshold = 0; + goto retry; + } -#if defined(CTL_KERN) && defined(KERN_BOOTTIME) - if (!got_boottime) { - static const int mib[] = {CTL_KERN, KERN_BOOTTIME}; - struct timeval boottime; - size_t len = sizeof(boottime); - if (sysctl( -#ifdef SYSCTL_LEGACY_NONCONST_MIB - (int *) -#endif - mib, - ARRAY_LENGTH(mib), &boottime, &len, NULL, 0) == 0 && - len == sizeof(boottime) && boottime.tv_sec) { - bootid_collect(&bin, &boottime, len); - got_boottime = true; + ERROR("Unable to merge/rebalance %s page %" PRIaPGNO + " (has %zu keys, fill %u.%u%%, used %zu, room %zu bytes)", + is_leaf(tp) ? "leaf" : "branch", tp->pgno, numkeys, + page_fill_percentum_x10(mc->txn->env, tp) / 10, + page_fill_percentum_x10(mc->txn->env, tp) % 10, + page_used(mc->txn->env, tp), room); + return MDBX_PROBLEM; +} + +int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, + MDBX_val *const newdata, pgno_t newpgno, const unsigned naf) { + unsigned flags; + int rc = MDBX_SUCCESS, foliage = 0; + MDBX_env *const env = mc->txn->env; + MDBX_val rkey, xdata; + page_t *tmp_ki_copy = nullptr; + DKBUF; + + page_t *const mp = mc->pg[mc->top]; + cASSERT(mc, (mp->flags & P_ILL_BITS) == 0); + + const size_t newindx = mc->ki[mc->top]; + size_t nkeys = page_numkeys(mp); + if (AUDIT_ENABLED()) { + rc = cursor_check_updating(mc); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + STATIC_ASSERT(P_BRANCH == 1); + const size_t minkeys = (mp->flags & P_BRANCH) + (size_t)1; + + DEBUG(">> splitting %s-page %" PRIaPGNO + " and adding %zu+%zu [%s] at %i, nkeys %zi", + is_leaf(mp) ? "leaf" : "branch", mp->pgno, newkey->iov_len, + newdata ? newdata->iov_len : 0, DKEY_DEBUG(newkey), mc->ki[mc->top], + nkeys); + cASSERT(mc, nkeys + 1 >= minkeys * 2); + + /* Create a new sibling page. */ + pgr_t npr = page_new(mc, mp->flags); + if (unlikely(npr.err != MDBX_SUCCESS)) + return npr.err; + page_t *const sister = npr.page; + sister->dupfix_ksize = mp->dupfix_ksize; + DEBUG("new sibling: page %" PRIaPGNO, sister->pgno); + + /* Usually when splitting the root page, the cursor + * height is 1. But when called from tree_propagate_key, + * the cursor height may be greater because it walks + * up the stack while finding the branch slot to update. */ + intptr_t prev_top = mc->top - 1; + if (mc->top == 0) { + npr = page_new(mc, P_BRANCH); + rc = npr.err; + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + page_t *const pp = npr.page; + /* shift current top to make room for new parent */ + cASSERT(mc, mc->tree->height > 0); +#if MDBX_DEBUG + memset(mc->pg + 3, 0, sizeof(mc->pg) - sizeof(mc->pg[0]) * 3); + memset(mc->ki + 3, -1, sizeof(mc->ki) - sizeof(mc->ki[0]) * 3); +#endif + mc->pg[2] = mc->pg[1]; + mc->ki[2] = mc->ki[1]; + mc->pg[1] = mc->pg[0]; + mc->ki[1] = mc->ki[0]; + mc->pg[0] = pp; + mc->ki[0] = 0; + mc->tree->root = pp->pgno; + DEBUG("root split! new root = %" PRIaPGNO, pp->pgno); + foliage = mc->tree->height++; + + /* Add left (implicit) pointer. */ + rc = node_add_branch(mc, 0, nullptr, mp->pgno); + if (unlikely(rc != MDBX_SUCCESS)) { + /* undo the pre-push */ + mc->pg[0] = mc->pg[1]; + mc->ki[0] = mc->ki[1]; + mc->tree->root = mp->pgno; + mc->tree->height--; + goto done; + } + mc->top = 1; + prev_top = 0; + if (AUDIT_ENABLED()) { + rc = cursor_check_updating(mc); + if (unlikely(rc != MDBX_SUCCESS)) + goto done; } + } else { + DEBUG("parent branch page is %" PRIaPGNO, mc->pg[prev_top]->pgno); } -#endif /* CTL_KERN && KERN_BOOTTIME */ -#if defined(__sun) || defined(__SVR4) || defined(__svr4__) - if (!got_boottime) { - kstat_ctl_t *kc = kstat_open(); - if (kc) { - kstat_t *kp = kstat_lookup(kc, "unix", 0, "system_misc"); - if (kp && kstat_read(kc, kp, 0) != -1) { - kstat_named_t *kn = (kstat_named_t *)kstat_data_lookup(kp, "boot_time"); - if (kn) { - switch (kn->data_type) { - case KSTAT_DATA_INT32: - case KSTAT_DATA_UINT32: - bootid_collect(&bin, &kn->value, sizeof(int32_t)); - got_boottime = true; - case KSTAT_DATA_INT64: - case KSTAT_DATA_UINT64: - bootid_collect(&bin, &kn->value, sizeof(int64_t)); - got_boottime = true; - } + cursor_couple_t couple; + MDBX_cursor *const mn = cursor_clone(mc, &couple); + mn->pg[mn->top] = sister; + mn->ki[mn->top] = 0; + mn->ki[prev_top] = mc->ki[prev_top] + 1; + + size_t split_indx = + (newindx < nkeys) + ? /* split at the middle */ (nkeys + 1) >> 1 + : /* split at the end (i.e. like append-mode ) */ nkeys - minkeys + 1; + eASSERT(env, split_indx >= minkeys && split_indx <= nkeys - minkeys + 1); + + cASSERT(mc, !is_branch(mp) || newindx > 0); + MDBX_val sepkey = {nullptr, 0}; + /* It is reasonable and possible to split the page at the begin */ + if (unlikely(newindx < minkeys)) { + split_indx = minkeys; + if (newindx == 0 && !(naf & MDBX_SPLIT_REPLACE)) { + split_indx = 0; + /* Checking for ability of splitting by the left-side insertion + * of a pure page with the new key */ + for (intptr_t i = 0; i < mc->top; ++i) + if (mc->ki[i]) { + sepkey = get_key(page_node(mc->pg[i], mc->ki[i])); + if (mc->clc->k.cmp(newkey, &sepkey) >= 0) + split_indx = minkeys; + break; } + if (split_indx == 0) { + /* Save the current first key which was omitted on the parent branch + * page and should be updated if the new first entry will be added */ + if (is_dupfix_leaf(mp)) + sepkey = page_dupfix_key(mp, 0, mc->tree->dupfix_size); + else + sepkey = get_key(page_node(mp, 0)); + cASSERT(mc, mc->clc->k.cmp(newkey, &sepkey) < 0); + /* Avoiding rare complex cases of nested split the parent page(s) */ + if (page_room(mc->pg[prev_top]) < branch_size(env, &sepkey)) + split_indx = minkeys; + } + if (foliage) { + TRACE("pure-left: foliage %u, top %i, ptop %zu, split_indx %zi, " + "minkeys %zi, sepkey %s, parent-room %zu, need4split %zu", + foliage, mc->top, prev_top, split_indx, minkeys, + DKEY_DEBUG(&sepkey), page_room(mc->pg[prev_top]), + branch_size(env, &sepkey)); + TRACE("pure-left: newkey %s, newdata %s, newindx %zu", + DKEY_DEBUG(newkey), DVAL_DEBUG(newdata), newindx); } - kstat_close(kc); } } -#endif /* SunOS / Solaris */ -#if _XOPEN_SOURCE_EXTENDED && defined(BOOT_TIME) - if (!got_boottime) { - setutxent(); - const struct utmpx id = {.ut_type = BOOT_TIME}; - const struct utmpx *entry = getutxid(&id); - if (entry) { - bootid_collect(&bin, entry, sizeof(*entry)); - got_boottime = true; - while (unlikely((entry = getutxid(&id)) != nullptr)) { - /* have multiple reboot records, assuming we can distinguish next - * bootsession even if RTC is wrong or absent */ - bootid_collect(&bin, entry, sizeof(*entry)); - got_bootseq = true; + const bool pure_right = split_indx == nkeys; + const bool pure_left = split_indx == 0; + if (unlikely(pure_right)) { + /* newindx == split_indx == nkeys */ + TRACE("no-split, but add new pure page at the %s", "right/after"); + cASSERT(mc, newindx == nkeys && split_indx == nkeys && minkeys == 1); + sepkey = *newkey; + } else if (unlikely(pure_left)) { + /* newindx == split_indx == 0 */ + TRACE("pure-left: no-split, but add new pure page at the %s", + "left/before"); + cASSERT(mc, newindx == 0 && split_indx == 0 && minkeys == 1); + TRACE("pure-left: old-first-key is %s", DKEY_DEBUG(&sepkey)); + } else { + if (is_dupfix_leaf(sister)) { + /* Move half of the keys to the right sibling */ + const intptr_t distance = mc->ki[mc->top] - split_indx; + size_t ksize = mc->tree->dupfix_size; + void *const split = page_dupfix_ptr(mp, split_indx, ksize); + size_t rsize = (nkeys - split_indx) * ksize; + size_t lsize = (nkeys - split_indx) * sizeof(indx_t); + cASSERT(mc, mp->lower >= lsize); + mp->lower -= (indx_t)lsize; + cASSERT(mc, sister->lower + lsize <= UINT16_MAX); + sister->lower += (indx_t)lsize; + cASSERT(mc, mp->upper + rsize - lsize <= UINT16_MAX); + mp->upper += (indx_t)(rsize - lsize); + cASSERT(mc, sister->upper >= rsize - lsize); + sister->upper -= (indx_t)(rsize - lsize); + sepkey.iov_len = ksize; + sepkey.iov_base = (newindx != split_indx) ? split : newkey->iov_base; + if (distance < 0) { + cASSERT(mc, ksize >= sizeof(indx_t)); + void *const ins = page_dupfix_ptr(mp, mc->ki[mc->top], ksize); + memcpy(sister->entries, split, rsize); + sepkey.iov_base = sister->entries; + memmove(ptr_disp(ins, ksize), ins, + (split_indx - mc->ki[mc->top]) * ksize); + memcpy(ins, newkey->iov_base, ksize); + cASSERT(mc, UINT16_MAX - mp->lower >= (int)sizeof(indx_t)); + mp->lower += sizeof(indx_t); + cASSERT(mc, mp->upper >= ksize - sizeof(indx_t)); + mp->upper -= (indx_t)(ksize - sizeof(indx_t)); + cASSERT(mc, (((ksize & page_numkeys(mp)) ^ mp->upper) & 1) == 0); + } else { + memcpy(sister->entries, split, distance * ksize); + void *const ins = page_dupfix_ptr(sister, distance, ksize); + memcpy(ins, newkey->iov_base, ksize); + memcpy(ptr_disp(ins, ksize), ptr_disp(split, distance * ksize), + rsize - distance * ksize); + cASSERT(mc, UINT16_MAX - sister->lower >= (int)sizeof(indx_t)); + sister->lower += sizeof(indx_t); + cASSERT(mc, sister->upper >= ksize - sizeof(indx_t)); + sister->upper -= (indx_t)(ksize - sizeof(indx_t)); + cASSERT(mc, distance <= (int)UINT16_MAX); + mc->ki[mc->top] = (indx_t)distance; + cASSERT(mc, + (((ksize & page_numkeys(sister)) ^ sister->upper) & 1) == 0); } - } - endutxent(); - } -#endif /* _XOPEN_SOURCE_EXTENDED && BOOT_TIME */ - if (!got_bootseq) { - if (!got_boottime || !MDBX_TRUST_RTC) - goto lack; + if (AUDIT_ENABLED()) { + rc = cursor_check_updating(mc); + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + rc = cursor_check_updating(mn); + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + } + } else { + /* grab a page to hold a temporary copy */ + tmp_ki_copy = page_shadow_alloc(mc->txn, 1); + if (unlikely(tmp_ki_copy == nullptr)) { + rc = MDBX_ENOMEM; + goto done; + } -#if defined(_WIN32) || defined(_WIN64) - FILETIME now; - GetSystemTimeAsFileTime(&now); - if (0x1CCCCCC > now.dwHighDateTime) -#else - struct timespec mono, real; - if (clock_gettime(CLOCK_MONOTONIC, &mono) || - clock_gettime(CLOCK_REALTIME, &real) || - /* wrong time, RTC is mad or absent */ - 1555555555l > real.tv_sec || - /* seems no adjustment by RTC/NTP, i.e. a fake time */ - real.tv_sec < mono.tv_sec || 1234567890l > real.tv_sec - mono.tv_sec || - (real.tv_sec - mono.tv_sec) % 900u == 0) -#endif - goto lack; - } + const size_t max_space = page_space(env); + const size_t new_size = is_leaf(mp) ? leaf_size(env, newkey, newdata) + : branch_size(env, newkey); - return bin; -} + /* prepare to insert */ + size_t i = 0; + while (i < newindx) { + tmp_ki_copy->entries[i] = mp->entries[i]; + ++i; + } + tmp_ki_copy->entries[i] = (indx_t)-1; + while (++i <= nkeys) + tmp_ki_copy->entries[i] = mp->entries[i - 1]; + tmp_ki_copy->pgno = mp->pgno; + tmp_ki_copy->flags = mp->flags; + tmp_ki_copy->txnid = INVALID_TXNID; + tmp_ki_copy->lower = 0; + tmp_ki_copy->upper = (indx_t)max_space; -__cold int mdbx_get_sysraminfo(intptr_t *page_size, intptr_t *total_pages, - intptr_t *avail_pages) { - if (!page_size && !total_pages && !avail_pages) - return MDBX_EINVAL; - if (total_pages) - *total_pages = -1; - if (avail_pages) - *avail_pages = -1; + /* Добавляемый узел может не поместиться в страницу-половину вместе + * с количественной половиной узлов из исходной страницы. В худшем случае, + * в страницу-половину с добавляемым узлом могут попасть самые больше узлы + * из исходной страницы, а другую половину только узлы с самыми короткими + * ключами и с пустыми данными. Поэтому, чтобы найти подходящую границу + * разреза требуется итерировать узлы и считая их объем. + * + * Однако, при простом количественном делении (без учета размера ключей + * и данных) на страницах-половинах будет примерно вдвое меньше узлов. + * Поэтому добавляемый узел точно поместится, если его размер не больше + * чем место "освобождающееся" от заголовков узлов, которые переедут + * в другую страницу-половину. Кроме этого, как минимум по одному байту + * будет в каждом ключе, в худшем случае кроме одного, который может быть + * нулевого размера. */ - const intptr_t pagesize = osal_syspagesize(); - if (page_size) - *page_size = pagesize; - if (unlikely(pagesize < MIN_PAGESIZE || !is_powerof2(pagesize))) - return MDBX_INCOMPATIBLE; + if (newindx == split_indx && nkeys >= 5) { + STATIC_ASSERT(P_BRANCH == 1); + split_indx += mp->flags & P_BRANCH; + } + eASSERT(env, split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys); + const size_t dim_nodes = + (newindx >= split_indx) ? split_indx : nkeys - split_indx; + const size_t dim_used = (sizeof(indx_t) + NODESIZE + 1) * dim_nodes; + if (new_size >= dim_used) { + /* Search for best acceptable split point */ + i = (newindx < split_indx) ? 0 : nkeys; + intptr_t dir = (newindx < split_indx) ? 1 : -1; + size_t before = 0, after = new_size + page_used(env, mp); + size_t best_split = split_indx; + size_t best_shift = INT_MAX; - MDBX_MAYBE_UNUSED const int log2page = log2n_powerof2(pagesize); - assert(pagesize == (INT64_C(1) << log2page)); - (void)log2page; + TRACE("seek separator from %zu, step %zi, default %zu, new-idx %zu, " + "new-size %zu", + i, dir, split_indx, newindx, new_size); + do { + cASSERT(mc, i <= nkeys); + size_t size = new_size; + if (i != newindx) { + node_t *node = ptr_disp(mp, tmp_ki_copy->entries[i] + PAGEHDRSZ); + size = NODESIZE + node_ks(node) + sizeof(indx_t); + if (is_leaf(mp)) + size += (node_flags(node) & N_BIGDATA) ? sizeof(pgno_t) + : node_ds(node); + size = EVEN_CEIL(size); + } -#if defined(_WIN32) || defined(_WIN64) - MEMORYSTATUSEX info; - memset(&info, 0, sizeof(info)); - info.dwLength = sizeof(info); - if (!GlobalMemoryStatusEx(&info)) - return (int)GetLastError(); -#endif + before += size; + after -= size; + TRACE("step %zu, size %zu, before %zu, after %zu, max %zu", i, size, + before, after, max_space); - if (total_pages) { -#if defined(_WIN32) || defined(_WIN64) - const intptr_t total_ram_pages = (intptr_t)(info.ullTotalPhys >> log2page); -#elif defined(_SC_PHYS_PAGES) - const intptr_t total_ram_pages = sysconf(_SC_PHYS_PAGES); - if (total_ram_pages == -1) - return errno; -#elif defined(_SC_AIX_REALMEM) - const intptr_t total_ram_Kb = sysconf(_SC_AIX_REALMEM); - if (total_ram_Kb == -1) - return errno; - const intptr_t total_ram_pages = (total_ram_Kb << 10) >> log2page; -#elif defined(HW_USERMEM) || defined(HW_PHYSMEM64) || defined(HW_MEMSIZE) || \ - defined(HW_PHYSMEM) - size_t ram, len = sizeof(ram); - static const int mib[] = { - CTL_HW, -#if defined(HW_USERMEM) - HW_USERMEM -#elif defined(HW_PHYSMEM64) - HW_PHYSMEM64 -#elif defined(HW_MEMSIZE) - HW_MEMSIZE -#else - HW_PHYSMEM -#endif - }; - if (sysctl( -#ifdef SYSCTL_LEGACY_NONCONST_MIB - (int *) -#endif - mib, - ARRAY_LENGTH(mib), &ram, &len, NULL, 0) != 0) - return errno; - if (len != sizeof(ram)) - return MDBX_ENOSYS; - const intptr_t total_ram_pages = (intptr_t)(ram >> log2page); -#else -#error "FIXME: Get User-accessible or physical RAM" -#endif - *total_pages = total_ram_pages; - if (total_ram_pages < 1) - return MDBX_ENOSYS; - } + if (before <= max_space && after <= max_space) { + const size_t split = i + (dir > 0); + if (split >= minkeys && split <= nkeys + 1 - minkeys) { + const size_t shift = branchless_abs(split_indx - split); + if (shift >= best_shift) + break; + best_shift = shift; + best_split = split; + if (!best_shift) + break; + } + } + i += dir; + } while (i < nkeys); - if (avail_pages) { -#if defined(_WIN32) || defined(_WIN64) - const intptr_t avail_ram_pages = (intptr_t)(info.ullAvailPhys >> log2page); -#elif defined(_SC_AVPHYS_PAGES) - const intptr_t avail_ram_pages = sysconf(_SC_AVPHYS_PAGES); - if (avail_ram_pages == -1) - return errno; -#elif defined(__MACH__) - mach_msg_type_number_t count = HOST_VM_INFO_COUNT; - vm_statistics_data_t vmstat; - mach_port_t mport = mach_host_self(); - kern_return_t kerr = host_statistics(mach_host_self(), HOST_VM_INFO, - (host_info_t)&vmstat, &count); - mach_port_deallocate(mach_task_self(), mport); - if (unlikely(kerr != KERN_SUCCESS)) - return MDBX_ENOSYS; - const intptr_t avail_ram_pages = vmstat.free_count; -#elif defined(VM_TOTAL) || defined(VM_METER) - struct vmtotal info; - size_t len = sizeof(info); - static const int mib[] = { - CTL_VM, -#if defined(VM_TOTAL) - VM_TOTAL -#elif defined(VM_METER) - VM_METER -#endif - }; - if (sysctl( -#ifdef SYSCTL_LEGACY_NONCONST_MIB - (int *) -#endif - mib, - ARRAY_LENGTH(mib), &info, &len, NULL, 0) != 0) - return errno; - if (len != sizeof(info)) - return MDBX_ENOSYS; - const intptr_t avail_ram_pages = info.t_free; -#else -#error "FIXME: Get Available RAM" -#endif - *avail_pages = avail_ram_pages; - if (avail_ram_pages < 1) - return MDBX_ENOSYS; + split_indx = best_split; + TRACE("chosen %zu", split_indx); + } + eASSERT(env, split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys); + + sepkey = *newkey; + if (split_indx != newindx) { + node_t *node = + ptr_disp(mp, tmp_ki_copy->entries[split_indx] + PAGEHDRSZ); + sepkey.iov_len = node_ks(node); + sepkey.iov_base = node_key(node); + } + } } + DEBUG("separator is %zd [%s]", split_indx, DKEY_DEBUG(&sepkey)); - return MDBX_SUCCESS; -} + bool did_split_parent = false; + /* Copy separator key to the parent. */ + if (page_room(mn->pg[prev_top]) < branch_size(env, &sepkey)) { + TRACE("need split parent branch-page for key %s", DKEY_DEBUG(&sepkey)); + cASSERT(mc, page_numkeys(mn->pg[prev_top]) > 2); + cASSERT(mc, !pure_left); + const int top = mc->top; + const int height = mc->tree->height; + mn->top -= 1; + did_split_parent = true; + couple.outer.next = mn->txn->cursors[cursor_dbi(mn)]; + mn->txn->cursors[cursor_dbi(mn)] = &couple.outer; + rc = page_split(mn, &sepkey, nullptr, sister->pgno, 0); + mn->txn->cursors[cursor_dbi(mn)] = couple.outer.next; + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + cASSERT(mc, mc->top - top == mc->tree->height - height); + if (AUDIT_ENABLED()) { + rc = cursor_check_updating(mc); + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + } -MDBX_INTERNAL_VAR_INSTA unsigned sys_pagesize, sys_pagesize_ln2, - sys_allocation_granularity; + /* root split? */ + prev_top += mc->top - top; -void osal_ctor(void) { -#if MDBX_HAVE_PWRITEV && defined(_SC_IOV_MAX) - osal_iov_max = sysconf(_SC_IOV_MAX); - if (RUNNING_ON_VALGRIND && osal_iov_max > 64) - /* чтобы не описывать все 1024 исключения в valgrind_suppress.txt */ - osal_iov_max = 64; -#endif /* MDBX_HAVE_PWRITEV && _SC_IOV_MAX */ + /* Right page might now have changed parent. + * Check if left page also changed parent. */ + if (mn->pg[prev_top] != mc->pg[prev_top] && + mc->ki[prev_top] >= page_numkeys(mc->pg[prev_top])) { + for (intptr_t i = 0; i < prev_top; i++) { + mc->pg[i] = mn->pg[i]; + mc->ki[i] = mn->ki[i]; + } + mc->pg[prev_top] = mn->pg[prev_top]; + if (mn->ki[prev_top]) { + mc->ki[prev_top] = mn->ki[prev_top] - 1; + } else { + /* find right page's left sibling */ + mc->ki[prev_top] = mn->ki[prev_top]; + rc = cursor_sibling_left(mc); + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc == MDBX_NOTFOUND) /* improper mdbx_cursor_sibling() result */ { + ERROR("unexpected %i error going left sibling", rc); + rc = MDBX_PROBLEM; + } + goto done; + } + } + } + } else if (unlikely(pure_left)) { + page_t *ptop_page = mc->pg[prev_top]; + TRACE("pure-left: adding to parent page %u node[%u] left-leaf page #%u key " + "%s", + ptop_page->pgno, mc->ki[prev_top], sister->pgno, + DKEY(mc->ki[prev_top] ? newkey : nullptr)); + assert(mc->top == prev_top + 1); + mc->top = (uint8_t)prev_top; + rc = node_add_branch(mc, mc->ki[prev_top], + mc->ki[prev_top] ? newkey : nullptr, sister->pgno); + cASSERT(mc, mp == mc->pg[prev_top + 1] && newindx == mc->ki[prev_top + 1] && + prev_top == mc->top); + + if (likely(rc == MDBX_SUCCESS) && mc->ki[prev_top] == 0) { + node_t *node = page_node(mc->pg[prev_top], 1); + TRACE("pure-left: update prev-first key on parent to %s", DKEY(&sepkey)); + cASSERT(mc, node_ks(node) == 0 && node_pgno(node) == mp->pgno); + cASSERT(mc, mc->top == prev_top && mc->ki[prev_top] == 0); + mc->ki[prev_top] = 1; + rc = tree_propagate_key(mc, &sepkey); + cASSERT(mc, mc->top == prev_top && mc->ki[prev_top] == 1); + cASSERT(mc, + mp == mc->pg[prev_top + 1] && newindx == mc->ki[prev_top + 1]); + mc->ki[prev_top] = 0; + } else { + TRACE("pure-left: no-need-update prev-first key on parent %s", + DKEY(&sepkey)); + } -#if defined(_WIN32) || defined(_WIN64) - SYSTEM_INFO si; - GetSystemInfo(&si); - sys_pagesize = si.dwPageSize; - sys_allocation_granularity = si.dwAllocationGranularity; -#else - sys_pagesize = sysconf(_SC_PAGE_SIZE); - sys_allocation_granularity = (MDBX_WORDBITS > 32) ? 65536 : 4096; - sys_allocation_granularity = (sys_allocation_granularity > sys_pagesize) - ? sys_allocation_granularity - : sys_pagesize; -#endif - assert(sys_pagesize > 0 && (sys_pagesize & (sys_pagesize - 1)) == 0); - assert(sys_allocation_granularity >= sys_pagesize && - sys_allocation_granularity % sys_pagesize == 0); - sys_pagesize_ln2 = log2n_powerof2(sys_pagesize); + mc->top++; + if (unlikely(rc != MDBX_SUCCESS)) + goto done; -#if defined(__linux__) || defined(__gnu_linux__) - posix_clockid = choice_monoclock(); -#endif + node_t *node = page_node(mc->pg[prev_top], mc->ki[prev_top] + (size_t)1); + cASSERT(mc, node_pgno(node) == mp->pgno && mc->pg[prev_top] == ptop_page); + } else { + mn->top -= 1; + TRACE("add-to-parent the right-entry[%u] for new sibling-page", + mn->ki[prev_top]); + rc = node_add_branch(mn, mn->ki[prev_top], &sepkey, sister->pgno); + mn->top += 1; + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + } -#if defined(_WIN32) || defined(_WIN64) - QueryPerformanceFrequency(&performance_frequency); -#elif defined(__APPLE__) || defined(__MACH__) - mach_timebase_info_data_t ti; - mach_timebase_info(&ti); - ratio_16dot16_to_monotine = UINT64_C(1000000000) * ti.denom / ti.numer; -#endif - monotime_limit = osal_16dot16_to_monotime(UINT32_MAX - 1); -} + if (unlikely(pure_left | pure_right)) { + mc->pg[mc->top] = sister; + mc->ki[mc->top] = 0; + switch (page_type(sister)) { + case P_LEAF: { + cASSERT(mc, newpgno == 0 || newpgno == P_INVALID); + rc = node_add_leaf(mc, 0, newkey, newdata, naf); + } break; + case P_LEAF | P_DUPFIX: { + cASSERT(mc, (naf & (N_BIGDATA | N_SUBDATA | N_DUPDATA)) == 0); + cASSERT(mc, newpgno == 0 || newpgno == P_INVALID); + rc = node_add_dupfix(mc, 0, newkey); + } break; + default: + rc = bad_page(sister, "wrong page-type %u\n", page_type(sister)); + } + if (unlikely(rc != MDBX_SUCCESS)) + goto done; -void osal_dtor(void) {} -/* This is CMake-template for libmdbx's version.c - ******************************************************************************/ + if (pure_right) { + for (intptr_t i = 0; i < mc->top; i++) + mc->ki[i] = mn->ki[i]; + } else if (mc->ki[mc->top - 1] == 0) { + for (intptr_t i = 2; i <= mc->top; ++i) + if (mc->ki[mc->top - i]) { + sepkey = get_key(page_node(mc->pg[mc->top - i], mc->ki[mc->top - i])); + if (mc->clc->k.cmp(newkey, &sepkey) < 0) { + mc->top -= (int8_t)i; + DEBUG("pure-left: update new-first on parent [%i] page %u key %s", + mc->ki[mc->top], mc->pg[mc->top]->pgno, DKEY(newkey)); + rc = tree_propagate_key(mc, newkey); + mc->top += (int8_t)i; + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + } + break; + } + } + } else if (tmp_ki_copy) { /* !is_dupfix_leaf(mp) */ + /* Move nodes */ + mc->pg[mc->top] = sister; + size_t n = 0, ii = split_indx; + do { + TRACE("i %zu, nkeys %zu => n %zu, rp #%u", ii, nkeys, n, sister->pgno); + pgno_t pgno = 0; + MDBX_val *rdata = nullptr; + if (ii == newindx) { + rkey = *newkey; + if (is_leaf(mp)) + rdata = newdata; + else + pgno = newpgno; + flags = naf; + /* Update index for the new key. */ + mc->ki[mc->top] = (indx_t)n; + } else { + node_t *node = ptr_disp(mp, tmp_ki_copy->entries[ii] + PAGEHDRSZ); + rkey.iov_base = node_key(node); + rkey.iov_len = node_ks(node); + if (is_leaf(mp)) { + xdata.iov_base = node_data(node); + xdata.iov_len = node_ds(node); + rdata = &xdata; + } else + pgno = node_pgno(node); + flags = node_flags(node); + } + + switch (page_type(sister)) { + case P_BRANCH: { + cASSERT(mc, 0 == (uint16_t)flags); + /* First branch index doesn't need key data. */ + rc = node_add_branch(mc, n, n ? &rkey : nullptr, pgno); + } break; + case P_LEAF: { + cASSERT(mc, pgno == 0); + cASSERT(mc, rdata != nullptr); + rc = node_add_leaf(mc, n, &rkey, rdata, flags); + } break; + /* case P_LEAF | P_DUPFIX: { + cASSERT(mc, (nflags & (N_BIGDATA | N_SUBDATA | N_DUPDATA)) == 0); + cASSERT(mc, gno == 0); + rc = mdbx_node_add_dupfix(mc, n, &rkey); + } break; */ + default: + rc = bad_page(sister, "wrong page-type %u\n", page_type(sister)); + } + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + ++n; + if (++ii > nkeys) { + ii = 0; + n = 0; + mc->pg[mc->top] = tmp_ki_copy; + TRACE("switch to mp #%u", tmp_ki_copy->pgno); + } + } while (ii != split_indx); -#if MDBX_VERSION_MAJOR != 0 || \ - MDBX_VERSION_MINOR != 13 -#error "API version mismatch! Had `git fetch --tags` done?" -#endif + TRACE("ii %zu, nkeys %zu, n %zu, pgno #%u", ii, nkeys, n, + mc->pg[mc->top]->pgno); -static const char sourcery[] = MDBX_STRINGIFY(MDBX_BUILD_SOURCERY); + nkeys = page_numkeys(tmp_ki_copy); + for (size_t i = 0; i < nkeys; i++) + mp->entries[i] = tmp_ki_copy->entries[i]; + mp->lower = tmp_ki_copy->lower; + mp->upper = tmp_ki_copy->upper; + memcpy(page_node(mp, nkeys - 1), page_node(tmp_ki_copy, nkeys - 1), + env->ps - tmp_ki_copy->upper - PAGEHDRSZ); -__dll_export -#ifdef __attribute_used__ - __attribute_used__ -#elif defined(__GNUC__) || __has_attribute(__used__) - __attribute__((__used__)) -#endif -#ifdef __attribute_externally_visible__ - __attribute_externally_visible__ -#elif (defined(__GNUC__) && !defined(__clang__)) || \ - __has_attribute(__externally_visible__) - __attribute__((__externally_visible__)) -#endif - const struct MDBX_version_info mdbx_version = { - 0, - 13, - 0, - 38, - {"2024-04-04T22:31:03+03:00", "a0fc2d938419aa82764beae00e1472f412d5a4d5", "f19753636d2364c43125f972b8d3f29dc9e244b4", - "v0.13.0-38-gf1975363"}, - sourcery}; + /* reset back to original page */ + if (newindx < split_indx) { + mc->pg[mc->top] = mp; + } else { + mc->pg[mc->top] = sister; + mc->ki[prev_top]++; + /* Make sure ki is still valid. */ + if (mn->pg[prev_top] != mc->pg[prev_top] && + mc->ki[prev_top] >= page_numkeys(mc->pg[prev_top])) { + for (intptr_t i = 0; i <= prev_top; i++) { + mc->pg[i] = mn->pg[i]; + mc->ki[i] = mn->ki[i]; + } + } + } + } else if (newindx >= split_indx) { + mc->pg[mc->top] = sister; + mc->ki[prev_top]++; + /* Make sure ki is still valid. */ + if (mn->pg[prev_top] != mc->pg[prev_top] && + mc->ki[prev_top] >= page_numkeys(mc->pg[prev_top])) { + for (intptr_t i = 0; i <= prev_top; i++) { + mc->pg[i] = mn->pg[i]; + mc->ki[i] = mn->ki[i]; + } + } + } -__dll_export -#ifdef __attribute_used__ - __attribute_used__ -#elif defined(__GNUC__) || __has_attribute(__used__) - __attribute__((__used__)) -#endif -#ifdef __attribute_externally_visible__ - __attribute_externally_visible__ -#elif (defined(__GNUC__) && !defined(__clang__)) || \ - __has_attribute(__externally_visible__) - __attribute__((__externally_visible__)) -#endif - const char *const mdbx_sourcery_anchor = sourcery; -/* - * Copyright 2015-2024 Leonid Yuriev - * and other libmdbx authors: please see AUTHORS file. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ + /* Adjust other cursors pointing to mp and/or to parent page */ + nkeys = page_numkeys(mp); + for (MDBX_cursor *m2 = mc->txn->cursors[cursor_dbi(mc)]; m2; m2 = m2->next) { + MDBX_cursor *m3 = (mc->flags & z_inner) ? &m2->subcur->cursor : m2; + if (!is_pointed(m3) || m3 == mc) + continue; + if (foliage) { + /* sub cursors may be on different DB */ + if (m3->pg[0] != mp) + continue; + /* root split */ + for (intptr_t k = foliage; k >= 0; k--) { + m3->ki[k + 1] = m3->ki[k]; + m3->pg[k + 1] = m3->pg[k]; + } + m3->ki[0] = m3->ki[0] >= nkeys + pure_left; + m3->pg[0] = mc->pg[0]; + m3->top += 1; + } + + if (m3->top >= mc->top && m3->pg[mc->top] == mp && !pure_left) { + if (m3->ki[mc->top] >= newindx) + m3->ki[mc->top] += !(naf & MDBX_SPLIT_REPLACE); + if (m3->ki[mc->top] >= nkeys) { + m3->pg[mc->top] = sister; + cASSERT(mc, m3->ki[mc->top] >= nkeys); + m3->ki[mc->top] -= (indx_t)nkeys; + for (intptr_t i = 0; i < mc->top; i++) { + m3->ki[i] = mn->ki[i]; + m3->pg[i] = mn->pg[i]; + } + } + } else if (!did_split_parent && m3->top >= prev_top && + m3->pg[prev_top] == mc->pg[prev_top] && + m3->ki[prev_top] >= mc->ki[prev_top]) { + m3->ki[prev_top]++; /* also for the `pure-left` case */ + } + if (inner_pointed(m3) && is_leaf(mp)) + cursor_inner_refresh(m3, m3->pg[mc->top], m3->ki[mc->top]); + } + TRACE("mp #%u left: %zd, sister #%u left: %zd", mp->pgno, page_room(mp), + sister->pgno, page_room(sister)); -#if defined(_WIN32) || defined(_WIN64) /* Windows LCK-implementation */ +done: + if (tmp_ki_copy) + page_shadow_release(env, tmp_ki_copy, 1); -/* PREAMBLE FOR WINDOWS: - * - * We are not concerned for performance here. - * If you are running Windows a performance could NOT be the goal. - * Otherwise please use Linux. */ + if (unlikely(rc != MDBX_SUCCESS)) + mc->txn->flags |= MDBX_TXN_ERROR; + else { + if (AUDIT_ENABLED()) + rc = cursor_check_updating(mc); + if (unlikely(naf & MDBX_RESERVE)) { + node_t *node = page_node(mc->pg[mc->top], mc->ki[mc->top]); + if (!(node_flags(node) & N_BIGDATA)) + newdata->iov_base = node_data(node); + } +#if MDBX_ENABLE_PGOP_STAT + env->lck->pgops.split.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + } + DEBUG("<< mp #%u, rc %d", mp->pgno, rc); + return rc; +} -static void mdbx_winnt_import(void); +int tree_propagate_key(MDBX_cursor *mc, const MDBX_val *key) { + page_t *mp; + node_t *node; + size_t len; + ptrdiff_t delta, ksize, oksize; + intptr_t ptr, i, nkeys, indx; + DKBUF_DEBUG; -#if MDBX_BUILD_SHARED_LIBRARY -#if MDBX_WITHOUT_MSVC_CRT && defined(NDEBUG) -/* DEBUG/CHECKED builds still require MSVC's CRT for runtime checks. - * - * Define dll's entry point only for Release build when NDEBUG is defined and - * MDBX_WITHOUT_MSVC_CRT=ON. if the entry point isn't defined then MSVC's will - * automatically use DllMainCRTStartup() from CRT library, which also - * automatically call DllMain() from our mdbx.dll */ -#pragma comment(linker, "/ENTRY:DllMain") -#endif /* MDBX_WITHOUT_MSVC_CRT */ + cASSERT(mc, cursor_is_tracked(mc)); + indx = mc->ki[mc->top]; + mp = mc->pg[mc->top]; + node = page_node(mp, indx); + ptr = mp->entries[indx]; +#if MDBX_DEBUG + MDBX_val k2; + k2.iov_base = node_key(node); + k2.iov_len = node_ks(node); + DEBUG("update key %zi (offset %zu) [%s] to [%s] on page %" PRIaPGNO, indx, + ptr, DVAL_DEBUG(&k2), DKEY_DEBUG(key), mp->pgno); +#endif /* MDBX_DEBUG */ + + /* Sizes must be 2-byte aligned. */ + ksize = EVEN_CEIL(key->iov_len); + oksize = EVEN_CEIL(node_ks(node)); + delta = ksize - oksize; + + /* Shift node contents if EVEN_CEIL(key length) changed. */ + if (delta) { + if (delta > (int)page_room(mp)) { + /* not enough space left, do a delete and split */ + DEBUG("Not enough room, delta = %zd, splitting...", delta); + pgno_t pgno = node_pgno(node); + node_del(mc, 0); + int err = page_split(mc, key, nullptr, pgno, MDBX_SPLIT_REPLACE); + if (err == MDBX_SUCCESS && AUDIT_ENABLED()) + err = cursor_check_updating(mc); + return err; + } + + nkeys = page_numkeys(mp); + for (i = 0; i < nkeys; i++) { + if (mp->entries[i] <= ptr) { + cASSERT(mc, mp->entries[i] >= delta); + mp->entries[i] -= (indx_t)delta; + } + } -BOOL APIENTRY DllMain(HANDLE module, DWORD reason, LPVOID reserved) -#else -#if !MDBX_MANUAL_MODULE_HANDLER -static -#endif /* !MDBX_MANUAL_MODULE_HANDLER */ - void NTAPI - mdbx_module_handler(PVOID module, DWORD reason, PVOID reserved) -#endif /* MDBX_BUILD_SHARED_LIBRARY */ -{ - (void)reserved; - switch (reason) { - case DLL_PROCESS_ATTACH: - mdbx_winnt_import(); - global_ctor(); - break; - case DLL_PROCESS_DETACH: - global_dtor(); - break; + void *const base = ptr_disp(mp, mp->upper + PAGEHDRSZ); + len = ptr - mp->upper + NODESIZE; + memmove(ptr_disp(base, -delta), base, len); + cASSERT(mc, mp->upper >= delta); + mp->upper -= (indx_t)delta; - case DLL_THREAD_ATTACH: - break; - case DLL_THREAD_DETACH: - thread_dtor(module); - break; + node = page_node(mp, indx); } -#if MDBX_BUILD_SHARED_LIBRARY - return TRUE; -#endif -} -#if !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER -#if defined(_MSC_VER) -# pragma const_seg(push) -# pragma data_seg(push) + /* But even if no shift was needed, update ksize */ + node_set_ks(node, key->iov_len); -# ifndef _M_IX86 - /* kick a linker to create the TLS directory if not already done */ -# pragma comment(linker, "/INCLUDE:_tls_used") - /* Force some symbol references. */ -# pragma comment(linker, "/INCLUDE:mdbx_tls_anchor") - /* specific const-segment for WIN64 */ -# pragma const_seg(".CRT$XLB") - const -# else - /* kick a linker to create the TLS directory if not already done */ -# pragma comment(linker, "/INCLUDE:__tls_used") - /* Force some symbol references. */ -# pragma comment(linker, "/INCLUDE:_mdbx_tls_anchor") - /* specific data-segment for WIN32 */ -# pragma data_seg(".CRT$XLB") -# endif + if (likely(key->iov_len /* to avoid UBSAN traps*/ != 0)) + memcpy(node_key(node), key->iov_base, key->iov_len); + return MDBX_SUCCESS; +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 - __declspec(allocate(".CRT$XLB")) PIMAGE_TLS_CALLBACK mdbx_tls_anchor = mdbx_module_handler; -# pragma data_seg(pop) -# pragma const_seg(pop) -#elif defined(__GNUC__) -# ifndef _M_IX86 - const -# endif - PIMAGE_TLS_CALLBACK mdbx_tls_anchor __attribute__((__section__(".CRT$XLB"), used)) = mdbx_module_handler; -#else -# error FIXME -#endif -#endif /* !MDBX_BUILD_SHARED_LIBRARY && !MDBX_MANUAL_MODULE_HANDLER */ +static inline size_t txl_size2bytes(const size_t size) { + assert(size > 0 && size <= txl_max * 2); + size_t bytes = + ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(txnid_t) * (size + 2), + txl_granulate * sizeof(txnid_t)) - + MDBX_ASSUME_MALLOC_OVERHEAD; + return bytes; +} -/*----------------------------------------------------------------------------*/ +static inline size_t txl_bytes2size(const size_t bytes) { + size_t size = bytes / sizeof(txnid_t); + assert(size > 2 && size <= txl_max * 2); + return size - 2; +} -#define LCK_SHARED 0 -#define LCK_EXCLUSIVE LOCKFILE_EXCLUSIVE_LOCK -#define LCK_WAITFOR 0 -#define LCK_DONTWAIT LOCKFILE_FAIL_IMMEDIATELY +MDBX_INTERNAL txl_t txl_alloc(void) { + size_t bytes = txl_size2bytes(txl_initial); + txl_t txl = osal_malloc(bytes); + if (likely(txl)) { +#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) + bytes = malloc_usable_size(txl); +#endif /* malloc_usable_size */ + txl[0] = txl_bytes2size(bytes); + assert(txl[0] >= txl_initial); + txl += 1; + *txl = 0; + } + return txl; +} -static int flock_with_event(HANDLE fd, HANDLE event, unsigned flags, - size_t offset, size_t bytes) { - TRACE("lock>>: fd %p, event %p, flags 0x%x offset %zu, bytes %zu >>", fd, - event, flags, offset, bytes); - OVERLAPPED ov; - ov.Internal = 0; - ov.InternalHigh = 0; - ov.hEvent = event; - ov.Offset = (DWORD)offset; - ov.OffsetHigh = HIGH_DWORD(offset); - if (LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov)) { - TRACE("lock<<: fd %p, event %p, flags 0x%x offset %zu, bytes %zu << %s", fd, - event, flags, offset, bytes, "done"); +MDBX_INTERNAL void txl_free(txl_t txl) { + if (likely(txl)) + osal_free(txl - 1); +} + +MDBX_INTERNAL int txl_reserve(txl_t __restrict *__restrict ptxl, + const size_t wanna) { + const size_t allocated = (size_t)MDBX_PNL_ALLOCLEN(*ptxl); + assert(MDBX_PNL_GETSIZE(*ptxl) <= txl_max && + MDBX_PNL_ALLOCLEN(*ptxl) >= MDBX_PNL_GETSIZE(*ptxl)); + if (likely(allocated >= wanna)) return MDBX_SUCCESS; + + if (unlikely(wanna > /* paranoia */ txl_max)) { + ERROR("TXL too long (%zu > %zu)", wanna, (size_t)txl_max); + return MDBX_TXN_FULL; } - DWORD rc = GetLastError(); - if (rc == ERROR_IO_PENDING) { - if (event) { - if (GetOverlappedResult(fd, &ov, &rc, true)) { - TRACE("lock<<: fd %p, event %p, flags 0x%x offset %zu, bytes %zu << %s", - fd, event, flags, offset, bytes, "overlapped-done"); - return MDBX_SUCCESS; - } - rc = GetLastError(); - } else - CancelIo(fd); + const size_t size = (wanna + wanna - allocated < txl_max) + ? wanna + wanna - allocated + : txl_max; + size_t bytes = txl_size2bytes(size); + txl_t txl = osal_realloc(*ptxl - 1, bytes); + if (likely(txl)) { +#if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) + bytes = malloc_usable_size(txl); +#endif /* malloc_usable_size */ + *txl = txl_bytes2size(bytes); + assert(*txl >= wanna); + *ptxl = txl + 1; + return MDBX_SUCCESS; } - TRACE("lock<<: fd %p, event %p, flags 0x%x offset %zu, bytes %zu << err %d", - fd, event, flags, offset, bytes, (int)rc); - return (int)rc; + return MDBX_ENOMEM; } -static __inline int flock(HANDLE fd, unsigned flags, size_t offset, - size_t bytes) { - return flock_with_event(fd, 0, flags, offset, bytes); +static __always_inline int __must_check_result +txl_need(txl_t __restrict *__restrict ptxl, size_t num) { + assert(MDBX_PNL_GETSIZE(*ptxl) <= txl_max && + MDBX_PNL_ALLOCLEN(*ptxl) >= MDBX_PNL_GETSIZE(*ptxl)); + assert(num <= PAGELIST_LIMIT); + const size_t wanna = (size_t)MDBX_PNL_GETSIZE(*ptxl) + num; + return likely(MDBX_PNL_ALLOCLEN(*ptxl) >= wanna) ? MDBX_SUCCESS + : txl_reserve(ptxl, wanna); } -static __inline int flock_data(const MDBX_env *env, unsigned flags, - size_t offset, size_t bytes) { - const HANDLE fd4data = - env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; - return flock_with_event(fd4data, env->me_data_lock_event, flags, offset, - bytes); +static __always_inline void txl_xappend(txl_t __restrict txl, txnid_t id) { + assert(MDBX_PNL_GETSIZE(txl) < MDBX_PNL_ALLOCLEN(txl)); + txl[0] += 1; + MDBX_PNL_LAST(txl) = id; } -static int funlock(mdbx_filehandle_t fd, size_t offset, size_t bytes) { - TRACE("unlock: fd %p, offset %zu, bytes %zu", fd, offset, bytes); - return UnlockFile(fd, (DWORD)offset, HIGH_DWORD(offset), (DWORD)bytes, - HIGH_DWORD(bytes)) - ? MDBX_SUCCESS - : (int)GetLastError(); +#define TXNID_SORT_CMP(first, last) ((first) > (last)) +SORT_IMPL(txnid_sort, false, txnid_t, TXNID_SORT_CMP) +MDBX_INTERNAL void txl_sort(txl_t txl) { + txnid_sort(MDBX_PNL_BEGIN(txl), MDBX_PNL_END(txl)); } -/*----------------------------------------------------------------------------*/ -/* global `write` lock for write-txt processing, - * exclusive locking both meta-pages) */ +MDBX_INTERNAL int __must_check_result txl_append(txl_t __restrict *ptxl, + txnid_t id) { + if (unlikely(MDBX_PNL_GETSIZE(*ptxl) == MDBX_PNL_ALLOCLEN(*ptxl))) { + int rc = txl_need(ptxl, txl_granulate); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + txl_xappend(*ptxl, id); + return MDBX_SUCCESS; +} +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 -#ifdef _WIN64 -#define DXB_MAXLEN UINT64_C(0x7fffFFFFfff00000) -#else -#define DXB_MAXLEN UINT32_C(0x7ff00000) -#endif -#define DXB_BODY (env->me_psize * (size_t)NUM_METAS), DXB_MAXLEN -#define DXB_WHOLE 0, DXB_MAXLEN -int osal_txn_lock(MDBX_env *env, bool dontwait) { - if (dontwait) { - if (!TryEnterCriticalSection(&env->me_windowsbug_lock)) - return MDBX_BUSY; - } else { - __try { - EnterCriticalSection(&env->me_windowsbug_lock); +__hot txnid_t txn_snapshot_oldest(const MDBX_txn *const txn) { + return mvcc_shapshot_oldest( + txn->env, txn->tw.troika.txnid[txn->tw.troika.prefer_steady]); +} + +static void done_cursors(MDBX_txn *txn, const bool merge) { + tASSERT(txn, txn->cursors[FREE_DBI] == nullptr); + TXN_FOREACH_DBI_FROM(txn, i, /* skip FREE_DBI */ 1) { + MDBX_cursor *mc = txn->cursors[i]; + if (mc) { + txn->cursors[i] = nullptr; + do { + MDBX_cursor *const next = mc->next; + cursor_eot(mc, merge); + mc = next; + } while (mc); } - __except ((GetExceptionCode() == - 0xC0000194 /* STATUS_POSSIBLE_DEADLOCK / EXCEPTION_POSSIBLE_DEADLOCK */) - ? EXCEPTION_EXECUTE_HANDLER - : EXCEPTION_CONTINUE_SEARCH) { - return MDBX_EDEADLK; + } +} + +int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) { + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + dpl_t *const dl = dpl_sort(txn); + int rc = MDBX_SUCCESS; + size_t r, w, total_npages = 0; + for (w = 0, r = 1; r <= dl->length; ++r) { + page_t *dp = dl->items[r].ptr; + if (dp->flags & P_LOOSE) { + dl->items[++w] = dl->items[r]; + continue; } + unsigned npages = dpl_npages(dl, r); + total_npages += npages; + rc = iov_page(txn, ctx, dp, npages); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; } - eASSERT(env, !env->me_txn0->mt_owner); - if (env->me_flags & MDBX_EXCLUSIVE) - goto done; + if (!iov_empty(ctx)) { + tASSERT(txn, rc == MDBX_SUCCESS); + rc = iov_write(ctx); + } - const HANDLE fd4data = - env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; - int rc = flock_with_event(fd4data, env->me_data_lock_event, - dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT) - : (LCK_EXCLUSIVE | LCK_WAITFOR), - DXB_BODY); - if (rc == ERROR_LOCK_VIOLATION && dontwait) { - SleepEx(0, true); - rc = flock_with_event(fd4data, env->me_data_lock_event, - LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_BODY); - if (rc == ERROR_LOCK_VIOLATION) { - SleepEx(0, true); - rc = flock_with_event(fd4data, env->me_data_lock_event, - LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_BODY); + if (likely(rc == MDBX_SUCCESS) && ctx->fd == txn->env->lazy_fd) { + txn->env->lck->unsynced_pages.weak += total_npages; + if (!txn->env->lck->eoos_timestamp.weak) + txn->env->lck->eoos_timestamp.weak = osal_monotime(); + } + + txn->tw.dirtylist->pages_including_loose -= total_npages; + while (r <= dl->length) + dl->items[++w] = dl->items[r++]; + + dl->sorted = dpl_setlen(dl, w); + txn->tw.dirtyroom += r - 1 - w; + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->parent ? txn->parent->tw.dirtyroom + : txn->env->options.dp_limit)); + tASSERT(txn, txn->tw.dirtylist->length == txn->tw.loose_count); + tASSERT(txn, txn->tw.dirtylist->pages_including_loose == txn->tw.loose_count); + return rc; +} + +/* Merge child txn into parent */ +static void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, + const size_t parent_retired_len) { + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0); + dpl_t *const src = dpl_sort(txn); + + /* Remove refunded pages from parent's dirty list */ + dpl_t *const dst = dpl_sort(parent); + if (MDBX_ENABLE_REFUND) { + size_t n = dst->length; + while (n && dst->items[n].pgno >= parent->geo.first_unallocated) { + const unsigned npages = dpl_npages(dst, n); + page_shadow_release(txn->env, dst->items[n].ptr, npages); + --n; + } + parent->tw.dirtyroom += dst->sorted - n; + dst->sorted = dpl_setlen(dst, n); + tASSERT(parent, parent->tw.dirtyroom + parent->tw.dirtylist->length == + (parent->parent ? parent->parent->tw.dirtyroom + : parent->env->options.dp_limit)); + } + + /* Remove reclaimed pages from parent's dirty list */ + const pnl_t reclaimed_list = parent->tw.relist; + dpl_sift(parent, reclaimed_list, false); + + /* Move retired pages from parent's dirty & spilled list to reclaimed */ + size_t r, w, d, s, l; + for (r = w = parent_retired_len; + ++r <= MDBX_PNL_GETSIZE(parent->tw.retired_pages);) { + const pgno_t pgno = parent->tw.retired_pages[r]; + const size_t di = dpl_exist(parent, pgno); + const size_t si = !di ? spill_search(parent, pgno) : 0; + unsigned npages; + const char *kind; + if (di) { + page_t *dp = dst->items[di].ptr; + tASSERT(parent, (dp->flags & ~(P_LEAF | P_DUPFIX | P_BRANCH | P_LARGE | + P_SPILLED)) == 0); + npages = dpl_npages(dst, di); + page_wash(parent, di, dp, npages); + kind = "dirty"; + l = 1; + if (unlikely(npages > l)) { + /* OVERFLOW-страница могла быть переиспользована по частям. Тогда + * в retired-списке может быть только начало последовательности, + * а остаток растащен по dirty, spilled и reclaimed спискам. Поэтому + * переносим в reclaimed с проверкой на обрыв последовательности. + * В любом случае, все осколки будут учтены и отфильтрованы, т.е. если + * страница была разбита на части, то важно удалить dirty-элемент, + * а все осколки будут учтены отдельно. */ + + /* Список retired страниц не сортирован, но для ускорения сортировки + * дополняется в соответствии с MDBX_PNL_ASCENDING */ +#if MDBX_PNL_ASCENDING + const size_t len = MDBX_PNL_GETSIZE(parent->tw.retired_pages); + while (r < len && parent->tw.retired_pages[r + 1] == pgno + l) { + ++r; + if (++l == npages) + break; + } +#else + while (w > parent_retired_len && + parent->tw.retired_pages[w - 1] == pgno + l) { + --w; + if (++l == npages) + break; + } +#endif + } + } else if (unlikely(si)) { + l = npages = 1; + spill_remove(parent, si, 1); + kind = "spilled"; + } else { + parent->tw.retired_pages[++w] = pgno; + continue; } + + DEBUG("reclaim retired parent's %u -> %zu %s page %" PRIaPGNO, npages, l, + kind, pgno); + int err = pnl_insert_span(&parent->tw.relist, pgno, l); + ENSURE(txn->env, err == MDBX_SUCCESS); } - if (rc == MDBX_SUCCESS) { - done: - /* Zap: Failing to release lock 'env->me_windowsbug_lock' - * in function 'mdbx_txn_lock' */ - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(26115); - env->me_txn0->mt_owner = osal_thread_self(); - return MDBX_SUCCESS; - } + MDBX_PNL_SETSIZE(parent->tw.retired_pages, w); - LeaveCriticalSection(&env->me_windowsbug_lock); - return (!dontwait || rc != ERROR_LOCK_VIOLATION) ? rc : MDBX_BUSY; -} + /* Filter-out parent spill list */ + if (parent->tw.spilled.list && + MDBX_PNL_GETSIZE(parent->tw.spilled.list) > 0) { + const pnl_t sl = spill_purge(parent); + size_t len = MDBX_PNL_GETSIZE(sl); + if (len) { + /* Remove refunded pages from parent's spill list */ + if (MDBX_ENABLE_REFUND && + MDBX_PNL_MOST(sl) >= (parent->geo.first_unallocated << 1)) { +#if MDBX_PNL_ASCENDING + size_t i = MDBX_PNL_GETSIZE(sl); + assert(MDBX_PNL_MOST(sl) == MDBX_PNL_LAST(sl)); + do { + if ((sl[i] & 1) == 0) + DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); + i -= 1; + } while (i && sl[i] >= (parent->geo.first_unallocated << 1)); + MDBX_PNL_SETSIZE(sl, i); +#else + assert(MDBX_PNL_MOST(sl) == MDBX_PNL_FIRST(sl)); + size_t i = 0; + do { + ++i; + if ((sl[i] & 1) == 0) + DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); + } while (i < len && sl[i + 1] >= (parent->geo.first_unallocated << 1)); + MDBX_PNL_SETSIZE(sl, len -= i); + memmove(sl + 1, sl + 1 + i, len * sizeof(sl[0])); +#endif + } + tASSERT(txn, pnl_check_allocated(sl, (size_t)parent->geo.first_unallocated + << 1)); -void osal_txn_unlock(MDBX_env *env) { - eASSERT(env, env->me_txn0->mt_owner == osal_thread_self()); - if ((env->me_flags & MDBX_EXCLUSIVE) == 0) { - const HANDLE fd4data = - env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; - int err = funlock(fd4data, DXB_BODY); - if (err != MDBX_SUCCESS) - mdbx_panic("%s failed: err %u", __func__, err); - } - env->me_txn0->mt_owner = 0; - LeaveCriticalSection(&env->me_windowsbug_lock); -} + /* Remove reclaimed pages from parent's spill list */ + s = MDBX_PNL_GETSIZE(sl), r = MDBX_PNL_GETSIZE(reclaimed_list); + /* Scanning from end to begin */ + while (s && r) { + if (sl[s] & 1) { + --s; + continue; + } + const pgno_t spilled_pgno = sl[s] >> 1; + const pgno_t reclaimed_pgno = reclaimed_list[r]; + if (reclaimed_pgno != spilled_pgno) { + const bool cmp = MDBX_PNL_ORDERED(spilled_pgno, reclaimed_pgno); + s -= !cmp; + r -= cmp; + } else { + DEBUG("remove reclaimed parent's spilled page %" PRIaPGNO, + reclaimed_pgno); + spill_remove(parent, s, 1); + --s; + --r; + } + } -/*----------------------------------------------------------------------------*/ -/* global `read` lock for readers registration, - * exclusive locking `mti_numreaders` (second) cacheline */ + /* Remove anything in our dirty list from parent's spill list */ + /* Scanning spill list in descend order */ + const intptr_t step = MDBX_PNL_ASCENDING ? -1 : 1; + s = MDBX_PNL_ASCENDING ? MDBX_PNL_GETSIZE(sl) : 1; + d = src->length; + while (d && (MDBX_PNL_ASCENDING ? s > 0 : s <= MDBX_PNL_GETSIZE(sl))) { + if (sl[s] & 1) { + s += step; + continue; + } + const pgno_t spilled_pgno = sl[s] >> 1; + const pgno_t dirty_pgno_form = src->items[d].pgno; + const unsigned npages = dpl_npages(src, d); + const pgno_t dirty_pgno_to = dirty_pgno_form + npages; + if (dirty_pgno_form > spilled_pgno) { + --d; + continue; + } + if (dirty_pgno_to <= spilled_pgno) { + s += step; + continue; + } -#define LCK_LO_OFFSET 0 -#define LCK_LO_LEN offsetof(MDBX_lockinfo, mti_numreaders) -#define LCK_UP_OFFSET LCK_LO_LEN -#define LCK_UP_LEN (sizeof(MDBX_lockinfo) - LCK_UP_OFFSET) -#define LCK_LOWER LCK_LO_OFFSET, LCK_LO_LEN -#define LCK_UPPER LCK_UP_OFFSET, LCK_UP_LEN + DEBUG("remove dirtied parent's spilled %u page %" PRIaPGNO, npages, + dirty_pgno_form); + spill_remove(parent, s, 1); + s += step; + } -MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) { - osal_srwlock_AcquireShared(&env->me_remap_guard); - if (env->me_lfd == INVALID_HANDLE_VALUE) - return MDBX_SUCCESS; /* readonly database in readonly filesystem */ + /* Squash deleted pagenums if we deleted any */ + spill_purge(parent); + } + } - /* transition from S-? (used) to S-E (locked), - * e.g. exclusive lock upper-part */ - if (env->me_flags & MDBX_EXCLUSIVE) - return MDBX_SUCCESS; + /* Remove anything in our spill list from parent's dirty list */ + if (txn->tw.spilled.list) { + tASSERT(txn, + pnl_check_allocated(txn->tw.spilled.list, + (size_t)parent->geo.first_unallocated << 1)); + dpl_sift(parent, txn->tw.spilled.list, true); + tASSERT(parent, parent->tw.dirtyroom + parent->tw.dirtylist->length == + (parent->parent ? parent->parent->tw.dirtyroom + : parent->env->options.dp_limit)); + } - int rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER); - if (rc == MDBX_SUCCESS) - return MDBX_SUCCESS; + /* Find length of merging our dirty list with parent's and release + * filter-out pages */ + for (l = 0, d = dst->length, s = src->length; d > 0 && s > 0;) { + page_t *sp = src->items[s].ptr; + tASSERT(parent, (sp->flags & ~(P_LEAF | P_DUPFIX | P_BRANCH | P_LARGE | + P_LOOSE | P_SPILLED)) == 0); + const unsigned s_npages = dpl_npages(src, s); + const pgno_t s_pgno = src->items[s].pgno; - osal_srwlock_ReleaseShared(&env->me_remap_guard); - return rc; -} + page_t *dp = dst->items[d].ptr; + tASSERT(parent, (dp->flags & ~(P_LEAF | P_DUPFIX | P_BRANCH | P_LARGE | + P_SPILLED)) == 0); + const unsigned d_npages = dpl_npages(dst, d); + const pgno_t d_pgno = dst->items[d].pgno; -MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) { - if (env->me_lfd != INVALID_HANDLE_VALUE && - (env->me_flags & MDBX_EXCLUSIVE) == 0) { - /* transition from S-E (locked) to S-? (used), e.g. unlock upper-part */ - int err = funlock(env->me_lfd, LCK_UPPER); - if (err != MDBX_SUCCESS) - mdbx_panic("%s failed: err %u", __func__, err); + if (d_pgno >= s_pgno + s_npages) { + --d; + ++l; + } else if (d_pgno + d_npages <= s_pgno) { + if (sp->flags != P_LOOSE) { + sp->txnid = parent->front_txnid; + sp->flags &= ~P_SPILLED; + } + --s; + ++l; + } else { + dst->items[d--].ptr = nullptr; + page_shadow_release(txn->env, dp, d_npages); + } } - osal_srwlock_ReleaseShared(&env->me_remap_guard); -} + assert(dst->sorted == dst->length); + tASSERT(parent, dst->detent >= l + d + s); + dst->sorted = l + d + s; /* the merged length */ -MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) { - return flock( - fd, wait ? LCK_EXCLUSIVE | LCK_WAITFOR : LCK_EXCLUSIVE | LCK_DONTWAIT, 0, - DXB_MAXLEN); -} + while (s > 0) { + page_t *sp = src->items[s].ptr; + tASSERT(parent, (sp->flags & ~(P_LEAF | P_DUPFIX | P_BRANCH | P_LARGE | + P_LOOSE | P_SPILLED)) == 0); + if (sp->flags != P_LOOSE) { + sp->txnid = parent->front_txnid; + sp->flags &= ~P_SPILLED; + } + --s; + } -static int suspend_and_append(mdbx_handle_array_t **array, - const DWORD ThreadId) { - const unsigned limit = (*array)->limit; - if ((*array)->count == limit) { - mdbx_handle_array_t *const ptr = - osal_realloc((limit > ARRAY_LENGTH((*array)->handles)) - ? *array - : /* don't free initial array on the stack */ NULL, - sizeof(mdbx_handle_array_t) + - sizeof(HANDLE) * (limit * (size_t)2 - - ARRAY_LENGTH((*array)->handles))); - if (!ptr) - return MDBX_ENOMEM; - if (limit == ARRAY_LENGTH((*array)->handles)) - *ptr = **array; - *array = ptr; - (*array)->limit = limit * 2; + /* Merge our dirty list into parent's, i.e. merge(dst, src) -> dst */ + if (dst->sorted >= dst->length) { + /* from end to begin with dst extending */ + for (l = dst->sorted, s = src->length, d = dst->length; s > 0 && d > 0;) { + if (unlikely(l <= d)) { + /* squash to get a gap of free space for merge */ + for (r = w = 1; r <= d; ++r) + if (dst->items[r].ptr) { + if (w != r) { + dst->items[w] = dst->items[r]; + dst->items[r].ptr = nullptr; + } + ++w; + } + VERBOSE("squash to begin for extending-merge %zu -> %zu", d, w - 1); + d = w - 1; + continue; + } + assert(l > d); + if (dst->items[d].ptr) { + dst->items[l--] = (dst->items[d].pgno > src->items[s].pgno) + ? dst->items[d--] + : src->items[s--]; + } else + --d; + } + if (s > 0) { + assert(l == s); + while (d > 0) { + assert(dst->items[d].ptr == nullptr); + --d; + } + do { + assert(l > 0); + dst->items[l--] = src->items[s--]; + } while (s > 0); + } else { + assert(l == d); + while (l > 0) { + assert(dst->items[l].ptr != nullptr); + --l; + } + } + } else { + /* from begin to end with shrinking (a lot of new large/overflow pages) */ + for (l = s = d = 1; s <= src->length && d <= dst->length;) { + if (unlikely(l >= d)) { + /* squash to get a gap of free space for merge */ + for (r = w = dst->length; r >= d; --r) + if (dst->items[r].ptr) { + if (w != r) { + dst->items[w] = dst->items[r]; + dst->items[r].ptr = nullptr; + } + --w; + } + VERBOSE("squash to end for shrinking-merge %zu -> %zu", d, w + 1); + d = w + 1; + continue; + } + assert(l < d); + if (dst->items[d].ptr) { + dst->items[l++] = (dst->items[d].pgno < src->items[s].pgno) + ? dst->items[d++] + : src->items[s++]; + } else + ++d; + } + if (s <= src->length) { + assert(dst->sorted - l == src->length - s); + while (d <= dst->length) { + assert(dst->items[d].ptr == nullptr); + --d; + } + do { + assert(l <= dst->sorted); + dst->items[l++] = src->items[s++]; + } while (s <= src->length); + } else { + assert(dst->sorted - l == dst->length - d); + while (l <= dst->sorted) { + assert(l <= d && d <= dst->length && dst->items[d].ptr); + dst->items[l++] = dst->items[d++]; + } + } } + parent->tw.dirtyroom -= dst->sorted - dst->length; + assert(parent->tw.dirtyroom <= parent->env->options.dp_limit); + dpl_setlen(dst, dst->sorted); + parent->tw.dirtylru = txn->tw.dirtylru; + + /* В текущем понимании выгоднее пересчитать кол-во страниц, + * чем подмешивать лишние ветвления и вычисления в циклы выше. */ + dst->pages_including_loose = 0; + for (r = 1; r <= dst->length; ++r) + dst->pages_including_loose += dpl_npages(dst, r); - HANDLE hThread = OpenThread(THREAD_SUSPEND_RESUME | THREAD_QUERY_INFORMATION, - FALSE, ThreadId); - if (hThread == NULL) - return (int)GetLastError(); + tASSERT(parent, dpl_check(parent)); + dpl_free(txn); - if (SuspendThread(hThread) == (DWORD)-1) { - int err = (int)GetLastError(); - DWORD ExitCode; - if (err == /* workaround for Win10 UCRT bug */ ERROR_ACCESS_DENIED || - !GetExitCodeThread(hThread, &ExitCode) || ExitCode != STILL_ACTIVE) - err = MDBX_SUCCESS; - CloseHandle(hThread); - return err; + if (txn->tw.spilled.list) { + if (parent->tw.spilled.list) { + /* Must not fail since space was preserved above. */ + pnl_merge(parent->tw.spilled.list, txn->tw.spilled.list); + pnl_free(txn->tw.spilled.list); + } else { + parent->tw.spilled.list = txn->tw.spilled.list; + parent->tw.spilled.least_removed = txn->tw.spilled.least_removed; + } + tASSERT(parent, dpl_check(parent)); } - (*array)->handles[(*array)->count++] = hThread; - return MDBX_SUCCESS; + parent->flags &= ~MDBX_TXN_HAS_CHILD; + if (parent->tw.spilled.list) { + assert(pnl_check_allocated(parent->tw.spilled.list, + (size_t)parent->geo.first_unallocated << 1)); + if (MDBX_PNL_GETSIZE(parent->tw.spilled.list)) + parent->flags |= MDBX_TXN_SPILLS; + } } -MDBX_INTERNAL_FUNC int -osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { - eASSERT(env, (env->me_flags & MDBX_NOSTICKYTHREADS) == 0); - const uintptr_t CurrentTid = GetCurrentThreadId(); - int rc; - if (env->me_lck_mmap.lck) { - /* Scan LCK for threads of the current process */ - const MDBX_reader *const begin = env->me_lck_mmap.lck->mti_readers; - const MDBX_reader *const end = - begin + - atomic_load32(&env->me_lck_mmap.lck->mti_numreaders, mo_AcquireRelease); - const uintptr_t WriteTxnOwner = env->me_txn0 ? env->me_txn0->mt_owner : 0; - for (const MDBX_reader *reader = begin; reader < end; ++reader) { - if (reader->mr_pid.weak != env->me_pid || !reader->mr_tid.weak) { - skip_lck: - continue; - } - if (reader->mr_tid.weak == CurrentTid || - reader->mr_tid.weak == WriteTxnOwner) - goto skip_lck; - - rc = suspend_and_append(array, (mdbx_tid_t)reader->mr_tid.weak); - if (rc != MDBX_SUCCESS) { - bailout_lck: - (void)osal_resume_threads_after_remap(*array); - return rc; - } - } - if (WriteTxnOwner && WriteTxnOwner != CurrentTid) { - rc = suspend_and_append(array, (mdbx_tid_t)WriteTxnOwner); - if (rc != MDBX_SUCCESS) - goto bailout_lck; - } - } else { - /* Without LCK (i.e. read-only mode). - * Walk through a snapshot of all running threads */ - eASSERT(env, env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)); - const HANDLE hSnapshot = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0); - if (hSnapshot == INVALID_HANDLE_VALUE) - return (int)GetLastError(); +static void take_gcprof(MDBX_txn *txn, MDBX_commit_latency *latency) { + MDBX_env *const env = txn->env; + if (MDBX_ENABLE_PROFGC) { + pgop_stat_t *const ptr = &env->lck->pgops; + latency->gc_prof.work_counter = ptr->gc_prof.work.spe_counter; + latency->gc_prof.work_rtime_monotonic = + osal_monotime_to_16dot16(ptr->gc_prof.work.rtime_monotonic); + latency->gc_prof.work_xtime_cpu = + osal_monotime_to_16dot16(ptr->gc_prof.work.xtime_cpu); + latency->gc_prof.work_rsteps = ptr->gc_prof.work.rsteps; + latency->gc_prof.work_xpages = ptr->gc_prof.work.xpages; + latency->gc_prof.work_majflt = ptr->gc_prof.work.majflt; - THREADENTRY32 entry; - entry.dwSize = sizeof(THREADENTRY32); + latency->gc_prof.self_counter = ptr->gc_prof.self.spe_counter; + latency->gc_prof.self_rtime_monotonic = + osal_monotime_to_16dot16(ptr->gc_prof.self.rtime_monotonic); + latency->gc_prof.self_xtime_cpu = + osal_monotime_to_16dot16(ptr->gc_prof.self.xtime_cpu); + latency->gc_prof.self_rsteps = ptr->gc_prof.self.rsteps; + latency->gc_prof.self_xpages = ptr->gc_prof.self.xpages; + latency->gc_prof.self_majflt = ptr->gc_prof.self.majflt; - if (!Thread32First(hSnapshot, &entry)) { - rc = (int)GetLastError(); - bailout_toolhelp: - CloseHandle(hSnapshot); - (void)osal_resume_threads_after_remap(*array); - return rc; - } + latency->gc_prof.wloops = ptr->gc_prof.wloops; + latency->gc_prof.coalescences = ptr->gc_prof.coalescences; + latency->gc_prof.wipes = ptr->gc_prof.wipes; + latency->gc_prof.flushes = ptr->gc_prof.flushes; + latency->gc_prof.kicks = ptr->gc_prof.kicks; + if (txn == env->basal_txn) + memset(&ptr->gc_prof, 0, sizeof(ptr->gc_prof)); + } else + memset(&latency->gc_prof, 0, sizeof(latency->gc_prof)); +} - do { - if (entry.th32OwnerProcessID != env->me_pid || - entry.th32ThreadID == CurrentTid) - continue; +int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { + STATIC_ASSERT(MDBX_TXN_FINISHED == + MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR); + const uint64_t ts_0 = latency ? osal_monotime() : 0; + uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0, ts_5 = 0, gc_cputime = 0; - rc = suspend_and_append(array, entry.th32ThreadID); - if (rc != MDBX_SUCCESS) - goto bailout_toolhelp; + int rc = check_txn(txn, MDBX_TXN_FINISHED); + if (unlikely(rc != MDBX_SUCCESS)) { + if (latency) + memset(latency, 0, sizeof(*latency)); + return rc; + } - } while (Thread32Next(hSnapshot, &entry)); + MDBX_env *const env = txn->env; +#if MDBX_ENV_CHECKPID + if (unlikely(env->pid != osal_getpid())) { + env->flags |= ENV_FATAL_ERROR; + if (latency) + memset(latency, 0, sizeof(*latency)); + return MDBX_PANIC; + } +#endif /* MDBX_ENV_CHECKPID */ - rc = (int)GetLastError(); - if (rc != ERROR_NO_MORE_FILES) - goto bailout_toolhelp; - CloseHandle(hSnapshot); + if (unlikely(txn->flags & MDBX_TXN_ERROR)) { + rc = MDBX_RESULT_TRUE; + goto fail; } - return MDBX_SUCCESS; -} + /* txn_end() mode for a commit which writes nothing */ + unsigned end_mode = + TXN_END_PURE_COMMIT | TXN_END_UPDATE | TXN_END_SLOT | TXN_END_FREE; + if (unlikely(txn->flags & MDBX_TXN_RDONLY)) + goto done; -MDBX_INTERNAL_FUNC int -osal_resume_threads_after_remap(mdbx_handle_array_t *array) { - int rc = MDBX_SUCCESS; - for (unsigned i = 0; i < array->count; ++i) { - const HANDLE hThread = array->handles[i]; - if (ResumeThread(hThread) == (DWORD)-1) { - const int err = (int)GetLastError(); - DWORD ExitCode; - if (err != /* workaround for Win10 UCRT bug */ ERROR_ACCESS_DENIED && - GetExitCodeThread(hThread, &ExitCode) && ExitCode == STILL_ACTIVE) - rc = err; - } - CloseHandle(hThread); + if ((txn->flags & MDBX_NOSTICKYTHREADS) && + unlikely(txn->owner != osal_thread_self())) { + rc = MDBX_THREAD_MISMATCH; + goto fail; } - return rc; -} -/*----------------------------------------------------------------------------*/ -/* global `initial` lock for lockfile initialization, - * exclusive/shared locking first cacheline */ + if (txn->nested) { + rc = mdbx_txn_commit_ex(txn->nested, nullptr); + tASSERT(txn, txn->nested == nullptr); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } -/* Briefly description of locking schema/algorithm: - * - Windows does not support upgrading or downgrading for file locking. - * - Therefore upgrading/downgrading is emulated by shared and exclusive - * locking of upper and lower halves. - * - In other words, we have FSM with possible 9 states, - * i.e. free/shared/exclusive x free/shared/exclusive == 9. - * Only 6 states of FSM are used, which 2 of ones are transitive. - * - * States: - * ?-? = free, i.e. unlocked - * S-? = used, i.e. shared lock - * E-? = exclusive-read, i.e. operational exclusive - * ?-S - * ?-E = middle (transitive state) - * S-S - * S-E = locked (transitive state) - * E-S - * E-E = exclusive-write, i.e. exclusive due (re)initialization - * - * The osal_lck_seize() moves the locking-FSM from the initial free/unlocked - * state to the "exclusive write" (and returns MDBX_RESULT_TRUE) if possible, - * or to the "used" (and returns MDBX_RESULT_FALSE). - * - * The osal_lck_downgrade() moves the locking-FSM from "exclusive write" - * state to the "used" (i.e. shared) state. - * - * The osal_lck_upgrade() moves the locking-FSM from "used" (i.e. shared) - * state to the "exclusive write" state. - */ + if (unlikely(txn != env->txn)) { + DEBUG("%s", "attempt to commit unknown transaction"); + rc = MDBX_EINVAL; + goto fail; + } -static void lck_unlock(MDBX_env *env) { - int err; + if (txn->parent) { + tASSERT(txn, audit_ex(txn, 0, false) == 0); + eASSERT(env, txn != env->basal_txn); + MDBX_txn *const parent = txn->parent; + eASSERT(env, parent->signature == txn_signature); + eASSERT(env, + parent->nested == txn && (parent->flags & MDBX_TXN_HAS_CHILD) != 0); + eASSERT(env, dpl_check(txn)); - if (env->me_lfd != INVALID_HANDLE_VALUE) { - /* double `unlock` for robustly remove overlapped shared/exclusive locks */ - do - err = funlock(env->me_lfd, LCK_LOWER); - while (err == MDBX_SUCCESS); - assert(err == ERROR_NOT_LOCKED || - (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); - SetLastError(ERROR_SUCCESS); + if (txn->tw.dirtylist->length == 0 && !(txn->flags & MDBX_TXN_DIRTY) && + parent->n_dbi == txn->n_dbi) { + TXN_FOREACH_DBI_ALL(txn, i) { + tASSERT(txn, (txn->dbi_state[i] & DBI_DIRTY) == 0); + if ((txn->dbi_state[i] & DBI_STALE) && + !(parent->dbi_state[i] & DBI_STALE)) + tASSERT(txn, + memcmp(&parent->dbs[i], &txn->dbs[i], sizeof(tree_t)) == 0); + } - do - err = funlock(env->me_lfd, LCK_UPPER); - while (err == MDBX_SUCCESS); - assert(err == ERROR_NOT_LOCKED || - (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); - SetLastError(ERROR_SUCCESS); - } + tASSERT(txn, memcmp(&parent->geo, &txn->geo, sizeof(parent->geo)) == 0); + tASSERT(txn, memcmp(&parent->canary, &txn->canary, + sizeof(parent->canary)) == 0); + tASSERT(txn, !txn->tw.spilled.list || + MDBX_PNL_GETSIZE(txn->tw.spilled.list) == 0); + tASSERT(txn, txn->tw.loose_count == 0); - const HANDLE fd4data = - env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; - if (fd4data != INVALID_HANDLE_VALUE) { - /* explicitly unlock to avoid latency for other processes (windows kernel - * releases such locks via deferred queues) */ - do - err = funlock(fd4data, DXB_BODY); - while (err == MDBX_SUCCESS); - assert(err == ERROR_NOT_LOCKED || - (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); - SetLastError(ERROR_SUCCESS); + /* fast completion of pure nested transaction */ + VERBOSE("fast-complete pure nested txn %" PRIaTXN, txn->txnid); + end_mode = TXN_END_PURE_COMMIT | TXN_END_SLOT | TXN_END_FREE; + goto done; + } - do - err = funlock(fd4data, DXB_WHOLE); - while (err == MDBX_SUCCESS); - assert(err == ERROR_NOT_LOCKED || - (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); - SetLastError(ERROR_SUCCESS); - } -} + /* Preserve space for spill list to avoid parent's state corruption + * if allocation fails. */ + const size_t parent_retired_len = (uintptr_t)parent->tw.retired_pages; + tASSERT(txn, parent_retired_len <= MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + const size_t retired_delta = + MDBX_PNL_GETSIZE(txn->tw.retired_pages) - parent_retired_len; + if (retired_delta) { + rc = pnl_need(&txn->tw.relist, retired_delta); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } -/* Seize state as 'exclusive-write' (E-E and returns MDBX_RESULT_TRUE) - * or as 'used' (S-? and returns MDBX_RESULT_FALSE). - * Otherwise returns an error. */ -static int internal_seize_lck(HANDLE lfd) { - assert(lfd != INVALID_HANDLE_VALUE); + if (txn->tw.spilled.list) { + if (parent->tw.spilled.list) { + rc = pnl_need(&parent->tw.spilled.list, + MDBX_PNL_GETSIZE(txn->tw.spilled.list)); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } + spill_purge(txn); + } - /* 1) now on ?-? (free), get ?-E (middle) */ - jitter4testing(false); - int rc = flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER); - if (rc != MDBX_SUCCESS) { - /* 2) something went wrong, give up */; - ERROR("%s, err %u", "?-?(free) >> ?-E(middle)", rc); - return rc; - } + if (unlikely(txn->tw.dirtylist->length + parent->tw.dirtylist->length > + parent->tw.dirtylist->detent && + !dpl_reserve(parent, txn->tw.dirtylist->length + + parent->tw.dirtylist->length))) { + rc = MDBX_ENOMEM; + goto fail; + } - /* 3) now on ?-E (middle), try E-E (exclusive-write) */ - jitter4testing(false); - rc = flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER); - if (rc == MDBX_SUCCESS) - return MDBX_RESULT_TRUE /* 4) got E-E (exclusive-write), done */; + //------------------------------------------------------------------------- - /* 5) still on ?-E (middle) */ - jitter4testing(false); - if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { - /* 6) something went wrong, give up */ - rc = funlock(lfd, LCK_UPPER); - if (rc != MDBX_SUCCESS) - mdbx_panic("%s(%s) failed: err %u", __func__, "?-E(middle) >> ?-?(free)", - rc); - return rc; - } + parent->tw.gc.reclaimed = txn->tw.gc.reclaimed; + txn->tw.gc.reclaimed = nullptr; - /* 7) still on ?-E (middle), try S-E (locked) */ - jitter4testing(false); - rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER); + parent->tw.retired_pages = txn->tw.retired_pages; + txn->tw.retired_pages = nullptr; - jitter4testing(false); - if (rc != MDBX_SUCCESS) - ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc); + pnl_free(parent->tw.relist); + parent->tw.relist = txn->tw.relist; + txn->tw.relist = nullptr; + parent->tw.gc.time_acc = txn->tw.gc.time_acc; + parent->tw.gc.last_reclaimed = txn->tw.gc.last_reclaimed; - /* 8) now on S-E (locked) or still on ?-E (middle), - * transition to S-? (used) or ?-? (free) */ - int err = funlock(lfd, LCK_UPPER); - if (err != MDBX_SUCCESS) - mdbx_panic("%s(%s) failed: err %u", __func__, - "X-E(locked/middle) >> X-?(used/free)", err); + parent->geo = txn->geo; + parent->canary = txn->canary; + parent->flags |= txn->flags & MDBX_TXN_DIRTY; - /* 9) now on S-? (used, DONE) or ?-? (free, FAILURE) */ - return rc; -} + /* Move loose pages to parent */ +#if MDBX_ENABLE_REFUND + parent->tw.loose_refund_wl = txn->tw.loose_refund_wl; +#endif /* MDBX_ENABLE_REFUND */ + parent->tw.loose_count = txn->tw.loose_count; + parent->tw.loose_pages = txn->tw.loose_pages; -MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { - const HANDLE fd4data = - env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; - assert(fd4data != INVALID_HANDLE_VALUE); - if (env->me_flags & MDBX_EXCLUSIVE) - return MDBX_RESULT_TRUE /* nope since files were must be opened - non-shareable */ - ; + /* Merge our cursors into parent's and close them */ + done_cursors(txn, true); + end_mode |= TXN_END_EOTDONE; - if (env->me_lfd == INVALID_HANDLE_VALUE) { - /* LY: without-lck mode (e.g. on read-only filesystem) */ - jitter4testing(false); - int rc = flock_data(env, LCK_SHARED | LCK_DONTWAIT, DXB_WHOLE); - if (rc != MDBX_SUCCESS) - ERROR("%s, err %u", "without-lck", rc); - return rc; - } + /* Update parent's DBs array */ + eASSERT(env, parent->n_dbi == txn->n_dbi); + TXN_FOREACH_DBI_ALL(txn, dbi) { + if (txn->dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)) { + parent->dbs[dbi] = txn->dbs[dbi]; + /* preserve parent's status */ + const uint8_t state = + txn->dbi_state[dbi] | + (parent->dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)); + DEBUG("dbi %zu dbi-state %s 0x%02x -> 0x%02x", dbi, + (parent->dbi_state[dbi] != state) ? "update" : "still", + parent->dbi_state[dbi], state); + parent->dbi_state[dbi] = state; + } else { + eASSERT(env, + txn->dbi_state[dbi] == (parent->dbi_state[dbi] & + ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY))); + } + } - int rc = internal_seize_lck(env->me_lfd); - jitter4testing(false); - if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { - /* Check that another process don't operates in without-lck mode. - * Doing such check by exclusive locking the body-part of db. Should be - * noted: - * - we need an exclusive lock for do so; - * - we can't lock meta-pages, otherwise other process could get an error - * while opening db in valid (non-conflict) mode. */ - int err = flock_data(env, LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_WHOLE); - if (err != MDBX_SUCCESS) { - ERROR("%s, err %u", "lock-against-without-lck", err); - jitter4testing(false); - lck_unlock(env); - return err; + if (latency) { + ts_1 = osal_monotime(); + ts_2 = /* no gc-update */ ts_1; + ts_3 = /* no audit */ ts_2; + ts_4 = /* no write */ ts_3; + ts_5 = /* no sync */ ts_4; } - jitter4testing(false); - err = funlock(fd4data, DXB_WHOLE); - if (err != MDBX_SUCCESS) - mdbx_panic("%s(%s) failed: err %u", __func__, - "unlock-against-without-lck", err); + txn_merge(parent, txn, parent_retired_len); + env->txn = parent; + parent->nested = nullptr; + tASSERT(parent, dpl_check(parent)); + +#if MDBX_ENABLE_REFUND + txn_refund(parent); + if (ASSERT_ENABLED()) { + /* Check parent's loose pages not suitable for refund */ + for (page_t *lp = parent->tw.loose_pages; lp; lp = page_next(lp)) { + tASSERT(parent, lp->pgno < parent->tw.loose_refund_wl && + lp->pgno + 1 < parent->geo.first_unallocated); + MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *)); + VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *)); + } + /* Check parent's reclaimed pages not suitable for refund */ + if (MDBX_PNL_GETSIZE(parent->tw.relist)) + tASSERT(parent, MDBX_PNL_MOST(parent->tw.relist) + 1 < + parent->geo.first_unallocated); + } +#endif /* MDBX_ENABLE_REFUND */ + + txn->signature = 0; + osal_free(txn); + tASSERT(parent, audit_ex(parent, 0, false) == 0); + rc = MDBX_SUCCESS; + goto provide_latency; } - return rc; -} + if (!txn->tw.dirtylist) { + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + } else { + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->parent ? txn->parent->tw.dirtyroom + : env->options.dp_limit)); + } + done_cursors(txn, false); + end_mode |= TXN_END_EOTDONE; -MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) { - const HANDLE fd4data = - env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; - /* Transite from exclusive-write state (E-E) to used (S-?) */ - assert(fd4data != INVALID_HANDLE_VALUE); - assert(env->me_lfd != INVALID_HANDLE_VALUE); + if ((!txn->tw.dirtylist || txn->tw.dirtylist->length == 0) && + (txn->flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0) { + TXN_FOREACH_DBI_ALL(txn, i) { + tASSERT(txn, !(txn->dbi_state[i] & DBI_DIRTY)); + } +#if defined(MDBX_NOSUCCESS_EMPTY_COMMIT) && MDBX_NOSUCCESS_EMPTY_COMMIT + rc = txn_end(txn, end_mode); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + rc = MDBX_RESULT_TRUE; + goto provide_latency; +#else + goto done; +#endif /* MDBX_NOSUCCESS_EMPTY_COMMIT */ + } - if (env->me_flags & MDBX_EXCLUSIVE) - return MDBX_SUCCESS /* nope since files were must be opened non-shareable */ - ; - /* 1) now at E-E (exclusive-write), transition to ?_E (middle) */ - int rc = funlock(env->me_lfd, LCK_LOWER); - if (rc != MDBX_SUCCESS) - mdbx_panic("%s(%s) failed: err %u", __func__, - "E-E(exclusive-write) >> ?-E(middle)", rc); + DEBUG("committing txn %" PRIaTXN " %p on env %p, root page %" PRIaPGNO + "/%" PRIaPGNO, + txn->txnid, (void *)txn, (void *)env, txn->dbs[MAIN_DBI].root, + txn->dbs[FREE_DBI].root); - /* 2) now at ?-E (middle), transition to S-E (locked) */ - rc = flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER); - if (rc != MDBX_SUCCESS) { - /* 3) something went wrong, give up */; - ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc); - return rc; + if (txn->n_dbi > CORE_DBS) { + /* Update subDB root pointers */ + cursor_couple_t cx; + rc = cursor_init(&cx.outer, txn, MAIN_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + cx.outer.next = txn->cursors[MAIN_DBI]; + txn->cursors[MAIN_DBI] = &cx.outer; + TXN_FOREACH_DBI_USER(txn, i) { + if ((txn->dbi_state[i] & DBI_DIRTY) == 0) + continue; + tree_t *const db = &txn->dbs[i]; + DEBUG("update main's entry for sub-db %zu, mod_txnid %" PRIaTXN + " -> %" PRIaTXN, + i, db->mod_txnid, txn->txnid); + /* Может быть mod_txnid > front после коммита вложенных тразакций */ + db->mod_txnid = txn->txnid; + MDBX_val data = {db, sizeof(tree_t)}; + rc = cursor_put(&cx.outer, &env->kvs[i].name, &data, N_SUBDATA); + if (unlikely(rc != MDBX_SUCCESS)) { + txn->cursors[MAIN_DBI] = cx.outer.next; + goto fail; + } + } + txn->cursors[MAIN_DBI] = cx.outer.next; } - /* 4) got S-E (locked), continue transition to S-? (used) */ - rc = funlock(env->me_lfd, LCK_UPPER); - if (rc != MDBX_SUCCESS) - mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> S-?(used)", - rc); + ts_1 = latency ? osal_monotime() : 0; - return MDBX_SUCCESS /* 5) now at S-? (used), done */; -} + gcu_t gcu_ctx; + gc_cputime = latency ? osal_cputime(nullptr) : 0; + rc = gc_update_init(txn, &gcu_ctx); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + rc = gc_update(txn, &gcu_ctx); + gc_cputime = latency ? osal_cputime(nullptr) - gc_cputime : 0; + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; -MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, bool dont_wait) { - /* Transite from used state (S-?) to exclusive-write (E-E) */ - assert(env->me_lfd != INVALID_HANDLE_VALUE); + tASSERT(txn, txn->tw.loose_count == 0); + txn->dbs[FREE_DBI].mod_txnid = (txn->dbi_state[FREE_DBI] & DBI_DIRTY) + ? txn->txnid + : txn->dbs[FREE_DBI].mod_txnid; - if (env->me_flags & MDBX_EXCLUSIVE) - return MDBX_SUCCESS /* nope since files were must be opened non-shareable */ - ; + txn->dbs[MAIN_DBI].mod_txnid = (txn->dbi_state[MAIN_DBI] & DBI_DIRTY) + ? txn->txnid + : txn->dbs[MAIN_DBI].mod_txnid; - /* 1) now on S-? (used), try S-E (locked) */ - jitter4testing(false); - int rc = flock(env->me_lfd, - dont_wait ? LCK_EXCLUSIVE | LCK_DONTWAIT : LCK_EXCLUSIVE, - LCK_UPPER); - if (rc != MDBX_SUCCESS) { - /* 2) something went wrong, give up */; - VERBOSE("%s, err %u", "S-?(used) >> S-E(locked)", rc); - return rc; + ts_2 = latency ? osal_monotime() : 0; + ts_3 = ts_2; + if (AUDIT_ENABLED()) { + rc = audit_ex(txn, MDBX_PNL_GETSIZE(txn->tw.retired_pages), true); + ts_3 = osal_monotime(); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; } - /* 3) now on S-E (locked), transition to ?-E (middle) */ - rc = funlock(env->me_lfd, LCK_LOWER); - if (rc != MDBX_SUCCESS) - mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> ?-E(middle)", - rc); + bool need_flush_for_nometasync = false; + const meta_ptr_t head = meta_recent(env, &txn->tw.troika); + const uint32_t meta_sync_txnid = + atomic_load32(&env->lck->meta_sync_txnid, mo_Relaxed); + /* sync prev meta */ + if (head.is_steady && meta_sync_txnid != (uint32_t)head.txnid) { + /* Исправление унаследованного от LMDB недочета: + * + * Всё хорошо, если все процессы работающие с БД не используют WRITEMAP. + * Тогда мета-страница (обновленная, но не сброшенная на диск) будет + * сохранена в результате fdatasync() при записи данных этой транзакции. + * + * Всё хорошо, если все процессы работающие с БД используют WRITEMAP + * без MDBX_AVOID_MSYNC. + * Тогда мета-страница (обновленная, но не сброшенная на диск) будет + * сохранена в результате msync() при записи данных этой транзакции. + * + * Если же в процессах работающих с БД используется оба метода, как sync() + * в режиме MDBX_WRITEMAP, так и записи через файловый дескриптор, то + * становится невозможным обеспечить фиксацию на диске мета-страницы + * предыдущей транзакции и данных текущей транзакции, за счет одной + * sync-операцией выполняемой после записи данных текущей транзакции. + * Соответственно, требуется явно обновлять мета-страницу, что полностью + * уничтожает выгоду от NOMETASYNC. */ + const uint32_t txnid_dist = + ((txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) + ? MDBX_NOMETASYNC_LAZY_FD + : MDBX_NOMETASYNC_LAZY_WRITEMAP; + /* Смысл "магии" в том, чтобы избежать отдельного вызова fdatasync() + * или msync() для гарантированной фиксации на диске мета-страницы, + * которая была "лениво" отправлена на запись в предыдущей транзакции, + * но не сброшена на диск из-за активного режима MDBX_NOMETASYNC. */ + if ( +#if defined(_WIN32) || defined(_WIN64) + !env->ioring.overlapped_fd && +#endif + meta_sync_txnid == (uint32_t)head.txnid - txnid_dist) + need_flush_for_nometasync = true; + else { + rc = meta_sync(env, head); + if (unlikely(rc != MDBX_SUCCESS)) { + ERROR("txn-%s: error %d", "presync-meta", rc); + goto fail; + } + } + } - /* 4) now on ?-E (middle), try E-E (exclusive-write) */ - jitter4testing(false); - rc = flock(env->me_lfd, - dont_wait ? LCK_EXCLUSIVE | LCK_DONTWAIT : LCK_EXCLUSIVE, - LCK_LOWER); - if (rc != MDBX_SUCCESS) { - /* 5) something went wrong, give up */; - VERBOSE("%s, err %u", "?-E(middle) >> E-E(exclusive-write)", rc); - return rc; + if (txn->tw.dirtylist) { + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + tASSERT(txn, txn->tw.loose_count == 0); + + mdbx_filehandle_t fd = +#if defined(_WIN32) || defined(_WIN64) + env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd; + (void)need_flush_for_nometasync; +#else + (need_flush_for_nometasync || env->dsync_fd == INVALID_HANDLE_VALUE || + txn->tw.dirtylist->length > env->options.writethrough_threshold || + atomic_load64(&env->lck->unsynced_pages, mo_Relaxed)) + ? env->lazy_fd + : env->dsync_fd; +#endif /* Windows */ + + iov_ctx_t write_ctx; + rc = iov_init(txn, &write_ctx, txn->tw.dirtylist->length, + txn->tw.dirtylist->pages_including_loose, fd, false); + if (unlikely(rc != MDBX_SUCCESS)) { + ERROR("txn-%s: error %d", "iov-init", rc); + goto fail; + } + + rc = txn_write(txn, &write_ctx); + if (unlikely(rc != MDBX_SUCCESS)) { + ERROR("txn-%s: error %d", "write", rc); + goto fail; + } + } else { + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + env->lck->unsynced_pages.weak += txn->tw.writemap_dirty_npages; + if (!env->lck->eoos_timestamp.weak) + env->lck->eoos_timestamp.weak = osal_monotime(); } - return MDBX_SUCCESS /* 6) now at E-E (exclusive-write), done */; -} + /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */ + ts_4 = latency ? osal_monotime() : 0; -MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, - MDBX_env *inprocess_neighbor, - int global_uniqueness_flag) { - (void)env; - (void)inprocess_neighbor; - (void)global_uniqueness_flag; - if (mdbx_SetFileIoOverlappedRange && !(env->me_flags & MDBX_RDONLY)) { - HANDLE token = INVALID_HANDLE_VALUE; - TOKEN_PRIVILEGES privileges; - privileges.PrivilegeCount = 1; - privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; - if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, - &token) || - !LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, - &privileges.Privileges[0].Luid) || - !AdjustTokenPrivileges(token, FALSE, &privileges, sizeof(privileges), - nullptr, nullptr) || - GetLastError() != ERROR_SUCCESS) - mdbx_SetFileIoOverlappedRange = NULL; + meta_t meta; + memcpy(meta.magic_and_version, head.ptr_c->magic_and_version, 8); + meta.reserve16 = head.ptr_c->reserve16; + meta.validator_id = head.ptr_c->validator_id; + meta.extra_pagehdr = head.ptr_c->extra_pagehdr; + unaligned_poke_u64(4, meta.pages_retired, + unaligned_peek_u64(4, head.ptr_c->pages_retired) + + MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + meta.geometry = txn->geo; + meta.trees.gc = txn->dbs[FREE_DBI]; + meta.trees.main = txn->dbs[MAIN_DBI]; + meta.canary = txn->canary; - if (token != INVALID_HANDLE_VALUE) - CloseHandle(token); + txnid_t commit_txnid = txn->txnid; +#if MDBX_ENABLE_BIGFOOT + if (gcu_ctx.bigfoot > txn->txnid) { + commit_txnid = gcu_ctx.bigfoot; + TRACE("use @%" PRIaTXN " (+%zu) for commit bigfoot-txn", commit_txnid, + (size_t)(commit_txnid - txn->txnid)); } - return MDBX_SUCCESS; -} +#endif + meta.unsafe_sign = DATASIGN_NONE; + meta_set_txnid(env, &meta, commit_txnid); -MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, - MDBX_env *inprocess_neighbor, - const uint32_t current_pid) { - (void)current_pid; - /* LY: should unmap before releasing the locks to avoid race condition and - * STATUS_USER_MAPPED_FILE/ERROR_USER_MAPPED_FILE */ - if (env->me_map) - osal_munmap(&env->me_dxb_mmap); - if (env->me_lck_mmap.lck) { - const bool synced = env->me_lck_mmap.lck->mti_unsynced_pages.weak == 0; - osal_munmap(&env->me_lck_mmap); - if (synced && !inprocess_neighbor && env->me_lfd != INVALID_HANDLE_VALUE && - osal_lck_upgrade(env, true) == MDBX_SUCCESS) - /* this will fail if LCK is used/mmapped by other process(es) */ - osal_ftruncate(env->me_lfd, 0); - } - lck_unlock(env); - return MDBX_SUCCESS; -} + rc = dxb_sync_locked(env, env->flags | txn->flags | txn_shrink_allowed, &meta, + &txn->tw.troika); -/*----------------------------------------------------------------------------*/ -/* reader checking (by pid) */ + ts_5 = latency ? osal_monotime() : 0; + if (unlikely(rc != MDBX_SUCCESS)) { + env->flags |= ENV_FATAL_ERROR; + ERROR("txn-%s: error %d", "sync", rc); + goto fail; + } -MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env) { - (void)env; - return MDBX_SUCCESS; -} + end_mode = TXN_END_COMMITTED | TXN_END_UPDATE | TXN_END_EOTDONE; -MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env) { - (void)env; - return MDBX_SUCCESS; -} +done: + if (latency) + take_gcprof(txn, latency); + rc = txn_end(txn, end_mode); -/* Checks reader by pid. - * - * Returns: - * MDBX_RESULT_TRUE, if pid is live (unable to acquire lock) - * MDBX_RESULT_FALSE, if pid is dead (lock acquired) - * or otherwise the errcode. */ -MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid) { - (void)env; - HANDLE hProcess = OpenProcess(SYNCHRONIZE, FALSE, pid); - int rc; - if (likely(hProcess)) { - rc = WaitForSingleObject(hProcess, 0); - if (unlikely(rc == (int)WAIT_FAILED)) - rc = (int)GetLastError(); - CloseHandle(hProcess); - } else { - rc = (int)GetLastError(); +provide_latency: + if (latency) { + latency->preparation = ts_1 ? osal_monotime_to_16dot16(ts_1 - ts_0) : 0; + latency->gc_wallclock = + (ts_2 > ts_1) ? osal_monotime_to_16dot16(ts_2 - ts_1) : 0; + latency->gc_cputime = gc_cputime ? osal_monotime_to_16dot16(gc_cputime) : 0; + latency->audit = (ts_3 > ts_2) ? osal_monotime_to_16dot16(ts_3 - ts_2) : 0; + latency->write = (ts_4 > ts_3) ? osal_monotime_to_16dot16(ts_4 - ts_3) : 0; + latency->sync = (ts_5 > ts_4) ? osal_monotime_to_16dot16(ts_5 - ts_4) : 0; + const uint64_t ts_6 = osal_monotime(); + latency->ending = ts_5 ? osal_monotime_to_16dot16(ts_6 - ts_5) : 0; + latency->whole = osal_monotime_to_16dot16_noUnderflow(ts_6 - ts_0); } + return rc; - switch (rc) { - case ERROR_INVALID_PARAMETER: - /* pid seems invalid */ - return MDBX_RESULT_FALSE; - case WAIT_OBJECT_0: - /* process just exited */ - return MDBX_RESULT_FALSE; - case ERROR_ACCESS_DENIED: - /* The ERROR_ACCESS_DENIED would be returned for CSRSS-processes, etc. - * assume pid exists */ - return MDBX_RESULT_TRUE; - case WAIT_TIMEOUT: - /* pid running */ - return MDBX_RESULT_TRUE; - default: - /* failure */ - return rc; - } +fail: + txn->flags |= MDBX_TXN_ERROR; + if (latency) + take_gcprof(txn, latency); + txn_abort(txn); + goto provide_latency; } -//---------------------------------------------------------------------------- -// Stub for slim read-write lock -// Copyright (C) 1995-2002 Brad Wilson - -static void WINAPI stub_srwlock_Init(osal_srwlock_t *srwl) { - srwl->readerCount = srwl->writerCount = 0; -} +int txn_abort(MDBX_txn *txn) { + if (txn->flags & MDBX_TXN_RDONLY) + /* LY: don't close DBI-handles */ + return txn_end(txn, TXN_END_ABORT | TXN_END_UPDATE | TXN_END_SLOT | + TXN_END_FREE); -static void WINAPI stub_srwlock_AcquireShared(osal_srwlock_t *srwl) { - while (true) { - assert(srwl->writerCount >= 0 && srwl->readerCount >= 0); + if (unlikely(txn->flags & MDBX_TXN_FINISHED)) + return MDBX_BAD_TXN; - // If there's a writer already, spin without unnecessarily - // interlocking the CPUs - if (srwl->writerCount != 0) { - SwitchToThread(); - continue; - } + if (txn->nested) + txn_abort(txn->nested); - // Add to the readers list - _InterlockedIncrement(&srwl->readerCount); + tASSERT(txn, (txn->flags & MDBX_TXN_ERROR) || dpl_check(txn)); + return txn_end(txn, TXN_END_ABORT | TXN_END_SLOT | TXN_END_FREE); +} - // Check for writers again (we may have been preempted). If - // there are no writers writing or waiting, then we're done. - if (srwl->writerCount == 0) - break; +int txn_renew(MDBX_txn *txn, unsigned flags) { + MDBX_env *const env = txn->env; + int rc; - // Remove from the readers list, spin, try again - _InterlockedDecrement(&srwl->readerCount); - SwitchToThread(); +#if MDBX_ENV_CHECKPID + if (unlikely(env->pid != osal_getpid())) { + env->flags |= ENV_FATAL_ERROR; + return MDBX_PANIC; } -} - -static void WINAPI stub_srwlock_ReleaseShared(osal_srwlock_t *srwl) { - assert(srwl->readerCount > 0); - _InterlockedDecrement(&srwl->readerCount); -} +#endif /* MDBX_ENV_CHECKPID */ -static void WINAPI stub_srwlock_AcquireExclusive(osal_srwlock_t *srwl) { - while (true) { - assert(srwl->writerCount >= 0 && srwl->readerCount >= 0); + const uintptr_t tid = osal_thread_self(); + flags |= env->flags & (MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP); + if (flags & MDBX_TXN_RDONLY) { + eASSERT(env, (flags & ~(txn_ro_begin_flags | MDBX_WRITEMAP | + MDBX_NOSTICKYTHREADS)) == 0); + txn->flags = flags; + reader_slot_t *r = txn->to.reader; + STATIC_ASSERT(sizeof(uintptr_t) <= sizeof(r->tid)); + if (likely(env->flags & ENV_TXKEY)) { + eASSERT(env, !(env->flags & MDBX_NOSTICKYTHREADS)); + r = thread_rthc_get(env->me_txkey); + if (likely(r)) { + if (unlikely(!r->pid.weak) && + (globals.runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN)) { + thread_rthc_set(env->me_txkey, nullptr); + r = nullptr; + } else { + eASSERT(env, r->pid.weak == env->pid); + eASSERT(env, r->tid.weak == osal_thread_self()); + } + } + } else { + eASSERT(env, !env->lck_mmap.lck || (env->flags & MDBX_NOSTICKYTHREADS)); + } - // If there's a writer already, spin without unnecessarily - // interlocking the CPUs - if (srwl->writerCount != 0) { - SwitchToThread(); - continue; + if (likely(r)) { + if (unlikely(r->pid.weak != env->pid || + r->txnid.weak < SAFE64_INVALID_THRESHOLD)) + return MDBX_BAD_RSLOT; + } else if (env->lck_mmap.lck) { + bsr_t brs = mvcc_bind_slot(env, tid); + if (unlikely(brs.err != MDBX_SUCCESS)) + return brs.err; + r = brs.rslot; + } + txn->to.reader = r; + STATIC_ASSERT(MDBX_TXN_RDONLY_PREPARE > MDBX_TXN_RDONLY); + if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) { + eASSERT(env, txn->txnid == 0); + eASSERT(env, txn->owner == 0); + eASSERT(env, txn->n_dbi == 0); + if (likely(r)) { + eASSERT(env, r->snapshot_pages_used.weak == 0); + eASSERT(env, r->txnid.weak >= SAFE64_INVALID_THRESHOLD); + atomic_store32(&r->snapshot_pages_used, 0, mo_Relaxed); + } + txn->flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED; + return MDBX_SUCCESS; } + txn->owner = tid; - // See if we can become the writer (expensive, because it inter- - // locks the CPUs, so writing should be an infrequent process) - if (_InterlockedExchange(&srwl->writerCount, 1) == 0) - break; - } + /* Seek & fetch the last meta */ + uint64_t timestamp = 0; + size_t loop = 0; + troika_t troika = meta_tap(env); + while (1) { + const meta_ptr_t head = + likely(env->stuck_meta < 0) + ? /* regular */ meta_recent(env, &troika) + : /* recovery mode */ meta_ptr(env, env->stuck_meta); + if (likely(r)) { + safe64_reset(&r->txnid, false); + atomic_store32(&r->snapshot_pages_used, + head.ptr_v->geometry.first_unallocated, mo_Relaxed); + atomic_store64( + &r->snapshot_pages_retired, + unaligned_peek_u64_volatile(4, head.ptr_v->pages_retired), + mo_Relaxed); + safe64_write(&r->txnid, head.txnid); + eASSERT(env, r->pid.weak == osal_getpid()); + eASSERT(env, r->tid.weak == ((env->flags & MDBX_NOSTICKYTHREADS) + ? 0 + : osal_thread_self())); + eASSERT(env, r->txnid.weak == head.txnid || + (r->txnid.weak >= SAFE64_INVALID_THRESHOLD && + head.txnid < env->lck->cached_oldest.weak)); + atomic_store32(&env->lck->rdt_refresh_flag, true, mo_AcquireRelease); + } else { + /* exclusive mode without lck */ + eASSERT(env, !env->lck_mmap.lck && env->lck == lckless_stub(env)); + } + jitter4testing(true); - // Now we're the writer, but there may be outstanding readers. - // Spin until there aren't any more; new readers will wait now - // that we're the writer. - while (srwl->readerCount != 0) { - assert(srwl->writerCount >= 0 && srwl->readerCount >= 0); - SwitchToThread(); - } -} + /* Snap the state from current meta-head */ + txn->txnid = head.txnid; + if (likely(env->stuck_meta < 0) && + unlikely(meta_should_retry(env, &troika) || + head.txnid < atomic_load64(&env->lck->cached_oldest, + mo_AcquireRelease))) { + if (unlikely(++loop > 42)) { + ERROR("bailout waiting for valid snapshot (%s)", + "metapages are too volatile"); + rc = MDBX_PROBLEM; + txn->txnid = INVALID_TXNID; + if (likely(r)) + safe64_reset(&r->txnid, false); + goto bailout; + } + timestamp = 0; + continue; + } -static void WINAPI stub_srwlock_ReleaseExclusive(osal_srwlock_t *srwl) { - assert(srwl->writerCount == 1 && srwl->readerCount >= 0); - srwl->writerCount = 0; -} + rc = coherency_check_head(txn, head, ×tamp); + jitter4testing(false); + if (likely(rc == MDBX_SUCCESS)) + break; -static uint64_t WINAPI stub_GetTickCount64(void) { - LARGE_INTEGER Counter, Frequency; - return (QueryPerformanceFrequency(&Frequency) && - QueryPerformanceCounter(&Counter)) - ? Counter.QuadPart * 1000ul / Frequency.QuadPart - : 0; -} + if (unlikely(rc != MDBX_RESULT_TRUE)) { + txn->txnid = INVALID_TXNID; + if (likely(r)) + safe64_reset(&r->txnid, false); + goto bailout; + } + } -/*----------------------------------------------------------------------------*/ + if (unlikely(txn->txnid < MIN_TXNID || txn->txnid > MAX_TXNID)) { + ERROR("%s", "environment corrupted by died writer, must shutdown!"); + if (likely(r)) + safe64_reset(&r->txnid, false); + txn->txnid = INVALID_TXNID; + rc = MDBX_CORRUPTED; + goto bailout; + } + ENSURE(env, + txn->txnid >= + /* paranoia is appropriate here */ env->lck->cached_oldest.weak); + tASSERT(txn, txn->dbs[FREE_DBI].flags == MDBX_INTEGERKEY); + tASSERT(txn, check_sdb_flags(txn->dbs[MAIN_DBI].flags)); + } else { + eASSERT(env, (flags & ~(txn_rw_begin_flags | MDBX_TXN_SPILLS | + MDBX_WRITEMAP | MDBX_NOSTICKYTHREADS)) == 0); + if (unlikely(txn->owner == tid || + /* not recovery mode */ env->stuck_meta >= 0)) + return MDBX_BUSY; + lck_t *const lck = env->lck_mmap.lck; + if (lck && (env->flags & MDBX_NOSTICKYTHREADS) == 0 && + (globals.runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) { + const size_t snap_nreaders = + atomic_load32(&lck->rdt_length, mo_AcquireRelease); + for (size_t i = 0; i < snap_nreaders; ++i) { + if (atomic_load32(&lck->rdt[i].pid, mo_Relaxed) == env->pid && + unlikely(atomic_load64(&lck->rdt[i].tid, mo_Relaxed) == tid)) { + const txnid_t txnid = safe64_read(&lck->rdt[i].txnid); + if (txnid >= MIN_TXNID && txnid <= MAX_TXNID) + return MDBX_TXN_OVERLAPPING; + } + } + } -#ifndef xMDBX_ALLOY -osal_srwlock_t_function osal_srwlock_Init, osal_srwlock_AcquireShared, - osal_srwlock_ReleaseShared, osal_srwlock_AcquireExclusive, - osal_srwlock_ReleaseExclusive; - -MDBX_NtExtendSection mdbx_NtExtendSection; -MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx; -MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW; -MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; -MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle; -MDBX_NtFsControlFile mdbx_NtFsControlFile; -MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; -MDBX_GetTickCount64 mdbx_GetTickCount64; -MDBX_RegGetValueA mdbx_RegGetValueA; -MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; -#endif /* xMDBX_ALLOY */ + /* Not yet touching txn == env->basal_txn, it may be active */ + jitter4testing(false); + rc = lck_txn_lock(env, !!(flags & MDBX_TXN_TRY)); + if (unlikely(rc)) + return rc; + if (unlikely(env->flags & ENV_FATAL_ERROR)) { + lck_txn_unlock(env); + return MDBX_PANIC; + } +#if defined(_WIN32) || defined(_WIN64) + if (unlikely(!env->dxb_mmap.base)) { + lck_txn_unlock(env); + return MDBX_EPERM; + } +#endif /* Windows */ -#if __GNUC_PREREQ(8, 0) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wcast-function-type" -#endif /* GCC/MINGW */ + txn->tw.troika = meta_tap(env); + const meta_ptr_t head = meta_recent(env, &txn->tw.troika); + uint64_t timestamp = 0; + while ("workaround for https://libmdbx.dqdkfa.ru/dead-github/issues/269") { + rc = coherency_check_head(txn, head, ×tamp); + if (likely(rc == MDBX_SUCCESS)) + break; + if (unlikely(rc != MDBX_RESULT_TRUE)) + goto bailout; + } + eASSERT(env, meta_txnid(head.ptr_v) == head.txnid); + txn->txnid = safe64_txnid_next(head.txnid); + if (unlikely(txn->txnid > MAX_TXNID)) { + rc = MDBX_TXN_FULL; + ERROR("txnid overflow, raise %d", rc); + goto bailout; + } -static void mdbx_winnt_import(void) { -#define GET_PROC_ADDR(dll, ENTRY) \ - mdbx_##ENTRY = (MDBX_##ENTRY)GetProcAddress(dll, #ENTRY) + tASSERT(txn, txn->dbs[FREE_DBI].flags == MDBX_INTEGERKEY); + tASSERT(txn, check_sdb_flags(txn->dbs[MAIN_DBI].flags)); + txn->flags = flags; + txn->nested = nullptr; + txn->tw.loose_pages = nullptr; + txn->tw.loose_count = 0; +#if MDBX_ENABLE_REFUND + txn->tw.loose_refund_wl = 0; +#endif /* MDBX_ENABLE_REFUND */ + MDBX_PNL_SETSIZE(txn->tw.retired_pages, 0); + txn->tw.spilled.list = nullptr; + txn->tw.spilled.least_removed = 0; + txn->tw.gc.time_acc = 0; + txn->tw.gc.last_reclaimed = 0; + if (txn->tw.gc.reclaimed) + MDBX_PNL_SETSIZE(txn->tw.gc.reclaimed, 0); + env->txn = txn; - const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll"); - if (hNtdll) { - if (GetProcAddress(hNtdll, "wine_get_version")) { - assert(mdbx_RunningUnderWine()); + if ((txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) { + rc = dpl_alloc(txn); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + txn->tw.dirtyroom = txn->env->options.dp_limit; + txn->tw.dirtylru = MDBX_DEBUG ? UINT32_MAX / 3 - 42 : 0; } else { - GET_PROC_ADDR(hNtdll, NtFsControlFile); - GET_PROC_ADDR(hNtdll, NtExtendSection); - assert(!mdbx_RunningUnderWine()); + tASSERT(txn, txn->tw.dirtylist == nullptr); + txn->tw.dirtylist = nullptr; + txn->tw.dirtyroom = MAX_PAGENO; + txn->tw.dirtylru = 0; } + eASSERT(env, txn->tw.writemap_dirty_npages == 0); + eASSERT(env, txn->tw.writemap_spilled_npages == 0); } - const HINSTANCE hKernel32dll = GetModuleHandleA("kernel32.dll"); - if (hKernel32dll) { - GET_PROC_ADDR(hKernel32dll, GetFileInformationByHandleEx); - GET_PROC_ADDR(hKernel32dll, GetTickCount64); - if (!mdbx_GetTickCount64) - mdbx_GetTickCount64 = stub_GetTickCount64; - if (!mdbx_RunningUnderWine()) { - GET_PROC_ADDR(hKernel32dll, SetFileInformationByHandle); - GET_PROC_ADDR(hKernel32dll, GetVolumeInformationByHandleW); - GET_PROC_ADDR(hKernel32dll, GetFinalPathNameByHandleW); - GET_PROC_ADDR(hKernel32dll, PrefetchVirtualMemory); - GET_PROC_ADDR(hKernel32dll, SetFileIoOverlappedRange); + txn->front_txnid = + txn->txnid + ((flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == 0); + + /* Setup db info */ + tASSERT(txn, txn->dbs[FREE_DBI].flags == MDBX_INTEGERKEY); + tASSERT(txn, check_sdb_flags(txn->dbs[MAIN_DBI].flags)); + VALGRIND_MAKE_MEM_UNDEFINED(txn->dbi_state, env->max_dbi); +#if MDBX_ENABLE_DBI_SPARSE + txn->n_dbi = CORE_DBS; + VALGRIND_MAKE_MEM_UNDEFINED( + txn->dbi_sparse, + ceil_powerof2(env->max_dbi, CHAR_BIT * sizeof(txn->dbi_sparse[0])) / + CHAR_BIT); + txn->dbi_sparse[0] = (1 << CORE_DBS) - 1; +#else + txn->n_dbi = (env->n_dbi < 8) ? env->n_dbi : 8; + if (txn->n_dbi > CORE_DBS) + memset(txn->dbi_state + CORE_DBS, 0, txn->n_dbi - CORE_DBS); +#endif /* MDBX_ENABLE_DBI_SPARSE */ + txn->dbi_state[FREE_DBI] = DBI_LINDO | DBI_VALID; + txn->dbi_state[MAIN_DBI] = DBI_LINDO | DBI_VALID; + txn->cursors[FREE_DBI] = nullptr; + txn->cursors[MAIN_DBI] = nullptr; + txn->dbi_seqs[FREE_DBI] = 0; + txn->dbi_seqs[MAIN_DBI] = + atomic_load32(&env->dbi_seqs[MAIN_DBI], mo_AcquireRelease); + + if (unlikely(env->dbs_flags[MAIN_DBI] != + (DB_VALID | txn->dbs[MAIN_DBI].flags))) { + const bool need_txn_lock = env->basal_txn && env->basal_txn->owner != tid; + bool should_unlock = false; + if (need_txn_lock) { + rc = lck_txn_lock(env, true); + if (rc == MDBX_SUCCESS) + should_unlock = true; + else if (rc != MDBX_BUSY && rc != MDBX_EDEADLK) + goto bailout; + } + rc = osal_fastmutex_acquire(&env->dbi_lock); + if (likely(rc == MDBX_SUCCESS)) { + uint32_t seq = dbi_seq_next(env, MAIN_DBI); + /* проверяем повторно после захвата блокировки */ + if (env->dbs_flags[MAIN_DBI] != (DB_VALID | txn->dbs[MAIN_DBI].flags)) { + if (!need_txn_lock || should_unlock || + /* если нет активной пишущей транзакции, + * то следующая будет ждать на dbi_lock */ + !env->txn) { + if (env->dbs_flags[MAIN_DBI] != 0 || MDBX_DEBUG) + NOTICE("renew MainDB for %s-txn %" PRIaTXN + " since db-flags changes 0x%x -> 0x%x", + (txn->flags & MDBX_TXN_RDONLY) ? "ro" : "rw", txn->txnid, + env->dbs_flags[MAIN_DBI] & ~DB_VALID, + txn->dbs[MAIN_DBI].flags); + env->dbs_flags[MAIN_DBI] = DB_POISON; + atomic_store32(&env->dbi_seqs[MAIN_DBI], seq, mo_AcquireRelease); + rc = sdb_setup(env, &env->kvs[MAIN_DBI], &txn->dbs[MAIN_DBI]); + if (likely(rc == MDBX_SUCCESS)) { + seq = dbi_seq_next(env, MAIN_DBI); + env->dbs_flags[MAIN_DBI] = DB_VALID | txn->dbs[MAIN_DBI].flags; + txn->dbi_seqs[MAIN_DBI] = atomic_store32(&env->dbi_seqs[MAIN_DBI], + seq, mo_AcquireRelease); + } + } else { + ERROR("MainDB db-flags changes 0x%x -> 0x%x ahead of read-txn " + "%" PRIaTXN, + txn->dbs[MAIN_DBI].flags, env->dbs_flags[MAIN_DBI] & ~DB_VALID, + txn->txnid); + rc = MDBX_INCOMPATIBLE; + } + } + ENSURE(env, osal_fastmutex_release(&env->dbi_lock) == MDBX_SUCCESS); + } else { + DEBUG("dbi_lock failed, err %d", rc); } + if (should_unlock) + lck_txn_unlock(env); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; } - const osal_srwlock_t_function init = - (osal_srwlock_t_function)(hKernel32dll - ? GetProcAddress(hKernel32dll, - "InitializeSRWLock") - : nullptr); - if (init != NULL) { - osal_srwlock_Init = init; - osal_srwlock_AcquireShared = (osal_srwlock_t_function)GetProcAddress( - hKernel32dll, "AcquireSRWLockShared"); - osal_srwlock_ReleaseShared = (osal_srwlock_t_function)GetProcAddress( - hKernel32dll, "ReleaseSRWLockShared"); - osal_srwlock_AcquireExclusive = (osal_srwlock_t_function)GetProcAddress( - hKernel32dll, "AcquireSRWLockExclusive"); - osal_srwlock_ReleaseExclusive = (osal_srwlock_t_function)GetProcAddress( - hKernel32dll, "ReleaseSRWLockExclusive"); - } else { - osal_srwlock_Init = stub_srwlock_Init; - osal_srwlock_AcquireShared = stub_srwlock_AcquireShared; - osal_srwlock_ReleaseShared = stub_srwlock_ReleaseShared; - osal_srwlock_AcquireExclusive = stub_srwlock_AcquireExclusive; - osal_srwlock_ReleaseExclusive = stub_srwlock_ReleaseExclusive; + if (unlikely(txn->dbs[FREE_DBI].flags != MDBX_INTEGERKEY)) { + ERROR("unexpected/invalid db-flags 0x%u for GC/FreeDB", + txn->dbs[FREE_DBI].flags); + rc = MDBX_INCOMPATIBLE; + goto bailout; } - const HINSTANCE hAdvapi32dll = GetModuleHandleA("advapi32.dll"); - if (hAdvapi32dll) { - GET_PROC_ADDR(hAdvapi32dll, RegGetValueA); + tASSERT(txn, txn->dbs[FREE_DBI].flags == MDBX_INTEGERKEY); + tASSERT(txn, check_sdb_flags(txn->dbs[MAIN_DBI].flags)); + if (unlikely(env->flags & ENV_FATAL_ERROR)) { + WARNING("%s", "environment had fatal error, must shutdown!"); + rc = MDBX_PANIC; + } else { + const size_t size_bytes = pgno2bytes(env, txn->geo.end_pgno); + const size_t used_bytes = pgno2bytes(env, txn->geo.first_unallocated); + const size_t required_bytes = + (txn->flags & MDBX_TXN_RDONLY) ? used_bytes : size_bytes; + eASSERT(env, env->dxb_mmap.limit >= env->dxb_mmap.current); + if (unlikely(required_bytes > env->dxb_mmap.current)) { + /* Размер БД (для пишущих транзакций) или используемых данных (для + * читающих транзакций) больше предыдущего/текущего размера внутри + * процесса, увеличиваем. Сюда также попадает случай увеличения верхней + * границы размера БД и отображения. В читающих транзакциях нельзя + * изменять размер файла, который может быть больше необходимого этой + * транзакции. */ + if (txn->geo.upper > MAX_PAGENO + 1 || + bytes2pgno(env, pgno2bytes(env, txn->geo.upper)) != txn->geo.upper) { + rc = MDBX_UNABLE_EXTEND_MAPSIZE; + goto bailout; + } + rc = dxb_resize(env, txn->geo.first_unallocated, txn->geo.end_pgno, + txn->geo.upper, implicit_grow); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + eASSERT(env, env->dxb_mmap.limit >= env->dxb_mmap.current); + } else if (unlikely(size_bytes < env->dxb_mmap.current)) { + /* Размер БД меньше предыдущего/текущего размера внутри процесса, можно + * уменьшить, но всё сложнее: + * - размер файла согласован со всеми читаемыми снимками на момент + * коммита последней транзакции; + * - в читающей транзакции размер файла может быть больше и него нельзя + * изменять, в том числе менять madvise (меньша размера файла нельзя, + * а за размером нет смысла). + * - в пишущей транзакции уменьшать размер файла можно только после + * проверки размера читаемых снимков, но в этом нет смысла, так как + * это будет сделано при фиксации транзакции. + * + * В сухом остатке, можно только установить dxb_mmap.current равным + * размеру файла, а это проще сделать без вызова dxb_resize() и усложения + * внутренней логики. + * + * В этой тактике есть недостаток: если пишущите транзакции не регулярны, + * и при завершении такой транзакции файл БД остаётся не-уменьшеным из-за + * читающих транзакций использующих предыдущие снимки. */ +#if defined(_WIN32) || defined(_WIN64) + imports.srwl_AcquireShared(&env->remap_guard); +#else + rc = osal_fastmutex_acquire(&env->remap_guard); +#endif + if (likely(rc == MDBX_SUCCESS)) { + eASSERT(env, env->dxb_mmap.limit >= env->dxb_mmap.current); + rc = osal_filesize(env->dxb_mmap.fd, &env->dxb_mmap.filesize); + if (likely(rc == MDBX_SUCCESS)) { + eASSERT(env, env->dxb_mmap.filesize >= required_bytes); + if (env->dxb_mmap.current > env->dxb_mmap.filesize) + env->dxb_mmap.current = + (env->dxb_mmap.limit < env->dxb_mmap.filesize) + ? env->dxb_mmap.limit + : (size_t)env->dxb_mmap.filesize; + } +#if defined(_WIN32) || defined(_WIN64) + imports.srwl_ReleaseShared(&env->remap_guard); +#else + int err = osal_fastmutex_release(&env->remap_guard); + if (unlikely(err) && likely(rc == MDBX_SUCCESS)) + rc = err; +#endif + } + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + eASSERT(env, pgno2bytes(env, txn->geo.first_unallocated) <= + env->dxb_mmap.current); + eASSERT(env, env->dxb_mmap.limit >= env->dxb_mmap.current); + if (txn->flags & MDBX_TXN_RDONLY) { +#if defined(_WIN32) || defined(_WIN64) + if (((used_bytes > env->geo_in_bytes.lower && env->geo_in_bytes.shrink) || + (globals.running_under_Wine && + /* under Wine acquisition of remap_guard is always required, + * since Wine don't support section extending, + * i.e. in both cases unmap+map are required. */ + used_bytes < env->geo_in_bytes.upper && env->geo_in_bytes.grow)) && + /* avoid recursive use SRW */ (txn->flags & MDBX_NOSTICKYTHREADS) == + 0) { + txn->flags |= txn_shrink_allowed; + imports.srwl_AcquireShared(&env->remap_guard); + } +#endif /* Windows */ + } else { + tASSERT(txn, txn == env->basal_txn); + MDBX_cursor *const gc = ptr_disp(txn, sizeof(MDBX_txn)); + rc = cursor_init(gc, txn, FREE_DBI); + if (rc != MDBX_SUCCESS) + goto bailout; + } + dxb_sanitize_tail(env, txn); + return MDBX_SUCCESS; } -#undef GET_PROC_ADDR +bailout: + tASSERT(txn, rc != MDBX_SUCCESS); + txn_end(txn, TXN_END_SLOT | TXN_END_FAIL_BEGIN); + return rc; } -#if __GNUC_PREREQ(8, 0) -#pragma GCC diagnostic pop -#endif /* GCC/MINGW */ - -#endif /* Windows LCK-implementation */ -/* - * Copyright 2015-2024 Leonid Yuriev - * and other libmdbx authors: please see AUTHORS file. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#if !(defined(_WIN32) || defined(_WIN64)) /* !Windows LCK-implementation */ - - -#if MDBX_LOCKING == MDBX_LOCKING_SYSV -#include -#endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ - -/*----------------------------------------------------------------------------*/ -/* global constructor/destructor */ - -#if defined(__linux__) || defined(__gnu_linux__) - -#include - -MDBX_INTERNAL_VAR_INSTA uint32_t linux_kernel_version; -MDBX_INTERNAL_VAR_INSTA bool - mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; +int txn_end(MDBX_txn *txn, const unsigned mode) { + MDBX_env *env = txn->env; + static const char *const names[] = TXN_END_NAMES; -MDBX_EXCLUDE_FOR_GPROF -__cold static uint8_t probe_for_WSL(const char *tag) { - const char *const WSL = strstr(tag, "WSL"); - if (WSL && WSL[3] >= '2' && WSL[3] <= '9') - return WSL[3] - '0'; - const char *const wsl = strstr(tag, "wsl"); - if (wsl && wsl[3] >= '2' && wsl[3] <= '9') - return wsl[3] - '0'; - if (WSL || wsl || strcasestr(tag, "Microsoft")) - /* Expecting no new kernel within WSL1, either it will explicitly - * marked by an appropriate WSL-version hint. */ - return (linux_kernel_version < /* 4.19.x */ 0x04130000) ? 1 : 2; - return 0; -} + DEBUG("%s txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO + "/%" PRIaPGNO, + names[mode & TXN_END_OPMASK], txn->txnid, + (txn->flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, + txn->dbs[MAIN_DBI].root, txn->dbs[FREE_DBI].root); -#endif /* Linux */ + if (!(mode & TXN_END_EOTDONE)) /* !(already closed cursors) */ + done_cursors(txn, false); -#ifdef ENABLE_GPROF -extern void _mcleanup(void); -extern void monstartup(unsigned long, unsigned long); -extern void _init(void); -extern void _fini(void); -extern void __gmon_start__(void) __attribute__((__weak__)); -#endif /* ENABLE_GPROF */ + int rc = MDBX_SUCCESS; + if (txn->flags & MDBX_TXN_RDONLY) { + if (txn->to.reader) { + reader_slot_t *slot = txn->to.reader; + eASSERT(env, slot->pid.weak == env->pid); + if (likely(!(txn->flags & MDBX_TXN_FINISHED))) { + ENSURE(env, txn->txnid >= + /* paranoia is appropriate here */ env->lck + ->cached_oldest.weak); + eASSERT(env, txn->txnid == slot->txnid.weak && + slot->txnid.weak >= env->lck->cached_oldest.weak); + dxb_sanitize_tail(env, nullptr); + atomic_store32(&slot->snapshot_pages_used, 0, mo_Relaxed); + safe64_reset(&slot->txnid, false); + atomic_store32(&env->lck->rdt_refresh_flag, true, mo_Relaxed); + } else { + eASSERT(env, slot->pid.weak == env->pid); + eASSERT(env, slot->txnid.weak >= SAFE64_INVALID_THRESHOLD); + } + if (mode & TXN_END_SLOT) { + if ((env->flags & ENV_TXKEY) == 0) + atomic_store32(&slot->pid, 0, mo_Relaxed); + txn->to.reader = nullptr; + } + } +#if defined(_WIN32) || defined(_WIN64) + if (txn->flags & txn_shrink_allowed) + imports.srwl_ReleaseShared(&env->remap_guard); +#endif + txn->n_dbi = 0; /* prevent further DBI activity */ + txn->flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED; + txn->owner = 0; + } else if (!(txn->flags & MDBX_TXN_FINISHED)) { + ENSURE(env, + txn->txnid >= + /* paranoia is appropriate here */ env->lck->cached_oldest.weak); + if (txn == env->basal_txn) + dxb_sanitize_tail(env, nullptr); + + txn->flags = MDBX_TXN_FINISHED; + env->txn = txn->parent; + pnl_free(txn->tw.spilled.list); + txn->tw.spilled.list = nullptr; + if (txn == env->basal_txn) { + eASSERT(env, txn->parent == nullptr); + /* Export or close DBI handles created in this txn */ + rc = dbi_update(txn, mode & TXN_END_UPDATE); + pnl_shrink(&txn->tw.retired_pages); + pnl_shrink(&txn->tw.relist); + if (!(env->flags & MDBX_WRITEMAP)) + dpl_release_shadows(txn); + /* The writer mutex was locked in mdbx_txn_begin. */ + lck_txn_unlock(env); + } else { + eASSERT(env, txn->parent != nullptr); + MDBX_txn *const parent = txn->parent; + eASSERT(env, parent->signature == txn_signature); + eASSERT(env, parent->nested == txn && + (parent->flags & MDBX_TXN_HAS_CHILD) != 0); + eASSERT(env, + pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated - + MDBX_ENABLE_REFUND)); + eASSERT(env, memcmp(&txn->tw.troika, &parent->tw.troika, + sizeof(troika_t)) == 0); -MDBX_EXCLUDE_FOR_GPROF -__cold static __attribute__((__constructor__)) void -mdbx_global_constructor(void) { -#ifdef ENABLE_GPROF - if (!&__gmon_start__) - monstartup((uintptr_t)&_init, (uintptr_t)&_fini); -#endif /* ENABLE_GPROF */ + txn->owner = 0; + if (txn->tw.gc.reclaimed) { + eASSERT(env, MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) >= + (uintptr_t)parent->tw.gc.reclaimed); + MDBX_PNL_SETSIZE(txn->tw.gc.reclaimed, + (uintptr_t)parent->tw.gc.reclaimed); + parent->tw.gc.reclaimed = txn->tw.gc.reclaimed; + } -#if defined(__linux__) || defined(__gnu_linux__) - struct utsname buffer; - if (uname(&buffer) == 0) { - int i = 0; - char *p = buffer.release; - while (*p && i < 4) { - if (*p >= '0' && *p <= '9') { - long number = strtol(p, &p, 10); - if (number > 0) { - if (number > 255) - number = 255; - linux_kernel_version += number << (24 - i * 8); + if (txn->tw.retired_pages) { + eASSERT(env, MDBX_PNL_GETSIZE(txn->tw.retired_pages) >= + (uintptr_t)parent->tw.retired_pages); + MDBX_PNL_SETSIZE(txn->tw.retired_pages, + (uintptr_t)parent->tw.retired_pages); + parent->tw.retired_pages = txn->tw.retired_pages; + } + + parent->nested = nullptr; + parent->flags &= ~MDBX_TXN_HAS_CHILD; + parent->tw.dirtylru = txn->tw.dirtylru; + tASSERT(parent, dpl_check(parent)); + tASSERT(parent, audit_ex(parent, 0, false) == 0); + dpl_release_shadows(txn); + dpl_free(txn); + pnl_free(txn->tw.relist); + + if (parent->geo.upper != txn->geo.upper || + parent->geo.now != txn->geo.now) { + /* undo resize performed by child txn */ + rc = dxb_resize(env, parent->geo.first_unallocated, parent->geo.now, + parent->geo.upper, impilict_shrink); + if (rc == MDBX_EPERM) { + /* unable undo resize (it is regular for Windows), + * therefore promote size changes from child to the parent txn */ + WARNING("unable undo resize performed by child txn, promote to " + "the parent (%u->%u, %u->%u)", + txn->geo.now, parent->geo.now, txn->geo.upper, + parent->geo.upper); + parent->geo.now = txn->geo.now; + parent->geo.upper = txn->geo.upper; + parent->flags |= MDBX_TXN_DIRTY; + rc = MDBX_SUCCESS; + } else if (unlikely(rc != MDBX_SUCCESS)) { + ERROR("error %d while undo resize performed by child txn, fail " + "the parent", + rc); + parent->flags |= MDBX_TXN_ERROR; + if (!env->dxb_mmap.base) + env->flags |= ENV_FATAL_ERROR; } - ++i; - } else { - ++p; } } - /* "Official" way of detecting WSL1 but not WSL2 - * https://github.com/Microsoft/WSL/issues/423#issuecomment-221627364 - * - * WARNING: False negative detection of WSL1 will result in DATA LOSS! - * So, the REQUIREMENTS for this code: - * 1. MUST detect WSL1 without false-negatives. - * 2. DESIRABLE detect WSL2 but without the risk of violating the first. */ - mdbx_RunningOnWSL1 = probe_for_WSL(buffer.version) == 1 || - probe_for_WSL(buffer.sysname) == 1 || - probe_for_WSL(buffer.release) == 1; } -#endif /* Linux */ - global_ctor(); -} + eASSERT(env, txn == env->basal_txn || txn->owner == 0); + if ((mode & TXN_END_FREE) != 0 && txn != env->basal_txn) { + txn->signature = 0; + osal_free(txn); + } -MDBX_EXCLUDE_FOR_GPROF -__cold static __attribute__((__destructor__)) void -mdbx_global_destructor(void) { - global_dtor(); -#ifdef ENABLE_GPROF - if (!&__gmon_start__) - _mcleanup(); -#endif /* ENABLE_GPROF */ + return rc; } /*----------------------------------------------------------------------------*/ -/* lck */ -/* Описание реализации блокировок для POSIX & Linux: - * - * lck-файл отображается в память, в нём организуется таблица читателей и - * размещаются совместно используемые posix-мьютексы (futex). Посредством - * этих мьютексов (см struct MDBX_lockinfo) реализуются: - * - Блокировка таблицы читателей для регистрации, - * т.е. функции osal_rdt_lock() и osal_rdt_unlock(). - * - Блокировка БД для пишущих транзакций, - * т.е. функции osal_txn_lock() и osal_txn_unlock(). - * - * Остальной функционал реализуется отдельно посредством файловых блокировок: - * - Первоначальный захват БД в режиме exclusive/shared и последующий перевод - * в операционный режим, функции osal_lck_seize() и osal_lck_downgrade(). - * - Проверка присутствие процессов-читателей, - * т.е. функции osal_rpid_set(), osal_rpid_clear() и osal_rpid_check(). - * - * Для блокировки файлов используется fcntl(F_SETLK), так как: - * - lockf() оперирует только эксклюзивной блокировкой и требует - * открытия файла в RW-режиме. - * - flock() не гарантирует атомарности при смене блокировок - * и оперирует только всем файлом целиком. - * - Для контроля процессов-читателей используются однобайтовые - * range-блокировки lck-файла посредством fcntl(F_SETLK). При этом - * в качестве позиции используется pid процесса-читателя. - * - Для первоначального захвата и shared/exclusive выполняется блокировка - * основного файла БД и при успехе lck-файла. - * - * ---------------------------------------------------------------------------- - * УДЕРЖИВАЕМЫЕ БЛОКИРОВКИ В ЗАВИСИМОСТИ ОТ РЕЖИМА И СОСТОЯНИЯ - * - * Эксклюзивный режим без lck-файла: - * = заблокирован весь dxb-файл посредством F_RDLCK или F_WRLCK, - * в зависимости от MDBX_RDONLY. - * - * Не-операционный режим на время пере-инициализации и разрушении lck-файла: - * = F_WRLCK блокировка первого байта lck-файла, другие процессы ждут её - * снятия при получении F_RDLCK через F_SETLKW. - * - блокировки dxb-файла могут меняться до снятие эксклюзивной блокировки - * lck-файла: - * + для НЕ-эксклюзивного режима блокировка pid-байта в dxb-файле - * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. - * + для ЭКСКЛЮЗИВНОГО режима блокировка всего dxb-файла - * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. - * - * ОПЕРАЦИОННЫЙ режим с lck-файлом: - * = F_RDLCK блокировка первого байта lck-файла, другие процессы не могут - * получить F_WRLCK и таким образом видят что БД используется. - * + F_WRLCK блокировка pid-байта в clk-файле после первой транзакции чтения. - * + для НЕ-эксклюзивного режима блокировка pid-байта в dxb-файле - * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. - * + для ЭКСКЛЮЗИВНОГО режима блокировка pid-байта всего dxb-файла - * посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY. - */ +int mdbx_txn_renew(MDBX_txn *txn) { + if (unlikely(!txn)) + return MDBX_EINVAL; -#if MDBX_USE_OFDLOCKS -static int op_setlk, op_setlkw, op_getlk; -__cold static void choice_fcntl(void) { - assert(!op_setlk && !op_setlkw && !op_getlk); - if ((mdbx_static.flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 -#if defined(__linux__) || defined(__gnu_linux__) - && linux_kernel_version > - 0x030f0000 /* OFD locks are available since 3.15, but engages here - only for 3.16 and later kernels (i.e. LTS) because - of reliability reasons */ -#endif /* linux */ - ) { - op_setlk = MDBX_F_OFD_SETLK; - op_setlkw = MDBX_F_OFD_SETLKW; - op_getlk = MDBX_F_OFD_GETLK; - return; + if (unlikely(txn->signature != txn_signature)) + return MDBX_EBADSIGN; + + if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0)) + return MDBX_EINVAL; + + int rc; + if (unlikely(txn->owner != 0 || !(txn->flags & MDBX_TXN_FINISHED))) { + rc = mdbx_txn_reset(txn); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; } - op_setlk = MDBX_F_SETLK; - op_setlkw = MDBX_F_SETLKW; - op_getlk = MDBX_F_GETLK; + + rc = txn_renew(txn, MDBX_TXN_RDONLY); + if (rc == MDBX_SUCCESS) { + tASSERT(txn, txn->owner == osal_thread_self()); + DEBUG("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO + "/%" PRIaPGNO, + txn->txnid, (txn->flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, + (void *)txn->env, txn->dbs[MAIN_DBI].root, txn->dbs[FREE_DBI].root); + } + return rc; } -#else -#define op_setlk MDBX_F_SETLK -#define op_setlkw MDBX_F_SETLKW -#define op_getlk MDBX_F_GETLK -#endif /* MDBX_USE_OFDLOCKS */ -static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck, - const off_t offset, off_t len) { - STATIC_ASSERT(sizeof(off_t) >= sizeof(void *) && - sizeof(off_t) >= sizeof(size_t)); -#ifdef __ANDROID_API__ - STATIC_ASSERT_MSG((sizeof(off_t) * 8 == MDBX_WORDBITS), - "The bitness of system `off_t` type is mismatch. Please " - "fix build and/or NDK configuration."); -#endif /* Android */ - jitter4testing(true); - assert(offset >= 0 && len > 0); - assert((uint64_t)offset < (uint64_t)INT64_MAX && - (uint64_t)len < (uint64_t)INT64_MAX && - (uint64_t)(offset + len) > (uint64_t)offset); +int mdbx_txn_set_userctx(MDBX_txn *txn, void *ctx) { + int rc = check_txn(txn, MDBX_TXN_FINISHED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - assert((uint64_t)offset < (uint64_t)OFF_T_MAX && - (uint64_t)len <= (uint64_t)OFF_T_MAX && - (uint64_t)(offset + len) <= (uint64_t)OFF_T_MAX); + txn->userctx = ctx; + return MDBX_SUCCESS; +} - assert((uint64_t)((off_t)((uint64_t)offset + (uint64_t)len)) == - ((uint64_t)offset + (uint64_t)len)); - for (;;) { - MDBX_STRUCT_FLOCK lock_op; - STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(lock_op.l_start) && - sizeof(off_t) <= sizeof(lock_op.l_len) && - OFF_T_MAX == (off_t)OFF_T_MAX, - "Support for large/64-bit-sized files is misconfigured " - "for the target system and/or toolchain. " - "Please fix it or at least disable it completely."); - memset(&lock_op, 0, sizeof(lock_op)); - lock_op.l_type = lck; - lock_op.l_whence = SEEK_SET; - lock_op.l_start = offset; - lock_op.l_len = len; - int rc = MDBX_FCNTL(fd, cmd, &lock_op); - jitter4testing(true); - if (rc != -1) { - if (cmd == op_getlk) { - /* Checks reader by pid. Returns: - * MDBX_RESULT_TRUE - if pid is live (reader holds a lock). - * MDBX_RESULT_FALSE - if pid is dead (a lock could be placed). */ - return (lock_op.l_type == F_UNLCK) ? MDBX_RESULT_FALSE - : MDBX_RESULT_TRUE; - } - return MDBX_SUCCESS; +void *mdbx_txn_get_userctx(const MDBX_txn *txn) { + return check_txn(txn, MDBX_TXN_FINISHED) ? nullptr : txn->userctx; +} + +int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, + MDBX_txn **ret, void *context) { + if (unlikely(!ret)) + return MDBX_EINVAL; + *ret = nullptr; + + if (unlikely((flags & ~txn_rw_begin_flags) && (flags & ~txn_ro_begin_flags))) + return MDBX_EINVAL; + + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(env->flags & MDBX_RDONLY & ~flags)) /* write txn in RDONLY env */ + return MDBX_EACCESS; + + MDBX_txn *txn = nullptr; + if (parent) { + /* Nested transactions: Max 1 child, write txns only, no writemap */ + rc = check_txn_rw(parent, + MDBX_TXN_RDONLY | MDBX_WRITEMAP | MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (env->options.spill_parent4child_denominator) { + /* Spill dirty-pages of parent to provide dirtyroom for child txn */ + rc = txn_spill(parent, nullptr, + parent->tw.dirtylist->length / + env->options.spill_parent4child_denominator); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; } - rc = errno; -#if MDBX_USE_OFDLOCKS - if (rc == EINVAL && (cmd == MDBX_F_OFD_SETLK || cmd == MDBX_F_OFD_SETLKW || - cmd == MDBX_F_OFD_GETLK)) { - /* fallback to non-OFD locks */ - if (cmd == MDBX_F_OFD_SETLK) - cmd = MDBX_F_SETLK; - else if (cmd == MDBX_F_OFD_SETLKW) - cmd = MDBX_F_SETLKW; - else - cmd = MDBX_F_GETLK; - op_setlk = MDBX_F_SETLK; - op_setlkw = MDBX_F_SETLKW; - op_getlk = MDBX_F_GETLK; - continue; + tASSERT(parent, audit_ex(parent, 0, false) == 0); + + flags |= parent->flags & (txn_rw_begin_flags | MDBX_TXN_SPILLS | + MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP); + } else if (flags & MDBX_TXN_RDONLY) { + if ((env->flags & MDBX_NOSTICKYTHREADS) == 0 && env->txn && + unlikely(env->basal_txn->owner == osal_thread_self()) && + (globals.runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) + return MDBX_TXN_OVERLAPPING; + } else { + /* Reuse preallocated write txn. However, do not touch it until + * txn_renew() succeeds, since it currently may be active. */ + txn = env->basal_txn; + goto renew; + } + + const intptr_t bitmap_bytes = +#if MDBX_ENABLE_DBI_SPARSE + ceil_powerof2(env->max_dbi, CHAR_BIT * sizeof(txn->dbi_sparse[0])) / + CHAR_BIT; +#else + 0; +#endif /* MDBX_ENABLE_DBI_SPARSE */ + STATIC_ASSERT(sizeof(txn->tw) > sizeof(txn->to)); + const size_t base = (flags & MDBX_TXN_RDONLY) + ? sizeof(MDBX_txn) - sizeof(txn->tw) + sizeof(txn->to) + : sizeof(MDBX_txn); + const size_t size = + base + + ((flags & MDBX_TXN_RDONLY) + ? (size_t)bitmap_bytes + env->max_dbi * sizeof(txn->dbi_seqs[0]) + : 0) + + env->max_dbi * (sizeof(txn->dbs[0]) + sizeof(txn->cursors[0]) + + sizeof(txn->dbi_state[0])); + txn = osal_malloc(size); + if (unlikely(txn == nullptr)) { + DEBUG("calloc: %s", "failed"); + return MDBX_ENOMEM; + } +#if MDBX_DEBUG + memset(txn, 0xCD, size); + VALGRIND_MAKE_MEM_UNDEFINED(txn, size); +#endif /* MDBX_DEBUG */ + MDBX_ANALYSIS_ASSUME(size > base); + memset(txn, 0, + (MDBX_GOOFY_MSVC_STATIC_ANALYZER && base > size) ? size : base); + txn->dbs = ptr_disp(txn, base); + txn->cursors = ptr_disp(txn->dbs, env->max_dbi * sizeof(txn->dbs[0])); +#if MDBX_DEBUG + txn->cursors[FREE_DBI] = nullptr; /* avoid SIGSEGV in an assertion later */ +#endif + txn->dbi_state = + ptr_disp(txn, size - env->max_dbi * sizeof(txn->dbi_state[0])); + txn->flags = flags; + txn->env = env; + + if (parent) { + tASSERT(parent, dpl_check(parent)); +#if MDBX_ENABLE_DBI_SPARSE + txn->dbi_sparse = parent->dbi_sparse; +#endif /* MDBX_ENABLE_DBI_SPARSE */ + txn->dbi_seqs = parent->dbi_seqs; + txn->geo = parent->geo; + rc = dpl_alloc(txn); + if (likely(rc == MDBX_SUCCESS)) { + const size_t len = + MDBX_PNL_GETSIZE(parent->tw.relist) + parent->tw.loose_count; + txn->tw.relist = + pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL); + if (unlikely(!txn->tw.relist)) + rc = MDBX_ENOMEM; } -#endif /* MDBX_USE_OFDLOCKS */ - if (rc != EINTR || cmd == op_setlkw) { - assert(MDBX_IS_ERROR(rc)); + if (unlikely(rc != MDBX_SUCCESS)) { + nested_failed: + pnl_free(txn->tw.relist); + dpl_free(txn); + osal_free(txn); return rc; } - } -} - -MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) { -#if MDBX_USE_OFDLOCKS - if (unlikely(op_setlk == 0)) - choice_fcntl(); -#endif /* MDBX_USE_OFDLOCKS */ - return lck_op(fd, wait ? op_setlkw : op_setlk, F_WRLCK, 0, OFF_T_MAX); -} - -MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env) { - assert(env->me_lfd != INVALID_HANDLE_VALUE); - assert(env->me_pid > 0); - if (unlikely(osal_getpid() != env->me_pid)) - return MDBX_PANIC; - return lck_op(env->me_lfd, op_setlk, F_WRLCK, env->me_pid, 1); -} - -MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env) { - assert(env->me_lfd != INVALID_HANDLE_VALUE); - assert(env->me_pid > 0); - return lck_op(env->me_lfd, op_setlk, F_UNLCK, env->me_pid, 1); -} -MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid) { - assert(env->me_lfd != INVALID_HANDLE_VALUE); - assert(pid > 0); - return lck_op(env->me_lfd, op_getlk, F_WRLCK, pid, 1); -} + /* Move loose pages to reclaimed list */ + if (parent->tw.loose_count) { + do { + page_t *lp = parent->tw.loose_pages; + tASSERT(parent, lp->flags == P_LOOSE); + rc = pnl_insert_span(&parent->tw.relist, lp->pgno, 1); + if (unlikely(rc != MDBX_SUCCESS)) + goto nested_failed; + MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *)); + VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *)); + parent->tw.loose_pages = page_next(lp); + /* Remove from dirty list */ + page_wash(parent, dpl_exist(parent, lp->pgno), lp, 1); + } while (parent->tw.loose_pages); + parent->tw.loose_count = 0; +#if MDBX_ENABLE_REFUND + parent->tw.loose_refund_wl = 0; +#endif /* MDBX_ENABLE_REFUND */ + tASSERT(parent, dpl_check(parent)); + } + txn->tw.dirtyroom = parent->tw.dirtyroom; + txn->tw.dirtylru = parent->tw.dirtylru; -/*---------------------------------------------------------------------------*/ + dpl_sort(parent); + if (parent->tw.spilled.list) + spill_purge(parent); -#if MDBX_LOCKING > MDBX_LOCKING_SYSV -MDBX_INTERNAL_FUNC int osal_ipclock_stubinit(osal_ipclock_t *ipc) { -#if MDBX_LOCKING == MDBX_LOCKING_POSIX1988 - return sem_init(ipc, false, 1) ? errno : 0; -#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ - MDBX_LOCKING == MDBX_LOCKING_POSIX2008 - return pthread_mutex_init(ipc, nullptr); -#else -#error "FIXME" -#endif -} + tASSERT(txn, MDBX_PNL_ALLOCLEN(txn->tw.relist) >= + MDBX_PNL_GETSIZE(parent->tw.relist)); + memcpy(txn->tw.relist, parent->tw.relist, + MDBX_PNL_SIZEOF(parent->tw.relist)); + eASSERT(env, pnl_check_allocated( + txn->tw.relist, + (txn->geo.first_unallocated /* LY: intentional assignment + here, only for assertion */ + = parent->geo.first_unallocated) - + MDBX_ENABLE_REFUND)); -MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc) { -#if MDBX_LOCKING == MDBX_LOCKING_POSIX1988 - return sem_destroy(ipc) ? errno : 0; -#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ - MDBX_LOCKING == MDBX_LOCKING_POSIX2008 - return pthread_mutex_destroy(ipc); -#else -#error "FIXME" -#endif -} -#endif /* MDBX_LOCKING > MDBX_LOCKING_SYSV */ + txn->tw.gc.time_acc = parent->tw.gc.time_acc; + txn->tw.gc.last_reclaimed = parent->tw.gc.last_reclaimed; + if (parent->tw.gc.reclaimed) { + txn->tw.gc.reclaimed = parent->tw.gc.reclaimed; + parent->tw.gc.reclaimed = + (void *)(intptr_t)MDBX_PNL_GETSIZE(parent->tw.gc.reclaimed); + } -static int check_fstat(MDBX_env *env) { - struct stat st; + txn->tw.retired_pages = parent->tw.retired_pages; + parent->tw.retired_pages = + (void *)(intptr_t)MDBX_PNL_GETSIZE(parent->tw.retired_pages); - int rc = MDBX_SUCCESS; - if (fstat(env->me_lazy_fd, &st)) { - rc = errno; - ERROR("fstat(%s), err %d", "DXB", rc); - return rc; - } + txn->txnid = parent->txnid; + txn->front_txnid = parent->front_txnid + 1; +#if MDBX_ENABLE_REFUND + txn->tw.loose_refund_wl = 0; +#endif /* MDBX_ENABLE_REFUND */ + txn->canary = parent->canary; + parent->flags |= MDBX_TXN_HAS_CHILD; + parent->nested = txn; + txn->parent = parent; + txn->owner = parent->owner; + txn->tw.troika = parent->tw.troika; - if (!S_ISREG(st.st_mode) || st.st_nlink < 1) { -#ifdef EBADFD - rc = EBADFD; -#else - rc = EPERM; -#endif - ERROR("%s %s, err %d", "DXB", - (st.st_nlink < 1) ? "file was removed" : "not a regular file", rc); - return rc; + txn->cursors[FREE_DBI] = nullptr; + txn->cursors[MAIN_DBI] = nullptr; + txn->dbi_state[FREE_DBI] = + parent->dbi_state[FREE_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); + txn->dbi_state[MAIN_DBI] = + parent->dbi_state[MAIN_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); + memset(txn->dbi_state + CORE_DBS, 0, + (txn->n_dbi = parent->n_dbi) - CORE_DBS); + memcpy(txn->dbs, parent->dbs, sizeof(txn->dbs[0]) * CORE_DBS); + + tASSERT(parent, parent->tw.dirtyroom + parent->tw.dirtylist->length == + (parent->parent ? parent->parent->tw.dirtyroom + : parent->env->options.dp_limit)); + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->parent ? txn->parent->tw.dirtyroom + : txn->env->options.dp_limit)); + env->txn = txn; + tASSERT(parent, parent->cursors[FREE_DBI] == nullptr); + rc = parent->cursors[MAIN_DBI] + ? cursor_shadow(parent->cursors[MAIN_DBI], txn, MAIN_DBI) + : MDBX_SUCCESS; + if (AUDIT_ENABLED() && ASSERT_ENABLED()) { + txn->signature = txn_signature; + tASSERT(txn, audit_ex(txn, 0, false) == 0); + } + if (unlikely(rc != MDBX_SUCCESS)) + txn_end(txn, TXN_END_FAIL_BEGINCHILD); + } else { /* MDBX_TXN_RDONLY */ + txn->dbi_seqs = + ptr_disp(txn->cursors, env->max_dbi * sizeof(txn->cursors[0])); +#if MDBX_ENABLE_DBI_SPARSE + txn->dbi_sparse = ptr_disp(txn->dbi_state, -bitmap_bytes); +#endif /* MDBX_ENABLE_DBI_SPARSE */ + renew: + rc = txn_renew(txn, flags); } - if (st.st_size < (off_t)(MDBX_MIN_PAGESIZE * NUM_METAS)) { - VERBOSE("dxb-file is too short (%u), exclusive-lock needed", - (unsigned)st.st_size); - rc = MDBX_RESULT_TRUE; + if (unlikely(rc != MDBX_SUCCESS)) { + if (txn != env->basal_txn) + osal_free(txn); + } else { + if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) + eASSERT(env, txn->flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED)); + else if (flags & MDBX_TXN_RDONLY) + eASSERT(env, (txn->flags & + ~(MDBX_NOSTICKYTHREADS | MDBX_TXN_RDONLY | MDBX_WRITEMAP | + /* Win32: SRWL flag */ txn_shrink_allowed)) == 0); + else { + eASSERT(env, (txn->flags & ~(MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP | + txn_shrink_allowed | MDBX_NOMETASYNC | + MDBX_SAFE_NOSYNC | MDBX_TXN_SPILLS)) == 0); + assert(!txn->tw.spilled.list && !txn->tw.spilled.least_removed); + } + txn->signature = txn_signature; + txn->userctx = context; + *ret = txn; + DEBUG("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO + "/%" PRIaPGNO, + txn->txnid, (flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, + (void *)env, txn->dbs[MAIN_DBI].root, txn->dbs[FREE_DBI].root); } - //---------------------------------------------------------------------------- + return rc; +} - if (fstat(env->me_lfd, &st)) { - rc = errno; - ERROR("fstat(%s), err %d", "LCK", rc); +int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { + int rc = check_txn(txn, MDBX_TXN_FINISHED); + if (unlikely(rc != MDBX_SUCCESS)) return rc; - } - if (!S_ISREG(st.st_mode) || st.st_nlink < 1) { -#ifdef EBADFD - rc = EBADFD; -#else - rc = EPERM; -#endif - ERROR("%s %s, err %d", "LCK", - (st.st_nlink < 1) ? "file was removed" : "not a regular file", rc); - return rc; - } + if (unlikely(!info)) + return MDBX_EINVAL; - /* Checking file size for detect the situation when we got the shared lock - * immediately after osal_lck_destroy(). */ - if (st.st_size < (off_t)(sizeof(MDBX_lockinfo) + sizeof(MDBX_reader))) { - VERBOSE("lck-file is too short (%u), exclusive-lock needed", - (unsigned)st.st_size); - rc = MDBX_RESULT_TRUE; + MDBX_env *const env = txn->env; +#if MDBX_ENV_CHECKPID + if (unlikely(env->pid != osal_getpid())) { + env->flags |= ENV_FATAL_ERROR; + return MDBX_PANIC; } +#endif /* MDBX_ENV_CHECKPID */ - return rc; -} + info->txn_id = txn->txnid; + info->txn_space_used = pgno2bytes(env, txn->geo.first_unallocated); -__cold MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { - assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); - if (unlikely(osal_getpid() != env->me_pid)) - return MDBX_PANIC; -#if MDBX_USE_OFDLOCKS - if (unlikely(op_setlk == 0)) - choice_fcntl(); -#endif /* MDBX_USE_OFDLOCKS */ + if (txn->flags & MDBX_TXN_RDONLY) { + meta_ptr_t head; + uint64_t head_retired; + troika_t troika = meta_tap(env); + do { + /* fetch info from volatile head */ + head = meta_recent(env, &troika); + head_retired = unaligned_peek_u64_volatile(4, head.ptr_v->pages_retired); + info->txn_space_limit_soft = pgno2bytes(env, head.ptr_v->geometry.now); + info->txn_space_limit_hard = pgno2bytes(env, head.ptr_v->geometry.upper); + info->txn_space_leftover = + pgno2bytes(env, head.ptr_v->geometry.now - + head.ptr_v->geometry.first_unallocated); + } while (unlikely(meta_should_retry(env, &troika))); - int rc = MDBX_SUCCESS; -#if defined(__linux__) || defined(__gnu_linux__) - if (unlikely(mdbx_RunningOnWSL1)) { - rc = ENOLCK /* No record locks available */; - ERROR("%s, err %u", - "WSL1 (Windows Subsystem for Linux) is mad and trouble-full, " - "injecting failure to avoid data loss", - rc); - return rc; - } -#endif /* Linux */ + info->txn_reader_lag = head.txnid - info->txn_id; + info->txn_space_dirty = info->txn_space_retired = 0; + uint64_t reader_snapshot_pages_retired; + if (txn->to.reader && + head_retired > + (reader_snapshot_pages_retired = atomic_load64( + &txn->to.reader->snapshot_pages_retired, mo_Relaxed))) { + info->txn_space_dirty = info->txn_space_retired = pgno2bytes( + env, (pgno_t)(head_retired - reader_snapshot_pages_retired)); - if (env->me_lfd == INVALID_HANDLE_VALUE) { - /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ - rc = - lck_op(env->me_lazy_fd, op_setlk, - (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX); - if (rc != MDBX_SUCCESS) { - ERROR("%s, err %u", "without-lck", rc); - eASSERT(env, MDBX_IS_ERROR(rc)); - return rc; + size_t retired_next_reader = 0; + lck_t *const lck = env->lck_mmap.lck; + if (scan_rlt && info->txn_reader_lag > 1 && lck) { + /* find next more recent reader */ + txnid_t next_reader = head.txnid; + const size_t snap_nreaders = + atomic_load32(&lck->rdt_length, mo_AcquireRelease); + for (size_t i = 0; i < snap_nreaders; ++i) { + retry: + if (atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease)) { + jitter4testing(true); + const txnid_t snap_txnid = safe64_read(&lck->rdt[i].txnid); + const uint64_t snap_retired = atomic_load64( + &lck->rdt[i].snapshot_pages_retired, mo_AcquireRelease); + if (unlikely(snap_retired != + atomic_load64(&lck->rdt[i].snapshot_pages_retired, + mo_Relaxed)) || + snap_txnid != safe64_read(&lck->rdt[i].txnid)) + goto retry; + if (snap_txnid <= txn->txnid) { + retired_next_reader = 0; + break; + } + if (snap_txnid < next_reader) { + next_reader = snap_txnid; + retired_next_reader = pgno2bytes( + env, (pgno_t)(snap_retired - + atomic_load64( + &txn->to.reader->snapshot_pages_retired, + mo_Relaxed))); + } + } + } + } + info->txn_space_dirty = retired_next_reader; } - return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */; - } -#if defined(_POSIX_PRIORITY_SCHEDULING) && _POSIX_PRIORITY_SCHEDULING > 0 - sched_yield(); -#endif - -retry: - if (rc == MDBX_RESULT_TRUE) { - rc = lck_op(env->me_lfd, op_setlk, F_UNLCK, 0, 1); - if (rc != MDBX_SUCCESS) { - ERROR("%s, err %u", "unlock-before-retry", rc); - eASSERT(env, MDBX_IS_ERROR(rc)); - return rc; + } else { + info->txn_space_limit_soft = pgno2bytes(env, txn->geo.now); + info->txn_space_limit_hard = pgno2bytes(env, txn->geo.upper); + info->txn_space_retired = + pgno2bytes(env, txn->nested ? (size_t)txn->tw.retired_pages + : MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom); + info->txn_space_dirty = pgno2bytes( + env, txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose + : (txn->tw.writemap_dirty_npages + + txn->tw.writemap_spilled_npages)); + info->txn_reader_lag = INT64_MAX; + lck_t *const lck = env->lck_mmap.lck; + if (scan_rlt && lck) { + txnid_t oldest_snapshot = txn->txnid; + const size_t snap_nreaders = + atomic_load32(&lck->rdt_length, mo_AcquireRelease); + if (snap_nreaders) { + oldest_snapshot = txn_snapshot_oldest(txn); + if (oldest_snapshot == txn->txnid - 1) { + /* check if there is at least one reader */ + bool exists = false; + for (size_t i = 0; i < snap_nreaders; ++i) { + if (atomic_load32(&lck->rdt[i].pid, mo_Relaxed) && + txn->txnid > safe64_read(&lck->rdt[i].txnid)) { + exists = true; + break; + } + } + oldest_snapshot += !exists; + } + } + info->txn_reader_lag = txn->txnid - oldest_snapshot; } } - /* Firstly try to get exclusive locking. */ - rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1); - if (rc == MDBX_SUCCESS) { - rc = check_fstat(env); - if (MDBX_IS_ERROR(rc)) - return rc; + return MDBX_SUCCESS; +} - continue_dxb_exclusive: - rc = - lck_op(env->me_lazy_fd, op_setlk, - (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX); - if (rc == MDBX_SUCCESS) - return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */; +MDBX_env *mdbx_txn_env(const MDBX_txn *txn) { + if (unlikely(!txn || txn->signature != txn_signature || + txn->env->signature.weak != env_signature)) + return nullptr; + return txn->env; +} - int err = check_fstat(env); - if (MDBX_IS_ERROR(err)) - return err; +uint64_t mdbx_txn_id(const MDBX_txn *txn) { + if (unlikely(!txn || txn->signature != txn_signature)) + return 0; + return txn->txnid; +} - /* the cause may be a collision with POSIX's file-lock recovery. */ - if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || - rc == EDEADLK)) { - ERROR("%s, err %u", "dxb-exclusive", rc); - eASSERT(env, MDBX_IS_ERROR(rc)); - return rc; - } +int mdbx_txn_flags(const MDBX_txn *txn) { + STATIC_ASSERT( + (MDBX_TXN_INVALID & + (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | + MDBX_TXN_HAS_CHILD | txn_gc_drained | txn_shrink_allowed | + txn_rw_begin_flags | txn_ro_begin_flags)) == 0); + if (unlikely(!txn || txn->signature != txn_signature)) + return MDBX_TXN_INVALID; + assert(0 == (int)(txn->flags & MDBX_TXN_INVALID)); + return txn->flags; +} - /* Fallback to lck-shared */ - } else if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || - rc == EWOULDBLOCK || rc == EDEADLK)) { - ERROR("%s, err %u", "try-exclusive", rc); - eASSERT(env, MDBX_IS_ERROR(rc)); +int mdbx_txn_reset(MDBX_txn *txn) { + int rc = check_txn(txn, 0); + if (unlikely(rc != MDBX_SUCCESS)) return rc; - } - - /* Here could be one of two: - * - osal_lck_destroy() from the another process was hold the lock - * during a destruction. - * - either osal_lck_seize() from the another process was got the exclusive - * lock and doing initialization. - * For distinguish these cases will use size of the lck-file later. */ - /* Wait for lck-shared now. */ - /* Here may be await during transient processes, for instance until another - * competing process doesn't call lck_downgrade(). */ - rc = lck_op(env->me_lfd, op_setlkw, F_RDLCK, 0, 1); - if (rc != MDBX_SUCCESS) { - ERROR("%s, err %u", "try-shared", rc); - eASSERT(env, MDBX_IS_ERROR(rc)); - return rc; - } + /* This call is only valid for read-only txns */ + if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0)) + return MDBX_EINVAL; - rc = check_fstat(env); - if (rc == MDBX_RESULT_TRUE) - goto retry; - if (rc != MDBX_SUCCESS) { - ERROR("%s, err %u", "lck_fstat", rc); - return rc; + /* LY: don't close DBI-handles */ + rc = txn_end(txn, TXN_END_RESET | TXN_END_UPDATE); + if (rc == MDBX_SUCCESS) { + tASSERT(txn, txn->signature == txn_signature); + tASSERT(txn, txn->owner == 0); } + return rc; +} - /* got shared, retry exclusive */ - rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1); - if (rc == MDBX_SUCCESS) - goto continue_dxb_exclusive; +int mdbx_txn_break(MDBX_txn *txn) { + do { + int rc = check_txn(txn, 0); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + txn->flags |= MDBX_TXN_ERROR; + if (txn->flags & MDBX_TXN_RDONLY) + break; + txn = txn->nested; + } while (txn); + return MDBX_SUCCESS; +} - if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || - rc == EDEADLK)) { - ERROR("%s, err %u", "try-exclusive", rc); - eASSERT(env, MDBX_IS_ERROR(rc)); +int mdbx_txn_abort(MDBX_txn *txn) { + int rc = check_txn(txn, 0); + if (unlikely(rc != MDBX_SUCCESS)) return rc; - } - /* Lock against another process operating in without-lck or exclusive mode. */ - rc = - lck_op(env->me_lazy_fd, op_setlk, - (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, env->me_pid, 1); - if (rc != MDBX_SUCCESS) { - ERROR("%s, err %u", "lock-against-without-lck", rc); - eASSERT(env, MDBX_IS_ERROR(rc)); + rc = check_env(txn->env, true); + if (unlikely(rc != MDBX_SUCCESS)) return rc; + + if ((txn->flags & (MDBX_TXN_RDONLY | MDBX_NOSTICKYTHREADS)) == + MDBX_NOSTICKYTHREADS && + unlikely(txn->owner != osal_thread_self())) { + mdbx_txn_break(txn); + return MDBX_THREAD_MISMATCH; } - /* Done: return with shared locking. */ - return MDBX_RESULT_FALSE; + return txn_abort(txn); } +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 -MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) { - assert(env->me_lfd != INVALID_HANDLE_VALUE); - if (unlikely(osal_getpid() != env->me_pid)) - return MDBX_PANIC; - int rc = MDBX_SUCCESS; - if ((env->me_flags & MDBX_EXCLUSIVE) == 0) { - rc = lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, 0, env->me_pid); - if (rc == MDBX_SUCCESS) - rc = lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, env->me_pid + 1, - OFF_T_MAX - env->me_pid - 1); - } - if (rc == MDBX_SUCCESS) - rc = lck_op(env->me_lfd, op_setlk, F_RDLCK, 0, 1); - if (unlikely(rc != 0)) { - ERROR("%s, err %u", "lck", rc); - assert(MDBX_IS_ERROR(rc)); - } - return rc; +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL unsigned +log2n_powerof2(size_t value_uintptr) { + assert(value_uintptr > 0 && value_uintptr < INT32_MAX && + is_powerof2(value_uintptr)); + assert((value_uintptr & -(intptr_t)value_uintptr) == value_uintptr); + const uint32_t value_uint32 = (uint32_t)value_uintptr; +#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctz) + STATIC_ASSERT(sizeof(value_uint32) <= sizeof(unsigned)); + return __builtin_ctz(value_uint32); +#elif defined(_MSC_VER) + unsigned long index; + STATIC_ASSERT(sizeof(value_uint32) <= sizeof(long)); + _BitScanForward(&index, value_uint32); + return index; +#else + static const uint8_t debruijn_ctz32[32] = { + 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, + 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; + return debruijn_ctz32[(uint32_t)(value_uint32 * 0x077CB531ul) >> 27]; +#endif } -MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, bool dont_wait) { - assert(env->me_lfd != INVALID_HANDLE_VALUE); - if (unlikely(osal_getpid() != env->me_pid)) - return MDBX_PANIC; - - const int cmd = dont_wait ? op_setlk : op_setlkw; - int rc = lck_op(env->me_lfd, cmd, F_WRLCK, 0, 1); - if (rc == MDBX_SUCCESS && (env->me_flags & MDBX_EXCLUSIVE) == 0) { - rc = (env->me_pid > 1) - ? lck_op(env->me_lazy_fd, cmd, F_WRLCK, 0, env->me_pid - 1) - : MDBX_SUCCESS; - if (rc == MDBX_SUCCESS) { - rc = lck_op(env->me_lazy_fd, cmd, F_WRLCK, env->me_pid + 1, - OFF_T_MAX - env->me_pid - 1); - if (rc != MDBX_SUCCESS && env->me_pid > 1 && - lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, 0, env->me_pid - 1)) - rc = MDBX_PANIC; - } - if (rc != MDBX_SUCCESS && lck_op(env->me_lfd, op_setlk, F_RDLCK, 0, 1)) - rc = MDBX_PANIC; - } - if (unlikely(rc != 0)) { - ERROR("%s, err %u", "lck", rc); - assert(MDBX_IS_ERROR(rc)); - } - return rc; +MDBX_NOTHROW_CONST_FUNCTION MDBX_INTERNAL uint64_t rrxmrrxmsx_0(uint64_t v) { + /* Pelle Evensen's mixer, https://bit.ly/2HOfynt */ + v ^= (v << 39 | v >> 25) ^ (v << 14 | v >> 50); + v *= UINT64_C(0xA24BAED4963EE407); + v ^= (v << 40 | v >> 24) ^ (v << 15 | v >> 49); + v *= UINT64_C(0x9FB21C651E98DF25); + return v ^ v >> 28; } +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 -__cold MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, - MDBX_env *inprocess_neighbor, - const uint32_t current_pid) { - eASSERT(env, osal_getpid() == current_pid); - int rc = MDBX_SUCCESS; - struct stat lck_info; - MDBX_lockinfo *lck = env->me_lck; - if (lck && lck == env->me_lck_mmap.lck && !inprocess_neighbor && - /* try get exclusive access */ - lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, OFF_T_MAX) == 0 && - /* if LCK was not removed */ - fstat(env->me_lfd, &lck_info) == 0 && lck_info.st_nlink > 0 && - lck_op(env->me_lazy_fd, op_setlk, - (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, - OFF_T_MAX) == 0) { - VERBOSE("%p got exclusive, drown ipc-locks", (void *)env); - eASSERT(env, current_pid == env->me_pid); -#if MDBX_LOCKING == MDBX_LOCKING_SYSV - if (env->me_sysv_ipc.semid != -1) - rc = semctl(env->me_sysv_ipc.semid, 2, IPC_RMID) ? errno : 0; -#else - rc = osal_ipclock_destroy(&lck->mti_rlock); - if (rc == 0) - rc = osal_ipclock_destroy(&lck->mti_wlock); -#endif /* MDBX_LOCKING */ +typedef struct walk_ctx { + void *userctx; + walk_options_t options; + int deep; + walk_func *visitor; + MDBX_txn *txn; + MDBX_cursor *cursor; +} walk_ctx_t; - eASSERT(env, rc == 0); - if (rc == 0) { - const bool synced = lck->mti_unsynced_pages.weak == 0; - osal_munmap(&env->me_lck_mmap); - if (synced && env->me_lfd != INVALID_HANDLE_VALUE) - rc = ftruncate(env->me_lfd, 0) ? errno : 0; - } +__cold static int walk_sdb(walk_ctx_t *ctx, walk_sdb_t *sdb); - jitter4testing(false); - } +static page_type_t walk_page_type(const page_t *mp) { + if (mp) + switch (mp->flags & ~P_SPILLED) { + case P_BRANCH: + return page_branch; + case P_LEAF: + return page_leaf; + case P_LEAF | P_DUPFIX: + return page_dupfix_leaf; + case P_LARGE: + return page_large; + } + return page_broken; +} - if (current_pid != env->me_pid) { - eASSERT(env, !inprocess_neighbor); - NOTICE("drown env %p after-fork pid %d -> %d", - __Wpedantic_format_voidptr(env), env->me_pid, current_pid); - inprocess_neighbor = nullptr; +static page_type_t walk_subpage_type(const page_t *sp) { + switch (sp->flags & /* ignore legacy P_DIRTY flag */ ~P_LEGACY_DIRTY) { + case P_LEAF | P_SUBP: + return page_sub_leaf; + case P_LEAF | P_DUPFIX | P_SUBP: + return page_sub_dupfix_leaf; + default: + return page_sub_broken; } +} - /* 1) POSIX's fcntl() locks (i.e. when op_setlk == F_SETLK) should be restored - * after file was closed. - * - * 2) File locks would be released (by kernel) while the file-descriptors will - * be closed. But to avoid false-positive EACCESS and EDEADLK from the kernel, - * locks should be released here explicitly with properly order. */ +/* Depth-first tree traversal. */ +__cold static int walk_pgno(walk_ctx_t *ctx, walk_sdb_t *sdb, const pgno_t pgno, + txnid_t parent_txnid) { + assert(pgno != P_INVALID); + page_t *mp = nullptr; + int err = page_get(ctx->cursor, pgno, &mp, parent_txnid); - /* close dxb and restore lock */ - if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { - if (unlikely(close(env->me_dsync_fd) != 0) && rc == MDBX_SUCCESS) - rc = errno; - env->me_dsync_fd = INVALID_HANDLE_VALUE; - } - if (env->me_lazy_fd != INVALID_HANDLE_VALUE) { - if (unlikely(close(env->me_lazy_fd) != 0) && rc == MDBX_SUCCESS) - rc = errno; - env->me_lazy_fd = INVALID_HANDLE_VALUE; - if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) { - /* restore file-lock */ - rc = lck_op( - inprocess_neighbor->me_lazy_fd, F_SETLKW, - (inprocess_neighbor->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, - (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) - ? 0 - : inprocess_neighbor->me_pid, - (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) ? OFF_T_MAX : 1); - } - } + const page_type_t type = walk_page_type(mp); + const size_t nentries = mp ? page_numkeys(mp) : 0; + size_t header_size = + (mp && !is_dupfix_leaf(mp)) ? PAGEHDRSZ + mp->lower : PAGEHDRSZ; + size_t payload_size = 0; + size_t unused_size = + (mp ? page_room(mp) : ctx->txn->env->ps - header_size) - payload_size; + size_t align_bytes = 0; - /* close clk and restore locks */ - if (env->me_lfd != INVALID_HANDLE_VALUE) { - if (unlikely(close(env->me_lfd) != 0) && rc == MDBX_SUCCESS) - rc = errno; - env->me_lfd = INVALID_HANDLE_VALUE; - if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) { - /* restore file-locks */ - rc = lck_op(inprocess_neighbor->me_lfd, F_SETLKW, F_RDLCK, 0, 1); - if (rc == MDBX_SUCCESS && inprocess_neighbor->me_live_reader) - rc = osal_rpid_set(inprocess_neighbor); + for (size_t i = 0; err == MDBX_SUCCESS && i < nentries; ++i) { + if (type == page_dupfix_leaf) { + /* DUPFIX pages have no entries[] or node headers */ + payload_size += mp->dupfix_ksize; + continue; } - } - - if (inprocess_neighbor && rc != MDBX_SUCCESS) - inprocess_neighbor->me_flags |= MDBX_FATAL_ERROR; - return rc; -} - -/*---------------------------------------------------------------------------*/ -__cold MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, - MDBX_env *inprocess_neighbor, - int global_uniqueness_flag) { -#if MDBX_LOCKING == MDBX_LOCKING_SYSV - int semid = -1; - /* don't initialize semaphores twice */ - (void)inprocess_neighbor; - if (global_uniqueness_flag == MDBX_RESULT_TRUE) { - struct stat st; - if (fstat(env->me_lazy_fd, &st)) - return errno; - sysv_retry_create: - semid = semget(env->me_sysv_ipc.key, 2, - IPC_CREAT | IPC_EXCL | - (st.st_mode & (S_IRWXU | S_IRWXG | S_IRWXO))); - if (unlikely(semid == -1)) { - int err = errno; - if (err != EEXIST) - return err; + const node_t *node = page_node(mp, i); + header_size += NODESIZE; + const size_t node_key_size = node_ks(node); + payload_size += node_key_size; - /* remove and re-create semaphore set */ - semid = semget(env->me_sysv_ipc.key, 2, 0); - if (semid == -1) { - err = errno; - if (err != ENOENT) - return err; - goto sysv_retry_create; - } - if (semctl(semid, 2, IPC_RMID)) { - err = errno; - if (err != EIDRM) - return err; - } - goto sysv_retry_create; + if (type == page_branch) { + assert(i > 0 || node_ks(node) == 0); + align_bytes += node_key_size & 1; + continue; } - unsigned short val_array[2] = {1, 1}; - if (semctl(semid, 2, SETALL, val_array)) - return errno; - } else { - semid = semget(env->me_sysv_ipc.key, 2, 0); - if (semid == -1) - return errno; + const size_t node_data_size = node_ds(node); + assert(type == page_leaf); + switch (node_flags(node)) { + case 0 /* usual node */: + payload_size += node_data_size; + align_bytes += (node_key_size + node_data_size) & 1; + break; + + case N_BIGDATA /* long data on the large/overflow page */: { + const pgno_t large_pgno = node_largedata_pgno(node); + const size_t over_payload = node_data_size; + const size_t over_header = PAGEHDRSZ; - /* check read & write access */ - struct semid_ds data[2]; - if (semctl(semid, 2, IPC_STAT, data) || semctl(semid, 2, IPC_SET, data)) - return errno; - } + assert(err == MDBX_SUCCESS); + pgr_t lp = page_get_large(ctx->cursor, large_pgno, mp->txnid); + const size_t npages = + ((err = lp.err) == MDBX_SUCCESS) ? lp.page->pages : 1; + const size_t pagesize = pgno2bytes(ctx->txn->env, npages); + const size_t over_unused = pagesize - over_payload - over_header; + const int rc = ctx->visitor(large_pgno, npages, ctx->userctx, ctx->deep, + sdb, pagesize, page_large, err, 1, + over_payload, over_header, over_unused); + if (unlikely(rc != MDBX_SUCCESS)) + return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; + payload_size += sizeof(pgno_t); + align_bytes += node_key_size & 1; + } break; - env->me_sysv_ipc.semid = semid; - return MDBX_SUCCESS; + case N_SUBDATA /* sub-db */: { + if (unlikely(node_data_size != sizeof(tree_t))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid subDb node size", (unsigned)node_data_size); + assert(err == MDBX_CORRUPTED); + err = MDBX_CORRUPTED; + } + header_size += node_data_size; + align_bytes += (node_key_size + node_data_size) & 1; + } break; -#elif MDBX_LOCKING == MDBX_LOCKING_FUTEX - (void)inprocess_neighbor; - if (global_uniqueness_flag != MDBX_RESULT_TRUE) - return MDBX_SUCCESS; -#error "FIXME: Not implemented" -#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 + case N_SUBDATA | N_DUPDATA /* dupsorted sub-tree */: + if (unlikely(node_data_size != sizeof(tree_t))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid sub-tree node size", (unsigned)node_data_size); + assert(err == MDBX_CORRUPTED); + err = MDBX_CORRUPTED; + } + header_size += node_data_size; + align_bytes += (node_key_size + node_data_size) & 1; + break; - /* don't initialize semaphores twice */ - (void)inprocess_neighbor; - if (global_uniqueness_flag == MDBX_RESULT_TRUE) { - if (sem_init(&env->me_lck_mmap.lck->mti_rlock, true, 1)) - return errno; - if (sem_init(&env->me_lck_mmap.lck->mti_wlock, true, 1)) - return errno; - } - return MDBX_SUCCESS; + case N_DUPDATA /* short sub-page */: { + if (unlikely(node_data_size <= PAGEHDRSZ || (node_data_size & 1))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid sub-page node size", (unsigned)node_data_size); + assert(err == MDBX_CORRUPTED); + err = MDBX_CORRUPTED; + break; + } -#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ - MDBX_LOCKING == MDBX_LOCKING_POSIX2008 - if (inprocess_neighbor) - return MDBX_SUCCESS /* don't need any initialization for mutexes - if LCK already opened/used inside current process */ - ; + const page_t *const sp = node_data(node); + const page_type_t subtype = walk_subpage_type(sp); + const size_t nsubkeys = page_numkeys(sp); + if (unlikely(subtype == page_sub_broken)) { + ERROR("%s/%d: %s 0x%x", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid sub-page flags", sp->flags); + assert(err == MDBX_CORRUPTED); + err = MDBX_CORRUPTED; + } - /* FIXME: Unfortunately, there is no other reliable way but to long testing - * on each platform. On the other hand, behavior like FreeBSD is incorrect - * and we can expect it to be rare. Moreover, even on FreeBSD without - * additional in-process initialization, the probability of an problem - * occurring is vanishingly small, and the symptom is a return of EINVAL - * while locking a mutex. In other words, in the worst case, the problem - * results in an EINVAL error at the start of the transaction, but NOT data - * loss, nor database corruption, nor other fatal troubles. Thus, the code - * below I am inclined to think the workaround for erroneous platforms (like - * FreeBSD), rather than a defect of libmdbx. */ -#if defined(__FreeBSD__) - /* seems that shared mutexes on FreeBSD required in-process initialization */ - (void)global_uniqueness_flag; -#else - /* shared mutexes on many other platforms (including Darwin and Linux's - * futexes) doesn't need any addition in-process initialization */ - if (global_uniqueness_flag != MDBX_RESULT_TRUE) - return MDBX_SUCCESS; -#endif + size_t subheader_size = + is_dupfix_leaf(sp) ? PAGEHDRSZ : PAGEHDRSZ + sp->lower; + size_t subunused_size = page_room(sp); + size_t subpayload_size = 0; + size_t subalign_bytes = 0; - pthread_mutexattr_t ma; - int rc = pthread_mutexattr_init(&ma); - if (rc) - return rc; + for (size_t ii = 0; err == MDBX_SUCCESS && ii < nsubkeys; ++ii) { + if (subtype == page_sub_dupfix_leaf) { + /* DUPFIX pages have no entries[] or node headers */ + subpayload_size += sp->dupfix_ksize; + } else { + assert(subtype == page_sub_leaf); + const node_t *subnode = page_node(sp, ii); + const size_t subnode_size = node_ks(subnode) + node_ds(subnode); + subheader_size += NODESIZE; + subpayload_size += subnode_size; + subalign_bytes += subnode_size & 1; + if (unlikely(node_flags(subnode) != 0)) { + ERROR("%s/%d: %s 0x%x", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "unexpected sub-node flags", node_flags(subnode)); + assert(err == MDBX_CORRUPTED); + err = MDBX_CORRUPTED; + } + } + } - rc = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED); - if (rc) - goto bailout; + const int rc = + ctx->visitor(pgno, 0, ctx->userctx, ctx->deep + 1, sdb, + node_data_size, subtype, err, nsubkeys, subpayload_size, + subheader_size, subunused_size + subalign_bytes); + if (unlikely(rc != MDBX_SUCCESS)) + return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; + header_size += subheader_size; + unused_size += subunused_size; + payload_size += subpayload_size; + align_bytes += subalign_bytes + (node_key_size & 1); + } break; -#if MDBX_LOCKING == MDBX_LOCKING_POSIX2008 -#if defined(PTHREAD_MUTEX_ROBUST) || defined(pthread_mutexattr_setrobust) - rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); -#elif defined(PTHREAD_MUTEX_ROBUST_NP) || \ - defined(pthread_mutexattr_setrobust_np) - rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP); -#elif _POSIX_THREAD_PROCESS_SHARED < 200809L - rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP); -#else - rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); -#endif - if (rc) - goto bailout; -#endif /* MDBX_LOCKING == MDBX_LOCKING_POSIX2008 */ + default: + ERROR("%s/%d: %s 0x%x", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid node flags", node_flags(node)); + assert(err == MDBX_CORRUPTED); + err = MDBX_CORRUPTED; + } + } -#if defined(_POSIX_THREAD_PRIO_INHERIT) && _POSIX_THREAD_PRIO_INHERIT >= 0 && \ - !defined(MDBX_SAFE4QEMU) - rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_INHERIT); - if (rc == ENOTSUP) - rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_NONE); - if (rc && rc != ENOTSUP) - goto bailout; -#endif /* PTHREAD_PRIO_INHERIT */ + const int rc = ctx->visitor( + pgno, 1, ctx->userctx, ctx->deep, sdb, ctx->txn->env->ps, type, err, + nentries, payload_size, header_size, unused_size + align_bytes); + if (unlikely(rc != MDBX_SUCCESS)) + return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; - rc = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK); - if (rc && rc != ENOTSUP) - goto bailout; + for (size_t i = 0; err == MDBX_SUCCESS && i < nentries; ++i) { + if (type == page_dupfix_leaf) + continue; - rc = pthread_mutex_init(&env->me_lck_mmap.lck->mti_rlock, &ma); - if (rc) - goto bailout; - rc = pthread_mutex_init(&env->me_lck_mmap.lck->mti_wlock, &ma); + node_t *node = page_node(mp, i); + if (type == page_branch) { + assert(err == MDBX_SUCCESS); + ctx->deep += 1; + err = walk_pgno(ctx, sdb, node_pgno(node), mp->txnid); + ctx->deep -= 1; + if (unlikely(err != MDBX_SUCCESS)) { + if (err == MDBX_RESULT_TRUE) + break; + return err; + } + continue; + } -bailout: - pthread_mutexattr_destroy(&ma); - return rc; -#else -#error "FIXME" -#endif /* MDBX_LOCKING > 0 */ -} + assert(type == page_leaf); + switch (node_flags(node)) { + default: + continue; -__cold static int osal_ipclock_failed(MDBX_env *env, osal_ipclock_t *ipc, - const int err) { - int rc = err; -#if MDBX_LOCKING == MDBX_LOCKING_POSIX2008 || MDBX_LOCKING == MDBX_LOCKING_SYSV - if (err == EOWNERDEAD) { - /* We own the mutex. Clean up after dead previous owner. */ + case N_SUBDATA /* sub-db */: + if (unlikely(node_ds(node) != sizeof(tree_t))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid sub-tree node size", (unsigned)node_ds(node)); + assert(err == MDBX_CORRUPTED); + err = MDBX_CORRUPTED; + } else { + tree_t aligned_db; + memcpy(&aligned_db, node_data(node), sizeof(aligned_db)); + walk_sdb_t subdb = {{node_key(node), node_ks(node)}, nullptr, nullptr}; + subdb.internal = &aligned_db; + assert(err == MDBX_SUCCESS); + ctx->deep += 1; + err = walk_sdb(ctx, &subdb); + ctx->deep -= 1; + } + break; - const bool rlocked = ipc == &env->me_lck->mti_rlock; - rc = MDBX_SUCCESS; - if (!rlocked) { - if (unlikely(env->me_txn)) { - /* env is hosed if the dead thread was ours */ - env->me_flags |= MDBX_FATAL_ERROR; - env->me_txn = NULL; - rc = MDBX_PANIC; + case N_SUBDATA | N_DUPDATA /* dupsorted sub-tree */: + if (unlikely(node_ds(node) != sizeof(tree_t))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid dupsort sub-tree node size", (unsigned)node_ds(node)); + assert(err == MDBX_CORRUPTED); + err = MDBX_CORRUPTED; + } else { + tree_t aligned_db; + memcpy(&aligned_db, node_data(node), sizeof(aligned_db)); + assert(err == MDBX_SUCCESS); + err = cursor_dupsort_setup(ctx->cursor, node, mp); + if (likely(err == MDBX_SUCCESS)) { + assert(ctx->cursor->subcur == + &container_of(ctx->cursor, cursor_couple_t, outer)->inner); + ctx->cursor = &ctx->cursor->subcur->cursor; + ctx->deep += 1; + sdb->nested = &aligned_db; + err = walk_pgno(ctx, sdb, aligned_db.root, mp->txnid); + sdb->nested = nullptr; + ctx->deep -= 1; + subcur_t *inner_xcursor = container_of(ctx->cursor, subcur_t, cursor); + cursor_couple_t *couple = + container_of(inner_xcursor, cursor_couple_t, inner); + ctx->cursor = &couple->outer; + } } + break; } - WARNING("%clock owner died, %s", (rlocked ? 'r' : 'w'), - (rc ? "this process' env is hosed" : "recovering")); - - int check_rc = cleanup_dead_readers(env, rlocked, NULL); - check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc; + } -#if MDBX_LOCKING == MDBX_LOCKING_SYSV - rc = (rc == MDBX_SUCCESS) ? check_rc : rc; -#else -#if defined(PTHREAD_MUTEX_ROBUST) || defined(pthread_mutex_consistent) - int mreco_rc = pthread_mutex_consistent(ipc); -#elif defined(PTHREAD_MUTEX_ROBUST_NP) || defined(pthread_mutex_consistent_np) - int mreco_rc = pthread_mutex_consistent_np(ipc); -#elif _POSIX_THREAD_PROCESS_SHARED < 200809L - int mreco_rc = pthread_mutex_consistent_np(ipc); -#else - int mreco_rc = pthread_mutex_consistent(ipc); -#endif - check_rc = (mreco_rc == 0) ? check_rc : mreco_rc; + return MDBX_SUCCESS; +} - if (unlikely(mreco_rc)) - ERROR("lock recovery failed, %s", mdbx_strerror(mreco_rc)); +__cold static int walk_sdb(walk_ctx_t *ctx, walk_sdb_t *sdb) { + tree_t *const db = sdb->internal; + if (unlikely(db->root == P_INVALID)) + return MDBX_SUCCESS; /* empty db */ - rc = (rc == MDBX_SUCCESS) ? check_rc : rc; - if (MDBX_IS_ERROR(rc)) - pthread_mutex_unlock(ipc); -#endif /* MDBX_LOCKING == MDBX_LOCKING_POSIX2008 */ - return rc; - } -#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 - (void)ipc; -#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 - (void)ipc; -#elif MDBX_LOCKING == MDBX_LOCKING_FUTEX -#ifdef _MSC_VER -#pragma message("warning: TODO") -#else -#warning "TODO" -#endif - (void)ipc; -#else -#error "FIXME" -#endif /* MDBX_LOCKING */ + kvx_t kvx = {.clc = {.k = {.lmin = INT_MAX}, .v = {.lmin = INT_MAX}}}; + cursor_couple_t couple; + int rc = cursor_init4walk(&couple, ctx->txn, db, &kvx); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - ERROR("mutex (un)lock failed, %s", mdbx_strerror(err)); - if (rc != EDEADLK) - env->me_flags |= MDBX_FATAL_ERROR; + const uint8_t cursor_checking = (ctx->options & dont_check_keys_ordering) + ? z_pagecheck | z_ignord + : z_pagecheck; + couple.outer.checking |= cursor_checking; + couple.inner.cursor.checking |= cursor_checking; + couple.outer.next = ctx->cursor; + couple.outer.top_and_flags = z_disable_tree_search_fastpath; + ctx->cursor = &couple.outer; + rc = walk_pgno(ctx, sdb, db->root, + db->mod_txnid ? db->mod_txnid : ctx->txn->txnid); + ctx->cursor = couple.outer.next; return rc; } -#if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC) -MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void) { - /* avoid 32-bit Bionic bug/hang with 32-pit TID */ - if (sizeof(pthread_mutex_t) < sizeof(pid_t) + sizeof(unsigned)) { - pid_t tid = gettid(); - if (unlikely(tid > 0xffff)) { - FATAL("Raise the ENOSYS(%d) error to avoid hang due " - "the 32-bit Bionic/Android bug with tid/thread_id 0x%08x(%i) " - "that don’t fit in 16 bits, see " - "https://android.googlesource.com/platform/bionic/+/master/" - "docs/32-bit-abi.md#is-too-small-for-large-pids", - ENOSYS, tid, tid); - return ENOSYS; - } +__cold int walk_pages(MDBX_txn *txn, walk_func *visitor, void *user, + walk_options_t options) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + walk_ctx_t ctx = { + .txn = txn, .userctx = user, .visitor = visitor, .options = options}; + walk_sdb_t sdb = {.name = {.iov_base = MDBX_CHK_GC}, + .internal = &txn->dbs[FREE_DBI]}; + rc = walk_sdb(&ctx, &sdb); + if (!MDBX_IS_ERROR(rc)) { + sdb.name.iov_base = MDBX_CHK_MAIN; + sdb.internal = &txn->dbs[MAIN_DBI]; + rc = walk_sdb(&ctx, &sdb); } - return 0; + return rc; } -#endif /* __ANDROID_API__ || ANDROID) || BIONIC */ +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 -static int osal_ipclock_lock(MDBX_env *env, osal_ipclock_t *ipc, - const bool dont_wait) { -#if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ - MDBX_LOCKING == MDBX_LOCKING_POSIX2008 - int rc = osal_check_tid4bionic(); - if (likely(rc == 0)) - rc = dont_wait ? pthread_mutex_trylock(ipc) : pthread_mutex_lock(ipc); - rc = (rc == EBUSY && dont_wait) ? MDBX_BUSY : rc; -#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 - int rc = MDBX_SUCCESS; - if (dont_wait) { - if (sem_trywait(ipc)) { - rc = errno; - if (rc == EAGAIN) - rc = MDBX_BUSY; +#if defined(_WIN32) || defined(_WIN64) + + +//------------------------------------------------------------------------------ +// Stub for slim read-write lock +// Portion Copyright (C) 1995-2002 Brad Wilson + +static void WINAPI stub_srwlock_Init(osal_srwlock_t *srwl) { + srwl->readerCount = srwl->writerCount = 0; +} + +static void WINAPI stub_srwlock_AcquireShared(osal_srwlock_t *srwl) { + while (true) { + assert(srwl->writerCount >= 0 && srwl->readerCount >= 0); + + // If there's a writer already, spin without unnecessarily + // interlocking the CPUs + if (srwl->writerCount != 0) { + SwitchToThread(); + continue; } - } else if (sem_wait(ipc)) - rc = errno; -#elif MDBX_LOCKING == MDBX_LOCKING_SYSV - struct sembuf op = {.sem_num = (ipc != &env->me_lck->mti_wlock), - .sem_op = -1, - .sem_flg = dont_wait ? IPC_NOWAIT | SEM_UNDO : SEM_UNDO}; - int rc; - if (semop(env->me_sysv_ipc.semid, &op, 1)) { - rc = errno; - if (dont_wait && rc == EAGAIN) - rc = MDBX_BUSY; - } else { - rc = *ipc ? EOWNERDEAD : MDBX_SUCCESS; - *ipc = env->me_pid; + + // Add to the readers list + _InterlockedIncrement(&srwl->readerCount); + + // Check for writers again (we may have been preempted). If + // there are no writers writing or waiting, then we're done. + if (srwl->writerCount == 0) + break; + + // Remove from the readers list, spin, try again + _InterlockedDecrement(&srwl->readerCount); + SwitchToThread(); } -#else -#error "FIXME" -#endif /* MDBX_LOCKING */ +} - if (unlikely(rc != MDBX_SUCCESS && rc != MDBX_BUSY)) - rc = osal_ipclock_failed(env, ipc, rc); - return rc; +static void WINAPI stub_srwlock_ReleaseShared(osal_srwlock_t *srwl) { + assert(srwl->readerCount > 0); + _InterlockedDecrement(&srwl->readerCount); } -int osal_ipclock_unlock(MDBX_env *env, osal_ipclock_t *ipc) { - int err = MDBX_ENOSYS; -#if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ - MDBX_LOCKING == MDBX_LOCKING_POSIX2008 - err = pthread_mutex_unlock(ipc); -#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 - err = sem_post(ipc) ? errno : MDBX_SUCCESS; -#elif MDBX_LOCKING == MDBX_LOCKING_SYSV - if (unlikely(*ipc != (pid_t)env->me_pid)) - err = EPERM; - else { - *ipc = 0; - struct sembuf op = {.sem_num = (ipc != &env->me_lck->mti_wlock), - .sem_op = 1, - .sem_flg = SEM_UNDO}; - err = semop(env->me_sysv_ipc.semid, &op, 1) ? errno : MDBX_SUCCESS; +static void WINAPI stub_srwlock_AcquireExclusive(osal_srwlock_t *srwl) { + while (true) { + assert(srwl->writerCount >= 0 && srwl->readerCount >= 0); + + // If there's a writer already, spin without unnecessarily + // interlocking the CPUs + if (srwl->writerCount != 0) { + SwitchToThread(); + continue; + } + + // See if we can become the writer (expensive, because it inter- + // locks the CPUs, so writing should be an infrequent process) + if (_InterlockedExchange(&srwl->writerCount, 1) == 0) + break; } -#else -#error "FIXME" -#endif /* MDBX_LOCKING */ - int rc = err; - if (unlikely(rc != MDBX_SUCCESS)) { - const uint32_t current_pid = osal_getpid(); - if (current_pid == env->me_pid || LOG_ENABLED(MDBX_LOG_NOTICE)) - debug_log((current_pid == env->me_pid) - ? MDBX_LOG_FATAL - : (rc = MDBX_SUCCESS, MDBX_LOG_NOTICE), - "ipc-unlock()", __LINE__, "failed: env %p, lck-%s %p, err %d\n", - __Wpedantic_format_voidptr(env), - (env->me_lck == env->me_lck_mmap.lck) ? "mmap" : "stub", - __Wpedantic_format_voidptr(env->me_lck), err); + + // Now we're the writer, but there may be outstanding readers. + // Spin until there aren't any more; new readers will wait now + // that we're the writer. + while (srwl->readerCount != 0) { + assert(srwl->writerCount >= 0 && srwl->readerCount >= 0); + SwitchToThread(); } - return rc; } -MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) { - TRACE("%s", ">>"); - jitter4testing(true); - int rc = osal_ipclock_lock(env, &env->me_lck->mti_rlock, false); - TRACE("<< rc %d", rc); - return rc; +static void WINAPI stub_srwlock_ReleaseExclusive(osal_srwlock_t *srwl) { + assert(srwl->writerCount == 1 && srwl->readerCount >= 0); + srwl->writerCount = 0; } -MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) { - TRACE("%s", ">>"); - int err = osal_ipclock_unlock(env, &env->me_lck->mti_rlock); - TRACE("<< err %d", err); - if (unlikely(err != MDBX_SUCCESS)) - mdbx_panic("%s() failed: err %d\n", __func__, err); - jitter4testing(true); +static uint64_t WINAPI stub_GetTickCount64(void) { + LARGE_INTEGER Counter, Frequency; + return (QueryPerformanceFrequency(&Frequency) && + QueryPerformanceCounter(&Counter)) + ? Counter.QuadPart * 1000ul / Frequency.QuadPart + : 0; } -int osal_txn_lock(MDBX_env *env, bool dont_wait) { - TRACE("%swait %s", dont_wait ? "dont-" : "", ">>"); - jitter4testing(true); - const int err = osal_ipclock_lock(env, &env->me_lck->mti_wlock, dont_wait); - int rc = err; - if (likely(!MDBX_IS_ERROR(err))) { - eASSERT(env, !env->me_txn0->mt_owner || - err == /* если другой поток в этом-же процессе завершился - не освободив блокировку */ - MDBX_RESULT_TRUE); - env->me_txn0->mt_owner = osal_thread_self(); - rc = MDBX_SUCCESS; +//------------------------------------------------------------------------------ + +struct libmdbx_imports imports; + +#if __GNUC_PREREQ(8, 0) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcast-function-type" +#endif /* GCC/MINGW */ + +#define MDBX_IMPORT(HANDLE, ENTRY) \ + imports.ENTRY = (MDBX_##ENTRY)GetProcAddress(HANDLE, #ENTRY) + +void windows_import(void) { + const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll"); + if (hNtdll) { + globals.running_under_Wine = !!GetProcAddress(hNtdll, "wine_get_version"); + if (!globals.running_under_Wine) { + MDBX_IMPORT(hNtdll, NtFsControlFile); + MDBX_IMPORT(hNtdll, NtExtendSection); + ENSURE(nullptr, imports.NtExtendSection); + } } - TRACE("<< err %d, rc %d", err, rc); - return rc; -} -void osal_txn_unlock(MDBX_env *env) { - TRACE("%s", ">>"); - eASSERT(env, env->me_txn0->mt_owner == osal_thread_self()); - env->me_txn0->mt_owner = 0; - int err = osal_ipclock_unlock(env, &env->me_lck->mti_wlock); - TRACE("<< err %d", err); - if (unlikely(err != MDBX_SUCCESS)) - mdbx_panic("%s() failed: err %d\n", __func__, err); - jitter4testing(true); + const HINSTANCE hKernel32dll = GetModuleHandleA("kernel32.dll"); + if (hKernel32dll) { + MDBX_IMPORT(hKernel32dll, GetFileInformationByHandleEx); + MDBX_IMPORT(hKernel32dll, GetTickCount64); + if (!imports.GetTickCount64) + imports.GetTickCount64 = stub_GetTickCount64; + if (!globals.running_under_Wine) { + MDBX_IMPORT(hKernel32dll, SetFileInformationByHandle); + MDBX_IMPORT(hKernel32dll, GetVolumeInformationByHandleW); + MDBX_IMPORT(hKernel32dll, GetFinalPathNameByHandleW); + MDBX_IMPORT(hKernel32dll, PrefetchVirtualMemory); + MDBX_IMPORT(hKernel32dll, SetFileIoOverlappedRange); + } + } + + const osal_srwlock_t_function srwlock_init = + (osal_srwlock_t_function)(hKernel32dll + ? GetProcAddress(hKernel32dll, + "InitializeSRWLock") + : nullptr); + if (srwlock_init) { + imports.srwl_Init = srwlock_init; + imports.srwl_AcquireShared = (osal_srwlock_t_function)GetProcAddress( + hKernel32dll, "AcquireSRWLockShared"); + imports.srwl_ReleaseShared = (osal_srwlock_t_function)GetProcAddress( + hKernel32dll, "ReleaseSRWLockShared"); + imports.srwl_AcquireExclusive = (osal_srwlock_t_function)GetProcAddress( + hKernel32dll, "AcquireSRWLockExclusive"); + imports.srwl_ReleaseExclusive = (osal_srwlock_t_function)GetProcAddress( + hKernel32dll, "ReleaseSRWLockExclusive"); + } else { + imports.srwl_Init = stub_srwlock_Init; + imports.srwl_AcquireShared = stub_srwlock_AcquireShared; + imports.srwl_ReleaseShared = stub_srwlock_ReleaseShared; + imports.srwl_AcquireExclusive = stub_srwlock_AcquireExclusive; + imports.srwl_ReleaseExclusive = stub_srwlock_ReleaseExclusive; + } + + const HINSTANCE hAdvapi32dll = GetModuleHandleA("advapi32.dll"); + if (hAdvapi32dll) { + MDBX_IMPORT(hAdvapi32dll, RegGetValueA); + } } -#else -#ifdef _MSC_VER -#pragma warning(disable : 4206) /* nonstandard extension used: translation \ - unit is empty */ -#endif /* _MSC_VER (warnings) */ -#endif /* !Windows LCK-implementation */ +#undef MDBX_IMPORT + +#if __GNUC_PREREQ(8, 0) +#pragma GCC diagnostic pop +#endif /* GCC/MINGW */ + +#endif /* Windows */ +/* This is CMake-template for libmdbx's version.c + ******************************************************************************/ + + +#if MDBX_VERSION_MAJOR != 0 || \ + MDBX_VERSION_MINOR != 13 +#error "API version mismatch! Had `git fetch --tags` done?" +#endif + +static const char sourcery[] = MDBX_STRINGIFY(MDBX_BUILD_SOURCERY); + +__dll_export +#ifdef __attribute_used__ + __attribute_used__ +#elif defined(__GNUC__) || __has_attribute(__used__) + __attribute__((__used__)) +#endif +#ifdef __attribute_externally_visible__ + __attribute_externally_visible__ +#elif (defined(__GNUC__) && !defined(__clang__)) || \ + __has_attribute(__externally_visible__) + __attribute__((__externally_visible__)) +#endif + const struct MDBX_version_info mdbx_version = { + 0, + 13, + 0, + 61, + {"2024-06-15T20:28:34+03:00", "54f4583bbd35cf3647dc4065446a781cabedcabd", "45b204f5daeff2e4db336b99db2af50e8e5c7c8e", + "v0.13.0-61-g45b204f5"}, + sourcery}; + +__dll_export +#ifdef __attribute_used__ + __attribute_used__ +#elif defined(__GNUC__) || __has_attribute(__used__) + __attribute__((__used__)) +#endif +#ifdef __attribute_externally_visible__ + __attribute_externally_visible__ +#elif (defined(__GNUC__) && !defined(__clang__)) || \ + __has_attribute(__externally_visible__) + __attribute__((__externally_visible__)) +#endif + const char *const mdbx_sourcery_anchor = sourcery; diff --git a/mdbx/mdbx.h b/mdbx/mdbx.h index 98f9a86..7a7b25e 100644 --- a/mdbx/mdbx.h +++ b/mdbx/mdbx.h @@ -1,11 +1,10 @@ /** -_libmdbx_ is an extremely fast, compact, powerful, embedded, +_libmdbx_ (aka MDBX) is an extremely fast, compact, powerful, embeddable, transactional [key-value -store](https://en.wikipedia.org/wiki/Key-value_database) database, with -[permissive license](./LICENSE). _MDBX_ has a specific set of properties and -capabilities, focused on creating unique lightweight solutions with -extraordinary performance. +store](https://en.wikipedia.org/wiki/Key-value_database), with [Apache 2.0 +license](./LICENSE). _MDBX_ has a specific set of properties and capabilities, +focused on creating unique lightweight solutions with extraordinary performance. _libmdbx_ is superior to [LMDB](https://bit.ly/26ts7tL) in terms of features and reliability, not inferior in performance. In comparison to LMDB, _libmdbx_ @@ -14,60 +13,24 @@ break down. _libmdbx_ supports Linux, Windows, MacOS, OSX, iOS, Android, FreeBSD, DragonFly, Solaris, OpenSolaris, OpenIndiana, NetBSD, OpenBSD and other systems compliant with POSIX.1-2008. -The origin has been migrated to -[GitFlic](https://gitflic.ru/project/erthink/libmdbx) since on 2022-04-15 -the Github administration, without any warning nor explanation, deleted libmdbx -along with a lot of other projects, simultaneously blocking access for many -developers. For the same reason ~~Github~~ is blacklisted forever. +Please visit https://libmdbx.dqdkfa.ru for more information, documentation, +C++ API description and links to the origin git repo with the source code. +Questions, feedback and suggestions are welcome to the Telegram' group +https://t.me/libmdbx. _The Future will (be) [Positive](https://www.ptsecurity.com). Всё будет хорошо._ +\note The origin has been migrated to +[GitFlic](https://gitflic.ru/project/erthink/libmdbx) since on 2022-04-15 the +Github administration, without any warning nor explanation, deleted libmdbx +along with a lot of other projects, simultaneously blocking access for many +developers. For the same reason ~~Github~~ is blacklisted forever. \section copyright LICENSE & COPYRIGHT - -\authors Copyright (c) 2015-2024, Leonid Yuriev -and other _libmdbx_ authors: please see [AUTHORS](./AUTHORS) file. - -\copyright Redistribution and use in source and binary forms, with or without -modification, are permitted only as authorized by the OpenLDAP Public License. - -A copy of this license is available in the file LICENSE in the -top-level directory of the distribution or, alternatively, at -. - - --- - -This code is derived from "LMDB engine" written by -Howard Chu (Symas Corporation), which itself derived from btree.c -written by Martin Hedenfalk. - - --- - -Portions Copyright 2011-2015 Howard Chu, Symas Corp. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted only as authorized by the OpenLDAP -Public License. - -A copy of this license is available in the file LICENSE in the -top-level directory of the distribution or, alternatively, at -. - - --- - -Portions Copyright (c) 2009, 2010 Martin Hedenfalk - -Permission to use, copy, modify, and distribute this software for any -purpose with or without fee is hereby granted, provided that the above -copyright notice and this permission notice appear in all copies. - -THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +\copyright SPDX-License-Identifier: Apache-2.0 +\note Please refer to the COPYRIGHT file for explanations license change, +credits and acknowledgments. +\author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 *******************************************************************************/ @@ -98,7 +61,7 @@ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. /* clang-format off */ /** \file mdbx.h - \brief The libmdbx C API header file + \brief The libmdbx C API header file. \defgroup c_api C API @{ @@ -359,6 +322,14 @@ typedef mode_t mdbx_mode_t; #endif #endif /* MDBX_DEPRECATED */ +#ifndef MDBX_DEPRECATED_ENUM +#if !defined(DOXYGEN) && (!defined(_MSC_VER) || _MSC_VER >= 1930) +#define MDBX_DEPRECATED_ENUM MDBX_DEPRECATED +#else +#define MDBX_DEPRECATED_ENUM /* avoid madness MSVC */ +#endif +#endif /* MDBX_DEPRECATED_ENUM */ + #ifndef __dll_export #if defined(_WIN32) || defined(_WIN64) || defined(__CYGWIN__) || \ defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__) @@ -393,7 +364,8 @@ typedef mode_t mdbx_mode_t; /** \brief Auxiliary macro for robustly define the both inline version of API * function and non-inline fallback dll-exported version for applications linked - * with old version of libmdbx, with a strictly ODR-common implementation. */ + * with old version of libmdbx, with a strictly ODR-common implementation. Thus, + * we emulate __extern_inline for all compilers, including non-GNU ones. */ #if defined(LIBMDBX_INTERNALS) && !defined(LIBMDBX_NO_EXPORTS_LEGACY_API) #define LIBMDBX_INLINE_API(TYPE, NAME, ARGS) \ /* proto of exported which uses common impl */ LIBMDBX_API TYPE NAME ARGS; \ @@ -888,7 +860,7 @@ enum MDBX_constants { /** Log level * \note Levels detailed than (great than) \ref MDBX_LOG_NOTICE * requires build libmdbx with \ref MDBX_DEBUG option. */ -enum MDBX_log_level_t { +typedef enum MDBX_log_level { /** Critical conditions, i.e. assertion failures. * \note libmdbx always produces such messages regardless * of \ref MDBX_DEBUG build option. */ @@ -938,17 +910,14 @@ enum MDBX_log_level_t { /** for \ref mdbx_setup_debug() only: Don't change current settings */ MDBX_LOG_DONTCHANGE = -1 -}; -#ifndef __cplusplus -typedef enum MDBX_log_level_t MDBX_log_level_t; -#endif +} MDBX_log_level_t; /** \brief Runtime debug flags * * \details `MDBX_DBG_DUMP` and `MDBX_DBG_LEGACY_MULTIOPEN` always have an * effect, but `MDBX_DBG_ASSERT`, `MDBX_DBG_AUDIT` and `MDBX_DBG_JITTER` only if * libmdbx built with \ref MDBX_DEBUG. */ -enum MDBX_debug_flags_t { +typedef enum MDBX_debug_flags { MDBX_DBG_NONE = 0, /** Enable assertion checks. @@ -986,12 +955,8 @@ enum MDBX_debug_flags_t { /** for mdbx_setup_debug() only: Don't change current settings */ MDBX_DBG_DONTCHANGE = -1 -}; -#ifndef __cplusplus -typedef enum MDBX_debug_flags_t MDBX_debug_flags_t; -#else -DEFINE_ENUM_FLAG_OPERATORS(MDBX_debug_flags_t) -#endif +} MDBX_debug_flags_t; +DEFINE_ENUM_FLAG_OPERATORS(MDBX_debug_flags) /** \brief A debug-logger callback function, * called before printing the message and aborting. @@ -1086,7 +1051,7 @@ MDBX_NORETURN LIBMDBX_API void mdbx_assert_fail(const MDBX_env *env, * \ingroup c_opening * \anchor env_flags * \see mdbx_env_open() \see mdbx_env_set_flags() */ -enum MDBX_env_flags_t { +typedef enum MDBX_env_flags { MDBX_ENV_DEFAULTS = 0, /** Extra validation of DB structure and pages content. @@ -1210,7 +1175,7 @@ enum MDBX_env_flags_t { /** Отвязывает транзакции от потоков/threads насколько это возможно. * - * Эта опция предназначена для приложений, которые мультиплексируют множество + * Опция предназначена для приложений, которые мультиплексируют множество * пользовательских легковесных потоков выполнения по отдельным потокам * операционной системы, например как это происходит в средах выполнения * GoLang и Rust. Таким приложениям также рекомендуется сериализовать @@ -1278,10 +1243,9 @@ enum MDBX_env_flags_t { * Этот флаг вступает в силу при открытии среды и не может быть изменен после. */ MDBX_NOSTICKYTHREADS = UINT32_C(0x200000), -#ifndef _MSC_VER /* avoid madness MSVC */ + /** \deprecated Please use \ref MDBX_NOSTICKYTHREADS instead. */ - MDBX_NOTLS MDBX_DEPRECATED = MDBX_NOSTICKYTHREADS, -#endif /* avoid madness MSVC */ + MDBX_NOTLS MDBX_DEPRECATED_ENUM = MDBX_NOSTICKYTHREADS, /** Don't do readahead. * @@ -1327,7 +1291,6 @@ enum MDBX_env_flags_t { * This flag may be changed at any time using `mdbx_env_set_flags()`. */ MDBX_NOMEMINIT = UINT32_C(0x1000000), -#ifndef _MSC_VER /* avoid madness MSVC */ /** Aims to coalesce a Garbage Collection items. * \deprecated Always enabled since v0.12 and deprecated since v0.13. * @@ -1339,8 +1302,7 @@ enum MDBX_env_flags_t { * Unallocated space and reducing the database file. * * This flag may be changed at any time using mdbx_env_set_flags(). */ - MDBX_COALESCE MDBX_DEPRECATED = UINT32_C(0x2000000), -#endif /* avoid madness MSVC */ + MDBX_COALESCE MDBX_DEPRECATED_ENUM = UINT32_C(0x2000000), /** LIFO policy for recycling a Garbage Collection items. * @@ -1543,19 +1505,14 @@ enum MDBX_env_flags_t { MDBX_UTTERLY_NOSYNC = MDBX_SAFE_NOSYNC | UINT32_C(0x100000), /** end of sync_modes @} */ -}; -#ifndef __cplusplus -/** \ingroup c_opening */ -typedef enum MDBX_env_flags_t MDBX_env_flags_t; -#else -DEFINE_ENUM_FLAG_OPERATORS(MDBX_env_flags_t) -#endif +} MDBX_env_flags_t; +DEFINE_ENUM_FLAG_OPERATORS(MDBX_env_flags) /** Transaction flags * \ingroup c_transactions * \anchor txn_flags * \see mdbx_txn_begin() \see mdbx_txn_flags() */ -enum MDBX_txn_flags_t { +typedef enum MDBX_txn_flags { /** Start read-write transaction. * * Only one write transaction may be active at a time. Writes are fully @@ -1627,18 +1584,14 @@ enum MDBX_txn_flags_t { * \note Transaction state flag. Returned from \ref mdbx_txn_flags() * but can't be used with \ref mdbx_txn_begin(). */ MDBX_TXN_BLOCKED = MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_HAS_CHILD -}; -#ifndef __cplusplus -typedef enum MDBX_txn_flags_t MDBX_txn_flags_t; -#else -DEFINE_ENUM_FLAG_OPERATORS(MDBX_txn_flags_t) -#endif +} MDBX_txn_flags_t; +DEFINE_ENUM_FLAG_OPERATORS(MDBX_txn_flags) /** \brief Database flags * \ingroup c_dbi * \anchor db_flags * \see mdbx_dbi_open() */ -enum MDBX_db_flags_t { +typedef enum MDBX_db_flags { /** Variable length unique keys with usual byte-by-byte string comparison. */ MDBX_DB_DEFAULTS = 0, @@ -1681,19 +1634,14 @@ enum MDBX_db_flags_t { * sub-database will be opened with flags which it was created, and then an * application could determine the actual flags by \ref mdbx_dbi_flags(). */ MDBX_DB_ACCEDE = MDBX_ACCEDE -}; -#ifndef __cplusplus -/** \ingroup c_dbi */ -typedef enum MDBX_db_flags_t MDBX_db_flags_t; -#else +} MDBX_db_flags_t; DEFINE_ENUM_FLAG_OPERATORS(MDBX_db_flags_t) -#endif /** \brief Data changing flags * \ingroup c_crud * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" * \see mdbx_put() \see mdbx_cursor_put() \see mdbx_replace() */ -enum MDBX_put_flags_t { +typedef enum MDBX_put_flags { /** Upsertion by default (without any other flags) */ MDBX_UPSERT = 0, @@ -1731,18 +1679,13 @@ enum MDBX_put_flags_t { /** Only for \ref MDBX_DUPFIXED. * Store multiple data items in one call. */ MDBX_MULTIPLE = UINT32_C(0x80000) -}; -#ifndef __cplusplus -/** \ingroup c_crud */ -typedef enum MDBX_put_flags_t MDBX_put_flags_t; -#else -DEFINE_ENUM_FLAG_OPERATORS(MDBX_put_flags_t) -#endif +} MDBX_put_flags_t; +DEFINE_ENUM_FLAG_OPERATORS(MDBX_put_flags) /** \brief Environment copy flags * \ingroup c_extra * \see mdbx_env_copy() \see mdbx_env_copy2fd() */ -enum MDBX_copy_flags_t { +typedef enum MDBX_copy_flags { MDBX_CP_DEFAULTS = 0, /** Copy with compactification: Omit free space from copy and renumber all @@ -1751,19 +1694,14 @@ enum MDBX_copy_flags_t { /** Force to make resizable copy, i.e. dynamic size instead of fixed */ MDBX_CP_FORCE_DYNAMIC_SIZE = 2u -}; -#ifndef __cplusplus -/** \ingroup c_extra */ -typedef enum MDBX_copy_flags_t MDBX_copy_flags_t; -#else -DEFINE_ENUM_FLAG_OPERATORS(MDBX_copy_flags_t) -#endif +} MDBX_copy_flags_t; +DEFINE_ENUM_FLAG_OPERATORS(MDBX_copy_flags) /** \brief Cursor operations * \ingroup c_cursors * This is the set of all operations for retrieving data using a cursor. * \see mdbx_cursor_get() */ -enum MDBX_cursor_op { +typedef enum MDBX_cursor_op { /** Position at first key/data item */ MDBX_FIRST, @@ -1875,18 +1813,14 @@ enum MDBX_cursor_op { MDBX_TO_PAIR_EQUAL, MDBX_TO_PAIR_GREATER_OR_EQUAL, MDBX_TO_PAIR_GREATER_THAN -}; -#ifndef __cplusplus -/** \ingroup c_cursors */ -typedef enum MDBX_cursor_op MDBX_cursor_op; -#endif +} MDBX_cursor_op; /** \brief Errors and return codes * \ingroup c_err * * BerkeleyDB uses -30800 to -30999, we'll go under them * \see mdbx_strerror() \see mdbx_strerror_r() \see mdbx_liberr2str() */ -enum MDBX_error_t { +typedef enum MDBX_error { /** Successful result */ MDBX_SUCCESS = 0, @@ -2062,11 +1996,7 @@ enum MDBX_error_t { MDBX_EREMOTE = ENOTBLK, MDBX_EDEADLK = EDEADLK #endif /* !Windows */ -}; -#ifndef __cplusplus -/** \ingroup c_err */ -typedef enum MDBX_error_t MDBX_error_t; -#endif +} MDBX_error_t; /** MDBX_MAP_RESIZED * \ingroup c_err @@ -2158,7 +2088,7 @@ LIBMDBX_API int mdbx_env_create(MDBX_env **penv); /** \brief MDBX environment extra runtime options. * \ingroup c_settings * \see mdbx_env_set_option() \see mdbx_env_get_option() */ -enum MDBX_option_t { +typedef enum MDBX_option { /** \brief Controls the maximum number of named databases for the environment. * * \details By default only unnamed key-value database could used and @@ -2320,14 +2250,15 @@ enum MDBX_option_t { MDBX_opt_spill_parent4child_denominator, /** \brief Controls the in-process threshold of semi-empty pages merge. - * \warning This is experimental option and subject for change or removal. * \details This option controls the in-process threshold of minimum page * fill, as used space of percentage of a page. Neighbour pages emptier than * this value are candidates for merging. The threshold value is specified - * in 1/65536 of percent, which is equivalent to the 16-dot-16 fixed point - * format. The specified value must be in the range from 12.5% (almost empty) - * to 50% (half empty) which corresponds to the range from 8192 and to 32768 - * in units respectively. */ + * in 1/65536 points of a whole page, which is equivalent to the 16-dot-16 + * fixed point format. + * The specified value must be in the range from 12.5% (almost empty page) + * to 50% (half empty page) which corresponds to the range from 8192 and + * to 32768 in units respectively. + * \see MDBX_opt_prefer_waf_insteadof_balance */ MDBX_opt_merge_threshold_16dot16_percent, /** \brief Controls the choosing between use write-through disk writes and @@ -2388,12 +2319,75 @@ enum MDBX_option_t { * С другой стороны, при минимальном значении (включая 0) * `MDBX_opt_rp_augment_limit` переработка GC будет ограничиваться * преимущественно затраченным временем. */ - MDBX_opt_gc_time_limit -}; -#ifndef __cplusplus -/** \ingroup c_settings */ -typedef enum MDBX_option_t MDBX_option_t; -#endif + MDBX_opt_gc_time_limit, + + /** \brief Управляет выбором между стремлением к равномерности наполнения + * страниц, либо уменьшением количества измененных и записанных страниц. + * + * \details После операций удаления страницы содержащие меньше минимума + * ключей, либо опустошенные до \ref MDBX_opt_merge_threshold_16dot16_percent + * подлежат слиянию с одной из соседних. Если страницы справа и слева от + * текущей обе «грязные» (были изменены в ходе транзакции и должны быть + * записаны на диск), либо обе «чисты» (не изменялись в текущей транзакции), + * то целью для слияния всегда выбирается менее заполненная страница. + * Когда же только одна из соседствующих является «грязной», а другая + * «чистой», то возможны две тактики выбора цели для слияния: + * + * - Если `MDBX_opt_prefer_waf_insteadof_balance = True`, то будет выбрана + * уже измененная страница, что НЕ УВЕЛИЧИТ количество измененных страниц + * и объем записи на диск при фиксации текущей транзакции, но в среднем + * будет УВЕЛИЧИВАТЬ неравномерность заполнения страниц. + * + * - Если `MDBX_opt_prefer_waf_insteadof_balance = False`, то будет выбрана + * менее заполненная страница, что УВЕЛИЧИТ количество измененных страниц + * и объем записи на диск при фиксации текущей транзакции, но в среднем + * будет УМЕНЬШАТЬ неравномерность заполнения страниц. + * + * \see MDBX_opt_merge_threshold_16dot16_percent */ + MDBX_opt_prefer_waf_insteadof_balance, + + /** \brief Задаёт в % максимальный размер вложенных страниц, используемых для + * размещения небольшого количества мульти-значений связанных с одном ключем. + * + * Использование вложенных страниц, вместо выноса значений на отдельные + * страницы вложенного дерева, позволяет уменьшить объем неиспользуемого места + * и этим увеличить плотность размещения данных. + * + * Но с увеличением размера вложенных страниц требуется больше листовых + * страниц основного дерева, что также увеличивает высоту основного дерева. + * Кроме этого, изменение данных на вложенных страницах требует дополнительных + * копирований, поэтому стоимость может быть больше во многих сценариях. + * + * min 12.5% (8192), max 100% (65535), default = 100% */ + MDBX_opt_subpage_limit, + + /** \brief Задаёт в % минимальный объём свободного места на основной странице, + * при отсутствии которого вложенные страницы выносятся в отдельное дерево. + * + * min 0, max 100% (65535), default = 0 */ + MDBX_opt_subpage_room_threshold, + + /** \brief Задаёт в % минимальный объём свободного места на основной странице, + * при наличии которого, производится резервирование места во вложенной. + * + * Если на основной странице свободного места недостаточно, то вложенная + * страница будет минимального размера. В свою очередь, при отсутствии резерва + * во вложенной странице, каждое добавлении в неё элементов будет требовать + * переформирования основной страниц с переносом всех узлов данных. + * + * Поэтому резервирование места, как правило, выгодно в сценариях с + * интенсивным добавлением коротких мульти-значений, например при + * индексировании. Но уменьшает плотность размещения данных, соответственно + * увеличивает объем БД и операций ввода-вывода. + * + * min 0, max 100% (65535), default = 42% (27525) */ + MDBX_opt_subpage_reserve_prereq, + + /** \brief Задаёт в % ограничение резервирования места на вложенных страницах. + * + * min 0, max 100% (65535), default = 4.2% (2753) */ + MDBX_opt_subpage_reserve_limit +} MDBX_option_t; /** \brief Sets the value of a extra runtime options for an environment. * \ingroup c_settings @@ -2508,7 +2502,7 @@ LIBMDBX_API int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, /** \brief Deletion modes for \ref mdbx_env_delete(). * \ingroup c_extra * \see mdbx_env_delete() */ -enum MDBX_env_delete_mode_t { +typedef enum MDBX_env_delete_mode { /** \brief Just delete the environment's files and directory if any. * \note On POSIX systems, processes already working with the database will * continue to work without interference until it close the environment. @@ -2522,11 +2516,7 @@ enum MDBX_env_delete_mode_t { /** \brief Wait until other processes closes the environment before deletion. */ MDBX_ENV_WAIT_FOR_UNUSED = 2, -}; -#ifndef __cplusplus -/** \ingroup c_extra */ -typedef enum MDBX_env_delete_mode_t MDBX_env_delete_mode_t; -#endif +} MDBX_env_delete_mode_t; /** \brief Delete the environment's files in a proper and multiprocess-safe way. * \ingroup c_extra @@ -2637,7 +2627,7 @@ struct MDBX_stat { uint32_t ms_depth; /**< Depth (height) of the B-tree */ uint64_t ms_branch_pages; /**< Number of internal (non-leaf) pages */ uint64_t ms_leaf_pages; /**< Number of leaf pages */ - uint64_t ms_overflow_pages; /**< Number of overflow pages */ + uint64_t ms_overflow_pages; /**< Number of large/overflow pages */ uint64_t ms_entries; /**< Number of data items */ uint64_t ms_mod_txnid; /**< Transaction ID of committed last modification */ }; @@ -3097,7 +3087,7 @@ LIBMDBX_API int mdbx_env_resurrect_after_fork(MDBX_env *env); * \ingroup c_settings * \anchor warmup_flags * \see mdbx_env_warmup() */ -enum MDBX_warmup_flags_t { +typedef enum MDBX_warmup_flags { /** By default \ref mdbx_env_warmup() just ask OS kernel to asynchronously * prefetch database pages. */ MDBX_warmup_default = 0, @@ -3140,12 +3130,8 @@ enum MDBX_warmup_flags_t { /** Release the lock that was performed before by \ref MDBX_warmup_lock. */ MDBX_warmup_release = 16, -}; -#ifndef __cplusplus -typedef enum MDBX_warmup_flags_t MDBX_warmup_flags_t; -#else -DEFINE_ENUM_FLAG_OPERATORS(MDBX_warmup_flags_t) -#endif +} MDBX_warmup_flags_t; +DEFINE_ENUM_FLAG_OPERATORS(MDBX_warmup_flags) /** \brief Warms up the database by loading pages into memory, optionally lock * ones. \ingroup c_settings @@ -3539,7 +3525,7 @@ MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t mdbx_limits_pairsize4page_max(intptr_t pagesize, MDBX_db_flags_t flags); /** \brief Returns maximal data size in bytes to fit in a leaf-page or - * single overflow/large-page with the given page size and database flags, + * single large/overflow-page with the given page size and database flags, * or -1 if pagesize is invalid. * \ingroup c_statinfo * \see db_flags */ @@ -3715,7 +3701,7 @@ MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int mdbx_env_get_pairsize4page_max(const MDBX_env *env, MDBX_db_flags_t flags); /** \brief Returns maximal data size in bytes to fit in a leaf-page or - * single overflow/large-page for specified database flags. + * single large/overflow-page for specified database flags. * \ingroup c_statinfo * * \param [in] env An environment handle returned by \ref mdbx_env_create(). @@ -4553,7 +4539,7 @@ LIBMDBX_API int mdbx_dbi_dupsort_depthmask(const MDBX_txn *txn, MDBX_dbi dbi, /** \brief DBI state bits returted by \ref mdbx_dbi_flags_ex() * \ingroup c_statinfo * \see mdbx_dbi_flags_ex() */ -enum MDBX_dbi_state_t { +typedef enum MDBX_dbi_state { /** DB was written in this txn */ MDBX_DBI_DIRTY = 0x01, /** Cached Named-DB record is older than txnID */ @@ -4562,13 +4548,8 @@ enum MDBX_dbi_state_t { MDBX_DBI_FRESH = 0x04, /** Named-DB handle created in this txn */ MDBX_DBI_CREAT = 0x08, -}; -#ifndef __cplusplus -/** \ingroup c_statinfo */ -typedef enum MDBX_dbi_state_t MDBX_dbi_state_t; -#else -DEFINE_ENUM_FLAG_OPERATORS(MDBX_dbi_state_t) -#endif +} MDBX_dbi_state_t; +DEFINE_ENUM_FLAG_OPERATORS(MDBX_dbi_state) /** \brief Retrieve the DB flags and status for a database handle. * \ingroup c_statinfo @@ -4980,6 +4961,7 @@ LIBMDBX_API int mdbx_cursor_bind(const MDBX_txn *txn, MDBX_cursor *cursor, * \see mdbx_cursor_renew() * \see mdbx_cursor_bind() * \see mdbx_cursor_close() + * \see mdbx_cursor_reset() * * \note In contrast to LMDB, the MDBX required that any opened cursors can be * reused and must be freed explicitly, regardless ones was opened in a @@ -4992,6 +4974,20 @@ LIBMDBX_API int mdbx_cursor_bind(const MDBX_txn *txn, MDBX_cursor *cursor, * \returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_cursor_unbind(MDBX_cursor *cursor); +/** \brief Сбрасывает состояние курсора. + * \ingroup c_cursors + * + * В результате сброса курсор становится неустановленным и не позволяет + * выполнять операции относительного позиционирования, получения или изменения + * данных, до установки на позицию не зависящую от текущей. Что позволяет + * приложению пресекать дальнейшие операции без предварительного + * позиционирования курсора. + * + * \param [in] cursor Указатель на курсор. + * + * \returns Результат операции сканирования, либо код ошибки. */ +LIBMDBX_API int mdbx_cursor_reset(MDBX_cursor *cursor); + /** \brief Create a cursor handle for the specified transaction and DBI handle. * \ingroup c_cursors * @@ -5172,6 +5168,21 @@ LIBMDBX_API int mdbx_cursor_compare(const MDBX_cursor *left, LIBMDBX_API int mdbx_cursor_get(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op); +/** \brief Служебная функция для использования в утилитах. + * \ingroup c_extra + * + * При использовании определяемых пользователем функций сравнения (aka custom + * comparison functions) проверка порядка ключей может приводить к неверным + * результатам и возврате ошибки \ref MDBX_CORRUPTED. + * + * Эта функция отключает контроль порядка следования ключей на страницах при + * чтении страниц БД для этого курсора, и таким образом, позволяет прочитать + * данные при отсутствии/недоступности использованных функций сравнения. + * \see avoid_custom_comparators + * + * \returns Результат операции сканирования, либо код ошибки. */ +LIBMDBX_API int mdbx_cursor_ignord(MDBX_cursor *cursor); + /** \brief Тип предикативных функций обратного вызова используемых * \ref mdbx_cursor_scan() и \ref mdbx_cursor_scan_from() для пробирования * пар ключ-значения. @@ -5399,18 +5410,16 @@ LIBMDBX_API int mdbx_cursor_scan_from(MDBX_cursor *cursor, * \param [in] limit The size of pairs buffer as the number of items, * but not a pairs. * \param [in] op A cursor operation \ref MDBX_cursor_op (only - * \ref MDBX_FIRST, \ref MDBX_NEXT, \ref MDBX_GET_CURRENT - * are supported). + * \ref MDBX_FIRST and \ref MDBX_NEXT are supported). * * \returns A non-zero error value on failure and 0 on success, * some possible errors are: * \retval MDBX_THREAD_MISMATCH Given transaction is not owned * by current thread. - * \retval MDBX_NOTFOUND No more key-value pairs are available. + * \retval MDBX_NOTFOUND No any key-value pairs are available. * \retval MDBX_ENODATA The cursor is already at the end of data. - * \retval MDBX_RESULT_TRUE The specified limit is less than the available - * key-value pairs on the current page/position - * that the cursor points to. + * \retval MDBX_RESULT_TRUE The returned chunk is the last one, + * and there are no pairs left. * \retval MDBX_EINVAL An invalid parameter was specified. */ LIBMDBX_API int mdbx_cursor_get_batch(MDBX_cursor *cursor, size_t *count, MDBX_val *pairs, size_t limit, @@ -6141,7 +6150,7 @@ LIBMDBX_API int mdbx_preopen_snapinfoW(const wchar_t *pathname, * \note Данный API еще не зафиксирован, в последующих версиях могут быть * незначительные доработки и изменения. * \see mdbx_env_chk() */ -enum MDBX_chk_flags_t { +typedef enum MDBX_chk_flags { /** Режим проверки по-умолчанию, в том числе в режиме только-чтения. */ MDBX_CHK_DEFAULTS = 0, @@ -6159,18 +6168,13 @@ enum MDBX_chk_flags_t { * \note Требуется при проверке унаследованных БД созданных с использованием * нестандартных (пользовательских) функций сравнения ключей или значений. */ MDBX_CHK_IGNORE_ORDER = 8 -}; -#ifndef __cplusplus -/** \ingroup c_opening */ -typedef enum MDBX_chk_flags_t MDBX_chk_flags_t; -#else -DEFINE_ENUM_FLAG_OPERATORS(MDBX_chk_flags_t) -#endif +} MDBX_chk_flags_t; +DEFINE_ENUM_FLAG_OPERATORS(MDBX_chk_flags) /** \brief Уровни логирование/детализации информации, * поставляемой через обратные вызовы при проверке целостности базы данных. * \see mdbx_env_chk() */ -enum MDBX_chk_severity { +typedef enum MDBX_chk_severity { MDBX_chk_severity_prio_shift = 4, MDBX_chk_severity_kind_mask = 0xF, MDBX_chk_fatal = 0x00u, @@ -6184,25 +6188,25 @@ enum MDBX_chk_severity { MDBX_chk_verbose = 0x78u, MDBX_chk_details = 0x89u, MDBX_chk_extra = 0x9Au -}; +} MDBX_chk_severity_t; /** \brief Стадии проверки, * сообщаемые через обратные вызовы при проверке целостности базы данных. * \see mdbx_env_chk() */ -enum MDBX_chk_stage { +typedef enum MDBX_chk_stage { MDBX_chk_none, MDBX_chk_init, MDBX_chk_lock, MDBX_chk_meta, - MDBX_chk_traversal_tree, - MDBX_chk_traversal_freedb, + MDBX_chk_tree, + MDBX_chk_gc, MDBX_chk_space, - MDBX_chk_traversal_maindb, - MDBX_chk_traversal_subdbs, + MDBX_chk_maindb, + MDBX_chk_subdbs, MDBX_chk_conclude, MDBX_chk_unlock, MDBX_chk_finalize -}; +} MDBX_chk_stage_t; /** \brief Виртуальная строка отчета, формируемого при проверке целостности базы * данных. \see mdbx_env_chk() */ @@ -6226,8 +6230,8 @@ typedef struct MDBX_chk_scope { MDBX_chk_issue_t *issues; struct MDBX_chk_internal *internal; const void *object; - enum MDBX_chk_stage stage; - enum MDBX_chk_severity verbosity; + MDBX_chk_stage_t stage; + MDBX_chk_severity_t verbosity; size_t subtotal_issues; union { void *ptr; @@ -6348,11 +6352,11 @@ typedef struct MDBX_chk_callbacks { size_t entry_number, const MDBX_val *key, const MDBX_val *value); - int (*stage_begin)(MDBX_chk_context_t *ctx, enum MDBX_chk_stage); - int (*stage_end)(MDBX_chk_context_t *ctx, enum MDBX_chk_stage, int err); + int (*stage_begin)(MDBX_chk_context_t *ctx, MDBX_chk_stage_t); + int (*stage_end)(MDBX_chk_context_t *ctx, MDBX_chk_stage_t, int err); MDBX_chk_line_t *(*print_begin)(MDBX_chk_context_t *ctx, - enum MDBX_chk_severity severity); + MDBX_chk_severity_t severity); void (*print_flush)(MDBX_chk_line_t *); void (*print_done)(MDBX_chk_line_t *); void (*print_chars)(MDBX_chk_line_t *, const char *str, size_t len); @@ -6392,8 +6396,8 @@ typedef struct MDBX_chk_callbacks { * \returns Нулевое значение в случае успеха, иначе код ошибки. */ LIBMDBX_API int mdbx_env_chk(MDBX_env *env, const MDBX_chk_callbacks_t *cb, MDBX_chk_context_t *ctx, - const enum MDBX_chk_flags_t flags, - enum MDBX_chk_severity verbosity, + const MDBX_chk_flags_t flags, + MDBX_chk_severity_t verbosity, unsigned timeout_seconds_16dot16); /** \brief Вспомогательная функция для подсчета проблем детектируемых