diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0e34a8e..5cc5e1d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -41,6 +41,5 @@ jobs: with: go-version: '1.20' - - run: go env - - run: go test -p 1 ./mdbx - - run: go test -p 1 ./exp/mdbxpool + - run: go test ./mdbx + - run: go test ./exp/mdbxpool diff --git a/README.md b/README.md index caa1cdd..cbff009 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,10 @@ Go bindings to the libmdbx: https://libmdbx.dqdkfa.ru +**Notice**: page `./mdbx` contains only `mdbx.h` and `mdbx.c` - to minimize go build time/size. +But full version of libmdbx (produced by it's `make dist` command) is in `./../mdbxdist/`. +License is also there. + Most of articles in internet about LMDB are applicable to MDBX. But mdbx has more features. For deeper DB understanding please read through [mdbx.h](https://gitflic.ru/project/erthink/libmdbx/blob?file=mdbx.h) diff --git a/mdbx/Notes b/mdbx/Notes new file mode 100644 index 0000000..2f44821 --- /dev/null +++ b/mdbx/Notes @@ -0,0 +1,3 @@ +This package contains only mdbx.h and mdbx.c - to minimize go build time/size. +But full version of libmdbx (produced by it's `make dist` command) is in `./../mdbxdist/`. License is also there. + diff --git a/mdbx/error.go b/mdbx/error.go index 3a3eba6..44b8e73 100644 --- a/mdbx/error.go +++ b/mdbx/error.go @@ -87,10 +87,10 @@ var CorruptErrorMessage = CorruptErrorHardwareRecommendations + " " + CorruptErr func (e Errno) Error() string { if e == Corrupted { - return "MDBX_FATAL: " + CorruptErrorMessage + return fmt.Sprintf("MDBX_FATAL(%d): ", int(e)) + CorruptErrorMessage } if e == Panic { - return "MDBX_PANIC: " + CorruptErrorMessage + return fmt.Sprintf("MDBX_PANIC(%d): ", int(e)) + CorruptErrorMessage } return C.GoString(C.mdbx_strerror(C.int(e))) } diff --git a/mdbx/mdbx.c b/mdbx/mdbx.c index 4a6ea35..01303e0 100644 --- a/mdbx/mdbx.c +++ b/mdbx/mdbx.c @@ -1,5 +1,5 @@ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * . */ #define xMDBX_ALLOY 1 -#define MDBX_BUILD_SOURCERY 0c86daff919dc09ab5e99c8e5bfb1d51e1367babca9c630c97af92a2aa988ea9_v0_12_9_16_gfff3fbd8 +#define MDBX_BUILD_SOURCERY 91ff5b5423830ee44fca4b70dcb298f233338a17a3185c44df67ec16d3034af3_v0_13_0_38_gf1975363 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -25,11 +25,13 @@ #ifdef xMDBX_ALLOY /* Amalgamated build */ #define MDBX_INTERNAL_FUNC static -#define MDBX_INTERNAL_VAR static +#define MDBX_INTERNAL_VAR_PROTO static +#define MDBX_INTERNAL_VAR_INSTA static #else /* Non-amalgamated build */ #define MDBX_INTERNAL_FUNC -#define MDBX_INTERNAL_VAR extern +#define MDBX_INTERNAL_VAR_PROTO extern +#define MDBX_INTERNAL_VAR_INSTA #endif /* xMDBX_ALLOY */ /*----------------------------------------------------------------------------*/ @@ -94,6 +96,10 @@ disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ producing 'defined' has undefined behavior */ #endif +#if _MSC_VER < 1920 +/* avoid "error C2219: syntax error: type qualifier must be after '*'" */ +#define __restrict +#endif #if _MSC_VER > 1930 #pragma warning(disable : 6235) /* is always a constant */ #pragma warning(disable : 6237) /* is never evaluated and might \ @@ -139,7 +145,7 @@ #include "mdbx.h" /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -187,6 +193,7 @@ #include #include +#include #include #include #include @@ -824,7 +831,7 @@ __extern_C key_t ftok(const char *, int); /*----------------------------------------------------------------------------*/ -#if defined(MDBX_USE_VALGRIND) +#if defined(ENABLE_MEMCHECK) #include #ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE /* LY: available since Valgrind 3.10 */ @@ -846,7 +853,7 @@ __extern_C key_t ftok(const char *, int); #define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a, s) (0) #define VALGRIND_CHECK_MEM_IS_DEFINED(a, s) (0) #define RUNNING_ON_VALGRIND (0) -#endif /* MDBX_USE_VALGRIND */ +#endif /* ENABLE_MEMCHECK */ #ifdef __SANITIZE_ADDRESS__ #include @@ -993,7 +1000,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1197,8 +1204,8 @@ typedef pthread_mutex_t osal_fastmutex_t; /*----------------------------------------------------------------------------*/ /* OS abstraction layer stuff */ -MDBX_INTERNAL_VAR unsigned sys_pagesize; -MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_pagesize_ln2, +MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize; +MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize_ln2, sys_allocation_granularity; /* Get the size of a memory page for the system. @@ -1462,8 +1469,9 @@ MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); #endif #if defined(__linux__) || defined(__gnu_linux__) -MDBX_INTERNAL_VAR uint32_t linux_kernel_version; -MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; +MDBX_INTERNAL_VAR_PROTO uint32_t linux_kernel_version; +MDBX_INTERNAL_VAR_PROTO bool + mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ #ifndef osal_strdup @@ -1677,7 +1685,8 @@ MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, /// restore POSIX-fcntl locks after the closing of file descriptors. /// \return Error code (MDBX_PANIC) or zero on success. MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, - MDBX_env *inprocess_neighbor); + MDBX_env *inprocess_neighbor, + const uint32_t current_pid); /// \brief Connects to shared interprocess locking objects and tries to acquire /// the maximum lock level (shared if exclusive is not available) @@ -1705,6 +1714,8 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); /// operational lock. /// \return Error code or zero on success MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, + bool dont_wait); /// \brief Locks LCK-file or/and table of readers for (de)registering. /// \return Error code or zero on success @@ -1713,16 +1724,12 @@ MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); /// \brief Unlocks LCK-file or/and table of readers after (de)registering. MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); -/// \brief Acquires lock for DB change (on writing transaction start) -/// Reading transactions will not be blocked. -/// Declared as LIBMDBX_API because it is used in mdbx_chk. +/// \brief Acquires write-transaction lock. /// \return Error code or zero on success -LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dont_wait); +MDBX_INTERNAL_FUNC int osal_txn_lock(MDBX_env *env, bool dont_wait); -/// \brief Releases lock once DB changes is made (after writing transaction -/// has finished). -/// Declared as LIBMDBX_API because it is used in mdbx_chk. -LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); +/// \brief Releases write-transaction lock.. +MDBX_INTERNAL_FUNC void osal_txn_unlock(MDBX_env *env); /// \brief Sets alive-flag of reader presence (indicative lock) for PID of /// the current process. The function does no more than needed for @@ -1751,7 +1758,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst); typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); -MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, +MDBX_INTERNAL_VAR_PROTO osal_srwlock_t_function osal_srwlock_Init, osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; @@ -1804,7 +1811,7 @@ typedef struct _FILE_REMOTE_PROTOCOL_INFO { typedef BOOL(WINAPI *MDBX_GetFileInformationByHandleEx)( _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -MDBX_INTERNAL_VAR MDBX_GetFileInformationByHandleEx +MDBX_INTERNAL_VAR_PROTO MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx; typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( @@ -1813,19 +1820,20 @@ typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( _Out_opt_ LPDWORD lpMaximumComponentLength, _Out_opt_ LPDWORD lpFileSystemFlags, _Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize); -MDBX_INTERNAL_VAR MDBX_GetVolumeInformationByHandleW +MDBX_INTERNAL_VAR_PROTO MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW; typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile, _Out_ LPWSTR lpszFilePath, _In_ DWORD cchFilePath, _In_ DWORD dwFlags); -MDBX_INTERNAL_VAR MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; +MDBX_INTERNAL_VAR_PROTO MDBX_GetFinalPathNameByHandleW + mdbx_GetFinalPathNameByHandleW; typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)( _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -MDBX_INTERNAL_VAR MDBX_SetFileInformationByHandle +MDBX_INTERNAL_VAR_PROTO MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle; typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( @@ -1834,10 +1842,10 @@ typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode, IN OUT PVOID InputBuffer, IN ULONG InputBufferLength, OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength); -MDBX_INTERNAL_VAR MDBX_NtFsControlFile mdbx_NtFsControlFile; +MDBX_INTERNAL_VAR_PROTO MDBX_NtFsControlFile mdbx_NtFsControlFile; typedef uint64_t(WINAPI *MDBX_GetTickCount64)(void); -MDBX_INTERNAL_VAR MDBX_GetTickCount64 mdbx_GetTickCount64; +MDBX_INTERNAL_VAR_PROTO MDBX_GetTickCount64 mdbx_GetTickCount64; #if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8 typedef struct _WIN32_MEMORY_RANGE_ENTRY { @@ -1849,13 +1857,13 @@ typedef struct _WIN32_MEMORY_RANGE_ENTRY { typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)( HANDLE hProcess, ULONG_PTR NumberOfEntries, PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags); -MDBX_INTERNAL_VAR MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; +MDBX_INTERNAL_VAR_PROTO MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT; typedef NTSTATUS(NTAPI *MDBX_NtExtendSection)(IN HANDLE SectionHandle, IN PLARGE_INTEGER NewSectionSize); -MDBX_INTERNAL_VAR MDBX_NtExtendSection mdbx_NtExtendSection; +MDBX_INTERNAL_VAR_PROTO MDBX_NtExtendSection mdbx_NtExtendSection; static __inline bool mdbx_RunningUnderWine(void) { return !mdbx_NtExtendSection; @@ -1865,14 +1873,15 @@ typedef LSTATUS(WINAPI *MDBX_RegGetValueA)(HKEY hkey, LPCSTR lpSubKey, LPCSTR lpValue, DWORD dwFlags, LPDWORD pdwType, PVOID pvData, LPDWORD pcbData); -MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; +MDBX_INTERNAL_VAR_PROTO MDBX_RegGetValueA mdbx_RegGetValueA; NTSYSAPI ULONG RtlRandomEx(PULONG Seed); typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle, PUCHAR OverlappedRangeStart, ULONG Length); -MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; +MDBX_INTERNAL_VAR_PROTO MDBX_SetFileIoOverlappedRange + mdbx_SetFileIoOverlappedRange; #endif /* Windows */ @@ -2102,7 +2111,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1 #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ -/** Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP +/** Controls dirty pages tracking, spilling and persisting in `MDBX_WRITEMAP` * mode. 0/OFF = Don't track dirty pages at all, don't spill ones, and use * msync() to persist data. This is by-default on Linux and other systems where * kernel provides properly LRU tracking and effective flushing on-demand. 1/ON @@ -2119,6 +2128,22 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_AVOID_MSYNC must be defined as 0 or 1 #endif /* MDBX_AVOID_MSYNC */ +/** Управляет механизмом поддержки разреженных наборов DBI-хендлов для снижения + * накладных расходов при запуске и обработке транзакций. */ +#ifndef MDBX_ENABLE_DBI_SPARSE +#define MDBX_ENABLE_DBI_SPARSE 1 +#elif !(MDBX_ENABLE_DBI_SPARSE == 0 || MDBX_ENABLE_DBI_SPARSE == 1) +#error MDBX_ENABLE_DBI_SPARSE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_DBI_SPARSE */ + +/** Управляет механизмом отложенного освобождения и поддержки пути быстрого + * открытия DBI-хендлов без захвата блокировок. */ +#ifndef MDBX_ENABLE_DBI_LOCKFREE +#define MDBX_ENABLE_DBI_LOCKFREE 1 +#elif !(MDBX_ENABLE_DBI_LOCKFREE == 0 || MDBX_ENABLE_DBI_LOCKFREE == 1) +#error MDBX_ENABLE_DBI_LOCKFREE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ + /** Controls sort order of internal page number lists. * This mostly experimental/advanced option with not for regular MDBX users. * \warning The database format depend on this option and libmdbx built with @@ -2166,8 +2191,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /** If defined then enables integration with Valgrind, * a memory analyzing tool. */ -#ifndef MDBX_USE_VALGRIND -#endif /* MDBX_USE_VALGRIND */ +#ifndef ENABLE_MEMCHECK +#endif /* ENABLE_MEMCHECK */ /** If defined then enables use C11 atomics, * otherwise detects ones availability automatically. */ @@ -2490,13 +2515,23 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_RUNTIME_FLAGS_INIT \ ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT -extern uint8_t runtime_flags; -extern uint8_t loglevel; -extern MDBX_debug_func *debug_logger; +union logger_union { + void *ptr; + MDBX_debug_func *fmt; + MDBX_debug_func_nofmt *nofmt; +}; + +MDBX_INTERNAL_VAR_PROTO struct mdbx_static { + uint8_t flags; + uint8_t loglevel; + union logger_union logger; + size_t logger_buffer_size; + char *logger_buffer; +} mdbx_static; MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { #if MDBX_DEBUG - if (MDBX_DBG_JITTER & runtime_flags) + if (MDBX_DBG_JITTER & mdbx_static.flags) osal_jitter(tiny); #else (void)tiny; @@ -2510,17 +2545,17 @@ MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, const char *fmt, va_list args); #if MDBX_DEBUG -#define LOG_ENABLED(msg) unlikely(msg <= loglevel) -#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) +#define LOG_ENABLED(msg) unlikely(msg <= mdbx_static.loglevel) +#define AUDIT_ENABLED() unlikely((mdbx_static.flags & MDBX_DBG_AUDIT)) #else /* MDBX_DEBUG */ -#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= mdbx_static.loglevel) #define AUDIT_ENABLED() (0) #endif /* MDBX_DEBUG */ #if MDBX_FORCE_ASSERTIONS #define ASSERT_ENABLED() (1) #elif MDBX_DEBUG -#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) +#define ASSERT_ENABLED() likely((mdbx_static.flags & MDBX_DBG_ASSERT)) #else #define ASSERT_ENABLED() (0) #endif /* assertions */ @@ -2955,7 +2990,8 @@ typedef struct MDBX_page { #define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) -/* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ +/* Drop legacy P_DIRTY flag for sub-pages for compatilibity, + * for assertions only. */ #define PAGETYPE_COMPAT(p) \ (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ @@ -3064,7 +3100,7 @@ typedef sem_t osal_ipclock_t; #endif /* MDBX_LOCKING */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) -MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_stubinit(osal_ipclock_t *ipc); MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); #endif /* MDBX_LOCKING */ @@ -3077,8 +3113,9 @@ MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); * read transactions started by the same thread need no further locking to * proceed. * - * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data. - * No reader table is used if the database is on a read-only filesystem. + * If MDBX_NOSTICKYTHREADS is set, the slot address is not saved in + * thread-specific data. No reader table is used if the database is on a + * read-only filesystem. * * Since the database uses multi-version concurrency control, readers don't * actually need any locking. This table is used to keep track of which @@ -3388,10 +3425,10 @@ typedef struct troika { #if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */ uint32_t unused_pad; #endif -#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) -#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) -#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) -#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3) +#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7u) +#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64u) +#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128u) +#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3u) txnid_t txnid[NUM_METAS]; } meta_troika_t; @@ -3421,6 +3458,8 @@ struct MDBX_txn { #error "Oops, some txn flags overlapped or wrong" #endif uint32_t mt_flags; + unsigned mt_numdbs; + size_t mt_owner; /* thread ID that owns this transaction */ MDBX_txn *mt_parent; /* parent of a nested txn */ /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ @@ -3438,31 +3477,30 @@ struct MDBX_txn { txnid_t mt_front; MDBX_env *mt_env; /* the DB environment */ - /* Array of records for each DB known in the environment. */ - MDBX_dbx *mt_dbxs; /* Array of MDBX_db records for each known DB */ MDBX_db *mt_dbs; - /* Array of sequence numbers for each DB handle */ - MDBX_atomic_uint32_t *mt_dbiseqs; - - /* Transaction DBI Flags */ -#define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ -#define DBI_STALE MDBX_DBI_STALE /* Named-DB record is older than txnID */ -#define DBI_FRESH MDBX_DBI_FRESH /* Named-DB handle opened in this txn */ -#define DBI_CREAT MDBX_DBI_CREAT /* Named-DB handle created in this txn */ -#define DBI_VALID 0x10 /* DB handle is valid, see also DB_VALID */ -#define DBI_USRVALID 0x20 /* As DB_VALID, but not set for FREE_DBI */ -#define DBI_AUDITED 0x40 /* Internal flag for accounting during audit */ - /* Array of flags for each DB */ - uint8_t *mt_dbistate; - /* Number of DB records in use, or 0 when the txn is finished. - * This number only ever increments until the txn finishes; we - * don't decrement it when individual DB handles are closed. */ - MDBX_dbi mt_numdbs; - size_t mt_owner; /* thread ID that owns this transaction */ + +#if MDBX_ENABLE_DBI_SPARSE + unsigned *__restrict mt_dbi_sparse; +#endif /* MDBX_ENABLE_DBI_SPARSE */ + + /* Non-shared DBI state flags inside transaction */ +#define DBI_DIRTY 0x01 /* DB was written in this txn */ +#define DBI_STALE 0x02 /* Named-DB record is older than txnID */ +#define DBI_FRESH 0x04 /* Named-DB handle opened in this txn */ +#define DBI_CREAT 0x08 /* Named-DB handle created in this txn */ +#define DBI_VALID 0x10 /* Handle is valid, see also DB_VALID */ +#define DBI_OLDEN 0x40 /* Handle was closed/reopened outside txn */ +#define DBI_LINDO 0x80 /* Lazy initialization done for DBI-slot */ + /* Array of non-shared txn's flags of DBI */ + uint8_t *__restrict mt_dbi_state; + + /* Array of sequence numbers for each DB handle. */ + uint32_t *__restrict mt_dbi_seqs; + MDBX_cursor **mt_cursors; + MDBX_canary mt_canary; void *mt_userctx; /* User-settable context */ - MDBX_cursor **mt_cursors; union { struct { @@ -3472,8 +3510,8 @@ struct MDBX_txn { struct { meta_troika_t troika; /* In write txns, array of cursors for each DB */ - MDBX_PNL relist; /* Reclaimed GC pages */ - txnid_t last_reclaimed; /* ID of last used record */ + MDBX_PNL __restrict relist; /* Reclaimed GC pages */ + txnid_t last_reclaimed; /* ID of last used record */ #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; #endif /* MDBX_ENABLE_REFUND */ @@ -3485,14 +3523,14 @@ struct MDBX_txn { * dirtylist into mt_parent after freeing hidden mt_parent pages. */ size_t dirtyroom; /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ - MDBX_dpl *dirtylist; + MDBX_dpl *__restrict dirtylist; /* The list of reclaimed txns from GC */ - MDBX_TXL lifo_reclaimed; + MDBX_TXL __restrict lifo_reclaimed; /* The list of pages that became unused during this transaction. */ - MDBX_PNL retired_pages; + MDBX_PNL __restrict retired_pages; /* The list of loose pages that became unused and may be reused * in this transaction, linked through `mp_next`. */ - MDBX_page *loose_pages; + MDBX_page *__restrict loose_pages; /* Number of loose pages (tw.loose_pages) */ size_t loose_count; union { @@ -3501,11 +3539,12 @@ struct MDBX_txn { /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ - MDBX_PNL list; + MDBX_PNL __restrict list; } spilled; size_t writemap_dirty_npages; size_t writemap_spilled_npages; }; + uint64_t gc_time_acc; } tw; }; }; @@ -3544,8 +3583,8 @@ struct MDBX_cursor { MDBX_db *mc_db; /* The database auxiliary record for this cursor */ MDBX_dbx *mc_dbx; - /* The mt_dbistate for this database */ - uint8_t *mc_dbistate; + /* The mt_dbi_state[] for this DBI */ + uint8_t *__restrict mc_dbi_state; uint8_t mc_snum; /* number of pushed pages */ uint8_t mc_top; /* index of top page, normally mc_snum-1 */ @@ -3598,6 +3637,11 @@ typedef struct MDBX_cursor_couple { MDBX_xcursor inner; } MDBX_cursor_couple; +struct mdbx_defer_free_item { + struct mdbx_defer_free_item *next; + uint64_t timestamp; +}; + /* The database environment. */ struct MDBX_env { /* ----------------------------------------------------- mostly static part */ @@ -3615,6 +3659,7 @@ struct MDBX_env { #define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; + unsigned me_psize; /* DB page size, initialized from me_os_psize */ osal_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.base #define me_lazy_fd me_dxb_mmap.fd @@ -3627,7 +3672,6 @@ struct MDBX_env { #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; - unsigned me_psize; /* DB page size, initialized from me_os_psize */ uint16_t me_leaf_nodemax; /* max size of a leaf-node */ uint16_t me_branch_nodemax; /* max size of a branch-node */ uint16_t me_subpage_limit; @@ -3645,13 +3689,15 @@ struct MDBX_env { MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ osal_thread_key_t me_txkey; /* thread-key for readers */ - pathchar_t *me_pathname; /* path to the DB files */ - void *me_pbuf; /* scratch area for DUPSORT put() */ - MDBX_txn *me_txn0; /* preallocated write transaction */ - - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ + struct { /* path to the DB files */ + pathchar_t *lck, *dxb, *specified; + void *buffer; + } me_pathname; + void *me_pbuf; /* scratch area for DUPSORT put() */ + MDBX_txn *me_txn0; /* preallocated write transaction */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *__restrict me_db_flags; /* array of flags from MDBX_db.md_flags */ + MDBX_atomic_uint32_t *me_dbi_seqs; /* array of dbi sequence numbers */ unsigned me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ unsigned me_maxgc_per_branch; @@ -3665,6 +3711,7 @@ struct MDBX_env { unsigned rp_augment_limit; unsigned dp_limit; unsigned dp_initial; + uint64_t gc_time_limit; uint8_t dp_loose_limit; uint8_t spill_max_denominator; uint8_t spill_min_denominator; @@ -3674,6 +3721,8 @@ struct MDBX_env { unsigned writethrough_threshold; #endif /* Windows */ bool prefault_write; + bool prefer_waf_insteadof_balance; /* Strive to minimize WAF instead of + balancing pages fullment */ union { unsigned all; /* tracks options with non-auto values but tuned by user */ @@ -3703,20 +3752,23 @@ struct MDBX_env { } me_sysv_ipc; #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ bool me_incore; + bool me_prefault_write; - MDBX_env *me_lcklist_next; +#if MDBX_ENABLE_DBI_LOCKFREE + struct mdbx_defer_free_item *me_defer_free; +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ /* --------------------------------------------------- mostly volatile part */ MDBX_txn *me_txn; /* current write transaction */ osal_fastmutex_t me_dbi_lock; - MDBX_dbi me_numdbs; /* number of DBs opened */ - bool me_prefault_write; + unsigned me_numdbs; /* number of DBs opened */ - MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ unsigned me_dp_reserve_len; + MDBX_page *__restrict me_dp_reserve; /* list of malloc'ed blocks for re-use */ + /* PNL of pages that became unused in a write txn */ - MDBX_PNL me_retired_pages; + MDBX_PNL __restrict me_retired_pages; osal_ioring_t me_ioring; #if defined(_WIN32) || defined(_WIN64) @@ -3734,13 +3786,12 @@ struct MDBX_env { #if MDBX_DEBUG MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ #endif -#ifdef MDBX_USE_VALGRIND +#ifdef ENABLE_MEMCHECK int me_valgrind_handle; #endif -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - MDBX_atomic_uint32_t me_ignore_EDEADLK; +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) pgno_t me_poison_edge; -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ #ifndef xMDBX_DEBUG_SPILLING #define xMDBX_DEBUG_SPILLING 0 @@ -3800,10 +3851,6 @@ osal_flush_incoherent_mmap(const void *addr, size_t nbytes, MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, int *dead); -MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, - MDBX_reader *end); -MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); - MDBX_INTERNAL_FUNC void global_ctor(void); MDBX_INTERNAL_FUNC void osal_ctor(void); MDBX_INTERNAL_FUNC void global_dtor(void); @@ -3918,7 +3965,8 @@ typedef struct MDBX_node { /* mdbx_dbi_open() flags */ #define DB_USABLE_FLAGS (DB_PERSISTENT_FLAGS | MDBX_CREATE | MDBX_DB_ACCEDE) -#define DB_VALID 0x8000 /* DB handle is valid, for me_dbflags */ +#define DB_VALID 0x8000u /* DB handle is valid, for me_db_flags */ +#define DB_POISON 0x7fffu /* update pending */ #define DB_INTERNAL_FLAGS DB_VALID #if DB_INTERNAL_FLAGS & DB_USABLE_FLAGS @@ -4007,11 +4055,11 @@ log2n_powerof2(size_t value_uintptr) { * environment and re-opening it with the new flags. */ #define ENV_CHANGEABLE_FLAGS \ (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | \ - MDBX_VALIDATION) + MDBX_NOMEMINIT | MDBX_DEPRECATED_COALESCE | MDBX_PAGEPERTURB | \ + MDBX_ACCEDE | MDBX_VALIDATION) #define ENV_CHANGELESS_FLAGS \ - (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ - MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) + (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOSTICKYTHREADS | \ + MDBX_NORDAHEAD | MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS) #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS @@ -4044,8 +4092,38 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { (size_t)(size), __LINE__); \ ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) + +/******************************************************************************/ + +/** \brief Page types for traverse the b-tree. + * \see mdbx_env_pgwalk() \see MDBX_pgvisitor_func */ +enum MDBX_page_type_t { + MDBX_page_broken, + MDBX_page_large, + MDBX_page_branch, + MDBX_page_leaf, + MDBX_page_dupfixed_leaf, + MDBX_subpage_leaf, + MDBX_subpage_dupfixed_leaf, + MDBX_subpage_broken, +}; +typedef enum MDBX_page_type_t MDBX_page_type_t; + +typedef struct MDBX_walk_sdb { + MDBX_val name; + struct MDBX_db *internal, *nested; +} MDBX_walk_sdb_t; + +/** \brief Callback function for traverse the b-tree. \see mdbx_env_pgwalk() */ +typedef int +MDBX_pgvisitor_func(const size_t pgno, const unsigned number, void *const ctx, + const int deep, const MDBX_walk_sdb_t *subdb, + const size_t page_size, const MDBX_page_type_t page_type, + const MDBX_error_t err, const size_t nentries, + const size_t payload_bytes, const size_t header_bytes, + const size_t unused_bytes); /* - * Copyright 2015-2023 Leonid Yuriev . + * Copyright 2015-2024 Leonid Yuriev . * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -4503,6 +4581,19 @@ static __inline size_t keysize_max(size_t pagesize, MDBX_db_flags_t flags) { return max_branch_key; } +static __inline size_t keysize_min(MDBX_db_flags_t flags) { + return (flags & MDBX_INTEGERKEY) ? 4 /* sizeof(uint32_t) */ : 0; +} + +static __inline size_t valsize_min(MDBX_db_flags_t flags) { + if (flags & MDBX_INTEGERDUP) + return 4 /* sizeof(uint32_t) */; + else if (flags & MDBX_DUPFIXED) + return sizeof(indx_t); + else + return 0; +} + static __inline size_t valsize_max(size_t pagesize, MDBX_db_flags_t flags) { assert(pagesize >= MIN_PAGESIZE && pagesize <= MAX_PAGESIZE && is_powerof2(pagesize)); @@ -4555,6 +4646,10 @@ __cold intptr_t mdbx_limits_keysize_max(intptr_t pagesize, return keysize_max(pagesize, flags); } +__cold intptr_t mdbx_limits_keysize_min(MDBX_db_flags_t flags) { + return keysize_min(flags); +} + __cold int mdbx_env_get_maxvalsize_ex(const MDBX_env *env, MDBX_db_flags_t flags) { if (unlikely(!env || env->me_signature.weak != MDBX_ME_SIGNATURE)) @@ -4575,6 +4670,10 @@ __cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize, return valsize_max(pagesize, flags); } +__cold intptr_t mdbx_limits_valsize_min(MDBX_db_flags_t flags) { + return valsize_min(flags); +} + __cold intptr_t mdbx_limits_pairsize4page_max(intptr_t pagesize, MDBX_db_flags_t flags) { if (pagesize < 1) @@ -5173,10 +5272,12 @@ MDBX_MAYBE_UNUSED static /*----------------------------------------------------------------------------*/ /* rthc (tls keys and destructors) */ +static int rthc_register(MDBX_env *const env); +static int rthc_remove(MDBX_env *const env); +static int rthc_uniq_check(const osal_mmap_t *pending, MDBX_env **found); + typedef struct rthc_entry_t { - MDBX_reader *begin; - MDBX_reader *end; - osal_thread_key_t thr_tls_key; + MDBX_env *env; } rthc_entry_t; #if MDBX_DEBUG @@ -5189,10 +5290,8 @@ static bin128_t bootid; #if defined(_WIN32) || defined(_WIN64) static CRITICAL_SECTION rthc_critical_section; -static CRITICAL_SECTION lcklist_critical_section; #else -static pthread_mutex_t lcklist_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t rthc_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t rthc_cond = PTHREAD_COND_INITIALIZER; static osal_thread_key_t rthc_key; @@ -5391,17 +5490,24 @@ static void thread_rthc_set(osal_thread_key_t key, const void *value) { /* dtor called for thread, i.e. for all mdbx's environment objects */ __cold void thread_dtor(void *rthc) { rthc_lock(); - TRACE(">> pid %d, thread 0x%" PRIxPTR ", rthc %p", osal_getpid(), + const uint32_t current_pid = osal_getpid(); + TRACE(">> pid %d, thread 0x%" PRIxPTR ", rthc %p", current_pid, osal_thread_self(), rthc); - const uint32_t self_pid = osal_getpid(); for (size_t i = 0; i < rthc_count; ++i) { - const osal_thread_key_t key = rthc_table[i].thr_tls_key; - MDBX_reader *const reader = thread_rthc_get(key); - if (reader < rthc_table[i].begin || reader >= rthc_table[i].end) + MDBX_env *const env = rthc_table[i].env; + if (env->me_pid != current_pid) + continue; + if (!(env->me_flags & MDBX_ENV_TXKEY)) + continue; + MDBX_reader *const reader = thread_rthc_get(env->me_txkey); + MDBX_reader *const begin = &env->me_lck_mmap.lck->mti_readers[0]; + MDBX_reader *const end = + &env->me_lck_mmap.lck->mti_readers[env->me_maxreaders]; + if (reader < begin || reader >= end) continue; #if !defined(_WIN32) && !defined(_WIN64) - if (pthread_setspecific(key, nullptr) != 0) { + if (pthread_setspecific(env->me_txkey, nullptr) != 0) { TRACE("== thread 0x%" PRIxPTR ", rthc %p: ignore race with tsd-key deletion", osal_thread_self(), __Wpedantic_format_voidptr(reader)); @@ -5413,13 +5519,13 @@ __cold void thread_dtor(void *rthc) { ", rthc %p, [%zi], %p ... %p (%+i), rtch-pid %i, " "current-pid %i", osal_thread_self(), __Wpedantic_format_voidptr(reader), i, - __Wpedantic_format_voidptr(rthc_table[i].begin), - __Wpedantic_format_voidptr(rthc_table[i].end), - (int)(reader - rthc_table[i].begin), reader->mr_pid.weak, self_pid); - if (atomic_load32(&reader->mr_pid, mo_Relaxed) == self_pid) { + __Wpedantic_format_voidptr(begin), __Wpedantic_format_voidptr(end), + (int)(reader - begin), reader->mr_pid.weak, current_pid); + if (atomic_load32(&reader->mr_pid, mo_Relaxed) == current_pid) { TRACE("==== thread 0x%" PRIxPTR ", rthc %p, cleanup", osal_thread_self(), __Wpedantic_format_voidptr(reader)); - (void)atomic_cas32(&reader->mr_pid, self_pid, 0); + (void)atomic_cas32(&reader->mr_pid, current_pid, 0); + atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); } } @@ -5462,16 +5568,21 @@ __cold void thread_dtor(void *rthc) { #endif } +MDBX_INTERNAL_VAR_INSTA struct mdbx_static mdbx_static = { + MDBX_RUNTIME_FLAGS_INIT, MDBX_LOG_FATAL, {nullptr}, 0, nullptr}; +static osal_fastmutex_t debug_lock; + MDBX_EXCLUDE_FOR_GPROF __cold void global_dtor(void) { - TRACE(">> pid %d", osal_getpid()); + const uint32_t current_pid = osal_getpid(); + TRACE(">> pid %d", current_pid); rthc_lock(); #if !defined(_WIN32) && !defined(_WIN64) uint64_t *rthc = pthread_getspecific(rthc_key); TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status 0x%08" PRIx64 ", left %d", - osal_thread_self(), __Wpedantic_format_voidptr(rthc), osal_getpid(), + osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid, rthc ? rthc_read(rthc) : ~UINT64_C(0), atomic_load32(&rthc_pending, mo_Relaxed)); if (rthc) { @@ -5482,20 +5593,20 @@ __cold void global_dtor(void) { rthc_compare_and_clean(rthc, sign_registered)) { TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - osal_thread_self(), __Wpedantic_format_voidptr(rthc), osal_getpid(), + osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid, "registered", state); } else if (state == sign_counted && rthc_compare_and_clean(rthc, sign_counted)) { TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - osal_thread_self(), __Wpedantic_format_voidptr(rthc), osal_getpid(), + osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid, "counted", state); ENSURE(nullptr, atomic_sub32(&rthc_pending, 1) > 0); } else { WARNING("thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - osal_thread_self(), __Wpedantic_format_voidptr(rthc), - osal_getpid(), "wrong", state); + osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid, + "wrong", state); } } @@ -5512,7 +5623,7 @@ __cold void global_dtor(void) { for (unsigned left; (left = atomic_load32(&rthc_pending, mo_AcquireRelease)) > 0;) { - NOTICE("tls-cleanup: pid %d, pending %u, wait for...", osal_getpid(), left); + NOTICE("tls-cleanup: pid %d, pending %u, wait for...", current_pid, left); const int rc = pthread_cond_timedwait(&rthc_cond, &rthc_mutex, &abstime); if (rc && rc != EINTR) break; @@ -5520,23 +5631,31 @@ __cold void global_dtor(void) { thread_key_delete(rthc_key); #endif - const uint32_t self_pid = osal_getpid(); for (size_t i = 0; i < rthc_count; ++i) { - const osal_thread_key_t key = rthc_table[i].thr_tls_key; - thread_key_delete(key); - for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; - ++rthc) { + MDBX_env *const env = rthc_table[i].env; + if (env->me_pid != current_pid) + continue; + if (!(env->me_flags & MDBX_ENV_TXKEY)) + continue; + MDBX_reader *const begin = &env->me_lck_mmap.lck->mti_readers[0]; + MDBX_reader *const end = + &env->me_lck_mmap.lck->mti_readers[env->me_maxreaders]; + thread_key_delete(env->me_txkey); + bool cleaned = false; + for (MDBX_reader *reader = begin; reader < end; ++reader) { TRACE("== [%zi] = key %" PRIuPTR ", %p ... %p, rthc %p (%+i), " "rthc-pid %i, current-pid %i", - i, (uintptr_t)key, __Wpedantic_format_voidptr(rthc_table[i].begin), - __Wpedantic_format_voidptr(rthc_table[i].end), - __Wpedantic_format_voidptr(rthc), (int)(rthc - rthc_table[i].begin), - rthc->mr_pid.weak, self_pid); - if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) { - atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease); - TRACE("== cleanup %p", __Wpedantic_format_voidptr(rthc)); + i, (uintptr_t)env->me_txkey, __Wpedantic_format_voidptr(begin), + __Wpedantic_format_voidptr(end), __Wpedantic_format_voidptr(reader), + (int)(reader - begin), reader->mr_pid.weak, current_pid); + if (atomic_load32(&reader->mr_pid, mo_Relaxed) == current_pid) { + (void)atomic_cas32(&reader->mr_pid, current_pid, 0); + TRACE("== cleanup %p", __Wpedantic_format_voidptr(reader)); + cleaned = true; } } + if (cleaned) + atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); } rthc_limit = rthc_count = 0; @@ -5546,7 +5665,6 @@ __cold void global_dtor(void) { rthc_unlock(); #if defined(_WIN32) || defined(_WIN64) - DeleteCriticalSection(&lcklist_critical_section); DeleteCriticalSection(&rthc_critical_section); #else /* LY: yielding a few timeslices to give a more chance @@ -5555,24 +5673,27 @@ __cold void global_dtor(void) { #endif osal_dtor(); - TRACE("<< pid %d\n", osal_getpid()); + TRACE("<< pid %d\n", current_pid); + ENSURE(nullptr, osal_fastmutex_destroy(&debug_lock) == 0); } -__cold int rthc_alloc(osal_thread_key_t *pkey, MDBX_reader *begin, - MDBX_reader *end) { - assert(pkey != NULL); -#ifndef NDEBUG - *pkey = (osal_thread_key_t)0xBADBADBAD; -#endif /* NDEBUG */ +__cold int rthc_register(MDBX_env *const env) { + TRACE(">> env %p, rthc_count %u, rthc_limit %u", + __Wpedantic_format_voidptr(env), rthc_count, rthc_limit); - rthc_lock(); - TRACE(">> rthc_count %u, rthc_limit %u", rthc_count, rthc_limit); - int rc; - if (rthc_count == rthc_limit) { + int rc = MDBX_SUCCESS; + for (size_t i = 0; i < rthc_count; ++i) + if (unlikely(rthc_table[i].env == env)) { + rc = MDBX_PANIC; + goto bailout; + } + + env->me_txkey = 0; + if (unlikely(rthc_count == rthc_limit)) { rthc_entry_t *new_table = osal_realloc((rthc_table == rthc_table_static) ? nullptr : rthc_table, sizeof(rthc_entry_t) * rthc_limit * 2); - if (new_table == nullptr) { + if (unlikely(new_table == nullptr)) { rc = MDBX_ENOMEM; goto bailout; } @@ -5582,84 +5703,92 @@ __cold int rthc_alloc(osal_thread_key_t *pkey, MDBX_reader *begin, rthc_limit *= 2; } - rc = thread_key_create(&rthc_table[rthc_count].thr_tls_key); - if (rc != MDBX_SUCCESS) - goto bailout; - - *pkey = rthc_table[rthc_count].thr_tls_key; - TRACE("== [%i] = key %" PRIuPTR ", %p ... %p", rthc_count, (uintptr_t)*pkey, - __Wpedantic_format_voidptr(begin), __Wpedantic_format_voidptr(end)); + if ((env->me_flags & MDBX_NOSTICKYTHREADS) == 0) { + rc = thread_key_create(&env->me_txkey); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + env->me_flags |= MDBX_ENV_TXKEY; + } - rthc_table[rthc_count].begin = begin; - rthc_table[rthc_count].end = end; + rthc_table[rthc_count].env = env; + TRACE("== [%i] = env %p, key %" PRIuPTR, rthc_count, + __Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey); ++rthc_count; - TRACE("<< key %" PRIuPTR ", rthc_count %u, rthc_limit %u", (uintptr_t)*pkey, - rthc_count, rthc_limit); - rthc_unlock(); - return MDBX_SUCCESS; bailout: - rthc_unlock(); + TRACE("<< env %p, key %" PRIuPTR ", rthc_count %u, rthc_limit %u, rc %d", + __Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey, rthc_count, + rthc_limit, rc); return rc; } +__cold static int rthc_drown(MDBX_env *const env) { + const uint32_t current_pid = osal_getpid(); + int rc = MDBX_SUCCESS; + MDBX_env *inprocess_neighbor = nullptr; + if (likely(env->me_lck_mmap.lck && current_pid == env->me_pid)) { + MDBX_reader *const begin = &env->me_lck_mmap.lck->mti_readers[0]; + MDBX_reader *const end = + &env->me_lck_mmap.lck->mti_readers[env->me_maxreaders]; + TRACE("== %s env %p pid %d, readers %p ...%p, current-pid %d", + (current_pid == env->me_pid) ? "cleanup" : "skip", + __Wpedantic_format_voidptr(env), env->me_pid, + __Wpedantic_format_voidptr(begin), __Wpedantic_format_voidptr(end), + current_pid); + bool cleaned = false; + for (MDBX_reader *r = begin; r < end; ++r) { + if (atomic_load32(&r->mr_pid, mo_Relaxed) == current_pid) { + atomic_store32(&r->mr_pid, 0, mo_AcquireRelease); + TRACE("== cleanup %p", __Wpedantic_format_voidptr(r)); + cleaned = true; + } + } + if (cleaned) + atomic_store32(&env->me_lck_mmap.lck->mti_readers_refresh_flag, true, + mo_Relaxed); + rc = rthc_uniq_check(&env->me_lck_mmap, &inprocess_neighbor); + if (!inprocess_neighbor && env->me_live_reader && + env->me_lfd != INVALID_HANDLE_VALUE) { + int err = osal_rpid_clear(env); + rc = rc ? rc : err; + } + } + int err = osal_lck_destroy(env, inprocess_neighbor, current_pid); + env->me_pid = 0; + return rc ? rc : err; +} -__cold void rthc_remove(const osal_thread_key_t key) { - thread_key_delete(key); - rthc_lock(); - TRACE(">> key %zu, rthc_count %u, rthc_limit %u", (uintptr_t)key, rthc_count, +__cold static int rthc_remove(MDBX_env *const env) { + TRACE(">>> env %p, key %zu, rthc_count %u, rthc_limit %u", + __Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey, rthc_count, rthc_limit); + int rc = MDBX_SUCCESS; + if (likely(env->me_pid)) + rc = rthc_drown(env); + for (size_t i = 0; i < rthc_count; ++i) { - if (key == rthc_table[i].thr_tls_key) { - const uint32_t self_pid = osal_getpid(); - TRACE("== [%zi], %p ...%p, current-pid %d", i, - __Wpedantic_format_voidptr(rthc_table[i].begin), - __Wpedantic_format_voidptr(rthc_table[i].end), self_pid); - - for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; - ++rthc) { - if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) { - atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease); - TRACE("== cleanup %p", __Wpedantic_format_voidptr(rthc)); - } - } + if (rthc_table[i].env == env) { if (--rthc_count > 0) rthc_table[i] = rthc_table[rthc_count]; else if (rthc_table != rthc_table_static) { - osal_free(rthc_table); + void *tmp = rthc_table; rthc_table = rthc_table_static; rthc_limit = RTHC_INITIAL_LIMIT; + osal_memory_barrier(); + osal_free(tmp); } break; } } - TRACE("<< key %zu, rthc_count %u, rthc_limit %u", (size_t)key, rthc_count, + TRACE("<<< %p, key %zu, rthc_count %u, rthc_limit %u", + __Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey, rthc_count, rthc_limit); - rthc_unlock(); + return rc; } //------------------------------------------------------------------------------ -#define RTHC_ENVLIST_END ((MDBX_env *)((uintptr_t)50459)) -static MDBX_env *inprocess_lcklist_head = RTHC_ENVLIST_END; - -static __inline void lcklist_lock(void) { -#if defined(_WIN32) || defined(_WIN64) - EnterCriticalSection(&lcklist_critical_section); -#else - ENSURE(nullptr, osal_pthread_mutex_lock(&lcklist_mutex) == 0); -#endif -} - -static __inline void lcklist_unlock(void) { -#if defined(_WIN32) || defined(_WIN64) - LeaveCriticalSection(&lcklist_critical_section); -#else - ENSURE(nullptr, pthread_mutex_unlock(&lcklist_mutex) == 0); -#endif -} - MDBX_NOTHROW_CONST_FUNCTION static uint64_t rrxmrrxmsx_0(uint64_t v) { /* Pelle Evensen's mixer, https://bit.ly/2HOfynt */ v ^= (v << 39 | v >> 25) ^ (v << 14 | v >> 50); @@ -5712,13 +5841,16 @@ static int uniq_poke(const osal_mmap_t *pending, osal_mmap_t *scan, return uniq_peek(pending, scan); } -__cold static int uniq_check(const osal_mmap_t *pending, MDBX_env **found) { +__cold static int rthc_uniq_check(const osal_mmap_t *pending, + MDBX_env **found) { *found = nullptr; uint64_t salt = 0; - for (MDBX_env *scan = inprocess_lcklist_head; scan != RTHC_ENVLIST_END; - scan = scan->me_lcklist_next) { - MDBX_lockinfo *const scan_lck = scan->me_lck_mmap.lck; - int err = atomic_load64(&scan_lck->mti_bait_uniqueness, mo_AcquireRelease) + for (size_t i = 0; i < rthc_count; ++i) { + MDBX_env *const scan = rthc_table[i].env; + if (!scan->me_lck_mmap.lck || &scan->me_lck_mmap == pending) + continue; + int err = atomic_load64(&scan->me_lck_mmap.lck->mti_bait_uniqueness, + mo_AcquireRelease) ? uniq_peek(pending, &scan->me_lck_mmap) : uniq_poke(pending, &scan->me_lck_mmap, &salt); if (err == MDBX_ENODATA) { @@ -5726,8 +5858,8 @@ __cold static int uniq_check(const osal_mmap_t *pending, MDBX_env **found) { if (likely(osal_filesize(pending->fd, &length) == MDBX_SUCCESS && length == 0)) { /* LY: skip checking since LCK-file is empty, i.e. just created. */ - DEBUG("uniq-probe: %s", "unique (new/empty lck)"); - return MDBX_RESULT_TRUE; + DEBUG("%s", "unique (new/empty lck)"); + return MDBX_SUCCESS; } } if (err == MDBX_RESULT_TRUE) @@ -5740,44 +5872,17 @@ __cold static int uniq_check(const osal_mmap_t *pending, MDBX_env **found) { if (err == MDBX_RESULT_TRUE) { err = uniq_poke(pending, &scan->me_lck_mmap, &salt); *found = scan; - DEBUG("uniq-probe: found %p", __Wpedantic_format_voidptr(*found)); - return MDBX_RESULT_FALSE; + DEBUG("found %p", __Wpedantic_format_voidptr(*found)); + return MDBX_SUCCESS; } if (unlikely(err != MDBX_SUCCESS)) { - DEBUG("uniq-probe: failed rc %d", err); + DEBUG("failed rc %d", err); return err; } } - DEBUG("uniq-probe: %s", "unique"); - return MDBX_RESULT_TRUE; -} - -static int lcklist_detach_locked(MDBX_env *env) { - MDBX_env *inprocess_neighbor = nullptr; - int rc = MDBX_SUCCESS; - if (env->me_lcklist_next != nullptr) { - ENSURE(env, env->me_lcklist_next != nullptr); - ENSURE(env, inprocess_lcklist_head != RTHC_ENVLIST_END); - for (MDBX_env **ptr = &inprocess_lcklist_head; *ptr != RTHC_ENVLIST_END; - ptr = &(*ptr)->me_lcklist_next) { - if (*ptr == env) { - *ptr = env->me_lcklist_next; - env->me_lcklist_next = nullptr; - break; - } - } - ENSURE(env, env->me_lcklist_next == nullptr); - } - - rc = likely(osal_getpid() == env->me_pid) - ? uniq_check(&env->me_lck_mmap, &inprocess_neighbor) - : MDBX_PANIC; - if (!inprocess_neighbor && env->me_live_reader) - (void)osal_rpid_clear(env); - if (!MDBX_IS_ERROR(rc)) - rc = osal_lck_destroy(env, inprocess_neighbor); - return rc; + DEBUG("%s", "unique"); + return MDBX_SUCCESS; } /*------------------------------------------------------------------------------ @@ -6306,7 +6411,7 @@ static void pnl_free(MDBX_PNL pl) { } /* Shrink the PNL to the default size if it has grown larger */ -static void pnl_shrink(MDBX_PNL *ppl) { +static void pnl_shrink(MDBX_PNL __restrict *__restrict ppl) { assert(pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) >= MDBX_PNL_INITIAL && pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) < MDBX_PNL_INITIAL * 3 / 2); @@ -6329,7 +6434,8 @@ static void pnl_shrink(MDBX_PNL *ppl) { } /* Grow the PNL to the size growed to at least given size */ -static int pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { +static int pnl_reserve(MDBX_PNL __restrict *__restrict ppl, + const size_t wanna) { const size_t allocated = MDBX_PNL_ALLOCLEN(*ppl); assert(MDBX_PNL_GETSIZE(*ppl) <= MDBX_PGL_LIMIT && MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_GETSIZE(*ppl)); @@ -6359,8 +6465,8 @@ static int pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { } /* Make room for num additional elements in an PNL */ -static __always_inline int __must_check_result pnl_need(MDBX_PNL *ppl, - size_t num) { +static __always_inline int __must_check_result +pnl_need(MDBX_PNL __restrict *__restrict ppl, size_t num) { assert(MDBX_PNL_GETSIZE(*ppl) <= MDBX_PGL_LIMIT && MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_GETSIZE(*ppl)); assert(num <= MDBX_PGL_LIMIT); @@ -6369,7 +6475,7 @@ static __always_inline int __must_check_result pnl_need(MDBX_PNL *ppl, : pnl_reserve(ppl, wanna); } -static __always_inline void pnl_xappend(MDBX_PNL pl, pgno_t pgno) { +static __always_inline void pnl_xappend(__restrict MDBX_PNL pl, pgno_t pgno) { assert(MDBX_PNL_GETSIZE(pl) < MDBX_PNL_ALLOCLEN(pl)); if (AUDIT_ENABLED()) { for (size_t i = MDBX_PNL_GETSIZE(pl); i > 0; --i) @@ -6380,10 +6486,8 @@ static __always_inline void pnl_xappend(MDBX_PNL pl, pgno_t pgno) { } /* Append an pgno range onto an unsorted PNL */ -__always_inline static int __must_check_result pnl_append_range(bool spilled, - MDBX_PNL *ppl, - pgno_t pgno, - size_t n) { +__always_inline static int __must_check_result pnl_append_range( + bool spilled, __restrict MDBX_PNL *ppl, pgno_t pgno, size_t n) { assert(n > 0); int rc = pnl_need(ppl, n); if (unlikely(rc != MDBX_SUCCESS)) @@ -6410,7 +6514,7 @@ __always_inline static int __must_check_result pnl_append_range(bool spilled, } /* Append an pgno range into the sorted PNL */ -__hot static int __must_check_result pnl_insert_range(MDBX_PNL *ppl, +__hot static int __must_check_result pnl_insert_range(__restrict MDBX_PNL *ppl, pgno_t pgno, size_t n) { assert(n > 0); int rc = pnl_need(ppl, n); @@ -6714,7 +6818,8 @@ static void txl_free(MDBX_TXL tl) { osal_free(tl - 1); } -static int txl_reserve(MDBX_TXL *ptl, const size_t wanna) { +static int txl_reserve(MDBX_TXL __restrict *__restrict ptl, + const size_t wanna) { const size_t allocated = (size_t)MDBX_PNL_ALLOCLEN(*ptl); assert(MDBX_PNL_GETSIZE(*ptl) <= MDBX_TXL_MAX && MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_GETSIZE(*ptl)); @@ -6743,8 +6848,8 @@ static int txl_reserve(MDBX_TXL *ptl, const size_t wanna) { return MDBX_ENOMEM; } -static __always_inline int __must_check_result txl_need(MDBX_TXL *ptl, - size_t num) { +static __always_inline int __must_check_result +txl_need(MDBX_TXL __restrict *__restrict ptl, size_t num) { assert(MDBX_PNL_GETSIZE(*ptl) <= MDBX_TXL_MAX && MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_GETSIZE(*ptl)); assert(num <= MDBX_PGL_LIMIT); @@ -6753,7 +6858,7 @@ static __always_inline int __must_check_result txl_need(MDBX_TXL *ptl, : txl_reserve(ptl, wanna); } -static __always_inline void txl_xappend(MDBX_TXL tl, txnid_t id) { +static __always_inline void txl_xappend(MDBX_TXL __restrict tl, txnid_t id) { assert(MDBX_PNL_GETSIZE(tl) < MDBX_PNL_ALLOCLEN(tl)); tl[0] += 1; MDBX_PNL_LAST(tl) = id; @@ -6765,7 +6870,8 @@ static void txl_sort(MDBX_TXL tl) { txnid_sort(MDBX_PNL_BEGIN(tl), MDBX_PNL_END(tl)); } -static int __must_check_result txl_append(MDBX_TXL *ptl, txnid_t id) { +static int __must_check_result txl_append(MDBX_TXL __restrict *ptl, + txnid_t id) { if (unlikely(MDBX_PNL_GETSIZE(*ptl) == MDBX_PNL_ALLOCLEN(*ptl))) { int rc = txl_need(ptl, MDBX_TXL_GRANULATE); if (unlikely(rc != MDBX_SUCCESS)) @@ -7251,10 +7357,6 @@ static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, /*----------------------------------------------------------------------------*/ -uint8_t runtime_flags = MDBX_RUNTIME_FLAGS_INIT; -uint8_t loglevel = MDBX_LOG_FATAL; -MDBX_debug_func *debug_logger; - static __must_check_result __inline int page_retire(MDBX_cursor *mc, MDBX_page *mp); @@ -7273,26 +7375,26 @@ static int page_touch(MDBX_cursor *mc); static int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, const MDBX_val *data); -#define MDBX_END_NAMES \ +#define TXN_END_NAMES \ { \ "committed", "empty-commit", "abort", "reset", "reset-tmp", "fail-begin", \ "fail-beginchild" \ } enum { /* txn_end operation number, for logging */ - MDBX_END_COMMITTED, - MDBX_END_PURE_COMMIT, - MDBX_END_ABORT, - MDBX_END_RESET, - MDBX_END_RESET_TMP, - MDBX_END_FAIL_BEGIN, - MDBX_END_FAIL_BEGINCHILD + TXN_END_COMMITTED, + TXN_END_PURE_COMMIT, + TXN_END_ABORT, + TXN_END_RESET, + TXN_END_RESET_TMP, + TXN_END_FAIL_BEGIN, + TXN_END_FAIL_BEGINCHILD }; -#define MDBX_END_OPMASK 0x0F /* mask for txn_end() operation number */ -#define MDBX_END_UPDATE 0x10 /* update env state (DBIs) */ -#define MDBX_END_FREE 0x20 /* free txn unless it is MDBX_env.me_txn0 */ -#define MDBX_END_EOTDONE 0x40 /* txn's cursors already closed */ -#define MDBX_END_SLOT 0x80 /* release any reader slot if MDBX_NOTLS */ +#define TXN_END_OPMASK 0x0F /* mask for txn_end() operation number */ +#define TXN_END_UPDATE 0x10 /* update env state (DBIs) */ +#define TXN_END_FREE 0x20 /* free txn unless it is MDBX_env.me_txn0 */ +#define TXN_END_EOTDONE 0x40 /* txn's cursors already closed */ +#define TXN_END_SLOT 0x80 /* release any reader slot if NOSTICKYTHREADS */ static int txn_end(MDBX_txn *txn, const unsigned mode); static __always_inline pgr_t page_get_inline(const uint16_t ILL, @@ -7356,7 +7458,7 @@ static int __must_check_result read_header(MDBX_env *env, MDBX_meta *meta, static int __must_check_result sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, meta_troika_t *const troika); -static int env_close(MDBX_env *env); +static int env_close(MDBX_env *env, bool resurrect_after_fork); struct node_result { MDBX_node *node; @@ -7430,7 +7532,8 @@ static int __must_check_result cursor_last(MDBX_cursor *mc, MDBX_val *key, static int __must_check_result cursor_init(MDBX_cursor *mc, const MDBX_txn *txn, size_t dbi); static int __must_check_result cursor_xinit0(MDBX_cursor *mc); -static int __must_check_result cursor_xinit1(MDBX_cursor *mc, MDBX_node *node, +static int __must_check_result cursor_xinit1(MDBX_cursor *mc, + const MDBX_node *node, const MDBX_page *mp); static int __must_check_result cursor_xinit2(MDBX_cursor *mc, MDBX_xcursor *src_mx, @@ -7440,7 +7543,7 @@ static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst); static int __must_check_result drop_tree(MDBX_cursor *mc, const bool may_have_subDBs); static int __must_check_result fetch_sdb(MDBX_txn *txn, size_t dbi); -static int __must_check_result setup_dbx(MDBX_dbx *const dbx, +static int __must_check_result setup_sdb(MDBX_dbx *const dbx, const MDBX_db *const db, const unsigned pagesize); @@ -7515,8 +7618,11 @@ __cold const char *mdbx_liberr2str(int errnum) { return "MDBX_TXN_OVERLAPPING: Overlapping read and write transactions for" " the current thread"; case MDBX_DUPLICATED_CLK: - return "MDBX_DUPLICATED_CLK: Alternative/Duplicate LCK-file is exists, " - "please keep one and remove unused other"; + return "MDBX_DUPLICATED_CLK: Alternative/Duplicate LCK-file is exists," + " please keep one and remove unused other"; + case MDBX_DANGLING_DBI: + return "MDBX_DANGLING_DBI: Some cursors and/or other resources should be" + " closed before subDb or corresponding DBI-handle could be (re)used"; default: return NULL; } @@ -7602,9 +7708,18 @@ const char *mdbx_strerror_ANSI2OEM(int errnum) { __cold void debug_log_va(int level, const char *function, int line, const char *fmt, va_list args) { - if (debug_logger) - debug_logger(level, function, line, fmt, args); - else { + ENSURE(nullptr, osal_fastmutex_acquire(&debug_lock) == 0); + if (mdbx_static.logger.ptr) { + if (mdbx_static.logger_buffer == nullptr) + mdbx_static.logger.fmt(level, function, line, fmt, args); + else { + const int len = vsnprintf(mdbx_static.logger_buffer, + mdbx_static.logger_buffer_size, fmt, args); + if (len > 0) + mdbx_static.logger.nofmt(level, function, line, + mdbx_static.logger_buffer, len); + } + } else { #if defined(_WIN32) || defined(_WIN64) if (IsDebuggerPresent()) { int prefix_len = 0; @@ -7637,6 +7752,7 @@ __cold void debug_log_va(int level, const char *function, int line, fflush(stderr); #endif } + ENSURE(nullptr, osal_fastmutex_release(&debug_lock) == 0); } __cold void debug_log(int level, const char *function, int line, @@ -7819,16 +7935,479 @@ MDBX_MAYBE_UNUSED static bool cursor_is_tracked(const MDBX_cursor *mc) { *tracking_head = tracked->mc_next; \ } while (0) +static int +env_defer_free_and_release(MDBX_env *const env, + struct mdbx_defer_free_item *const chain) { + size_t length = 0; + struct mdbx_defer_free_item *obsolete_chain = nullptr; +#if MDBX_ENABLE_DBI_LOCKFREE + const uint64_t now = osal_monotime(); + struct mdbx_defer_free_item **scan = &env->me_defer_free; + if (env->me_defer_free) { + const uint64_t threshold_1second = osal_16dot16_to_monotime(1 * 65536); + do { + struct mdbx_defer_free_item *item = *scan; + if (now - item->timestamp < threshold_1second) { + scan = &item->next; + length += 1; + } else { + *scan = item->next; + item->next = obsolete_chain; + obsolete_chain = item; + } + } while (*scan); + } + + eASSERT(env, *scan == nullptr); + if (chain) { + struct mdbx_defer_free_item *item = chain; + do { + item->timestamp = now; + item = item->next; + } while (item); + *scan = chain; + } +#else /* MDBX_ENABLE_DBI_LOCKFREE */ + obsolete_chain = chain; +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ + + ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + if (length > 42) { +#if defined(_WIN32) || defined(_WIN64) + SwitchToThread(); +#else + sched_yield(); +#endif /* Windows */ + } + while (obsolete_chain) { + struct mdbx_defer_free_item *item = obsolete_chain; + obsolete_chain = obsolete_chain->next; + osal_free(item); + } + return chain ? MDBX_SUCCESS : MDBX_BAD_DBI; +} + +#if MDBX_ENABLE_DBI_SPARSE + +static __inline size_t dbi_bitmap_ctz(const MDBX_txn *txn, intptr_t bmi) { + tASSERT(txn, bmi > 0); + STATIC_ASSERT(sizeof(bmi) >= sizeof(txn->mt_dbi_sparse[0])); +#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl) + if (sizeof(txn->mt_dbi_sparse[0]) <= sizeof(int)) + return __builtin_ctz((int)bmi); + if (sizeof(txn->mt_dbi_sparse[0]) == sizeof(long)) + return __builtin_ctzl((long)bmi); +#if (defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ == 8) || \ + __has_builtin(__builtin_ctzll) + return __builtin_ctzll(bmi); +#endif /* have(long long) && long long == uint64_t */ +#endif /* GNU C */ + +#if defined(_MSC_VER) + unsigned long index; + if (sizeof(txn->mt_dbi_sparse[0]) > 4) { +#if defined(_M_AMD64) || defined(_M_ARM64) || defined(_M_X64) + _BitScanForward64(&index, bmi); + return index; +#else + if (bmi > UINT32_MAX) { + _BitScanForward(&index, (uint32_t)((uint64_t)bmi >> 32)); + return index; + } +#endif + } + _BitScanForward(&index, (uint32_t)bmi); + return index; +#endif /* MSVC */ + + bmi &= -bmi; + if (sizeof(txn->mt_dbi_sparse[0]) > 4) { + static const uint8_t debruijn_ctz64[64] = { + 0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28, + 62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11, + 63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10, + 51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12}; + return debruijn_ctz64[(UINT64_C(0x022FDD63CC95386D) * (uint64_t)bmi) >> 58]; + } else { + static const uint8_t debruijn_ctz32[32] = { + 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, + 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; + return debruijn_ctz32[(UINT32_C(0x077CB531) * (uint32_t)bmi) >> 27]; + } +} + +/* LY: Макрос целенаправленно сделан с одним циклом, чтобы сохранить возможность + * использования оператора break */ +#define TXN_FOREACH_DBI_FROM(TXN, I, FROM) \ + for (size_t bitmap_chunk = CHAR_BIT * sizeof(TXN->mt_dbi_sparse[0]), \ + bitmap_item = TXN->mt_dbi_sparse[0] >> FROM, I = FROM; \ + I < TXN->mt_numdbs; ++I) \ + if (bitmap_item == 0) { \ + I = (I - 1) | (bitmap_chunk - 1); \ + bitmap_item = TXN->mt_dbi_sparse[(1 + I) / bitmap_chunk]; \ + if (!bitmap_item) \ + I += bitmap_chunk; \ + continue; \ + } else if ((bitmap_item & 1) == 0) { \ + size_t bitmap_skip = dbi_bitmap_ctz(txn, bitmap_item); \ + bitmap_item >>= bitmap_skip; \ + I += bitmap_skip - 1; \ + continue; \ + } else if (bitmap_item >>= 1, TXN->mt_dbi_state[I]) +#else +#define TXN_FOREACH_DBI_FROM(TXN, I, SKIP) \ + for (size_t I = SKIP; I < TXN->mt_numdbs; ++I) \ + if (TXN->mt_dbi_state[I]) +#endif /* MDBX_ENABLE_DBI_SPARSE */ + +#define TXN_FOREACH_DBI_ALL(TXN, I) TXN_FOREACH_DBI_FROM(TXN, I, 0) +#define TXN_FOREACH_DBI_USER(TXN, I) TXN_FOREACH_DBI_FROM(TXN, I, CORE_DBS) + +/* Back up parent txn's cursor, then grab the original for tracking */ +static int cursor_shadow(MDBX_cursor *parent_cursor, MDBX_txn *nested_txn, + const size_t dbi) { + + tASSERT(nested_txn, dbi > FREE_DBI && dbi < nested_txn->mt_numdbs); + const size_t size = parent_cursor->mc_xcursor + ? sizeof(MDBX_cursor) + sizeof(MDBX_xcursor) + : sizeof(MDBX_cursor); + for (MDBX_cursor *bk; parent_cursor; parent_cursor = bk->mc_next) { + bk = parent_cursor; + if (parent_cursor->mc_signature != MDBX_MC_LIVE) + continue; + bk = osal_malloc(size); + if (unlikely(!bk)) + return MDBX_ENOMEM; +#if MDBX_DEBUG + memset(bk, 0xCD, size); + VALGRIND_MAKE_MEM_UNDEFINED(bk, size); +#endif /* MDBX_DEBUG */ + *bk = *parent_cursor; + parent_cursor->mc_backup = bk; + /* Kill pointers into src to reduce abuse: The + * user may not use mc until dst ends. But we need a valid + * txn pointer here for cursor fixups to keep working. */ + parent_cursor->mc_txn = nested_txn; + parent_cursor->mc_db = &nested_txn->mt_dbs[dbi]; + parent_cursor->mc_dbi_state = &nested_txn->mt_dbi_state[dbi]; + MDBX_xcursor *mx = parent_cursor->mc_xcursor; + if (mx != NULL) { + *(MDBX_xcursor *)(bk + 1) = *mx; + mx->mx_cursor.mc_txn = nested_txn; + } + parent_cursor->mc_next = nested_txn->mt_cursors[dbi]; + nested_txn->mt_cursors[dbi] = parent_cursor; + } + return MDBX_SUCCESS; +} + +/* Close this txn's cursors, give parent txn's cursors back to parent. + * + * [in] txn the transaction handle. + * [in] merge true to keep changes to parent cursors, false to revert. + * + * Returns 0 on success, non-zero on failure. */ +static void cursors_eot(MDBX_txn *txn, const bool merge) { + tASSERT(txn, txn->mt_cursors[FREE_DBI] == nullptr); + TXN_FOREACH_DBI_FROM(txn, i, /* skip FREE_DBI */ 1) { + MDBX_cursor *mc = txn->mt_cursors[i]; + if (!mc) + continue; + txn->mt_cursors[i] = nullptr; + do { + const unsigned stage = mc->mc_signature; + MDBX_cursor *const next = mc->mc_next; + MDBX_cursor *const bk = mc->mc_backup; + ENSURE(txn->mt_env, + stage == MDBX_MC_LIVE || (stage == MDBX_MC_WAIT4EOT && bk)); + cASSERT(mc, mc->mc_dbi == (MDBX_dbi)i); + if (bk) { + MDBX_xcursor *mx = mc->mc_xcursor; + tASSERT(txn, txn->mt_parent != NULL); + /* Zap: Using uninitialized memory '*mc->mc_backup'. */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6001); + ENSURE(txn->mt_env, bk->mc_signature == MDBX_MC_LIVE); + tASSERT(txn, mx == bk->mc_xcursor); + if (stage == MDBX_MC_WAIT4EOT /* Cursor was closed by user */) + mc->mc_signature = stage /* Promote closed state to parent txn */; + else if (merge) { + /* Restore pointers to parent txn */ + mc->mc_next = bk->mc_next; + mc->mc_backup = bk->mc_backup; + mc->mc_txn = bk->mc_txn; + mc->mc_db = bk->mc_db; + mc->mc_dbi_state = bk->mc_dbi_state; + if (mx) { + if (mx != bk->mc_xcursor) { + *bk->mc_xcursor = *mx; + mx = bk->mc_xcursor; + } + mx->mx_cursor.mc_txn = bk->mc_txn; + } + } else { + /* Restore from backup, i.e. rollback/abort nested txn */ + *mc = *bk; + if (mx) + *mx = *(MDBX_xcursor *)(bk + 1); + } + bk->mc_signature = 0; + osal_free(bk); + } else { + ENSURE(txn->mt_env, stage == MDBX_MC_LIVE); + mc->mc_signature = MDBX_MC_READY4CLOSE /* Cursor may be reused */; + mc->mc_flags = 0 /* reset C_UNTRACK */; + } + mc = next; + } while (mc); + } +} + +static __noinline int dbi_import(MDBX_txn *txn, const size_t dbi); + +static __inline bool db_check_flags(uint16_t db_flags) { + switch (db_flags & ~(DB_VALID | MDBX_REVERSEKEY | MDBX_INTEGERKEY)) { + default: + NOTICE("invalid db-flags 0x%x", db_flags); + return false; + case MDBX_DUPSORT: + case MDBX_DUPSORT | MDBX_REVERSEDUP: + case MDBX_DUPSORT | MDBX_DUPFIXED: + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP: + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP: + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP: + case MDBX_DB_DEFAULTS: + return (db_flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY)) != + (MDBX_REVERSEKEY | MDBX_INTEGERKEY); + } +} + +static __inline uint8_t dbi_state(const MDBX_txn *txn, const size_t dbi) { + STATIC_ASSERT(DBI_DIRTY == MDBX_DBI_DIRTY && DBI_STALE == MDBX_DBI_STALE && + DBI_FRESH == MDBX_DBI_FRESH && DBI_CREAT == MDBX_DBI_CREAT); + +#if MDBX_ENABLE_DBI_SPARSE + const size_t bitmap_chunk = CHAR_BIT * sizeof(txn->mt_dbi_sparse[0]); + const size_t bitmap_indx = dbi / bitmap_chunk; + const size_t bitmap_mask = (size_t)1 << dbi % bitmap_chunk; + return likely(dbi < txn->mt_numdbs && + (txn->mt_dbi_sparse[bitmap_indx] & bitmap_mask) != 0) + ? txn->mt_dbi_state[dbi] + : 0; +#else + return likely(dbi < txn->mt_numdbs) ? txn->mt_dbi_state[dbi] : 0; +#endif /* MDBX_ENABLE_DBI_SPARSE */ +} + +static __inline bool dbi_changed(const MDBX_txn *txn, const size_t dbi) { + const MDBX_env *const env = txn->mt_env; + eASSERT(env, dbi_state(txn, dbi) & DBI_LINDO); + const uint32_t snap_seq = + atomic_load32(&env->me_dbi_seqs[dbi], mo_AcquireRelease); + return snap_seq != txn->mt_dbi_seqs[dbi]; +} + +static __always_inline int dbi_check(const MDBX_txn *txn, const size_t dbi) { + const uint8_t state = dbi_state(txn, dbi); + if (likely((state & DBI_LINDO) != 0 && !dbi_changed(txn, dbi))) + return (state & DBI_VALID) ? MDBX_SUCCESS : MDBX_BAD_DBI; + + /* Медленный путь: ленивая до-инициализацяи и импорт */ + return dbi_import((MDBX_txn *)txn, dbi); +} + +static __inline uint32_t dbi_seq_next(const MDBX_env *const env, size_t dbi) { + uint32_t v = atomic_load32(&env->me_dbi_seqs[dbi], mo_AcquireRelease) + 1; + return v ? v : 1; +} + +struct dbi_snap_result { + uint32_t sequence; + unsigned flags; +}; + +static struct dbi_snap_result dbi_snap(const MDBX_env *env, const size_t dbi) { + eASSERT(env, dbi < env->me_numdbs); + struct dbi_snap_result r; + uint32_t snap = atomic_load32(&env->me_dbi_seqs[dbi], mo_AcquireRelease); + do { + r.sequence = snap; + r.flags = env->me_db_flags[dbi]; + snap = atomic_load32(&env->me_dbi_seqs[dbi], mo_AcquireRelease); + } while (unlikely(snap != r.sequence)); + return r; +} + +static __noinline int dbi_import(MDBX_txn *txn, const size_t dbi) { + const MDBX_env *const env = txn->mt_env; + if (dbi >= env->me_numdbs || !env->me_db_flags[dbi]) + return MDBX_BAD_DBI; + +#if MDBX_ENABLE_DBI_SPARSE + const size_t bitmap_chunk = CHAR_BIT * sizeof(txn->mt_dbi_sparse[0]); + const size_t bitmap_indx = dbi / bitmap_chunk; + const size_t bitmap_mask = (size_t)1 << dbi % bitmap_chunk; + if (dbi >= txn->mt_numdbs) { + for (size_t i = (txn->mt_numdbs + bitmap_chunk - 1) / bitmap_chunk; + bitmap_indx >= i; ++i) + txn->mt_dbi_sparse[i] = 0; + eASSERT(env, (txn->mt_dbi_sparse[bitmap_indx] & bitmap_mask) == 0); + MDBX_txn *scan = txn; + do { + eASSERT(env, scan->mt_dbi_sparse == txn->mt_dbi_sparse); + eASSERT(env, scan->mt_numdbs < dbi + 1); + scan->mt_numdbs = (unsigned)dbi + 1; + scan->mt_dbi_state[dbi] = 0; + scan = scan->mt_parent; + } while (scan /* && scan->mt_dbi_sparse == txn->mt_dbi_sparse */); + txn->mt_dbi_sparse[bitmap_indx] |= bitmap_mask; + goto lindo; + } + if ((txn->mt_dbi_sparse[bitmap_indx] & bitmap_mask) == 0) { + MDBX_txn *scan = txn; + do { + eASSERT(env, scan->mt_dbi_sparse == txn->mt_dbi_sparse); + eASSERT(env, scan->mt_numdbs == txn->mt_numdbs); + scan->mt_dbi_state[dbi] = 0; + scan = scan->mt_parent; + } while (scan /* && scan->mt_dbi_sparse == txn->mt_dbi_sparse */); + txn->mt_dbi_sparse[bitmap_indx] |= bitmap_mask; + goto lindo; + } +#else + if (dbi >= txn->mt_numdbs) { + size_t i = txn->mt_numdbs; + do + txn->mt_dbi_state[i] = 0; + while (dbi >= ++i); + txn->mt_numdbs = i; + goto lindo; + } +#endif /* MDBX_ENABLE_DBI_SPARSE */ + + if (!txn->mt_dbi_state[dbi]) { + lindo: + /* dbi-слот еще не инициализирован в транзакции, а хендл не использовался */ + txn->mt_cursors[dbi] = nullptr; + MDBX_txn *const parent = txn->mt_parent; + if (parent) { + /* вложенная пишущая транзакция */ + int rc = dbi_check(parent, dbi); + /* копируем состояние subDB очищая new-флаги. */ + eASSERT(env, txn->mt_dbi_seqs == parent->mt_dbi_seqs); + txn->mt_dbi_state[dbi] = + parent->mt_dbi_state[dbi] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); + if (likely(rc == MDBX_SUCCESS)) { + txn->mt_dbs[dbi] = parent->mt_dbs[dbi]; + if (parent->mt_cursors[dbi]) { + rc = cursor_shadow(parent->mt_cursors[dbi], txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) { + /* не получилось забекапить курсоры */ + txn->mt_dbi_state[dbi] = DBI_OLDEN | DBI_LINDO | DBI_STALE; + txn->mt_flags |= MDBX_TXN_ERROR; + } + } + } + return rc; + } + txn->mt_dbi_seqs[dbi] = 0; + txn->mt_dbi_state[dbi] = DBI_LINDO; + } else { + eASSERT(env, txn->mt_dbi_seqs[dbi] != env->me_dbi_seqs[dbi].weak); + if (unlikely((txn->mt_dbi_state[dbi] & (DBI_VALID | DBI_OLDEN)) || + txn->mt_cursors[dbi])) { + /* хендл уже использовался в транзакции, но был закрыт или переоткрыт, + * либо при явном пере-открытии хендла есть висячие курсоры */ + eASSERT(env, (txn->mt_dbi_state[dbi] & DBI_STALE) == 0); + txn->mt_dbi_seqs[dbi] = env->me_dbi_seqs[dbi].weak; + txn->mt_dbi_state[dbi] = DBI_OLDEN | DBI_LINDO; + return txn->mt_cursors[dbi] ? MDBX_DANGLING_DBI : MDBX_BAD_DBI; + } + } + + /* хендл не использовался в транзакции, либо явно пере-отрывается при + * отсутствии висячих курсоров */ + eASSERT(env, (txn->mt_dbi_state[dbi] & DBI_LINDO) && !txn->mt_cursors[dbi]); + + /* читаем актуальные флаги и sequence */ + struct dbi_snap_result snap = dbi_snap(env, dbi); + txn->mt_dbi_seqs[dbi] = snap.sequence; + if (snap.flags & DB_VALID) { + txn->mt_dbs[dbi].md_flags = snap.flags & DB_PERSISTENT_FLAGS; + txn->mt_dbi_state[dbi] = DBI_LINDO | DBI_VALID | DBI_STALE; + return MDBX_SUCCESS; + } + return MDBX_BAD_DBI; +} + +/* Export or close DBI handles opened in this txn. */ +static int dbi_update(MDBX_txn *txn, int keep) { + MDBX_env *const env = txn->mt_env; + tASSERT(txn, !txn->mt_parent && txn == env->me_txn0); + bool locked = false; + struct mdbx_defer_free_item *defer_chain = nullptr; + TXN_FOREACH_DBI_USER(txn, dbi) { + if (likely((txn->mt_dbi_state[dbi] & DBI_CREAT) == 0)) + continue; + if (!locked) { + int err = osal_fastmutex_acquire(&env->me_dbi_lock); + if (unlikely(err != MDBX_SUCCESS)) + return err; + locked = true; + if (dbi >= env->me_numdbs) + /* хендл был закрыт из другого потока пока захватывали блокировку */ + continue; + } + tASSERT(txn, dbi < env->me_numdbs); + if (keep) { + env->me_db_flags[dbi] = txn->mt_dbs[dbi].md_flags | DB_VALID; + } else { + uint32_t seq = dbi_seq_next(env, dbi); + struct mdbx_defer_free_item *item = env->me_dbxs[dbi].md_name.iov_base; + if (item) { + env->me_db_flags[dbi] = 0; + env->me_dbxs[dbi].md_name.iov_len = 0; + env->me_dbxs[dbi].md_name.iov_base = nullptr; + atomic_store32(&env->me_dbi_seqs[dbi], seq, mo_AcquireRelease); + osal_flush_incoherent_cpu_writeback(); + item->next = defer_chain; + defer_chain = item; + } else { + eASSERT(env, env->me_dbxs[dbi].md_name.iov_len == 0); + eASSERT(env, env->me_db_flags[dbi] == 0); + } + } + } + + if (locked) { + size_t i = env->me_numdbs; + while ((env->me_db_flags[i - 1] & DB_VALID) == 0) { + --i; + eASSERT(env, i >= CORE_DBS); + eASSERT(env, !env->me_db_flags[i] && !env->me_dbxs[i].md_name.iov_len && + !env->me_dbxs[i].md_name.iov_base); + } + env->me_numdbs = (unsigned)i; + env_defer_free_and_release(env, defer_chain); + } + return MDBX_SUCCESS; +} + int mdbx_cmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, const MDBX_val *b) { eASSERT(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); - return txn->mt_dbxs[dbi].md_cmp(a, b); + tASSERT(txn, (dbi_state(txn, dbi) & DBI_VALID) && !dbi_changed(txn, dbi)); + tASSERT(txn, dbi < txn->mt_env->me_numdbs && + (txn->mt_env->me_db_flags[dbi] & DB_VALID) != 0); + return txn->mt_env->me_dbxs[dbi].md_cmp(a, b); } int mdbx_dcmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, const MDBX_val *b) { eASSERT(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); - return txn->mt_dbxs[dbi].md_dcmp(a, b); + tASSERT(txn, (dbi_state(txn, dbi) & DBI_VALID) && !dbi_changed(txn, dbi)); + tASSERT(txn, dbi < txn->mt_env->me_numdbs && + (txn->mt_env->me_db_flags[dbi] & DB_VALID)); + return txn->mt_env->me_dbxs[dbi].md_dcmp(a, b); } /* Allocate memory for a page. @@ -8159,7 +8738,8 @@ static void refund_loose(MDBX_txn *txn) { /* Filter-out loose chain & dispose refunded pages. */ unlink_loose: - for (MDBX_page **link = &txn->tw.loose_pages; *link;) { + for (MDBX_page *__restrict *__restrict link = &txn->tw.loose_pages; + *link;) { MDBX_page *dp = *link; tASSERT(txn, dp->mp_flags == P_LOOSE); MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(dp), sizeof(MDBX_page *)); @@ -8510,7 +9090,7 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, return MDBX_SUCCESS; } -#if !MDBX_DEBUG && !defined(MDBX_USE_VALGRIND) && !defined(__SANITIZE_ADDRESS__) +#if !MDBX_DEBUG && !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) #endif { @@ -8527,7 +9107,7 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, goto skip_invalidate; } -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) #endif kill_page(txn, mp, pgno, npages); @@ -8870,12 +9450,15 @@ static size_t txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); txn_lru_turn(txn); size_t keep = m0 ? cursor_keep(txn, m0) : 0; - for (size_t i = FREE_DBI; i < txn->mt_numdbs; ++i) - if (F_ISSET(txn->mt_dbistate[i], DBI_DIRTY | DBI_VALID) && - txn->mt_dbs[i].md_root != P_INVALID) - for (MDBX_cursor *mc = txn->mt_cursors[i]; mc; mc = mc->mc_next) + + TXN_FOREACH_DBI_ALL(txn, dbi) { + if (F_ISSET(txn->mt_dbi_state[dbi], DBI_DIRTY | DBI_VALID) && + txn->mt_dbs[dbi].md_root != P_INVALID) + for (MDBX_cursor *mc = txn->mt_cursors[dbi]; mc; mc = mc->mc_next) if (mc != m0) keep += cursor_keep(txn, mc); + } + return keep; } @@ -8932,33 +9515,6 @@ spill_prio(const MDBX_txn *txn, const size_t i, const uint32_t reciprocal) { return prio = (unsigned)factor; } -/* Spill pages from the dirty list back to disk. - * This is intended to prevent running into MDBX_TXN_FULL situations, - * but note that they may still occur in a few cases: - * - * 1) our estimate of the txn size could be too small. Currently this - * seems unlikely, except with a large number of MDBX_MULTIPLE items. - * - * 2) child txns may run out of space if their parents dirtied a - * lot of pages and never spilled them. TODO: we probably should do - * a preemptive spill during mdbx_txn_begin() of a child txn, if - * the parent's dirtyroom is below a given threshold. - * - * Otherwise, if not using nested txns, it is expected that apps will - * not run into MDBX_TXN_FULL any more. The pages are flushed to disk - * the same way as for a txn commit, e.g. their dirty status is cleared. - * If the txn never references them again, they can be left alone. - * If the txn only reads them, they can be used without any fuss. - * If the txn writes them again, they can be dirtied immediately without - * going thru all of the work of page_touch(). Such references are - * handled by page_unspill(). - * - * Also note, we never spill DB root pages, nor pages of active cursors, - * because we'll need these back again soon anyway. And in nested txns, - * we can't spill a page in a child txn if it was already spilled in a - * parent txn. That would alter the parent txns' data even though - * the child hasn't committed yet, and we'd have no way to undo it if - * the child aborted. */ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intptr_t wanna_spill_entries, const intptr_t wanna_spill_npages, @@ -9613,7 +10169,7 @@ __cold static void meta_troika_dump(const MDBX_env *env, const meta_ptr_t recent = meta_recent(env, troika); const meta_ptr_t prefer_steady = meta_prefer_steady(env, troika); const meta_ptr_t tail = meta_tail(env, troika); - NOTICE("%" PRIaTXN ".%c:%" PRIaTXN ".%c:%" PRIaTXN ".%c, fsm=0x%02x, " + NOTICE("troika: %" PRIaTXN ".%c:%" PRIaTXN ".%c:%" PRIaTXN ".%c, fsm=0x%02x, " "head=%d-%" PRIaTXN ".%c, " "base=%d-%" PRIaTXN ".%c, " "tail=%d-%" PRIaTXN ".%c, " @@ -9630,6 +10186,14 @@ __cold static void meta_troika_dump(const MDBX_env *env, /*----------------------------------------------------------------------------*/ +static __inline MDBX_CONST_FUNCTION MDBX_lockinfo * +lckless_stub(const MDBX_env *env) { + uintptr_t stub = (uintptr_t)&env->x_lckless_stub; + /* align to avoid false-positive alarm from UndefinedBehaviorSanitizer */ + stub = (stub + MDBX_CACHELINE_SIZE - 1) & ~(MDBX_CACHELINE_SIZE - 1); + return (MDBX_lockinfo *)stub; +} + /* Find oldest txnid still referenced. */ static txnid_t find_oldest_reader(MDBX_env *const env, const txnid_t steady) { const uint32_t nothing_changed = MDBX_STRING_TETRAD("None"); @@ -9637,7 +10201,7 @@ static txnid_t find_oldest_reader(MDBX_env *const env, const txnid_t steady) { MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (unlikely(lck == NULL /* exclusive without-lck mode */)) { - eASSERT(env, env->me_lck == (void *)&env->x_lckless_stub); + eASSERT(env, env->me_lck == lckless_stub(env)); env->me_lck->mti_readers_refresh_flag.weak = nothing_changed; return env->me_lck->mti_oldest_reader.weak = steady; } @@ -10020,11 +10584,21 @@ __cold static void munlock_all(const MDBX_env *env) { } __cold static unsigned default_rp_augment_limit(const MDBX_env *env) { - /* default rp_augment_limit = npages / 3 */ - const size_t augment = env->me_dbgeo.now / 3 >> env->me_psize2log; - eASSERT(env, augment < MDBX_PGL_LIMIT); - return pnl_bytes2size(pnl_size2bytes( - (augment > MDBX_PNL_INITIAL) ? augment : MDBX_PNL_INITIAL)); + const size_t timeframe = /* 16 секунд */ 16 << 16; + const size_t remain_1sec = + (env->me_options.gc_time_limit < timeframe) + ? timeframe - (size_t)env->me_options.gc_time_limit + : 0; + const size_t minimum = (env->me_maxgc_ov1page * 2 > MDBX_PNL_INITIAL) + ? env->me_maxgc_ov1page * 2 + : MDBX_PNL_INITIAL; + const size_t one_third = env->me_dbgeo.now / 3 >> env->me_psize2log; + const size_t augment_limit = + (one_third > minimum) + ? minimum + (one_third - minimum) / timeframe * remain_1sec + : minimum; + eASSERT(env, augment_limit < MDBX_PGL_LIMIT); + return pnl_bytes2size(pnl_size2bytes(augment_limit)); } static bool default_prefault_write(const MDBX_env *env) { @@ -10084,9 +10658,9 @@ __cold static int dxb_resize(MDBX_env *const env, const pgno_t used_pgno, } const size_t limit_bytes = pgno_align2os_bytes(env, limit_pgno); const size_t size_bytes = pgno_align2os_bytes(env, size_pgno); -#if MDBX_ENABLE_MADVISE || defined(MDBX_USE_VALGRIND) +#if MDBX_ENABLE_MADVISE || defined(ENABLE_MEMCHECK) const void *const prev_map = env->me_dxb_mmap.base; -#endif /* MDBX_ENABLE_MADVISE || MDBX_USE_VALGRIND */ +#endif /* MDBX_ENABLE_MADVISE || ENABLE_MEMCHECK */ VERBOSE("resize/%d datafile/mapping: " "present %" PRIuPTR " -> %" PRIuPTR ", " @@ -10107,60 +10681,63 @@ __cold static int dxb_resize(MDBX_env *const env, const pgno_t used_pgno, size_bytes == env->me_dxb_mmap.filesize) goto bailout; + /* При использовании MDBX_NOSTICKYTHREADS с транзакциями могут работать любые + * потоки и у нас нет информации о том, какие именно. Поэтому нет возможности + * выполнить remap-действия требующие приостановки работающих с БД потоков. */ + if ((env->me_flags & MDBX_NOSTICKYTHREADS) == 0) { #if defined(_WIN32) || defined(_WIN64) - if ((env->me_flags & MDBX_NOTLS) == 0 && - ((size_bytes < env->me_dxb_mmap.current && mode > implicit_grow) || - limit_bytes != env->me_dxb_mmap.limit)) { - /* 1) Windows allows only extending a read-write section, but not a - * corresponding mapped view. Therefore in other cases we must suspend - * the local threads for safe remap. - * 2) At least on Windows 10 1803 the entire mapped section is unavailable - * for short time during NtExtendSection() or VirtualAlloc() execution. - * 3) Under Wine runtime environment on Linux a section extending is not - * supported. - * - * THEREFORE LOCAL THREADS SUSPENDING IS ALWAYS REQUIRED! */ - array_onstack.limit = ARRAY_LENGTH(array_onstack.handles); - array_onstack.count = 0; - suspended = &array_onstack; - rc = osal_suspend_threads_before_remap(env, &suspended); - if (rc != MDBX_SUCCESS) { - ERROR("failed suspend-for-remap: errcode %d", rc); - goto bailout; - } - mresize_flags |= (mode < explicit_resize) - ? MDBX_MRESIZE_MAY_UNMAP - : MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; - } -#else /* Windows */ - MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (mode == explicit_resize && limit_bytes != env->me_dxb_mmap.limit && - !(env->me_flags & MDBX_NOTLS)) { - mresize_flags |= MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; - if (lck) { - int err = osal_rdt_lock(env) /* lock readers table until remap done */; - if (unlikely(MDBX_IS_ERROR(err))) { - rc = err; + if ((size_bytes < env->me_dxb_mmap.current && mode > implicit_grow) || + limit_bytes != env->me_dxb_mmap.limit) { + /* 1) Windows allows only extending a read-write section, but not a + * corresponding mapped view. Therefore in other cases we must suspend + * the local threads for safe remap. + * 2) At least on Windows 10 1803 the entire mapped section is unavailable + * for short time during NtExtendSection() or VirtualAlloc() execution. + * 3) Under Wine runtime environment on Linux a section extending is not + * supported. + * + * THEREFORE LOCAL THREADS SUSPENDING IS ALWAYS REQUIRED! */ + array_onstack.limit = ARRAY_LENGTH(array_onstack.handles); + array_onstack.count = 0; + suspended = &array_onstack; + rc = osal_suspend_threads_before_remap(env, &suspended); + if (rc != MDBX_SUCCESS) { + ERROR("failed suspend-for-remap: errcode %d", rc); goto bailout; } + mresize_flags |= (mode < explicit_resize) + ? MDBX_MRESIZE_MAY_UNMAP + : MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; + } +#else /* Windows */ + MDBX_lockinfo *const lck = env->me_lck_mmap.lck; + if (mode == explicit_resize && limit_bytes != env->me_dxb_mmap.limit) { + mresize_flags |= MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; + if (lck) { + int err = osal_rdt_lock(env) /* lock readers table until remap done */; + if (unlikely(MDBX_IS_ERROR(err))) { + rc = err; + goto bailout; + } - /* looking for readers from this process */ - const size_t snap_nreaders = - atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - eASSERT(env, mode == explicit_resize); - for (size_t i = 0; i < snap_nreaders; ++i) { - if (lck->mti_readers[i].mr_pid.weak == env->me_pid && - lck->mti_readers[i].mr_tid.weak != osal_thread_self()) { - /* the base address of the mapping can't be changed since - * the other reader thread from this process exists. */ - osal_rdt_unlock(env); - mresize_flags &= ~(MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE); - break; + /* looking for readers from this process */ + const size_t snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); + eASSERT(env, mode == explicit_resize); + for (size_t i = 0; i < snap_nreaders; ++i) { + if (lck->mti_readers[i].mr_pid.weak == env->me_pid && + lck->mti_readers[i].mr_tid.weak != osal_thread_self()) { + /* the base address of the mapping can't be changed since + * the other reader thread from this process exists. */ + osal_rdt_unlock(env); + mresize_flags &= ~(MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE); + break; + } } } } - } #endif /* ! Windows */ + } const pgno_t aligned_munlock_pgno = (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) @@ -10273,7 +10850,7 @@ __cold static int dxb_resize(MDBX_env *const env, const pgno_t used_pgno, env->me_dbgeo.now = env->me_dxb_mmap.current; env->me_dbgeo.upper = env->me_dxb_mmap.limit; adjust_defaults(env); -#ifdef MDBX_USE_VALGRIND +#ifdef ENABLE_MEMCHECK if (prev_limit != env->me_dxb_mmap.limit || prev_map != env->me_map) { VALGRIND_DISCARD(env->me_valgrind_handle); env->me_valgrind_handle = 0; @@ -10281,7 +10858,7 @@ __cold static int dxb_resize(MDBX_env *const env, const pgno_t used_pgno, env->me_valgrind_handle = VALGRIND_CREATE_BLOCK(env->me_map, env->me_dxb_mmap.limit, "mdbx"); } -#endif /* MDBX_USE_VALGRIND */ +#endif /* ENABLE_MEMCHECK */ } else { if (rc != MDBX_UNABLE_EXTEND_MAPSIZE && rc != MDBX_EPERM) { ERROR("failed resize datafile/mapping: " @@ -10581,9 +11158,9 @@ scan4seq_sse2(pgno_t *range, const size_t len, const size_t seq) { do { mask = (uint8_t)diffcmp2mask_sse2(range - 3, offset, pattern); if (mask) { -#ifndef __SANITIZE_ADDRESS__ +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) found: -#endif /* __SANITIZE_ADDRESS__ */ +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ return range + 28 - __builtin_clz(mask); } range -= 4; @@ -10596,7 +11173,7 @@ scan4seq_sse2(pgno_t *range, const size_t len, const size_t seq) { * только за пределами региона выделенного под PNL, но и пересекать границу * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ -#ifndef __SANITIZE_ADDRESS__ +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */; if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && !RUNNING_ON_VALGRIND) { @@ -10608,7 +11185,7 @@ scan4seq_sse2(pgno_t *range, const size_t len, const size_t seq) { goto found; return nullptr; } -#endif /* __SANITIZE_ADDRESS__ */ +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ do if (*range - range[offset] == target) return range; @@ -10652,9 +11229,9 @@ scan4seq_avx2(pgno_t *range, const size_t len, const size_t seq) { do { mask = (uint8_t)diffcmp2mask_avx2(range - 7, offset, pattern); if (mask) { -#ifndef __SANITIZE_ADDRESS__ +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) found: -#endif /* __SANITIZE_ADDRESS__ */ +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ return range + 24 - __builtin_clz(mask); } range -= 8; @@ -10667,7 +11244,7 @@ scan4seq_avx2(pgno_t *range, const size_t len, const size_t seq) { * только за пределами региона выделенного под PNL, но и пересекать границу * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ -#ifndef __SANITIZE_ADDRESS__ +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) const unsigned on_page_safe_mask = 0xfe0 /* enough for '-31' bytes offset */; if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && !RUNNING_ON_VALGRIND) { @@ -10679,7 +11256,7 @@ scan4seq_avx2(pgno_t *range, const size_t len, const size_t seq) { goto found; return nullptr; } -#endif /* __SANITIZE_ADDRESS__ */ +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ if (range - 3 > detent) { mask = diffcmp2mask_sse2avx(range - 3, offset, *(const __m128i *)&pattern); if (mask) @@ -10720,9 +11297,9 @@ scan4seq_avx512bw(pgno_t *range, const size_t len, const size_t seq) { do { mask = diffcmp2mask_avx512bw(range - 15, offset, pattern); if (mask) { -#ifndef __SANITIZE_ADDRESS__ +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) found: -#endif /* __SANITIZE_ADDRESS__ */ +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ return range + 16 - __builtin_clz(mask); } range -= 16; @@ -10735,7 +11312,7 @@ scan4seq_avx512bw(pgno_t *range, const size_t len, const size_t seq) { * только за пределами региона выделенного под PNL, но и пересекать границу * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ -#ifndef __SANITIZE_ADDRESS__ +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) const unsigned on_page_safe_mask = 0xfc0 /* enough for '-63' bytes offset */; if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && !RUNNING_ON_VALGRIND) { @@ -10747,7 +11324,7 @@ scan4seq_avx512bw(pgno_t *range, const size_t len, const size_t seq) { goto found; return nullptr; } -#endif /* __SANITIZE_ADDRESS__ */ +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ if (range - 7 > detent) { mask = diffcmp2mask_avx2(range - 7, offset, *(const __m256i *)&pattern); if (mask) @@ -10800,9 +11377,9 @@ __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, do { mask = diffcmp2mask_neon(range - 3, offset, pattern); if (mask) { -#ifndef __SANITIZE_ADDRESS__ +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) found: -#endif /* __SANITIZE_ADDRESS__ */ +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ return ptr_disp(range, -(__builtin_clzl(mask) >> sizeof(size_t) / 4)); } range -= 4; @@ -10815,7 +11392,7 @@ __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, * только за пределами региона выделенного под PNL, но и пересекать границу * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ -#ifndef __SANITIZE_ADDRESS__ +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */; if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && !RUNNING_ON_VALGRIND) { @@ -10827,7 +11404,7 @@ __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, goto found; return nullptr; } -#endif /* __SANITIZE_ADDRESS__ */ +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ do if (*range - range[offset] == target) return range; @@ -10898,22 +11475,6 @@ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, //------------------------------------------------------------------------------ -/* Allocate page numbers and memory for writing. Maintain mt_last_reclaimed, - * mt_relist and mt_next_pgno. Set MDBX_TXN_ERROR on failure. - * - * If there are free pages available from older transactions, they - * are re-used first. Otherwise allocate a new page at mt_next_pgno. - * Do not modify the GC, just merge GC records into mt_relist - * and move mt_last_reclaimed to say which records were consumed. Only this - * function can create mt_relist and move - * mt_last_reclaimed/mt_next_pgno. - * - * [in] mc cursor A cursor handle identifying the transaction and - * database for which we are allocating. - * [in] num the number of pages to allocate. - * - * Returns 0 on success, non-zero on failure.*/ - #define MDBX_ALLOC_DEFAULT 0 #define MDBX_ALLOC_RESERVE 1 #define MDBX_ALLOC_UNIMPORTANT 2 @@ -11290,12 +11851,24 @@ static __inline pgr_t page_alloc_finalize(MDBX_env *const env, return ret; } +struct monotime_cache { + uint64_t value; + int expire_countdown; +}; + +static __inline uint64_t monotime_since_cached(uint64_t begin_timestamp, + struct monotime_cache *cache) { + if (cache->expire_countdown) + cache->expire_countdown -= 1; + else { + cache->value = osal_monotime(); + cache->expire_countdown = 42 / 3; + } + return cache->value - begin_timestamp; +} + static pgr_t page_alloc_slowpath(const MDBX_cursor *const mc, const size_t num, uint8_t flags) { -#if MDBX_ENABLE_PROFGC - const uint64_t monotime_before = osal_monotime(); -#endif /* MDBX_ENABLE_PROFGC */ - pgr_t ret; MDBX_txn *const txn = mc->mc_txn; MDBX_env *const env = txn->mt_env; @@ -11310,8 +11883,19 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *const mc, const size_t num, eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - pgno_t pgno = 0; size_t newnext; + const uint64_t monotime_begin = + (MDBX_ENABLE_PROFGC || (num > 1 && env->me_options.gc_time_limit)) + ? osal_monotime() + : 0; + struct monotime_cache now_cache; + now_cache.expire_countdown = + 1 /* старт с 1 позволяет избавиться как от лишних системных вызовов когда + лимит времени задан нулевой или уже исчерпан, так и от подсчета + времени при не-достижении rp_augment_limit */ + ; + now_cache.value = monotime_begin; + pgno_t pgno = 0; if (num > 1) { #if MDBX_ENABLE_PROFGC prof->xpages += 1; @@ -11425,6 +12009,8 @@ next_gc:; goto depleted_gc; } if (unlikely(key.iov_len != sizeof(txnid_t))) { + ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC key-length"); ret.err = MDBX_CORRUPTED; goto fail; } @@ -11451,6 +12037,8 @@ next_gc:; if (unlikely(data.iov_len % sizeof(pgno_t) || data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || !pnl_check(gc_pnl, txn->mt_next_pgno))) { + ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC value-length"); ret.err = MDBX_CORRUPTED; goto fail; } @@ -11487,7 +12075,10 @@ next_gc:; txn->tw.relist) >= env->me_options.rp_augment_limit) && ((/* not a slot-request from gc-update */ num && /* have enough unallocated space */ txn->mt_geo.upper >= - txn->mt_next_pgno + num) || + txn->mt_next_pgno + num && + monotime_since_cached(monotime_begin, &now_cache) + + txn->tw.gc_time_acc >= + env->me_options.gc_time_limit) || gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= MDBX_PGL_LIMIT)) { /* Stop reclaiming to avoid large/overflow the page list. This is a rare * case while search for a continuously multi-page region in a @@ -11531,6 +12122,8 @@ next_gc:; flags |= MDBX_ALLOC_SHOULD_SCAN; if (AUDIT_ENABLED()) { if (unlikely(!pnl_check(txn->tw.relist, txn->mt_next_pgno))) { + ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid txn retired-list"); ret.err = MDBX_CORRUPTED; goto fail; } @@ -11789,6 +12382,8 @@ next_gc:; (size_t)txn->mt_dbs[FREE_DBI].md_entries); ret.page = NULL; } + if (num > 1) + txn->tw.gc_time_acc += monotime_since_cached(monotime_begin, &now_cache); } else { early_exit: DEBUG("return NULL for %zu pages for ALLOC_%s, rc %d", num, @@ -11797,7 +12392,7 @@ next_gc:; } #if MDBX_ENABLE_PROFGC - prof->rtime_monotonic += osal_monotime() - monotime_before; + prof->rtime_monotonic += osal_monotime() - monotime_begin; #endif /* MDBX_ENABLE_PROFGC */ return ret; } @@ -11805,7 +12400,8 @@ next_gc:; __hot static pgr_t page_alloc(const MDBX_cursor *const mc) { MDBX_txn *const txn = mc->mc_txn; tASSERT(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); - tASSERT(txn, F_ISSET(txn->mt_dbistate[mc->mc_dbi], DBI_DIRTY | DBI_VALID)); + tASSERT(txn, F_ISSET(dbi_state(txn, mc->mc_dbi), + DBI_LINDO | DBI_VALID | DBI_DIRTY)); /* If there are any loose pages, just use them */ while (likely(txn->tw.loose_pages)) { @@ -11945,7 +12541,7 @@ __hot static int page_touch(MDBX_cursor *mc) { int rc; tASSERT(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); - tASSERT(txn, F_ISSET(*mc->mc_dbistate, DBI_DIRTY | DBI_VALID)); + tASSERT(txn, F_ISSET(*mc->mc_dbi_state, DBI_LINDO | DBI_VALID | DBI_DIRTY)); tASSERT(txn, !IS_OVERFLOW(mp)); if (ASSERT_ENABLED()) { if (mc->mc_flags & C_SUB) { @@ -11953,7 +12549,7 @@ __hot static int page_touch(MDBX_cursor *mc) { MDBX_cursor_couple *couple = container_of(mx, MDBX_cursor_couple, inner); tASSERT(txn, mc->mc_db == &couple->outer.mc_xcursor->mx_db); tASSERT(txn, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); - tASSERT(txn, *couple->outer.mc_dbistate & DBI_DIRTY); + tASSERT(txn, *couple->outer.mc_dbi_state & DBI_DIRTY); } tASSERT(txn, dirtylist_check(txn)); } @@ -12142,25 +12738,30 @@ static int meta_sync(const MDBX_env *env, const meta_ptr_t head) { return rc; } +static __inline bool env_txn0_owned(const MDBX_env *env) { + return (env->me_flags & MDBX_NOSTICKYTHREADS) + ? (env->me_txn0->mt_owner != 0) + : (env->me_txn0->mt_owner == osal_thread_self()); +} + __cold static int env_sync(MDBX_env *env, bool force, bool nonblock) { - bool locked = false; + if (unlikely(env->me_flags & MDBX_RDONLY)) + return MDBX_EACCESS; + + const bool txn0_owned = env_txn0_owned(env); + bool should_unlock = false; int rc = MDBX_RESULT_TRUE /* means "nothing to sync" */; retry:; unsigned flags = env->me_flags & ~(MDBX_NOMETASYNC | MDBX_SHRINK_ALLOWED); - if (unlikely((flags & (MDBX_RDONLY | MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE)) != + if (unlikely((flags & (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE)) != MDBX_ENV_ACTIVE)) { - rc = MDBX_EACCESS; - if (!(flags & MDBX_ENV_ACTIVE)) - rc = MDBX_EPERM; - if (flags & MDBX_FATAL_ERROR) - rc = MDBX_PANIC; + rc = (flags & MDBX_FATAL_ERROR) ? MDBX_PANIC : MDBX_EPERM; goto bailout; } - const bool inside_txn = (env->me_txn0->mt_owner == osal_thread_self()); const meta_troika_t troika = - (inside_txn | locked) ? env->me_txn0->tw.troika : meta_tap(env); + (txn0_owned | should_unlock) ? env->me_txn0->tw.troika : meta_tap(env); const meta_ptr_t head = meta_recent(env, &troika); const uint64_t unsynced_pages = atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed); @@ -12171,7 +12772,7 @@ retry:; goto bailout; } - if (!inside_txn && locked && (env->me_flags & MDBX_WRITEMAP) && + if (should_unlock && (env->me_flags & MDBX_WRITEMAP) && unlikely(head.ptr_c->mm_geo.next > bytes2pgno(env, env->me_dxb_mmap.current))) { @@ -12201,8 +12802,8 @@ retry:; osal_monotime() - eoos_timestamp >= autosync_period)) flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; - if (!inside_txn) { - if (!locked) { + if (!txn0_owned) { + if (!should_unlock) { #if MDBX_ENABLE_PGOP_STAT unsigned wops = 0; #endif /* MDBX_ENABLE_PGOP_STAT */ @@ -12244,11 +12845,11 @@ retry:; rc = MDBX_SUCCESS /* means "some data was synced" */; } - err = mdbx_txn_lock(env, nonblock); + err = osal_txn_lock(env, nonblock); if (unlikely(err != MDBX_SUCCESS)) return err; - locked = true; + should_unlock = true; #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += wops; #endif /* MDBX_ENABLE_PGOP_STAT */ @@ -12262,8 +12863,8 @@ retry:; flags |= MDBX_SHRINK_ALLOWED; } - eASSERT(env, inside_txn || locked); - eASSERT(env, !inside_txn || (flags & MDBX_SHRINK_ALLOWED) == 0); + eASSERT(env, txn0_owned || should_unlock); + eASSERT(env, !txn0_owned || (flags & MDBX_SHRINK_ALLOWED) == 0); if (!head.is_steady && unlikely(env->me_stuck_meta >= 0) && troika.recent != (uint8_t)env->me_stuck_meta) { @@ -12290,8 +12891,8 @@ retry:; rc = meta_sync(env, head); bailout: - if (locked) - mdbx_txn_unlock(env); + if (should_unlock) + osal_txn_unlock(env); return rc; } @@ -12307,7 +12908,7 @@ static __inline int check_env(const MDBX_env *env, const bool wanna_active) { if (wanna_active) { #if MDBX_ENV_CHECKPID - if (unlikely(env->me_pid != osal_getpid())) { + if (unlikely(env->me_pid != osal_getpid()) && env->me_pid) { ((MDBX_env *)env)->me_flags |= MDBX_FATAL_ERROR; return MDBX_PANIC; } @@ -12328,110 +12929,7 @@ __cold int mdbx_env_sync_ex(MDBX_env *env, bool force, bool nonblock) { return env_sync(env, force, nonblock); } -/* Back up parent txn's cursors, then grab the originals for tracking */ -static int cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { - tASSERT(parent, parent->mt_cursors[FREE_DBI] == nullptr); - nested->mt_cursors[FREE_DBI] = nullptr; - for (int i = parent->mt_numdbs; --i > FREE_DBI;) { - nested->mt_cursors[i] = NULL; - MDBX_cursor *mc = parent->mt_cursors[i]; - if (mc != NULL) { - size_t size = mc->mc_xcursor ? sizeof(MDBX_cursor) + sizeof(MDBX_xcursor) - : sizeof(MDBX_cursor); - for (MDBX_cursor *bk; mc; mc = bk->mc_next) { - bk = mc; - if (mc->mc_signature != MDBX_MC_LIVE) - continue; - bk = osal_malloc(size); - if (unlikely(!bk)) - return MDBX_ENOMEM; -#if MDBX_DEBUG - memset(bk, 0xCD, size); - VALGRIND_MAKE_MEM_UNDEFINED(bk, size); -#endif /* MDBX_DEBUG */ - *bk = *mc; - mc->mc_backup = bk; - /* Kill pointers into src to reduce abuse: The - * user may not use mc until dst ends. But we need a valid - * txn pointer here for cursor fixups to keep working. */ - mc->mc_txn = nested; - mc->mc_db = &nested->mt_dbs[i]; - mc->mc_dbistate = &nested->mt_dbistate[i]; - MDBX_xcursor *mx = mc->mc_xcursor; - if (mx != NULL) { - *(MDBX_xcursor *)(bk + 1) = *mx; - mx->mx_cursor.mc_txn = nested; - } - mc->mc_next = nested->mt_cursors[i]; - nested->mt_cursors[i] = mc; - } - } - } - return MDBX_SUCCESS; -} - -/* Close this txn's cursors, give parent txn's cursors back to parent. - * - * [in] txn the transaction handle. - * [in] merge true to keep changes to parent cursors, false to revert. - * - * Returns 0 on success, non-zero on failure. */ -static void cursors_eot(MDBX_txn *txn, const bool merge) { - tASSERT(txn, txn->mt_cursors[FREE_DBI] == nullptr); - for (intptr_t i = txn->mt_numdbs; --i > FREE_DBI;) { - MDBX_cursor *mc = txn->mt_cursors[i]; - if (!mc) - continue; - txn->mt_cursors[i] = nullptr; - do { - const unsigned stage = mc->mc_signature; - MDBX_cursor *const next = mc->mc_next; - MDBX_cursor *const bk = mc->mc_backup; - ENSURE(txn->mt_env, - stage == MDBX_MC_LIVE || (stage == MDBX_MC_WAIT4EOT && bk)); - cASSERT(mc, mc->mc_dbi == (MDBX_dbi)i); - if (bk) { - MDBX_xcursor *mx = mc->mc_xcursor; - tASSERT(txn, txn->mt_parent != NULL); - /* Zap: Using uninitialized memory '*mc->mc_backup'. */ - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6001); - ENSURE(txn->mt_env, bk->mc_signature == MDBX_MC_LIVE); - tASSERT(txn, mx == bk->mc_xcursor); - if (stage == MDBX_MC_WAIT4EOT /* Cursor was closed by user */) - mc->mc_signature = stage /* Promote closed state to parent txn */; - else if (merge) { - /* Restore pointers to parent txn */ - mc->mc_next = bk->mc_next; - mc->mc_backup = bk->mc_backup; - mc->mc_txn = bk->mc_txn; - mc->mc_db = bk->mc_db; - mc->mc_dbistate = bk->mc_dbistate; - if (mx) { - if (mx != bk->mc_xcursor) { - *bk->mc_xcursor = *mx; - mx = bk->mc_xcursor; - } - mx->mx_cursor.mc_txn = bk->mc_txn; - } - } else { - /* Restore from backup, i.e. rollback/abort nested txn */ - *mc = *bk; - if (mx) - *mx = *(MDBX_xcursor *)(bk + 1); - } - bk->mc_signature = 0; - osal_free(bk); - } else { - ENSURE(txn->mt_env, stage == MDBX_MC_LIVE); - mc->mc_signature = MDBX_MC_READY4CLOSE /* Cursor may be reused */; - mc->mc_flags = 0 /* reset C_UNTRACK */; - } - mc = next; - } while (mc); - } -} - -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) /* Find largest mvcc-snapshot still referenced by this process. */ static pgno_t find_largest_this(MDBX_env *env, pgno_t largest) { MDBX_lockinfo *const lck = env->me_lck_mmap.lck; @@ -12479,13 +12977,16 @@ static void txn_valgrind(MDBX_env *env, MDBX_txn *txn) { } else { /* transaction end */ bool should_unlock = false; pgno_t last = MAX_PAGENO + 1; - if (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()) { + if (env->me_pid != osal_getpid()) { + /* resurrect after fork */ + return; + } else if (env->me_txn && env_txn0_owned(env)) { /* inside write-txn */ last = meta_recent(env, &env->me_txn0->tw.troika).ptr_v->mm_geo.next; } else if (env->me_flags & MDBX_RDONLY) { /* read-only mode, no write-txn, no wlock mutex */ last = NUM_METAS; - } else if (mdbx_txn_lock(env, true) == MDBX_SUCCESS) { + } else if (osal_txn_lock(env, true) == MDBX_SUCCESS) { /* no write-txn */ last = NUM_METAS; should_unlock = true; @@ -12506,10 +13007,10 @@ static void txn_valgrind(MDBX_env *env, MDBX_txn *txn) { pgno2bytes(env, edge - last)); } if (should_unlock) - mdbx_txn_unlock(env); + osal_txn_unlock(env); } } -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ typedef struct { int err; @@ -12575,7 +13076,7 @@ static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) { safe64_reset(&result.rslot->mr_txnid, true); if (slot == nreaders) env->me_lck->mti_numreaders.weak = (uint32_t)++nreaders; - result.rslot->mr_tid.weak = (env->me_flags & MDBX_NOTLS) ? 0 : tid; + result.rslot->mr_tid.weak = (env->me_flags & MDBX_NOSTICKYTHREADS) ? 0 : tid; atomic_store32(&result.rslot->mr_pid, env->me_pid, mo_AcquireRelease); osal_rdt_unlock(env); @@ -12595,12 +13096,12 @@ __cold int mdbx_thread_register(const MDBX_env *env) { return (env->me_flags & MDBX_EXCLUSIVE) ? MDBX_EINVAL : MDBX_EPERM; if (unlikely((env->me_flags & MDBX_ENV_TXKEY) == 0)) { - eASSERT(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); - return MDBX_EINVAL /* MDBX_NOTLS mode */; + eASSERT(env, env->me_flags & MDBX_NOSTICKYTHREADS); + return MDBX_EINVAL /* MDBX_NOSTICKYTHREADS mode */; } - eASSERT(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY | - MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY); + eASSERT(env, (env->me_flags & (MDBX_NOSTICKYTHREADS | MDBX_ENV_TXKEY)) == + MDBX_ENV_TXKEY); MDBX_reader *r = thread_rthc_get(env->me_txkey); if (unlikely(r != NULL)) { eASSERT(env, r->mr_pid.weak == env->me_pid); @@ -12611,7 +13112,7 @@ __cold int mdbx_thread_register(const MDBX_env *env) { } const uintptr_t tid = osal_thread_self(); - if (env->me_txn0 && unlikely(env->me_txn0->mt_owner == tid)) + if (env->me_txn && unlikely(env->me_txn0->mt_owner == tid)) return MDBX_TXN_OVERLAPPING; return bind_rslot((MDBX_env *)env, tid).err; } @@ -12625,12 +13126,12 @@ __cold int mdbx_thread_unregister(const MDBX_env *env) { return MDBX_RESULT_TRUE; if (unlikely((env->me_flags & MDBX_ENV_TXKEY) == 0)) { - eASSERT(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); - return MDBX_RESULT_TRUE /* MDBX_NOTLS mode */; + eASSERT(env, env->me_flags & MDBX_NOSTICKYTHREADS); + return MDBX_RESULT_TRUE /* MDBX_NOSTICKYTHREADS mode */; } - eASSERT(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY | - MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY); + eASSERT(env, (env->me_flags & (MDBX_NOSTICKYTHREADS | MDBX_ENV_TXKEY)) == + MDBX_ENV_TXKEY); MDBX_reader *r = thread_rthc_get(env->me_txkey); if (unlikely(r == NULL)) return MDBX_RESULT_TRUE /* not registered */; @@ -12798,11 +13299,16 @@ __hot static int coherency_check_head(MDBX_txn *txn, const meta_ptr_t head, /* Copy the DB info and flags */ txn->mt_geo = head.ptr_v->mm_geo; memcpy(txn->mt_dbs, head.ptr_c->mm_dbs, CORE_DBS * sizeof(MDBX_db)); + VALGRIND_MAKE_MEM_UNDEFINED(txn->mt_dbs + CORE_DBS, + txn->mt_env->me_maxdbs - CORE_DBS); txn->mt_canary = head.ptr_v->mm_canary; if (unlikely(!coherency_check(txn->mt_env, head.txnid, txn->mt_dbs, head.ptr_v, *timestamp == 0))) return coherency_timeout(timestamp, -1, txn->mt_env); + + tASSERT(txn, txn->mt_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); + tASSERT(txn, db_check_flags(txn->mt_dbs[MAIN_DBI].md_flags)); return MDBX_SUCCESS; } @@ -12826,6 +13332,9 @@ static int coherency_check_written(const MDBX_env *env, const txnid_t txnid, } if (unlikely(!coherency_check(env, head_txnid, meta->mm_dbs, meta, report))) return coherency_timeout(timestamp, pgno, env); + + eASSERT(env, meta->mm_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); + eASSERT(env, db_check_flags(meta->mm_dbs[MAIN_DBI].md_flags)); return MDBX_SUCCESS; } @@ -12837,7 +13346,7 @@ static bool check_meta_coherency(const MDBX_env *env, } /* Common code for mdbx_txn_begin() and mdbx_txn_renew(). */ -static int txn_renew(MDBX_txn *txn, const unsigned flags) { +static int txn_renew(MDBX_txn *txn, unsigned flags) { MDBX_env *env = txn->mt_env; int rc; @@ -12862,18 +13371,19 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { 0); const uintptr_t tid = osal_thread_self(); + flags |= env->me_flags & (MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP); if (flags & MDBX_TXN_RDONLY) { - eASSERT(env, (flags & ~(MDBX_TXN_RO_BEGIN_FLAGS | MDBX_WRITEMAP)) == 0); - txn->mt_flags = - MDBX_TXN_RDONLY | (env->me_flags & (MDBX_NOTLS | MDBX_WRITEMAP)); + eASSERT(env, (flags & ~(MDBX_TXN_RO_BEGIN_FLAGS | MDBX_WRITEMAP | + MDBX_NOSTICKYTHREADS)) == 0); + txn->mt_flags = flags; MDBX_reader *r = txn->to.reader; STATIC_ASSERT(sizeof(uintptr_t) <= sizeof(r->mr_tid)); if (likely(env->me_flags & MDBX_ENV_TXKEY)) { - eASSERT(env, !(env->me_flags & MDBX_NOTLS)); + eASSERT(env, !(env->me_flags & MDBX_NOSTICKYTHREADS)); r = thread_rthc_get(env->me_txkey); if (likely(r)) { if (unlikely(!r->mr_pid.weak) && - (runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN)) { + (mdbx_static.flags & MDBX_DBG_LEGACY_MULTIOPEN)) { thread_rthc_set(env->me_txkey, nullptr); r = nullptr; } else { @@ -12882,7 +13392,8 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { } } } else { - eASSERT(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); + eASSERT(env, + !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOSTICKYTHREADS)); } if (likely(r)) { @@ -12896,6 +13407,7 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { r = brs.rslot; } txn->to.reader = r; + STATIC_ASSERT(MDBX_TXN_RDONLY_PREPARE > MDBX_TXN_RDONLY); if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) { eASSERT(env, txn->mt_txnid == 0); eASSERT(env, txn->mt_owner == 0); @@ -12908,6 +13420,7 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED; return MDBX_SUCCESS; } + txn->mt_owner = tid; /* Seek & fetch the last meta */ uint64_t timestamp = 0; @@ -12928,9 +13441,9 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { mo_Relaxed); safe64_write(&r->mr_txnid, head.txnid); eASSERT(env, r->mr_pid.weak == osal_getpid()); - eASSERT(env, - r->mr_tid.weak == - ((env->me_flags & MDBX_NOTLS) ? 0 : osal_thread_self())); + eASSERT(env, r->mr_tid.weak == ((env->me_flags & MDBX_NOSTICKYTHREADS) + ? 0 + : osal_thread_self())); eASSERT(env, r->mr_txnid.weak == head.txnid || (r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD && head.txnid < env->me_lck->mti_oldest_reader.weak)); @@ -12938,8 +13451,7 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { mo_AcquireRelease); } else { /* exclusive mode without lck */ - eASSERT(env, !env->me_lck_mmap.lck && - env->me_lck == (void *)&env->x_lckless_stub); + eASSERT(env, !env->me_lck_mmap.lck && env->me_lck == lckless_stub(env)); } jitter4testing(true); @@ -12983,21 +13495,20 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { rc = MDBX_CORRUPTED; goto bailout; } - eASSERT(env, txn->mt_txnid >= env->me_lck->mti_oldest_reader.weak); - txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ ENSURE(env, txn->mt_txnid >= /* paranoia is appropriate here */ env->me_lck ->mti_oldest_reader.weak); - txn->mt_numdbs = env->me_numdbs; + tASSERT(txn, txn->mt_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); + tASSERT(txn, db_check_flags(txn->mt_dbs[MAIN_DBI].md_flags)); } else { eASSERT(env, (flags & ~(MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS | - MDBX_WRITEMAP)) == 0); + MDBX_WRITEMAP | MDBX_NOSTICKYTHREADS)) == 0); if (unlikely(txn->mt_owner == tid || /* not recovery mode */ env->me_stuck_meta >= 0)) return MDBX_BUSY; MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (lck && (env->me_flags & MDBX_NOTLS) == 0 && - (runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) { + if (lck && (env->me_flags & MDBX_NOSTICKYTHREADS) == 0 && + (mdbx_static.flags & MDBX_DBG_LEGACY_OVERLAP) == 0) { const size_t snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); for (size_t i = 0; i < snap_nreaders; ++i) { @@ -13014,16 +13525,16 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { /* Not yet touching txn == env->me_txn0, it may be active */ jitter4testing(false); - rc = mdbx_txn_lock(env, !!(flags & MDBX_TXN_TRY)); + rc = osal_txn_lock(env, !!(flags & MDBX_TXN_TRY)); if (unlikely(rc)) return rc; if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { - mdbx_txn_unlock(env); + osal_txn_unlock(env); return MDBX_PANIC; } #if defined(_WIN32) || defined(_WIN64) if (unlikely(!env->me_map)) { - mdbx_txn_unlock(env); + osal_txn_unlock(env); return MDBX_EPERM; } #endif /* Windows */ @@ -13046,6 +13557,8 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { goto bailout; } + tASSERT(txn, txn->mt_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); + tASSERT(txn, db_check_flags(txn->mt_dbs[MAIN_DBI].md_flags)); txn->mt_flags = flags; txn->mt_child = NULL; txn->tw.loose_pages = NULL; @@ -13056,12 +13569,11 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { MDBX_PNL_SETSIZE(txn->tw.retired_pages, 0); txn->tw.spilled.list = NULL; txn->tw.spilled.least_removed = 0; + txn->tw.gc_time_acc = 0; txn->tw.last_reclaimed = 0; if (txn->tw.lifo_reclaimed) MDBX_PNL_SETSIZE(txn->tw.lifo_reclaimed, 0); env->me_txn = txn; - txn->mt_numdbs = env->me_numdbs; - memcpy(txn->mt_dbiseqs, env->me_dbiseqs, txn->mt_numdbs * sizeof(unsigned)); if ((txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) { rc = dpl_alloc(txn); @@ -13079,24 +13591,98 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { eASSERT(env, txn->tw.writemap_spilled_npages == 0); } - /* Setup db info */ - osal_compiler_barrier(); - memset(txn->mt_cursors, 0, sizeof(MDBX_cursor *) * txn->mt_numdbs); - for (size_t i = CORE_DBS; i < txn->mt_numdbs; i++) { - const unsigned db_flags = env->me_dbflags[i]; - txn->mt_dbs[i].md_flags = db_flags & DB_PERSISTENT_FLAGS; - txn->mt_dbistate[i] = - (db_flags & DB_VALID) ? DBI_VALID | DBI_USRVALID | DBI_STALE : 0; - } - txn->mt_dbistate[MAIN_DBI] = DBI_VALID | DBI_USRVALID; - rc = - setup_dbx(&txn->mt_dbxs[MAIN_DBI], &txn->mt_dbs[MAIN_DBI], env->me_psize); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - txn->mt_dbistate[FREE_DBI] = DBI_VALID; txn->mt_front = txn->mt_txnid + ((flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == 0); + /* Setup db info */ + tASSERT(txn, txn->mt_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); + tASSERT(txn, db_check_flags(txn->mt_dbs[MAIN_DBI].md_flags)); + VALGRIND_MAKE_MEM_UNDEFINED(txn->mt_dbi_state, env->me_maxdbs); +#if MDBX_ENABLE_DBI_SPARSE + txn->mt_numdbs = CORE_DBS; + VALGRIND_MAKE_MEM_UNDEFINED( + txn->mt_dbi_sparse, + ceil_powerof2(env->me_maxdbs, CHAR_BIT * sizeof(txn->mt_dbi_sparse[0])) / + CHAR_BIT); + txn->mt_dbi_sparse[0] = (1 << CORE_DBS) - 1; +#else + txn->mt_numdbs = (env->me_numdbs < 8) ? env->me_numdbs : 8; + if (txn->mt_numdbs > CORE_DBS) + memset(txn->mt_dbi_state + CORE_DBS, 0, txn->mt_numdbs - CORE_DBS); +#endif /* MDBX_ENABLE_DBI_SPARSE */ + txn->mt_dbi_state[FREE_DBI] = DBI_LINDO | DBI_VALID; + txn->mt_dbi_state[MAIN_DBI] = DBI_LINDO | DBI_VALID; + txn->mt_cursors[FREE_DBI] = nullptr; + txn->mt_cursors[MAIN_DBI] = nullptr; + txn->mt_dbi_seqs[FREE_DBI] = 0; + txn->mt_dbi_seqs[MAIN_DBI] = + atomic_load32(&env->me_dbi_seqs[MAIN_DBI], mo_AcquireRelease); + + if (unlikely(env->me_db_flags[MAIN_DBI] != + (DB_VALID | txn->mt_dbs[MAIN_DBI].md_flags))) { + const bool need_txn_lock = env->me_txn0 && env->me_txn0->mt_owner != tid; + bool should_unlock = false; + if (need_txn_lock) { + rc = osal_txn_lock(env, true); + if (rc == MDBX_SUCCESS) + should_unlock = true; + else if (rc != MDBX_BUSY && rc != MDBX_EDEADLK) + goto bailout; + } + rc = osal_fastmutex_acquire(&env->me_dbi_lock); + if (likely(rc == MDBX_SUCCESS)) { + uint32_t seq = dbi_seq_next(env, MAIN_DBI); + /* проверяем повторно после захвата блокировки */ + if (env->me_db_flags[MAIN_DBI] != + (DB_VALID | txn->mt_dbs[MAIN_DBI].md_flags)) { + if (!need_txn_lock || should_unlock || + /* если нет активной пишущей транзакции, + * то следующая будет ждать на me_dbi_lock */ + !env->me_txn) { + if (env->me_db_flags[MAIN_DBI] != 0 || MDBX_DEBUG) + NOTICE("renew MainDB for %s-txn %" PRIaTXN + " since db-flags changes 0x%x -> 0x%x", + (txn->mt_flags & MDBX_TXN_RDONLY) ? "ro" : "rw", + txn->mt_txnid, env->me_db_flags[MAIN_DBI] & ~DB_VALID, + txn->mt_dbs[MAIN_DBI].md_flags); + env->me_db_flags[MAIN_DBI] = DB_POISON; + atomic_store32(&env->me_dbi_seqs[MAIN_DBI], seq, mo_AcquireRelease); + rc = setup_sdb(&env->me_dbxs[MAIN_DBI], &txn->mt_dbs[MAIN_DBI], + env->me_psize); + if (likely(rc == MDBX_SUCCESS)) { + seq = dbi_seq_next(env, MAIN_DBI); + env->me_db_flags[MAIN_DBI] = + DB_VALID | txn->mt_dbs[MAIN_DBI].md_flags; + txn->mt_dbi_seqs[MAIN_DBI] = atomic_store32( + &env->me_dbi_seqs[MAIN_DBI], seq, mo_AcquireRelease); + } + } else { + ERROR("MainDB db-flags changes 0x%x -> 0x%x ahead of read-txn " + "%" PRIaTXN, + txn->mt_dbs[MAIN_DBI].md_flags, + env->me_db_flags[MAIN_DBI] & ~DB_VALID, txn->mt_txnid); + rc = MDBX_INCOMPATIBLE; + } + } + ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + } else { + DEBUG("me_dbi_lock failed, err %d", rc); + } + if (should_unlock) + osal_txn_unlock(env); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + + if (unlikely(txn->mt_dbs[FREE_DBI].md_flags != MDBX_INTEGERKEY)) { + ERROR("unexpected/invalid db-flags 0x%u for GC/FreeDB", + txn->mt_dbs[FREE_DBI].md_flags); + rc = MDBX_INCOMPATIBLE; + goto bailout; + } + + tASSERT(txn, txn->mt_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); + tASSERT(txn, db_check_flags(txn->mt_dbs[MAIN_DBI].md_flags)); if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { WARNING("%s", "environment had fatal error, must shutdown!"); rc = MDBX_PANIC; @@ -13181,34 +13767,27 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { * since Wine don't support section extending, * i.e. in both cases unmap+map are required. */ used_bytes < env->me_dbgeo.upper && env->me_dbgeo.grow)) && - /* avoid recursive use SRW */ (txn->mt_flags & MDBX_NOTLS) == 0) { + /* avoid recursive use SRW */ (txn->mt_flags & + MDBX_NOSTICKYTHREADS) == 0) { txn->mt_flags |= MDBX_SHRINK_ALLOWED; osal_srwlock_AcquireShared(&env->me_remap_guard); } #endif /* Windows */ } else { - if (unlikely(txn->mt_dbs[FREE_DBI].md_flags != MDBX_INTEGERKEY)) { - ERROR("unexpected/invalid db-flags 0x%u for GC/FreeDB", - txn->mt_dbs[FREE_DBI].md_flags); - rc = MDBX_INCOMPATIBLE; - goto bailout; - } - tASSERT(txn, txn == env->me_txn0); MDBX_cursor *const gc = ptr_disp(txn, sizeof(MDBX_txn)); rc = cursor_init(gc, txn, FREE_DBI); if (rc != MDBX_SUCCESS) goto bailout; } -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) txn_valgrind(env, txn); -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ - txn->mt_owner = tid; +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ return MDBX_SUCCESS; } bailout: tASSERT(txn, rc != MDBX_SUCCESS); - txn_end(txn, MDBX_END_SLOT | MDBX_END_FAIL_BEGIN); + txn_end(txn, TXN_END_SLOT | TXN_END_FAIL_BEGIN); return rc; } @@ -13223,15 +13802,13 @@ static __always_inline int check_txn(const MDBX_txn *txn, int bad_bits) { return MDBX_BAD_TXN; tASSERT(txn, (txn->mt_flags & MDBX_TXN_FINISHED) || - (txn->mt_flags & MDBX_NOTLS) == - ((txn->mt_flags & MDBX_TXN_RDONLY) - ? txn->mt_env->me_flags & MDBX_NOTLS - : 0)); + (txn->mt_flags & MDBX_NOSTICKYTHREADS) == + (txn->mt_env->me_flags & MDBX_NOSTICKYTHREADS)); #if MDBX_TXN_CHECKOWNER - STATIC_ASSERT(MDBX_NOTLS > MDBX_TXN_FINISHED + MDBX_TXN_RDONLY); - if (unlikely(txn->mt_owner != osal_thread_self()) && - (txn->mt_flags & (MDBX_NOTLS | MDBX_TXN_FINISHED | MDBX_TXN_RDONLY)) < - (MDBX_TXN_FINISHED | MDBX_TXN_RDONLY)) + STATIC_ASSERT((long)MDBX_NOSTICKYTHREADS > (long)MDBX_TXN_FINISHED); + if ((txn->mt_flags & (MDBX_NOSTICKYTHREADS | MDBX_TXN_FINISHED)) < + MDBX_TXN_FINISHED && + unlikely(txn->mt_owner != osal_thread_self())) return txn->mt_owner ? MDBX_THREAD_MISMATCH : MDBX_BAD_TXN; #endif /* MDBX_TXN_CHECKOWNER */ @@ -13312,8 +13889,6 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, ~flags)) /* write txn in RDONLY env */ return MDBX_EACCESS; - flags |= env->me_flags & MDBX_WRITEMAP; - MDBX_txn *txn = nullptr; if (parent) { /* Nested transactions: Max 1 child, write txns only, no writemap */ @@ -13332,11 +13907,12 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, } tASSERT(parent, audit_ex(parent, 0, false) == 0); - flags |= parent->mt_flags & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS); + flags |= parent->mt_flags & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS | + MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP); } else if (flags & MDBX_TXN_RDONLY) { - if (env->me_txn0 && + if ((env->me_flags & MDBX_NOSTICKYTHREADS) == 0 && env->me_txn && unlikely(env->me_txn0->mt_owner == osal_thread_self()) && - (runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) + (mdbx_static.flags & MDBX_DBG_LEGACY_OVERLAP) == 0) return MDBX_TXN_OVERLAPPING; } else { /* Reuse preallocated write txn. However, do not touch it until @@ -13345,11 +13921,24 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, goto renew; } + const intptr_t bitmap_bytes = +#if MDBX_ENABLE_DBI_SPARSE + ceil_powerof2(env->me_maxdbs, CHAR_BIT * sizeof(txn->mt_dbi_sparse[0])) / + CHAR_BIT; +#else + 0; +#endif /* MDBX_ENABLE_DBI_SPARSE */ + STATIC_ASSERT(sizeof(txn->tw) > sizeof(txn->to)); const size_t base = (flags & MDBX_TXN_RDONLY) ? sizeof(MDBX_txn) - sizeof(txn->tw) + sizeof(txn->to) : sizeof(MDBX_txn); const size_t size = - base + env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + 1); + base + + ((flags & MDBX_TXN_RDONLY) + ? (size_t)bitmap_bytes + env->me_maxdbs * sizeof(txn->mt_dbi_seqs[0]) + : 0) + + env->me_maxdbs * (sizeof(txn->mt_dbs[0]) + sizeof(txn->mt_cursors[0]) + + sizeof(txn->mt_dbi_state[0])); txn = osal_malloc(size); if (unlikely(txn == nullptr)) { DEBUG("calloc: %s", "failed"); @@ -13363,18 +13952,22 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, memset(txn, 0, (MDBX_GOOFY_MSVC_STATIC_ANALYZER && base > size) ? size : base); txn->mt_dbs = ptr_disp(txn, base); - txn->mt_cursors = ptr_disp(txn->mt_dbs, sizeof(MDBX_db) * env->me_maxdbs); + txn->mt_cursors = + ptr_disp(txn->mt_dbs, env->me_maxdbs * sizeof(txn->mt_dbs[0])); #if MDBX_DEBUG txn->mt_cursors[FREE_DBI] = nullptr; /* avoid SIGSEGV in an assertion later */ -#endif /* MDBX_DEBUG */ - txn->mt_dbistate = ptr_disp(txn, size - env->me_maxdbs); - txn->mt_dbxs = env->me_dbxs; /* static */ +#endif + txn->mt_dbi_state = + ptr_disp(txn, size - env->me_maxdbs * sizeof(txn->mt_dbi_state[0])); txn->mt_flags = flags; txn->mt_env = env; if (parent) { tASSERT(parent, dirtylist_check(parent)); - txn->mt_dbiseqs = parent->mt_dbiseqs; +#if MDBX_ENABLE_DBI_SPARSE + txn->mt_dbi_sparse = parent->mt_dbi_sparse; +#endif /* MDBX_ENABLE_DBI_SPARSE */ + txn->mt_dbi_seqs = parent->mt_dbi_seqs; txn->mt_geo = parent->mt_geo; rc = dpl_alloc(txn); if (likely(rc == MDBX_SUCCESS)) { @@ -13431,6 +14024,7 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, = parent->mt_next_pgno) - MDBX_ENABLE_REFUND)); + txn->tw.gc_time_acc = parent->tw.gc_time_acc; txn->tw.last_reclaimed = parent->tw.last_reclaimed; if (parent->tw.lifo_reclaimed) { txn->tw.lifo_reclaimed = parent->tw.lifo_reclaimed; @@ -13451,14 +14045,19 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, parent->mt_flags |= MDBX_TXN_HAS_CHILD; parent->mt_child = txn; txn->mt_parent = parent; - txn->mt_numdbs = parent->mt_numdbs; txn->mt_owner = parent->mt_owner; - memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); txn->tw.troika = parent->tw.troika; - /* Copy parent's mt_dbistate, but clear DB_NEW */ - for (size_t i = 0; i < txn->mt_numdbs; i++) - txn->mt_dbistate[i] = - parent->mt_dbistate[i] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); + + txn->mt_cursors[FREE_DBI] = nullptr; + txn->mt_cursors[MAIN_DBI] = nullptr; + txn->mt_dbi_state[FREE_DBI] = + parent->mt_dbi_state[FREE_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); + txn->mt_dbi_state[MAIN_DBI] = + parent->mt_dbi_state[MAIN_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); + memset(txn->mt_dbi_state + CORE_DBS, 0, + (txn->mt_numdbs = parent->mt_numdbs) - CORE_DBS); + memcpy(txn->mt_dbs, parent->mt_dbs, sizeof(txn->mt_dbs[0]) * CORE_DBS); + tASSERT(parent, parent->tw.dirtyroom + parent->tw.dirtylist->length == (parent->mt_parent ? parent->mt_parent->tw.dirtyroom @@ -13467,15 +14066,22 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, (txn->mt_parent ? txn->mt_parent->tw.dirtyroom : txn->mt_env->me_options.dp_limit)); env->me_txn = txn; - rc = cursor_shadow(parent, txn); + tASSERT(parent, parent->mt_cursors[FREE_DBI] == nullptr); + rc = parent->mt_cursors[MAIN_DBI] + ? cursor_shadow(parent->mt_cursors[MAIN_DBI], txn, MAIN_DBI) + : MDBX_SUCCESS; if (AUDIT_ENABLED() && ASSERT_ENABLED()) { txn->mt_signature = MDBX_MT_SIGNATURE; tASSERT(txn, audit_ex(txn, 0, false) == 0); } if (unlikely(rc != MDBX_SUCCESS)) - txn_end(txn, MDBX_END_FAIL_BEGINCHILD); + txn_end(txn, TXN_END_FAIL_BEGINCHILD); } else { /* MDBX_TXN_RDONLY */ - txn->mt_dbiseqs = env->me_dbiseqs; + txn->mt_dbi_seqs = + ptr_disp(txn->mt_cursors, env->me_maxdbs * sizeof(txn->mt_cursors[0])); +#if MDBX_ENABLE_DBI_SPARSE + txn->mt_dbi_sparse = ptr_disp(txn->mt_dbi_state, -bitmap_bytes); +#endif /* MDBX_ENABLE_DBI_SPARSE */ renew: rc = txn_renew(txn, flags); } @@ -13488,12 +14094,13 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, eASSERT(env, txn->mt_flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED)); else if (flags & MDBX_TXN_RDONLY) eASSERT(env, (txn->mt_flags & - ~(MDBX_NOTLS | MDBX_TXN_RDONLY | MDBX_WRITEMAP | + ~(MDBX_NOSTICKYTHREADS | MDBX_TXN_RDONLY | MDBX_WRITEMAP | /* Win32: SRWL flag */ MDBX_SHRINK_ALLOWED)) == 0); else { - eASSERT(env, (txn->mt_flags & - ~(MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED | MDBX_NOMETASYNC | - MDBX_SAFE_NOSYNC | MDBX_TXN_SPILLS)) == 0); + eASSERT(env, + (txn->mt_flags & + ~(MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED | + MDBX_NOMETASYNC | MDBX_SAFE_NOSYNC | MDBX_TXN_SPILLS)) == 0); assert(!txn->tw.spilled.list && !txn->tw.spilled.least_removed); } txn->mt_signature = MDBX_MT_SIGNATURE; @@ -13645,141 +14252,17 @@ uint64_t mdbx_txn_id(const MDBX_txn *txn) { } int mdbx_txn_flags(const MDBX_txn *txn) { - if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) { - assert((-1 & (int)MDBX_TXN_INVALID) != 0); - return -1; - } + STATIC_ASSERT( + (MDBX_TXN_INVALID & + (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | + MDBX_TXN_HAS_CHILD | MDBX_TXN_DRAINED_GC | MDBX_SHRINK_ALLOWED | + MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) == 0); + if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDBX_TXN_INVALID; assert(0 == (int)(txn->mt_flags & MDBX_TXN_INVALID)); return txn->mt_flags; } -/* Check for misused dbi handles */ -static __inline bool dbi_changed(const MDBX_txn *txn, size_t dbi) { - if (txn->mt_dbiseqs == txn->mt_env->me_dbiseqs) - return false; - if (likely( - txn->mt_dbiseqs[dbi].weak == - atomic_load32((MDBX_atomic_uint32_t *)&txn->mt_env->me_dbiseqs[dbi], - mo_AcquireRelease))) - return false; - return true; -} - -static __inline unsigned dbi_seq(const MDBX_env *const env, size_t slot) { - unsigned v = env->me_dbiseqs[slot].weak + 1; - return v + (v == 0); -} - -static void dbi_import_locked(MDBX_txn *txn) { - const MDBX_env *const env = txn->mt_env; - size_t n = env->me_numdbs; - for (size_t i = CORE_DBS; i < n; ++i) { - if (i >= txn->mt_numdbs) { - txn->mt_cursors[i] = NULL; - if (txn->mt_dbiseqs != env->me_dbiseqs) - txn->mt_dbiseqs[i].weak = 0; - txn->mt_dbistate[i] = 0; - } - if ((dbi_changed(txn, i) && - (txn->mt_dbistate[i] & (DBI_CREAT | DBI_DIRTY | DBI_FRESH)) == 0) || - ((env->me_dbflags[i] & DB_VALID) && - !(txn->mt_dbistate[i] & DBI_VALID))) { - tASSERT(txn, - (txn->mt_dbistate[i] & (DBI_CREAT | DBI_DIRTY | DBI_FRESH)) == 0); - txn->mt_dbiseqs[i] = env->me_dbiseqs[i]; - txn->mt_dbs[i].md_flags = env->me_dbflags[i] & DB_PERSISTENT_FLAGS; - txn->mt_dbistate[i] = 0; - if (env->me_dbflags[i] & DB_VALID) { - txn->mt_dbistate[i] = DBI_VALID | DBI_USRVALID | DBI_STALE; - tASSERT(txn, txn->mt_dbxs[i].md_cmp != NULL); - tASSERT(txn, txn->mt_dbxs[i].md_name.iov_base != NULL); - } - } - } - while (unlikely(n < txn->mt_numdbs)) - if (txn->mt_cursors[txn->mt_numdbs - 1] == NULL && - (txn->mt_dbistate[txn->mt_numdbs - 1] & DBI_USRVALID) == 0) - txn->mt_numdbs -= 1; - else { - if ((txn->mt_dbistate[n] & DBI_USRVALID) == 0) { - if (txn->mt_dbiseqs != env->me_dbiseqs) - txn->mt_dbiseqs[n].weak = 0; - txn->mt_dbistate[n] = 0; - } - ++n; - } - txn->mt_numdbs = (MDBX_dbi)n; -} - -/* Import DBI which opened after txn started into context */ -__cold static bool dbi_import(MDBX_txn *txn, MDBX_dbi dbi) { - if (dbi < CORE_DBS || - (dbi >= txn->mt_numdbs && dbi >= txn->mt_env->me_numdbs)) - return false; - - ENSURE(txn->mt_env, - osal_fastmutex_acquire(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS); - dbi_import_locked(txn); - ENSURE(txn->mt_env, - osal_fastmutex_release(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS); - return txn->mt_dbistate[dbi] & DBI_USRVALID; -} - -/* Export or close DBI handles opened in this txn. */ -static void dbi_update(MDBX_txn *txn, int keep) { - tASSERT(txn, !txn->mt_parent && txn == txn->mt_env->me_txn0); - MDBX_dbi n = txn->mt_numdbs; - if (n) { - bool locked = false; - MDBX_env *const env = txn->mt_env; - - for (size_t i = n; --i >= CORE_DBS;) { - if (likely((txn->mt_dbistate[i] & DBI_CREAT) == 0)) - continue; - if (!locked) { - ENSURE(env, osal_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); - locked = true; - } - if (env->me_numdbs <= i || - txn->mt_dbiseqs[i].weak != env->me_dbiseqs[i].weak) - continue /* dbi explicitly closed and/or then re-opened by other txn */; - if (keep) { - env->me_dbflags[i] = txn->mt_dbs[i].md_flags | DB_VALID; - } else { - const MDBX_val name = env->me_dbxs[i].md_name; - if (name.iov_base) { - env->me_dbxs[i].md_name.iov_base = nullptr; - eASSERT(env, env->me_dbflags[i] == 0); - atomic_store32(&env->me_dbiseqs[i], dbi_seq(env, i), - mo_AcquireRelease); - env->me_dbxs[i].md_name.iov_len = 0; - if (name.iov_len) - osal_free(name.iov_base); - } else { - eASSERT(env, name.iov_len == 0); - eASSERT(env, env->me_dbflags[i] == 0); - } - } - } - - n = env->me_numdbs; - if (n > CORE_DBS && unlikely(!(env->me_dbflags[n - 1] & DB_VALID))) { - if (!locked) { - ENSURE(env, osal_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); - locked = true; - } - - n = env->me_numdbs; - while (n > CORE_DBS && !(env->me_dbflags[n - 1] & DB_VALID)) - --n; - env->me_numdbs = n; - } - - if (unlikely(locked)) - ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); - } -} - /* Filter-out pgno list from transaction's dirty-page list */ static void dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, const bool spilled) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); @@ -13854,22 +14337,15 @@ static void dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, const bool spilled) { * [in] mode why and how to end the transaction */ static int txn_end(MDBX_txn *txn, const unsigned mode) { MDBX_env *env = txn->mt_env; - static const char *const names[] = MDBX_END_NAMES; - -#if MDBX_ENV_CHECKPID - if (unlikely(txn->mt_env->me_pid != osal_getpid())) { - env->me_flags |= MDBX_FATAL_ERROR; - return MDBX_PANIC; - } -#endif /* MDBX_ENV_CHECKPID */ + static const char *const names[] = TXN_END_NAMES; - DEBUG("%s txn %" PRIaTXN "%c %p on mdbenv %p, root page %" PRIaPGNO + DEBUG("%s txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, - names[mode & MDBX_END_OPMASK], txn->mt_txnid, + names[mode & TXN_END_OPMASK], txn->mt_txnid, (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root); - if (!(mode & MDBX_END_EOTDONE)) /* !(already closed cursors) */ + if (!(mode & TXN_END_EOTDONE)) /* !(already closed cursors) */ cursors_eot(txn, false); int rc = MDBX_SUCCESS; @@ -13884,11 +14360,9 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { eASSERT(env, txn->mt_txnid == slot->mr_txnid.weak && slot->mr_txnid.weak >= env->me_lck->mti_oldest_reader.weak); -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - atomic_add32(&env->me_ignore_EDEADLK, 1); +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) txn_valgrind(env, nullptr); - atomic_sub32(&env->me_ignore_EDEADLK, 1); -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ atomic_store32(&slot->mr_snapshot_pages_used, 0, mo_Relaxed); safe64_reset(&slot->mr_txnid, false); atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, @@ -13897,7 +14371,7 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { eASSERT(env, slot->mr_pid.weak == env->me_pid); eASSERT(env, slot->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); } - if (mode & MDBX_END_SLOT) { + if (mode & TXN_END_SLOT) { if ((env->me_flags & MDBX_ENV_TXKEY) == 0) atomic_store32(&slot->mr_pid, 0, mo_Relaxed); txn->to.reader = NULL; @@ -13914,26 +14388,25 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { ENSURE(env, txn->mt_txnid >= /* paranoia is appropriate here */ env->me_lck ->mti_oldest_reader.weak); -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) if (txn == env->me_txn0) txn_valgrind(env, nullptr); -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ txn->mt_flags = MDBX_TXN_FINISHED; - txn->mt_owner = 0; env->me_txn = txn->mt_parent; pnl_free(txn->tw.spilled.list); txn->tw.spilled.list = nullptr; if (txn == env->me_txn0) { eASSERT(env, txn->mt_parent == NULL); /* Export or close DBI handles created in this txn */ - dbi_update(txn, mode & MDBX_END_UPDATE); + rc = dbi_update(txn, mode & TXN_END_UPDATE); pnl_shrink(&txn->tw.retired_pages); pnl_shrink(&txn->tw.relist); if (!(env->me_flags & MDBX_WRITEMAP)) dlist_free(txn); /* The writer mutex was locked in mdbx_txn_begin. */ - mdbx_txn_unlock(env); + osal_txn_unlock(env); } else { eASSERT(env, txn->mt_parent != NULL); MDBX_txn *const parent = txn->mt_parent; @@ -13945,6 +14418,7 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { eASSERT(env, memcmp(&txn->tw.troika, &parent->tw.troika, sizeof(meta_troika_t)) == 0); + txn->mt_owner = 0; if (txn->tw.lifo_reclaimed) { eASSERT(env, MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) >= (uintptr_t)parent->tw.lifo_reclaimed); @@ -13999,7 +14473,7 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { } eASSERT(env, txn == env->me_txn0 || txn->mt_owner == 0); - if ((mode & MDBX_END_FREE) != 0 && txn != env->me_txn0) { + if ((mode & TXN_END_FREE) != 0 && txn != env->me_txn0) { txn->mt_signature = 0; osal_free(txn); } @@ -14017,7 +14491,7 @@ int mdbx_txn_reset(MDBX_txn *txn) { return MDBX_EINVAL; /* LY: don't close DBI-handles */ - rc = txn_end(txn, MDBX_END_RESET | MDBX_END_UPDATE); + rc = txn_end(txn, TXN_END_RESET | TXN_END_UPDATE); if (rc == MDBX_SUCCESS) { tASSERT(txn, txn->mt_signature == MDBX_MT_SIGNATURE); tASSERT(txn, txn->mt_owner == 0); @@ -14038,30 +14512,76 @@ int mdbx_txn_break(MDBX_txn *txn) { return MDBX_SUCCESS; } -int mdbx_txn_abort(MDBX_txn *txn) { - int rc = check_txn(txn, 0); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - +static int txn_abort(MDBX_txn *txn) { if (txn->mt_flags & MDBX_TXN_RDONLY) /* LY: don't close DBI-handles */ - return txn_end(txn, MDBX_END_ABORT | MDBX_END_UPDATE | MDBX_END_SLOT | - MDBX_END_FREE); + return txn_end(txn, TXN_END_ABORT | TXN_END_UPDATE | TXN_END_SLOT | + TXN_END_FREE); if (unlikely(txn->mt_flags & MDBX_TXN_FINISHED)) return MDBX_BAD_TXN; if (txn->mt_child) - mdbx_txn_abort(txn->mt_child); + txn_abort(txn->mt_child); tASSERT(txn, (txn->mt_flags & MDBX_TXN_ERROR) || dirtylist_check(txn)); - return txn_end(txn, MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE); + return txn_end(txn, TXN_END_ABORT | TXN_END_SLOT | TXN_END_FREE); +} + +int mdbx_txn_abort(MDBX_txn *txn) { + int rc = check_txn(txn, 0); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + rc = check_env(txn->mt_env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if ((txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_NOSTICKYTHREADS)) == + MDBX_NOSTICKYTHREADS && + unlikely(txn->mt_owner != osal_thread_self())) { + mdbx_txn_break(txn); + return MDBX_THREAD_MISMATCH; + } + + return txn_abort(txn); +} + +__cold static MDBX_db *audit_db_dig(const MDBX_txn *txn, const size_t dbi, + MDBX_db *fallback) { + const MDBX_txn *dig = txn; + do { + tASSERT(txn, txn->mt_numdbs == dig->mt_numdbs); + const uint8_t state = dbi_state(dig, dbi); + if (state & DBI_LINDO) + switch (state & (DBI_VALID | DBI_STALE | DBI_OLDEN)) { + case DBI_VALID: + case DBI_OLDEN: + return dig->mt_dbs + dbi; + case 0: + return nullptr; + case DBI_VALID | DBI_STALE: + case DBI_OLDEN | DBI_STALE: + break; + default: + tASSERT(txn, !!"unexpected dig->mt_dbi_state[dbi]"); + } + dig = dig->mt_parent; + } while (dig); + return fallback; +} + +static size_t audit_db_used(const MDBX_db *db) { + return db ? (size_t)db->md_branch_pages + (size_t)db->md_leaf_pages + + (size_t)db->md_overflow_pages + : 0; } /* Count all the pages in each DB and in the GC and make sure * it matches the actual number of pages being used. */ -__cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, - bool dont_filter_gc) { +__cold static int audit_ex_locked(MDBX_txn *txn, size_t retired_stored, + bool dont_filter_gc) { + const MDBX_env *const env = txn->mt_env; size_t pending = 0; if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) pending = txn->tw.loose_count + MDBX_PNL_GETSIZE(txn->tw.relist) + @@ -14076,8 +14596,11 @@ __cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, MDBX_val key, data; while ((rc = cursor_get(&cx.outer, &key, &data, MDBX_NEXT)) == 0) { if (!dont_filter_gc) { - if (unlikely(key.iov_len != sizeof(txnid_t))) + if (unlikely(key.iov_len != sizeof(txnid_t))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC-key size", (unsigned)key.iov_len); return MDBX_CORRUPTED; + } txnid_t id = unaligned_peek_u64(4, key.iov_base); if (txn->tw.lifo_reclaimed) { for (size_t i = 1; i <= MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed); ++i) @@ -14092,79 +14615,69 @@ __cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, } tASSERT(txn, rc == MDBX_NOTFOUND); - for (size_t i = FREE_DBI; i < txn->mt_numdbs; i++) - txn->mt_dbistate[i] &= ~DBI_AUDITED; + const size_t done_bitmap_size = (txn->mt_numdbs + CHAR_BIT - 1) / CHAR_BIT; + uint8_t *const done_bitmap = alloca(done_bitmap_size); + memset(done_bitmap, 0, done_bitmap_size); + if (txn->mt_parent) { + tASSERT(txn, txn->mt_numdbs == txn->mt_parent->mt_numdbs && + txn->mt_numdbs == txn->mt_env->me_txn->mt_numdbs); +#if MDBX_ENABLE_DBI_SPARSE + tASSERT(txn, txn->mt_dbi_sparse == txn->mt_parent->mt_dbi_sparse && + txn->mt_dbi_sparse == txn->mt_env->me_txn->mt_dbi_sparse); +#endif /* MDBX_ENABLE_DBI_SPARSE */ + } + + size_t used = NUM_METAS + + audit_db_used(audit_db_dig(txn, FREE_DBI, nullptr)) + + audit_db_used(audit_db_dig(txn, MAIN_DBI, nullptr)); + rc = cursor_init(&cx.outer, txn, MAIN_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - size_t used = NUM_METAS; - for (size_t i = FREE_DBI; i <= MAIN_DBI; i++) { - if (!(txn->mt_dbistate[i] & DBI_VALID)) - continue; - rc = cursor_init(&cx.outer, txn, i); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - txn->mt_dbistate[i] |= DBI_AUDITED; - if (txn->mt_dbs[i].md_root == P_INVALID) - continue; - used += (size_t)txn->mt_dbs[i].md_branch_pages + - (size_t)txn->mt_dbs[i].md_leaf_pages + - (size_t)txn->mt_dbs[i].md_overflow_pages; + for (rc = page_search(&cx.outer, NULL, MDBX_PS_FIRST); rc == MDBX_SUCCESS; + rc = cursor_sibling(&cx.outer, SIBLING_RIGHT)) { + MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top]; + for (size_t k = 0; k < page_numkeys(mp); k++) { + MDBX_node *node = page_node(mp, k); + if (node_flags(node) != F_SUBDATA) + continue; + if (unlikely(node_ds(node) != sizeof(MDBX_db))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid dupsort sub-tree node size", (unsigned)node_ds(node)); + return MDBX_CORRUPTED; + } - if (i != MAIN_DBI) - continue; - rc = page_search(&cx.outer, NULL, MDBX_PS_FIRST); - while (rc == MDBX_SUCCESS) { - MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top]; - for (size_t j = 0; j < page_numkeys(mp); j++) { - MDBX_node *node = page_node(mp, j); - if (node_flags(node) == F_SUBDATA) { - if (unlikely(node_ds(node) != sizeof(MDBX_db))) - return MDBX_CORRUPTED; - MDBX_db db_copy, *db; - memcpy(db = &db_copy, node_data(node), sizeof(db_copy)); - if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) { - for (MDBX_dbi k = txn->mt_numdbs; --k > MAIN_DBI;) { - if ((txn->mt_dbistate[k] & DBI_VALID) && - /* txn->mt_dbxs[k].md_name.iov_base && */ - node_ks(node) == txn->mt_dbxs[k].md_name.iov_len && - memcmp(node_key(node), txn->mt_dbxs[k].md_name.iov_base, - node_ks(node)) == 0) { - txn->mt_dbistate[k] |= DBI_AUDITED; - if (!(txn->mt_dbistate[k] & MDBX_DBI_STALE)) - db = txn->mt_dbs + k; - break; - } - } - } - used += (size_t)db->md_branch_pages + (size_t)db->md_leaf_pages + - (size_t)db->md_overflow_pages; - } + MDBX_db reside; + const MDBX_db *db = memcpy(&reside, node_data(node), sizeof(reside)); + const MDBX_val name = {node_key(node), node_ks(node)}; + for (size_t dbi = CORE_DBS; dbi < env->me_numdbs; ++dbi) { + if (dbi >= txn->mt_numdbs || !(env->me_db_flags[dbi] & DB_VALID)) + continue; + if (env->me_dbxs[MAIN_DBI].md_cmp(&name, &env->me_dbxs[dbi].md_name)) + continue; + + done_bitmap[dbi / CHAR_BIT] |= 1 << dbi % CHAR_BIT; + db = audit_db_dig(txn, dbi, &reside); + break; } - rc = cursor_sibling(&cx.outer, SIBLING_RIGHT); + used += audit_db_used(db); } - tASSERT(txn, rc == MDBX_NOTFOUND); } + tASSERT(txn, rc == MDBX_NOTFOUND); - for (size_t i = FREE_DBI; i < txn->mt_numdbs; i++) { - if ((txn->mt_dbistate[i] & (DBI_VALID | DBI_AUDITED | DBI_STALE)) != - DBI_VALID) + for (size_t dbi = CORE_DBS; dbi < txn->mt_numdbs; ++dbi) { + if (done_bitmap[dbi / CHAR_BIT] & (1 << dbi % CHAR_BIT)) continue; - for (MDBX_txn *t = txn; t; t = t->mt_parent) - if (F_ISSET(t->mt_dbistate[i], DBI_DIRTY | DBI_CREAT)) { - used += (size_t)t->mt_dbs[i].md_branch_pages + - (size_t)t->mt_dbs[i].md_leaf_pages + - (size_t)t->mt_dbs[i].md_overflow_pages; - txn->mt_dbistate[i] |= DBI_AUDITED; - break; - } - MDBX_ANALYSIS_ASSUME(txn != nullptr); - if (!(txn->mt_dbistate[i] & DBI_AUDITED)) { + const MDBX_db *db = audit_db_dig(txn, dbi, nullptr); + if (db) + used += audit_db_used(db); + else if (dbi_state(txn, dbi)) WARNING("audit %s@%" PRIaTXN ": unable account dbi %zd / \"%*s\", state 0x%02x", - txn->mt_parent ? "nested-" : "", txn->mt_txnid, i, - (int)txn->mt_dbxs[i].md_name.iov_len, - (const char *)txn->mt_dbxs[i].md_name.iov_base, - txn->mt_dbistate[i]); - } + txn->mt_parent ? "nested-" : "", txn->mt_txnid, dbi, + (int)env->me_dbxs[dbi].md_name.iov_len, + (const char *)env->me_dbxs[dbi].md_name.iov_base, + dbi_state(txn, dbi)); } if (pending + gc + used == txn->mt_next_pgno) @@ -14185,9 +14698,22 @@ __cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, return MDBX_PROBLEM; } +__cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, + bool dont_filter_gc) { + MDBX_env *const env = txn->mt_env; + int rc = osal_fastmutex_acquire(&env->me_dbi_lock); + if (likely(rc == MDBX_SUCCESS)) { + rc = audit_ex_locked(txn, retired_stored, dont_filter_gc); + ENSURE(txn->mt_env, + osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + } + return rc; +} + typedef struct gc_update_context { - size_t retired_stored, loop; - size_t settled, cleaned_slot, reused_slot, filled_slot; + size_t loop, reserve_adj; + size_t retired_stored; + size_t reserved, cleaned_slot, reused_slot, fill_idx; txnid_t cleaned_id, rid; bool lifo, dense; #if MDBX_ENABLE_BIGFOOT @@ -14232,7 +14758,8 @@ static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { err = cursor_del(gc, 0); TRACE("== clear-4linear, backlog %zu, err %d", gcu_backlog_size(txn), err); - } + } else + err = (err == MDBX_NOTFOUND) ? MDBX_SUCCESS : err; } #if MDBX_ENABLE_BIGFOOT while (!err && --ctx->bigfoot >= txn->mt_txnid); @@ -14334,13 +14861,13 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx) { } static __inline void gcu_clean_reserved(MDBX_env *env, MDBX_val pnl) { -#if MDBX_DEBUG && (defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)) +#if MDBX_DEBUG && (defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__)) /* Для предотвращения предупреждения Valgrind из mdbx_dump_val() * вызванное через макрос DVAL_DEBUG() на выходе * из cursor_set(MDBX_SET_KEY), которая вызывается ниже внутри update_gc() в * цикле очистки и цикле заполнения зарезервированных элементов. */ memset(pnl.iov_base, 0xBB, pnl.iov_len); -#endif /* MDBX_DEBUG && (MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__) */ +#endif /* MDBX_DEBUG && (ENABLE_MEMCHECK || __SANITIZE_ADDRESS__) */ /* PNL is initially empty, zero out at least the length */ memset(pnl.iov_base, 0, sizeof(pgno_t)); @@ -14371,7 +14898,8 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { /* txn->tw.relist[] can grow and shrink during this call. * txn->tw.last_reclaimed and txn->tw.retired_pages[] can only grow. * But page numbers cannot disappear from txn->tw.retired_pages[]. */ - +retry_clean_adj: + ctx->reserve_adj = 0; retry: if (ctx->loop++) TRACE("%s", " >> restart"); @@ -14391,10 +14919,10 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { goto bailout; } - ctx->settled = 0; + ctx->reserved = 0; ctx->cleaned_slot = 0; ctx->reused_slot = 0; - ctx->filled_slot = ~0u; + ctx->fill_idx = ~0u; ctx->cleaned_id = 0; ctx->rid = txn->tw.last_reclaimed; while (true) { @@ -14416,10 +14944,10 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { if (ctx->cleaned_slot < (txn->tw.lifo_reclaimed ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) : 0)) { - ctx->settled = 0; + ctx->reserved = 0; ctx->cleaned_slot = 0; ctx->reused_slot = 0; - ctx->filled_slot = ~0u; + ctx->fill_idx = ~0u; /* LY: cleanup reclaimed records. */ do { ctx->cleaned_id = txn->tw.lifo_reclaimed[++ctx->cleaned_slot]; @@ -14458,11 +14986,13 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { goto bailout; if (!MDBX_DISABLE_VALIDATION && unlikely(key.iov_len != sizeof(txnid_t))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC-key size", (unsigned)key.iov_len); rc = MDBX_CORRUPTED; goto bailout; } ctx->rid = ctx->cleaned_id; - ctx->settled = 0; + ctx->reserved = 0; ctx->reused_slot = 0; ctx->cleaned_id = unaligned_peek_u64(4, key.iov_base); if (ctx->cleaned_id > txn->tw.last_reclaimed) @@ -14602,7 +15132,7 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { if (unlikely(!ctx->retired_stored)) { /* Make sure last page of GC is touched and on retired-list */ rc = cursor_last(&ctx->cursor, nullptr, nullptr); - if (likely(rc != MDBX_SUCCESS)) + if (likely(rc == MDBX_SUCCESS)) rc = gcu_touch(ctx); if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND) goto bailout; @@ -14657,14 +15187,14 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { if (unlikely(rc != MDBX_SUCCESS)) goto bailout; -#if MDBX_DEBUG && (defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)) +#if MDBX_DEBUG && (defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__)) /* Для предотвращения предупреждения Valgrind из mdbx_dump_val() * вызванное через макрос DVAL_DEBUG() на выходе * из cursor_set(MDBX_SET_KEY), которая вызывается как выше в цикле * очистки, так и ниже в цикле заполнения зарезервированных элементов. */ memset(data.iov_base, 0xBB, data.iov_len); -#endif /* MDBX_DEBUG && (MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__) */ +#endif /* MDBX_DEBUG && (ENABLE_MEMCHECK || __SANITIZE_ADDRESS__) */ if (retired_pages_before == MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { const size_t at = (ctx->lifo == MDBX_PNL_ASCENDING) @@ -14704,14 +15234,14 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { if (unlikely(rc != MDBX_SUCCESS)) goto bailout; -#if MDBX_DEBUG && (defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)) +#if MDBX_DEBUG && (defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__)) /* Для предотвращения предупреждения Valgrind из mdbx_dump_val() * вызванное через макрос DVAL_DEBUG() на выходе * из cursor_set(MDBX_SET_KEY), которая вызывается как выше в цикле * очистки, так и ниже в цикле заполнения зарезервированных элементов. */ memset(data.iov_base, 0xBB, data.iov_len); -#endif /* MDBX_DEBUG && (MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__) */ +#endif /* MDBX_DEBUG && (ENABLE_MEMCHECK || __SANITIZE_ADDRESS__) */ /* Retry if tw.retired_pages[] grew during the Put() */ } while (data.iov_len < MDBX_PNL_SIZEOF(txn->tw.retired_pages)); @@ -14733,10 +15263,10 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { DEBUG_EXTRA_PRINT("%s\n", "."); } if (unlikely(amount != MDBX_PNL_GETSIZE(txn->tw.relist) && - ctx->settled)) { + ctx->reserved)) { TRACE("%s: reclaimed-list changed %zu -> %zu, retry", dbg_prefix_mode, amount, MDBX_PNL_GETSIZE(txn->tw.relist)); - goto retry /* rare case, but avoids GC fragmentation + goto retry_clean_adj /* rare case, but avoids GC fragmentation and one cycle. */ ; } @@ -14754,10 +15284,11 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - const size_t left = amount - ctx->settled; - TRACE("%s: amount %zu, settled %zd, left %zd, lifo-reclaimed-slots %zu, " + const size_t left = amount - ctx->reserved - ctx->reserve_adj; + TRACE("%s: amount %zu, settled %zd, reserve_adj %zu, left %zd, " + "lifo-reclaimed-slots %zu, " "reused-gc-slots %zu", - dbg_prefix_mode, amount, ctx->settled, left, + dbg_prefix_mode, amount, ctx->reserved, ctx->reserve_adj, left, txn->tw.lifo_reclaimed ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) : 0, ctx->reused_slot); if (0 >= (intptr_t)left) @@ -14851,6 +15382,8 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { rc = cursor_first(&ctx->cursor, &key, nullptr); if (unlikely(rc != MDBX_SUCCESS || key.iov_len != sizeof(txnid_t))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC-key size", (unsigned)key.iov_len); rc = MDBX_CORRUPTED; goto bailout; } @@ -14906,6 +15439,8 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { rc = cursor_first(&ctx->cursor, &key, nullptr); if (likely(rc == MDBX_SUCCESS)) { if (unlikely(key.iov_len != sizeof(txnid_t))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC-key size", (unsigned)key.iov_len); rc = MDBX_CORRUPTED; goto bailout; } @@ -14915,7 +15450,7 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { if (unlikely(ctx->rid == 0)) { ERROR("%s", "** no GC tail-space to store (going dense-mode)"); ctx->dense = true; - goto retry; + goto retry_clean_adj; } } else if (rc != MDBX_NOTFOUND) goto bailout; @@ -15004,7 +15539,7 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { key.iov_base = &reservation_gc_id; data.iov_len = (chunk + 1) * sizeof(pgno_t); TRACE("%s: reserve %zu [%zu...%zu) @%" PRIaTXN, dbg_prefix_mode, chunk, - ctx->settled + 1, ctx->settled + chunk + 1, reservation_gc_id); + ctx->reserved + 1, ctx->reserved + chunk + 1, reservation_gc_id); gcu_prepare_backlog(txn, ctx); rc = cursor_put_nochecklen(&ctx->cursor, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); @@ -15014,17 +15549,17 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { goto bailout; gcu_clean_reserved(env, data); - ctx->settled += chunk; - TRACE("%s: settled %zu (+%zu), continue", dbg_prefix_mode, ctx->settled, + ctx->reserved += chunk; + TRACE("%s: settled %zu (+%zu), continue", dbg_prefix_mode, ctx->reserved, chunk); if (txn->tw.lifo_reclaimed && unlikely(amount < MDBX_PNL_GETSIZE(txn->tw.relist)) && - (ctx->loop < 5 || - MDBX_PNL_GETSIZE(txn->tw.relist) - amount > env->me_maxgc_ov1page)) { + (ctx->loop < 5 || MDBX_PNL_GETSIZE(txn->tw.relist) - amount > + env->me_maxgc_ov1page / 2)) { NOTICE("** restart: reclaimed-list growth %zu -> %zu", amount, MDBX_PNL_GETSIZE(txn->tw.relist)); - goto retry; + goto retry_clean_adj; } continue; @@ -15037,7 +15572,8 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { TRACE("%s", " >> filling"); /* Fill in the reserved records */ - ctx->filled_slot = + size_t excess_slots = 0; + ctx->fill_idx = txn->tw.lifo_reclaimed ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot : ctx->reused_slot; @@ -15045,18 +15581,21 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); tASSERT(txn, dirtylist_check(txn)); - if (MDBX_PNL_GETSIZE(txn->tw.relist)) { + if (ctx->reserved || MDBX_PNL_GETSIZE(txn->tw.relist)) { MDBX_val key, data; key.iov_len = data.iov_len = 0; /* avoid MSVC warning */ key.iov_base = data.iov_base = NULL; const size_t amount = MDBX_PNL_GETSIZE(txn->tw.relist); - size_t left = amount; + size_t left = amount, excess = 0; if (txn->tw.lifo_reclaimed == nullptr) { tASSERT(txn, ctx->lifo == 0); rc = cursor_first(&ctx->cursor, &key, &data); - if (unlikely(rc != MDBX_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc == MDBX_NOTFOUND && ctx->reserve_adj) + goto retry_clean_adj; goto bailout; + } } else { tASSERT(txn, ctx->lifo != 0); } @@ -15068,24 +15607,33 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { if (txn->tw.lifo_reclaimed == nullptr) { tASSERT(txn, ctx->lifo == 0); fill_gc_id = unaligned_peek_u64(4, key.iov_base); - if (ctx->filled_slot-- == 0 || fill_gc_id > txn->tw.last_reclaimed) { - NOTICE( - "** restart: reserve depleted (filled_slot %zu, fill_id %" PRIaTXN - " > last_reclaimed %" PRIaTXN, - ctx->filled_slot, fill_gc_id, txn->tw.last_reclaimed); + if (ctx->fill_idx == 0 || fill_gc_id > txn->tw.last_reclaimed) { + if (!left) + break; + NOTICE("** restart: reserve depleted (fill_idx %zu, fill_id %" PRIaTXN + " > last_reclaimed %" PRIaTXN ", left %zu", + ctx->fill_idx, fill_gc_id, txn->tw.last_reclaimed, left); + ctx->reserve_adj = + (ctx->reserve_adj > left) ? ctx->reserve_adj - left : 0; goto retry; } + ctx->fill_idx -= 1; } else { tASSERT(txn, ctx->lifo != 0); - if (++ctx->filled_slot > MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)) { - NOTICE("** restart: reserve depleted (filled_gc_slot %zu > " - "lifo_reclaimed %zu" PRIaTXN, - ctx->filled_slot, MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)); + if (ctx->fill_idx >= MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)) { + if (!left) + break; + NOTICE("** restart: reserve depleted (fill_idx %zu >= " + "lifo_reclaimed %zu, left %zu", + ctx->fill_idx, MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed), left); + ctx->reserve_adj = + (ctx->reserve_adj > left) ? ctx->reserve_adj - left : 0; goto retry; } - fill_gc_id = txn->tw.lifo_reclaimed[ctx->filled_slot]; + ctx->fill_idx += 1; + fill_gc_id = txn->tw.lifo_reclaimed[ctx->fill_idx]; TRACE("%s: seek-reservation @%" PRIaTXN " at lifo_reclaimed[%zu]", - dbg_prefix_mode, fill_gc_id, ctx->filled_slot); + dbg_prefix_mode, fill_gc_id, ctx->fill_idx); key.iov_base = &fill_gc_id; key.iov_len = sizeof(fill_gc_id); rc = cursor_set(&ctx->cursor, &key, &data, MDBX_SET_KEY).err; @@ -15104,12 +15652,17 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { tASSERT(txn, data.iov_len >= sizeof(pgno_t) * 2); size_t chunk = data.iov_len / sizeof(pgno_t) - 1; if (unlikely(chunk > left)) { + const size_t delta = chunk - left; + excess += delta; + if (!left) { + excess_slots += 1; + goto next; + } TRACE("%s: chunk %zu > left %zu, @%" PRIaTXN, dbg_prefix_mode, chunk, left, fill_gc_id); - if ((ctx->loop < 5 && chunk - left > ctx->loop / 2) || - chunk - left > env->me_maxgc_ov1page) { + if ((ctx->loop < 5 && delta > (ctx->loop / 2)) || + delta > env->me_maxgc_ov1page) data.iov_len = (left + 1) * sizeof(pgno_t); - } chunk = left; } rc = cursor_put_nochecklen(&ctx->cursor, &key, &data, @@ -15122,14 +15675,14 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { amount != MDBX_PNL_GETSIZE(txn->tw.relist))) { NOTICE("** restart: reclaimed-list growth (%zu -> %zu, loose +%zu)", amount, MDBX_PNL_GETSIZE(txn->tw.relist), txn->tw.loose_count); - goto retry; + goto retry_clean_adj; } if (unlikely(txn->tw.lifo_reclaimed ? ctx->cleaned_slot < MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) : ctx->cleaned_id < txn->tw.last_reclaimed)) { NOTICE("%s", "** restart: reclaimed-slots changed"); - goto retry; + goto retry_clean_adj; } if (unlikely(ctx->retired_stored != MDBX_PNL_GETSIZE(txn->tw.retired_pages))) { @@ -15137,7 +15690,7 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { ctx->retired_stored < MDBX_PNL_GETSIZE(txn->tw.retired_pages)); NOTICE("** restart: retired-list growth (%zu -> %zu)", ctx->retired_stored, MDBX_PNL_GETSIZE(txn->tw.retired_pages)); - goto retry; + goto retry_clean_adj; } pgno_t *dst = data.iov_base; @@ -15155,35 +15708,44 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - if (left == 0) { - rc = MDBX_SUCCESS; - break; - } + next: if (txn->tw.lifo_reclaimed == nullptr) { tASSERT(txn, ctx->lifo == 0); rc = cursor_next(&ctx->cursor, &key, &data, MDBX_NEXT); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc != MDBX_NOTFOUND) + goto bailout; + rc = MDBX_SUCCESS; + break; + } } else { tASSERT(txn, ctx->lifo != 0); } } + + if (excess) { + size_t n = excess, adj = excess; + while (n >= env->me_maxgc_ov1page) + adj -= n /= env->me_maxgc_ov1page; + ctx->reserve_adj += adj; + TRACE("%s: extra %zu reserved space, adj +%zu (%zu)", dbg_prefix_mode, + excess, adj, ctx->reserve_adj); + } } tASSERT(txn, rc == MDBX_SUCCESS); if (unlikely(txn->tw.loose_count != 0)) { NOTICE("** restart: got %zu loose pages", txn->tw.loose_count); - goto retry; + goto retry_clean_adj; } - if (unlikely(ctx->filled_slot != - (txn->tw.lifo_reclaimed - ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - : 0))) { - const bool will_retry = ctx->loop < 9; - NOTICE("** %s: reserve excess (filled-slot %zu, loop %zu)", - will_retry ? "restart" : "ignore", ctx->filled_slot, ctx->loop); + if (unlikely(excess_slots)) { + const bool will_retry = ctx->loop < 5 || excess_slots > 1; + NOTICE("** %s: reserve excess (excess-slots %zu, filled-slot %zu, adj %zu, " + "loop %zu)", + will_retry ? "restart" : "ignore", excess_slots, ctx->fill_idx, + ctx->reserve_adj, ctx->loop); if (will_retry) goto retry; } @@ -15246,21 +15808,6 @@ static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) { return rc; } -/* Check txn and dbi arguments to a function */ -static __always_inline bool check_dbi(const MDBX_txn *txn, MDBX_dbi dbi, - unsigned validity) { - if (likely(dbi < txn->mt_numdbs)) { - if (likely(!dbi_changed(txn, dbi))) { - if (likely(txn->mt_dbistate[dbi] & validity)) - return true; - if (likely(dbi < CORE_DBS || - (txn->mt_env->me_dbflags[dbi] & DB_VALID) == 0)) - return false; - } - } - return dbi_import((MDBX_txn *)txn, dbi); -} - /* Merge child txn into parent */ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, const size_t parent_retired_len) { @@ -15677,10 +16224,16 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { /* txn_end() mode for a commit which writes nothing */ unsigned end_mode = - MDBX_END_PURE_COMMIT | MDBX_END_UPDATE | MDBX_END_SLOT | MDBX_END_FREE; + TXN_END_PURE_COMMIT | TXN_END_UPDATE | TXN_END_SLOT | TXN_END_FREE; if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) goto done; + if ((txn->mt_flags & MDBX_NOSTICKYTHREADS) && + unlikely(txn->mt_owner != osal_thread_self())) { + rc = MDBX_THREAD_MISMATCH; + goto fail; + } + if (txn->mt_child) { rc = mdbx_txn_commit_ex(txn->mt_child, NULL); tASSERT(txn, txn->mt_child == NULL); @@ -15705,10 +16258,10 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { if (txn->tw.dirtylist->length == 0 && !(txn->mt_flags & MDBX_TXN_DIRTY) && parent->mt_numdbs == txn->mt_numdbs) { - for (int i = txn->mt_numdbs; --i >= 0;) { - tASSERT(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); - if ((txn->mt_dbistate[i] & DBI_STALE) && - !(parent->mt_dbistate[i] & DBI_STALE)) + TXN_FOREACH_DBI_ALL(txn, i) { + tASSERT(txn, (txn->mt_dbi_state[i] & DBI_DIRTY) == 0); + if ((txn->mt_dbi_state[i] & DBI_STALE) && + !(parent->mt_dbi_state[i] & DBI_STALE)) tASSERT(txn, memcmp(&parent->mt_dbs[i], &txn->mt_dbs[i], sizeof(MDBX_db)) == 0); } @@ -15722,7 +16275,8 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { tASSERT(txn, txn->tw.loose_count == 0); /* fast completion of pure nested transaction */ - end_mode = MDBX_END_PURE_COMMIT | MDBX_END_SLOT | MDBX_END_FREE; + VERBOSE("fast-complete pure nested txn %" PRIaTXN, txn->mt_txnid); + end_mode = TXN_END_PURE_COMMIT | TXN_END_SLOT | TXN_END_FREE; goto done; } @@ -15767,6 +16321,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { pnl_free(parent->tw.relist); parent->tw.relist = txn->tw.relist; txn->tw.relist = NULL; + parent->tw.gc_time_acc = txn->tw.gc_time_acc; parent->tw.last_reclaimed = txn->tw.last_reclaimed; parent->mt_geo = txn->mt_geo; @@ -15782,20 +16337,26 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { /* Merge our cursors into parent's and close them */ cursors_eot(txn, true); - end_mode |= MDBX_END_EOTDONE; + end_mode |= TXN_END_EOTDONE; /* Update parent's DBs array */ - memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); - parent->mt_numdbs = txn->mt_numdbs; - for (size_t i = 0; i < txn->mt_numdbs; i++) { - /* preserve parent's status */ - const uint8_t state = - txn->mt_dbistate[i] | - (parent->mt_dbistate[i] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)); - DEBUG("dbi %zu dbi-state %s 0x%02x -> 0x%02x", i, - (parent->mt_dbistate[i] != state) ? "update" : "still", - parent->mt_dbistate[i], state); - parent->mt_dbistate[i] = state; + eASSERT(env, parent->mt_numdbs == txn->mt_numdbs); + TXN_FOREACH_DBI_ALL(txn, dbi) { + if (txn->mt_dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)) { + parent->mt_dbs[dbi] = txn->mt_dbs[dbi]; + /* preserve parent's status */ + const uint8_t state = + txn->mt_dbi_state[dbi] | + (parent->mt_dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)); + DEBUG("dbi %zu dbi-state %s 0x%02x -> 0x%02x", dbi, + (parent->mt_dbi_state[dbi] != state) ? "update" : "still", + parent->mt_dbi_state[dbi], state); + parent->mt_dbi_state[dbi] = state; + } else { + eASSERT(env, txn->mt_dbi_state[dbi] == + (parent->mt_dbi_state[dbi] & + ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY))); + } } if (latency) { @@ -15840,15 +16401,16 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); + : env->me_options.dp_limit)); } cursors_eot(txn, false); - end_mode |= MDBX_END_EOTDONE; + end_mode |= TXN_END_EOTDONE; if ((!txn->tw.dirtylist || txn->tw.dirtylist->length == 0) && (txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0) { - for (intptr_t i = txn->mt_numdbs; --i >= 0;) - tASSERT(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); + TXN_FOREACH_DBI_ALL(txn, i) { + tASSERT(txn, !(txn->mt_dbi_state[i] & DBI_DIRTY)); + } #if defined(MDBX_NOSUCCESS_EMPTY_COMMIT) && MDBX_NOSUCCESS_EMPTY_COMMIT rc = txn_end(txn, end_mode); if (unlikely(rc != MDBX_SUCCESS)) @@ -15860,37 +16422,37 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { #endif /* MDBX_NOSUCCESS_EMPTY_COMMIT */ } - DEBUG("committing txn %" PRIaTXN " %p on mdbenv %p, root page %" PRIaPGNO + DEBUG("committing txn %" PRIaTXN " %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, txn->mt_txnid, (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root); - /* Update DB root pointers */ if (txn->mt_numdbs > CORE_DBS) { - MDBX_cursor_couple couple; - MDBX_val data; - data.iov_len = sizeof(MDBX_db); - - rc = cursor_init(&couple.outer, txn, MAIN_DBI); + /* Update subDB root pointers */ + MDBX_cursor_couple cx; + rc = cursor_init(&cx.outer, txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) goto fail; - for (MDBX_dbi i = CORE_DBS; i < txn->mt_numdbs; i++) { - if (txn->mt_dbistate[i] & DBI_DIRTY) { - MDBX_db *db = &txn->mt_dbs[i]; - DEBUG("update main's entry for sub-db %u, mod_txnid %" PRIaTXN - " -> %" PRIaTXN, - i, db->md_mod_txnid, txn->mt_txnid); - /* Может быть mod_txnid > front после коммита вложенных тразакций */ - db->md_mod_txnid = txn->mt_txnid; - data.iov_base = db; - WITH_CURSOR_TRACKING( - couple.outer, - rc = cursor_put_nochecklen(&couple.outer, &txn->mt_dbxs[i].md_name, - &data, F_SUBDATA)); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; + cx.outer.mc_next = txn->mt_cursors[MAIN_DBI]; + txn->mt_cursors[MAIN_DBI] = &cx.outer; + TXN_FOREACH_DBI_USER(txn, i) { + if ((txn->mt_dbi_state[i] & DBI_DIRTY) == 0) + continue; + MDBX_db *const db = &txn->mt_dbs[i]; + DEBUG("update main's entry for sub-db %zu, mod_txnid %" PRIaTXN + " -> %" PRIaTXN, + i, db->md_mod_txnid, txn->mt_txnid); + /* Может быть mod_txnid > front после коммита вложенных тразакций */ + db->md_mod_txnid = txn->mt_txnid; + MDBX_val data = {db, sizeof(MDBX_db)}; + rc = cursor_put_nochecklen(&cx.outer, &env->me_dbxs[i].md_name, &data, + F_SUBDATA); + if (unlikely(rc != MDBX_SUCCESS)) { + txn->mt_cursors[MAIN_DBI] = cx.outer.mc_next; + goto fail; } } + txn->mt_cursors[MAIN_DBI] = cx.outer.mc_next; } ts_1 = latency ? osal_monotime() : 0; @@ -15906,11 +16468,11 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { goto fail; tASSERT(txn, txn->tw.loose_count == 0); - txn->mt_dbs[FREE_DBI].md_mod_txnid = (txn->mt_dbistate[FREE_DBI] & DBI_DIRTY) + txn->mt_dbs[FREE_DBI].md_mod_txnid = (txn->mt_dbi_state[FREE_DBI] & DBI_DIRTY) ? txn->mt_txnid : txn->mt_dbs[FREE_DBI].md_mod_txnid; - txn->mt_dbs[MAIN_DBI].md_mod_txnid = (txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY) + txn->mt_dbs[MAIN_DBI].md_mod_txnid = (txn->mt_dbi_state[MAIN_DBI] & DBI_DIRTY) ? txn->mt_txnid : txn->mt_dbs[MAIN_DBI].md_mod_txnid; @@ -16045,7 +16607,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { goto fail; } - end_mode = MDBX_END_COMMITTED | MDBX_END_UPDATE | MDBX_END_EOTDONE; + end_mode = TXN_END_COMMITTED | TXN_END_UPDATE | TXN_END_EOTDONE; done: if (latency) @@ -16223,6 +16785,10 @@ static __always_inline bool eq_fast(const MDBX_val *a, const MDBX_val *b) { eq_fast_slowpath(a->iov_base, b->iov_base, a->iov_len); } +static int cmp_equal_or_greater(const MDBX_val *a, const MDBX_val *b) { + return eq_fast(a, b) ? 0 : 1; +} + static int validate_meta(MDBX_env *env, MDBX_meta *const meta, const MDBX_page *const page, const unsigned meta_number, unsigned *guess_pagesize) { @@ -16276,6 +16842,17 @@ static int validate_meta(MDBX_env *env, MDBX_meta *const meta, return MDBX_RESULT_TRUE; } + if (unlikely(meta->mm_dbs[FREE_DBI].md_flags != MDBX_INTEGERKEY)) { + WARNING("meta[%u] has invalid %s flags 0x%u, skip it", meta_number, + "GC/FreeDB", meta->mm_dbs[FREE_DBI].md_flags); + return MDBX_INCOMPATIBLE; + } + if (unlikely(!db_check_flags(meta->mm_dbs[MAIN_DBI].md_flags))) { + WARNING("meta[%u] has invalid %s flags 0x%u, skip it", meta_number, + "MainDB", meta->mm_dbs[MAIN_DBI].md_flags); + return MDBX_INCOMPATIBLE; + } + DEBUG("checking meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO " +%u -%u, txn_id %" PRIaTXN ", %s", @@ -16660,6 +17237,8 @@ __cold static MDBX_meta *init_metas(const MDBX_env *env, void *buffer) { static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, meta_troika_t *const troika) { eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); + eASSERT(env, pending->mm_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); + eASSERT(env, db_check_flags(pending->mm_dbs[MAIN_DBI].md_flags)); const MDBX_meta *const meta0 = METAPAGE(env, 0); const MDBX_meta *const meta1 = METAPAGE(env, 1); const MDBX_meta *const meta2 = METAPAGE(env, 2); @@ -16703,7 +17282,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, : pending->mm_geo.next); eASSERT(env, largest_pgno >= NUM_METAS); -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) const pgno_t edge = env->me_poison_edge; if (edge > largest_pgno) { env->me_poison_edge = largest_pgno; @@ -16714,7 +17293,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)), pgno2bytes(env, edge - largest_pgno)); } -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ #if MDBX_ENABLE_MADVISE && \ (defined(MADV_DONTNEED) || defined(POSIX_MADV_DONTNEED)) @@ -16805,9 +17384,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, const txnid_t txnid = safe64_txnid_next(pending->unsafe_txnid); NOTICE("force-forward pending-txn %" PRIaTXN " -> %" PRIaTXN, pending->unsafe_txnid, txnid); - ENSURE(env, !env->me_txn0 || - (env->me_txn0->mt_owner != osal_thread_self() && - !env->me_txn)); + ENSURE(env, !env->me_txn0 || !env->me_txn); if (unlikely(txnid > MAX_TXNID)) { rc = MDBX_TXN_FULL; ERROR("txnid overflow, raise %d", rc); @@ -16958,6 +17535,8 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, target->mm_geo = pending->mm_geo; target->mm_dbs[FREE_DBI] = pending->mm_dbs[FREE_DBI]; target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI]; + eASSERT(env, target->mm_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); + eASSERT(env, db_check_flags(target->mm_dbs[MAIN_DBI].md_flags)); target->mm_canary = pending->mm_canary; memcpy(target->mm_pages_retired, pending->mm_pages_retired, 8); jitter4testing(true); @@ -17012,6 +17591,8 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ const MDBX_meta undo_meta = *target; + eASSERT(env, pending->mm_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); + eASSERT(env, db_check_flags(pending->mm_dbs[MAIN_DBI].md_flags)); rc = osal_pwrite(env->me_fd4meta, pending, sizeof(MDBX_meta), ptr_dist(target, env->me_map)); if (unlikely(rc != MDBX_SUCCESS)) { @@ -17177,14 +17758,6 @@ __cold static void setup_pagesize(MDBX_env *env, const size_t pagesize) { env->me_options.dp_initial = env->me_options.dp_limit; } -static __inline MDBX_CONST_FUNCTION MDBX_lockinfo * -lckless_stub(const MDBX_env *env) { - uintptr_t stub = (uintptr_t)&env->x_lckless_stub; - /* align to avoid false-positive alarm from UndefinedBehaviorSanitizer */ - stub = (stub + MDBX_CACHELINE_SIZE - 1) & ~(MDBX_CACHELINE_SIZE - 1); - return (MDBX_lockinfo *)stub; -} - __cold int mdbx_env_create(MDBX_env **penv) { if (unlikely(!penv)) return MDBX_EINVAL; @@ -17228,7 +17801,6 @@ __cold int mdbx_env_create(MDBX_env **penv) { env->me_maxdbs = env->me_numdbs = CORE_DBS; env->me_lazy_fd = env->me_dsync_fd = env->me_fd4meta = env->me_lfd = INVALID_HANDLE_VALUE; - env->me_pid = osal_getpid(); env->me_stuck_meta = -1; env->me_options.rp_augment_limit = MDBX_PNL_INITIAL; @@ -17268,7 +17840,7 @@ __cold int mdbx_env_create(MDBX_env **penv) { #if MDBX_LOCKING > MDBX_LOCKING_SYSV MDBX_lockinfo *const stub = lckless_stub(env); - rc = osal_ipclock_stub(&stub->mti_wlock); + rc = osal_ipclock_stubinit(&stub->mti_wlock); #endif /* MDBX_LOCKING */ if (unlikely(rc != MDBX_SUCCESS)) { osal_fastmutex_destroy(&env->me_remap_guard); @@ -17326,8 +17898,9 @@ __cold int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, if (unlikely(rc != MDBX_SUCCESS)) return rc; - const bool inside_txn = - (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()); + const bool txn0_owned = env->me_txn0 && env_txn0_owned(env); + const bool inside_txn = txn0_owned && env->me_txn; + bool should_unlock = false; #if MDBX_DEBUG if (growth_step < 0) { @@ -17338,17 +17911,16 @@ __cold int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, #endif /* MDBX_DEBUG */ intptr_t reasonable_maxsize = 0; - bool need_unlock = false; if (env->me_map) { /* env already mapped */ if (unlikely(env->me_flags & MDBX_RDONLY)) return MDBX_EACCESS; - if (!inside_txn) { - int err = mdbx_txn_lock(env, false); + if (!txn0_owned) { + int err = osal_txn_lock(env, false); if (unlikely(err != MDBX_SUCCESS)) return err; - need_unlock = true; + should_unlock = true; env->me_txn0->tw.troika = meta_tap(env); eASSERT(env, !env->me_txn && !env->me_txn0->mt_child); env->me_txn0->mt_txnid = @@ -17574,7 +18146,7 @@ __cold int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, MDBX_meta meta; memset(&meta, 0, sizeof(meta)); if (!inside_txn) { - eASSERT(env, need_unlock); + eASSERT(env, should_unlock); const meta_ptr_t head = meta_recent(env, &env->me_txn0->tw.troika); uint64_t timestamp = 0; @@ -17662,7 +18234,7 @@ __cold int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } -#endif +#endif /* Windows */ if (new_geo.now != current_geo->now || new_geo.upper != current_geo->upper) { @@ -17695,8 +18267,8 @@ __cold int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, } bailout: - if (need_unlock) - mdbx_txn_unlock(env); + if (should_unlock) + osal_txn_unlock(env); return rc; } @@ -17711,6 +18283,7 @@ __cold static int alloc_page_buf(MDBX_env *env) { __cold static int setup_dxb(MDBX_env *env, const int lck_rc, const mdbx_mode_t mode_bits) { MDBX_meta header; + eASSERT(env, !(env->me_flags & MDBX_ENV_ACTIVE)); int rc = MDBX_RESULT_FALSE; int err = read_header(env, &header, lck_rc, mode_bits); if (unlikely(err != MDBX_SUCCESS)) { @@ -17761,6 +18334,19 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, pv2pages(header.mm_geo.shrink_pv), unaligned_peek_u64(4, header.mm_txnid_a), durable_caption(&header)); + if (unlikely(header.mm_dbs[FREE_DBI].md_flags != MDBX_INTEGERKEY)) { + ERROR("unexpected/invalid db-flags 0x%u for GC/FreeDB", + header.mm_dbs[FREE_DBI].md_flags); + return MDBX_INCOMPATIBLE; + } + env->me_db_flags[FREE_DBI] = DB_VALID | MDBX_INTEGERKEY; + env->me_dbxs[FREE_DBI].md_cmp = cmp_int_align4; /* aligned MDBX_INTEGERKEY */ + env->me_dbxs[FREE_DBI].md_dcmp = cmp_lenfast; + env->me_dbxs[FREE_DBI].md_klen_max = env->me_dbxs[FREE_DBI].md_klen_min = 8; + env->me_dbxs[FREE_DBI].md_vlen_min = 4; + env->me_dbxs[FREE_DBI].md_vlen_max = + mdbx_env_get_maxvalsize_ex(env, MDBX_INTEGERKEY); + if (env->me_psize != header.mm_psize) setup_pagesize(env, header.mm_psize); const size_t used_bytes = pgno2bytes(env, header.mm_geo.next); @@ -17906,7 +18492,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, return err; #endif /* MADV_DONTDUMP */ #if defined(MADV_DODUMP) - if (runtime_flags & MDBX_DBG_DUMP) { + if (mdbx_static.flags & MDBX_DBG_DUMP) { const size_t meta_length_aligned2os = pgno_align2os_bytes(env, NUM_METAS); err = madvise(env->me_map, meta_length_aligned2os, MADV_DODUMP) ? ignore_enosys(errno) @@ -17917,14 +18503,14 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, #endif /* MADV_DODUMP */ #endif /* MDBX_ENABLE_MADVISE */ -#ifdef MDBX_USE_VALGRIND +#ifdef ENABLE_MEMCHECK env->me_valgrind_handle = VALGRIND_CREATE_BLOCK(env->me_map, env->me_dxb_mmap.limit, "mdbx"); -#endif /* MDBX_USE_VALGRIND */ +#endif /* ENABLE_MEMCHECK */ eASSERT(env, used_bytes >= pgno2bytes(env, NUM_METAS) && used_bytes <= env->me_dxb_mmap.limit); -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) if (env->me_dxb_mmap.filesize > used_bytes && env->me_dxb_mmap.filesize < env->me_dxb_mmap.limit) { VALGRIND_MAKE_MEM_NOACCESS(ptr_disp(env->me_map, used_bytes), @@ -17936,13 +18522,12 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, bytes2pgno(env, (env->me_dxb_mmap.filesize < env->me_dxb_mmap.limit) ? env->me_dxb_mmap.filesize : env->me_dxb_mmap.limit); -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ meta_troika_t troika = meta_tap(env); #if MDBX_DEBUG meta_troika_dump(env, &troika); #endif - eASSERT(env, !env->me_txn && !env->me_txn0); //-------------------------------- validate/rollback head & steady meta-pages if (unlikely(env->me_stuck_meta >= 0)) { /* recovery mode */ @@ -18157,7 +18742,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, bytes2pgno(env, used_aligned2os_bytes), mo_Relaxed); if ((env->me_flags & MDBX_RDONLY) == 0 && env->me_stuck_meta < 0 && - (runtime_flags & MDBX_DBG_DONT_UPGRADE) == 0) { + (mdbx_static.flags & MDBX_DBG_DONT_UPGRADE) == 0) { for (int n = 0; n < NUM_METAS; ++n) { MDBX_meta *const meta = METAPAGE(env, n); if (unlikely(unaligned_peek_u64(4, &meta->mm_magic_and_version) != @@ -18238,101 +18823,48 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, /******************************************************************************/ -/* Open and/or initialize the lock region for the environment. */ -__cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, - mdbx_mode_t mode) { - eASSERT(env, env->me_lazy_fd != INVALID_HANDLE_VALUE); - eASSERT(env, env->me_lfd == INVALID_HANDLE_VALUE); - - int err = osal_openfile(MDBX_OPEN_LCK, env, lck_pathname, &env->me_lfd, mode); - if (err != MDBX_SUCCESS) { - switch (err) { - default: - return err; - case MDBX_ENOFILE: - case MDBX_EACCESS: - case MDBX_EPERM: - if (!F_ISSET(env->me_flags, MDBX_RDONLY | MDBX_EXCLUSIVE)) - return err; - break; - case MDBX_EROFS: - if ((env->me_flags & MDBX_RDONLY) == 0) - return err; - break; - } - - if (err != MDBX_ENOFILE) { - /* ENSURE the file system is read-only */ - err = osal_check_fs_rdonly(env->me_lazy_fd, lck_pathname, err); - if (err != MDBX_SUCCESS && - /* ignore ERROR_NOT_SUPPORTED for exclusive mode */ - !(err == MDBX_ENOSYS && (env->me_flags & MDBX_EXCLUSIVE))) - return err; - } +__cold static int setup_lck_locked(MDBX_env *env) { + int err = rthc_register(env); + if (unlikely(err != MDBX_SUCCESS)) + return err; - /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ - /* beginning of a locked section ---------------------------------------- */ - lcklist_lock(); - eASSERT(env, env->me_lcklist_next == nullptr); - env->me_lfd = INVALID_HANDLE_VALUE; - const int rc = osal_lck_seize(env); - if (MDBX_IS_ERROR(rc)) { - /* Calling lcklist_detach_locked() is required to restore POSIX-filelock - * and this job will be done by env_close(). */ - lcklist_unlock(); - return rc; - } - /* insert into inprocess lck-list */ - env->me_lcklist_next = inprocess_lcklist_head; - inprocess_lcklist_head = env; - lcklist_unlock(); - /* end of a locked section ---------------------------------------------- */ + int lck_seize_rc = osal_lck_seize(env); + if (unlikely(MDBX_IS_ERROR(lck_seize_rc))) + return lck_seize_rc; + if (env->me_lfd == INVALID_HANDLE_VALUE) { env->me_lck = lckless_stub(env); env->me_maxreaders = UINT_MAX; DEBUG("lck-setup:%s%s%s", " lck-less", (env->me_flags & MDBX_RDONLY) ? " readonly" : "", - (rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); - return rc; + (lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); + return lck_seize_rc; } - /* beginning of a locked section ------------------------------------------ */ - lcklist_lock(); - eASSERT(env, env->me_lcklist_next == nullptr); - - /* Try to get exclusive lock. If we succeed, then - * nobody is using the lock region and we should initialize it. */ - err = osal_lck_seize(env); - if (MDBX_IS_ERROR(err)) { - bailout: - /* Calling lcklist_detach_locked() is required to restore POSIX-filelock - * and this job will be done by env_close(). */ - lcklist_unlock(); - return err; - } + DEBUG("lck-setup:%s%s%s", " with-lck", + (env->me_flags & MDBX_RDONLY) ? " readonly" : "", + (lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); MDBX_env *inprocess_neighbor = nullptr; - if (err == MDBX_RESULT_TRUE) { - err = uniq_check(&env->me_lck_mmap, &inprocess_neighbor); - if (MDBX_IS_ERROR(err)) - goto bailout; - if (inprocess_neighbor && - ((runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 || - (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) != 0)) { - err = MDBX_BUSY; - goto bailout; + err = rthc_uniq_check(&env->me_lck_mmap, &inprocess_neighbor); + if (unlikely(MDBX_IS_ERROR(err))) + return err; + if (inprocess_neighbor) { + if ((mdbx_static.flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 || + (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) != 0) + return MDBX_BUSY; + if (lck_seize_rc == MDBX_RESULT_TRUE) { + err = osal_lck_downgrade(env); + if (unlikely(err != MDBX_SUCCESS)) + return err; + lck_seize_rc = MDBX_RESULT_FALSE; } } - const int lck_seize_rc = err; - - DEBUG("lck-setup:%s%s%s", " with-lck", - (env->me_flags & MDBX_RDONLY) ? " readonly" : "", - (lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); uint64_t size = 0; err = osal_filesize(env->me_lfd, &size); if (unlikely(err != MDBX_SUCCESS)) - goto bailout; + return err; if (lck_seize_rc == MDBX_RESULT_TRUE) { size = ceil_powerof2(env->me_maxreaders * sizeof(MDBX_reader) + @@ -18340,15 +18872,12 @@ __cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, env->me_os_psize); jitter4testing(false); } else { - if (env->me_flags & MDBX_EXCLUSIVE) { - err = MDBX_BUSY; - goto bailout; - } + if (env->me_flags & MDBX_EXCLUSIVE) + return MDBX_BUSY; if (size > INT_MAX || (size & (env->me_os_psize - 1)) != 0 || size < env->me_os_psize) { ERROR("lck-file has invalid size %" PRIu64 " bytes", size); - err = MDBX_PROBLEM; - goto bailout; + return MDBX_PROBLEM; } } @@ -18356,8 +18885,7 @@ __cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, ((size_t)size - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader); if (maxreaders < 4) { ERROR("lck-size too small (up to %" PRIuPTR " readers)", maxreaders); - err = MDBX_PROBLEM; - goto bailout; + return MDBX_PROBLEM; } env->me_maxreaders = (maxreaders <= MDBX_READERS_LIMIT) ? (unsigned)maxreaders @@ -18368,14 +18896,14 @@ __cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, lck_seize_rc ? MMAP_OPTION_TRUNCATE | MMAP_OPTION_SEMAPHORE : MMAP_OPTION_SEMAPHORE); if (unlikely(err != MDBX_SUCCESS)) - goto bailout; + return err; #if MDBX_ENABLE_MADVISE #ifdef MADV_DODUMP err = madvise(env->me_lck_mmap.lck, size, MADV_DODUMP) ? ignore_enosys(errno) : MDBX_SUCCESS; if (unlikely(MDBX_IS_ERROR(err))) - goto bailout; + return err; #endif /* MADV_DODUMP */ #ifdef MADV_WILLNEED @@ -18383,18 +18911,19 @@ __cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, ? ignore_enosys(errno) : MDBX_SUCCESS; if (unlikely(MDBX_IS_ERROR(err))) - goto bailout; + return err; #elif defined(POSIX_MADV_WILLNEED) err = ignore_enosys( posix_madvise(env->me_lck_mmap.lck, size, POSIX_MADV_WILLNEED)); if (unlikely(MDBX_IS_ERROR(err))) - goto bailout; + return err; #endif /* MADV_WILLNEED */ #endif /* MDBX_ENABLE_MADVISE */ - struct MDBX_lockinfo *const lck = env->me_lck_mmap.lck; + struct MDBX_lockinfo *lck = env->me_lck_mmap.lck; if (lck_seize_rc == MDBX_RESULT_TRUE) { - /* LY: exclusive mode, check and reset lck content */ + /* If we succeed got exclusive lock, then nobody is using the lock region + * and we should initialize it. */ memset(lck, 0, (size_t)size); jitter4testing(false); lck->mti_magic_and_version = MDBX_LOCK_MAGIC; @@ -18406,7 +18935,8 @@ __cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, MDBX_SYNC_DATA | MDBX_SYNC_SIZE); if (unlikely(err != MDBX_SUCCESS)) { ERROR("initial-%s for lck-file failed, err %d", "msync/fsync", err); - goto bailout; + eASSERT(env, MDBX_IS_ERROR(err)); + return err; } } else { if (lck->mti_magic_and_version != MDBX_LOCK_MAGIC) { @@ -18416,34 +18946,69 @@ __cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, ? "invalid magic" : "incompatible version (only applications with nearly or the " "same versions of libmdbx can share the same database)"); - err = invalid ? MDBX_INVALID : MDBX_VERSION_MISMATCH; - goto bailout; + return invalid ? MDBX_INVALID : MDBX_VERSION_MISMATCH; } if (lck->mti_os_and_format != MDBX_LOCK_FORMAT) { ERROR("lock region has os/format signature 0x%" PRIx32 ", expected 0x%" PRIx32, lck->mti_os_and_format, MDBX_LOCK_FORMAT); - err = MDBX_VERSION_MISMATCH; - goto bailout; + return MDBX_VERSION_MISMATCH; } } err = osal_lck_init(env, inprocess_neighbor, lck_seize_rc); - if (MDBX_IS_ERROR(err)) - goto bailout; - - ENSURE(env, env->me_lcklist_next == nullptr); - /* insert into inprocess lck-list */ - env->me_lcklist_next = inprocess_lcklist_head; - inprocess_lcklist_head = env; - lcklist_unlock(); - /* end of a locked section ------------------------------------------------ */ + if (unlikely(err != MDBX_SUCCESS)) { + eASSERT(env, MDBX_IS_ERROR(err)); + return err; + } - eASSERT(env, !MDBX_IS_ERROR(lck_seize_rc)); env->me_lck = lck; + eASSERT(env, !MDBX_IS_ERROR(lck_seize_rc)); return lck_seize_rc; } +/* Open and/or initialize the lock region for the environment. */ +__cold static int setup_lck(MDBX_env *env, mdbx_mode_t mode) { + eASSERT(env, env->me_lazy_fd != INVALID_HANDLE_VALUE); + eASSERT(env, env->me_lfd == INVALID_HANDLE_VALUE); + + int err = osal_openfile(MDBX_OPEN_LCK, env, env->me_pathname.lck, + &env->me_lfd, mode); + if (err != MDBX_SUCCESS) { + switch (err) { + default: + return err; + case MDBX_ENOFILE: + case MDBX_EACCESS: + case MDBX_EPERM: + if (!F_ISSET(env->me_flags, MDBX_RDONLY | MDBX_EXCLUSIVE)) + return err; + break; + case MDBX_EROFS: + if ((env->me_flags & MDBX_RDONLY) == 0) + return err; + break; + } + + if (err != MDBX_ENOFILE) { + /* ENSURE the file system is read-only */ + err = osal_check_fs_rdonly(env->me_lazy_fd, env->me_pathname.lck, err); + if (err != MDBX_SUCCESS && + /* ignore ERROR_NOT_SUPPORTED for exclusive mode */ + !(err == MDBX_ENOSYS && (env->me_flags & MDBX_EXCLUSIVE))) + return err; + } + + /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ + env->me_lfd = INVALID_HANDLE_VALUE; + } + + rthc_lock(); + err = setup_lck_locked(env); + rthc_unlock(); + return err; +} + __cold int mdbx_is_readahead_reasonable(size_t volume, intptr_t redundancy) { if (volume <= 1024 * 1024 * 4ul) return MDBX_RESULT_TRUE; @@ -18513,11 +19078,11 @@ __cold static int __must_check_result override_meta(MDBX_env *env, if (shape) { if (txnid && unlikely(!check_meta_coherency(env, shape, false))) { ERROR("bailout overriding meta-%zu since model failed " - "freedb/maindb %s-check for txnid #%" PRIaTXN, + "FreeDB/MainDB %s-check for txnid #%" PRIaTXN, target, "pre", constmeta_txnid(shape)); return MDBX_PROBLEM; } - if (runtime_flags & MDBX_DBG_DONT_UPGRADE) + if (mdbx_static.flags & MDBX_DBG_DONT_UPGRADE) memcpy(&model->mm_magic_and_version, &shape->mm_magic_and_version, sizeof(model->mm_magic_and_version)); model->mm_extra_flags = shape->mm_extra_flags; @@ -18537,7 +19102,7 @@ __cold static int __must_check_result override_meta(MDBX_env *env, sizeof(model->mm_magic_and_version)); if (unlikely(!check_meta_coherency(env, model, false))) { ERROR("bailout overriding meta-%zu since model failed " - "freedb/maindb %s-check for txnid #%" PRIaTXN, + "FreeDB/MainDB %s-check for txnid #%" PRIaTXN, target, "post", txnid); return MDBX_PROBLEM; } @@ -18669,12 +19234,6 @@ __cold int mdbx_env_open_for_recoveryW(MDBX_env *env, const wchar_t *pathname, 0); } -typedef struct { - void *buffer_for_free; - pathchar_t *lck, *dxb; - size_t ent_len; -} MDBX_handle_env_pathname; - __cold static int check_alternative_lck_absent(const pathchar_t *lck_pathname) { int err = osal_fileexists(lck_pathname); if (unlikely(err != MDBX_RESULT_FALSE)) { @@ -18686,11 +19245,9 @@ __cold static int check_alternative_lck_absent(const pathchar_t *lck_pathname) { return err; } -__cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, - const pathchar_t *pathname, - MDBX_env_flags_t *flags, +__cold static int env_handle_pathname(MDBX_env *env, const pathchar_t *pathname, const mdbx_mode_t mode) { - memset(ctx, 0, sizeof(*ctx)); + memset(&env->me_pathname, 0, sizeof(env->me_pathname)); if (unlikely(!pathname || !*pathname)) return MDBX_EINVAL; @@ -18701,21 +19258,22 @@ __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, rc = GetLastError(); if (rc != MDBX_ENOFILE) return rc; - if (mode == 0 || (*flags & MDBX_RDONLY) != 0) + if (mode == 0 || (env->me_flags & MDBX_RDONLY) != 0) /* can't open existing */ return rc; /* auto-create directory if requested */ - if ((*flags & MDBX_NOSUBDIR) == 0 && !CreateDirectoryW(pathname, nullptr)) { + if ((env->me_flags & MDBX_NOSUBDIR) == 0 && + !CreateDirectoryW(pathname, nullptr)) { rc = GetLastError(); if (rc != ERROR_ALREADY_EXISTS) return rc; } } else { /* ignore passed MDBX_NOSUBDIR flag and set it automatically */ - *flags |= MDBX_NOSUBDIR; + env->me_flags |= MDBX_NOSUBDIR; if (dwAttrib & FILE_ATTRIBUTE_DIRECTORY) - *flags -= MDBX_NOSUBDIR; + env->me_flags -= MDBX_NOSUBDIR; } #else struct stat st; @@ -18723,7 +19281,7 @@ __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, rc = errno; if (rc != MDBX_ENOFILE) return rc; - if (mode == 0 || (*flags & MDBX_RDONLY) != 0) + if (mode == 0 || (env->me_flags & MDBX_RDONLY) != 0) /* can't open non-existing */ return rc /* MDBX_ENOFILE */; @@ -18734,16 +19292,16 @@ __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, /* always add read/write/search for owner */ S_IRWXU | ((mode & S_IRGRP) ? /* +search if readable by group */ S_IXGRP : 0) | ((mode & S_IROTH) ? /* +search if readable by others */ S_IXOTH : 0); - if ((*flags & MDBX_NOSUBDIR) == 0 && mkdir(pathname, dir_mode)) { + if ((env->me_flags & MDBX_NOSUBDIR) == 0 && mkdir(pathname, dir_mode)) { rc = errno; if (rc != EEXIST) return rc; } } else { /* ignore passed MDBX_NOSUBDIR flag and set it automatically */ - *flags |= MDBX_NOSUBDIR; + env->me_flags |= MDBX_NOSUBDIR; if (S_ISDIR(st.st_mode)) - *flags -= MDBX_NOSUBDIR; + env->me_flags -= MDBX_NOSUBDIR; } #endif @@ -18759,41 +19317,42 @@ __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, const size_t pathname_len = strlen(pathname); #endif assert(!osal_isdirsep(lock_suffix[0])); - ctx->ent_len = pathname_len; + size_t base_len = pathname_len; static const size_t dxb_name_len = ARRAY_LENGTH(dxb_name) - 1; - if (*flags & MDBX_NOSUBDIR) { - if (ctx->ent_len > dxb_name_len && - osal_pathequal(pathname + ctx->ent_len - dxb_name_len, dxb_name, + if (env->me_flags & MDBX_NOSUBDIR) { + if (base_len > dxb_name_len && + osal_pathequal(pathname + base_len - dxb_name_len, dxb_name, dxb_name_len)) { - *flags -= MDBX_NOSUBDIR; - ctx->ent_len -= dxb_name_len; - } else if (ctx->ent_len == dxb_name_len - 1 && osal_isdirsep(dxb_name[0]) && + env->me_flags -= MDBX_NOSUBDIR; + base_len -= dxb_name_len; + } else if (base_len == dxb_name_len - 1 && osal_isdirsep(dxb_name[0]) && osal_isdirsep(lck_name[0]) && - osal_pathequal(pathname + ctx->ent_len - dxb_name_len + 1, + osal_pathequal(pathname + base_len - dxb_name_len + 1, dxb_name + 1, dxb_name_len - 1)) { - *flags -= MDBX_NOSUBDIR; - ctx->ent_len -= dxb_name_len - 1; + env->me_flags -= MDBX_NOSUBDIR; + base_len -= dxb_name_len - 1; } } const size_t suflen_with_NOSUBDIR = sizeof(lock_suffix) + sizeof(pathchar_t); const size_t suflen_without_NOSUBDIR = sizeof(lck_name) + sizeof(dxb_name); - const size_t enogh4any = (suflen_with_NOSUBDIR > suflen_without_NOSUBDIR) - ? suflen_with_NOSUBDIR - : suflen_without_NOSUBDIR; - const size_t bytes_needed = sizeof(pathchar_t) * ctx->ent_len * 2 + enogh4any; - ctx->buffer_for_free = osal_malloc(bytes_needed); - if (!ctx->buffer_for_free) + const size_t enough4any = (suflen_with_NOSUBDIR > suflen_without_NOSUBDIR) + ? suflen_with_NOSUBDIR + : suflen_without_NOSUBDIR; + const size_t bytes_needed = + sizeof(pathchar_t) * (base_len * 2 + pathname_len + 1) + enough4any; + env->me_pathname.buffer = osal_malloc(bytes_needed); + if (!env->me_pathname.buffer) return MDBX_ENOMEM; - ctx->dxb = ctx->buffer_for_free; - ctx->lck = ctx->dxb + ctx->ent_len + dxb_name_len + 1; - pathchar_t *const buf = ctx->buffer_for_free; + env->me_pathname.specified = env->me_pathname.buffer; + env->me_pathname.dxb = env->me_pathname.specified + pathname_len + 1; + env->me_pathname.lck = env->me_pathname.dxb + base_len + dxb_name_len + 1; rc = MDBX_SUCCESS; - if (ctx->ent_len) { - memcpy(buf + /* shutting up goofy MSVC static analyzer */ 0, pathname, - sizeof(pathchar_t) * pathname_len); - if (*flags & MDBX_NOSUBDIR) { + pathchar_t *const buf = env->me_pathname.buffer; + if (base_len) { + memcpy(buf, pathname, sizeof(pathchar_t) * pathname_len); + if (env->me_flags & MDBX_NOSUBDIR) { const pathchar_t *const lck_ext = osal_fileext(lck_name, ARRAY_LENGTH(lck_name)); if (lck_ext) { @@ -18803,32 +19362,33 @@ __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, rc = check_alternative_lck_absent(buf); } } else { - memcpy(buf + ctx->ent_len, dxb_name, sizeof(dxb_name)); - memcpy(buf + ctx->ent_len + dxb_name_len, lock_suffix, - sizeof(lock_suffix)); + memcpy(buf + base_len, dxb_name, sizeof(dxb_name)); + memcpy(buf + base_len + dxb_name_len, lock_suffix, sizeof(lock_suffix)); rc = check_alternative_lck_absent(buf); } - memcpy(ctx->dxb + /* shutting up goofy MSVC static analyzer */ 0, pathname, - sizeof(pathchar_t) * (ctx->ent_len + 1)); - memcpy(ctx->lck, pathname, sizeof(pathchar_t) * ctx->ent_len); - if (*flags & MDBX_NOSUBDIR) { - memcpy(ctx->lck + ctx->ent_len, lock_suffix, sizeof(lock_suffix)); + memcpy(env->me_pathname.dxb, pathname, sizeof(pathchar_t) * (base_len + 1)); + memcpy(env->me_pathname.lck, pathname, sizeof(pathchar_t) * base_len); + if (env->me_flags & MDBX_NOSUBDIR) { + memcpy(env->me_pathname.lck + base_len, lock_suffix, sizeof(lock_suffix)); } else { - memcpy(ctx->dxb + ctx->ent_len, dxb_name, sizeof(dxb_name)); - memcpy(ctx->lck + ctx->ent_len, lck_name, sizeof(lck_name)); + memcpy(env->me_pathname.dxb + base_len, dxb_name, sizeof(dxb_name)); + memcpy(env->me_pathname.lck + base_len, lck_name, sizeof(lck_name)); } } else { - assert(!(*flags & MDBX_NOSUBDIR)); - memcpy(buf + /* shutting up goofy MSVC static analyzer */ 0, dxb_name + 1, - sizeof(dxb_name) - sizeof(pathchar_t)); + assert(!(env->me_flags & MDBX_NOSUBDIR)); + memcpy(buf, dxb_name + 1, sizeof(dxb_name) - sizeof(pathchar_t)); memcpy(buf + dxb_name_len - 1, lock_suffix, sizeof(lock_suffix)); rc = check_alternative_lck_absent(buf); - memcpy(ctx->dxb + /* shutting up goofy MSVC static analyzer */ 0, - dxb_name + 1, sizeof(dxb_name) - sizeof(pathchar_t)); - memcpy(ctx->lck, lck_name + 1, sizeof(lck_name) - sizeof(pathchar_t)); + memcpy(env->me_pathname.dxb, dxb_name + 1, + sizeof(dxb_name) - sizeof(pathchar_t)); + memcpy(env->me_pathname.lck, lck_name + 1, + sizeof(lck_name) - sizeof(pathchar_t)); } + + memcpy(env->me_pathname.specified, pathname, + sizeof(pathchar_t) * (pathname_len + 1)); return rc; } @@ -18866,23 +19426,19 @@ __cold int mdbx_env_deleteW(const wchar_t *pathname, (mode == MDBX_ENV_ENSURE_UNUSED) ? MDBX_EXCLUSIVE : MDBX_ENV_DEFAULTS; dummy_env->me_os_psize = (unsigned)osal_syspagesize(); dummy_env->me_psize = (unsigned)mdbx_default_pagesize(); - dummy_env->me_pathname = (pathchar_t *)pathname; - MDBX_handle_env_pathname env_pathname; STATIC_ASSERT(sizeof(dummy_env->me_flags) == sizeof(MDBX_env_flags_t)); - int rc = MDBX_RESULT_TRUE, - err = handle_env_pathname(&env_pathname, pathname, - (MDBX_env_flags_t *)&dummy_env->me_flags, 0); + int rc = MDBX_RESULT_TRUE, err = env_handle_pathname(dummy_env, pathname, 0); if (likely(err == MDBX_SUCCESS)) { mdbx_filehandle_t clk_handle = INVALID_HANDLE_VALUE, dxb_handle = INVALID_HANDLE_VALUE; if (mode > MDBX_ENV_JUST_DELETE) { - err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.dxb, - &dxb_handle, 0); + err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, + dummy_env->me_pathname.dxb, &dxb_handle, 0); err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; if (err == MDBX_SUCCESS) { - err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.lck, - &clk_handle, 0); + err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, + dummy_env->me_pathname.lck, &clk_handle, 0); err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; } if (err == MDBX_SUCCESS && clk_handle != INVALID_HANDLE_VALUE) @@ -18892,7 +19448,7 @@ __cold int mdbx_env_deleteW(const wchar_t *pathname, } if (err == MDBX_SUCCESS) { - err = osal_removefile(env_pathname.dxb); + err = osal_removefile(dummy_env->me_pathname.dxb); if (err == MDBX_SUCCESS) rc = MDBX_SUCCESS; else if (err == MDBX_ENOFILE) @@ -18900,14 +19456,17 @@ __cold int mdbx_env_deleteW(const wchar_t *pathname, } if (err == MDBX_SUCCESS) { - err = osal_removefile(env_pathname.lck); + err = osal_removefile(dummy_env->me_pathname.lck); if (err == MDBX_SUCCESS) rc = MDBX_SUCCESS; else if (err == MDBX_ENOFILE) err = MDBX_SUCCESS; } - if (err == MDBX_SUCCESS && !(dummy_env->me_flags & MDBX_NOSUBDIR)) { + if (err == MDBX_SUCCESS && !(dummy_env->me_flags & MDBX_NOSUBDIR) && + (/* pathname != "." */ pathname[0] != '.' || pathname[1] != 0) && + (/* pathname != ".." */ pathname[0] != '.' || pathname[1] != '.' || + pathname[2] != 0)) { err = osal_removedirectory(pathname); if (err == MDBX_SUCCESS) rc = MDBX_SUCCESS; @@ -18922,92 +19481,11 @@ __cold int mdbx_env_deleteW(const wchar_t *pathname, } else if (err == MDBX_ENOFILE) err = MDBX_SUCCESS; - osal_free(env_pathname.buffer_for_free); + osal_free(dummy_env->me_pathname.buffer); return (err == MDBX_SUCCESS) ? rc : err; } -__cold int mdbx_env_open(MDBX_env *env, const char *pathname, - MDBX_env_flags_t flags, mdbx_mode_t mode) { -#if defined(_WIN32) || defined(_WIN64) - wchar_t *pathnameW = nullptr; - int rc = osal_mb2w(pathname, &pathnameW); - if (likely(rc == MDBX_SUCCESS)) { - rc = mdbx_env_openW(env, pathnameW, flags, mode); - osal_free(pathnameW); - if (rc == MDBX_SUCCESS) - /* force to make cache of the multi-byte pathname representation */ - mdbx_env_get_path(env, &pathname); - } - return rc; -} - -__cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, - MDBX_env_flags_t flags, mdbx_mode_t mode) { -#endif /* Windows */ - - int rc = check_env(env, false); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(flags & ~ENV_USABLE_FLAGS)) - return MDBX_EINVAL; - - if (unlikely(env->me_lazy_fd != INVALID_HANDLE_VALUE || - (env->me_flags & MDBX_ENV_ACTIVE) != 0 || env->me_map)) - return MDBX_EPERM; - - /* Pickup previously mdbx_env_set_flags(), - * but avoid MDBX_UTTERLY_NOSYNC by disjunction */ - const uint32_t saved_me_flags = env->me_flags; - flags = merge_sync_flags(flags | MDBX_DEPRECATED_COALESCE, env->me_flags); - - if (flags & MDBX_RDONLY) { - /* Silently ignore irrelevant flags when we're only getting read access */ - flags &= ~(MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC | MDBX_SAFE_NOSYNC | - MDBX_NOMETASYNC | MDBX_DEPRECATED_COALESCE | MDBX_LIFORECLAIM | - MDBX_NOMEMINIT | MDBX_ACCEDE); - mode = 0; - } else { -#if MDBX_MMAP_INCOHERENT_FILE_WRITE - /* Temporary `workaround` for OpenBSD kernel's flaw. - * See https://libmdbx.dqdkfa.ru/dead-github/issues/67 */ - if ((flags & MDBX_WRITEMAP) == 0) { - if (flags & MDBX_ACCEDE) - flags |= MDBX_WRITEMAP; - else { - debug_log(MDBX_LOG_ERROR, __func__, __LINE__, - "System (i.e. OpenBSD) requires MDBX_WRITEMAP because " - "of an internal flaw(s) in a file/buffer/page cache.\n"); - return 42 /* ENOPROTOOPT */; - } - } -#endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ - } - - MDBX_handle_env_pathname env_pathname; - rc = handle_env_pathname(&env_pathname, pathname, &flags, mode); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - - env->me_flags = (flags & ~MDBX_FATAL_ERROR) | MDBX_ENV_ACTIVE; - env->me_pathname = osal_calloc(env_pathname.ent_len + 1, sizeof(pathchar_t)); - env->me_dbxs = osal_calloc(env->me_maxdbs, sizeof(MDBX_dbx)); - env->me_dbflags = osal_calloc(env->me_maxdbs, sizeof(env->me_dbflags[0])); - env->me_dbiseqs = osal_calloc(env->me_maxdbs, sizeof(env->me_dbiseqs[0])); - if (!(env->me_dbxs && env->me_pathname && env->me_dbflags && - env->me_dbiseqs)) { - rc = MDBX_ENOMEM; - goto bailout; - } - memcpy(env->me_pathname, env_pathname.dxb, - env_pathname.ent_len * sizeof(pathchar_t)); - env->me_dbxs[FREE_DBI].md_cmp = cmp_int_align4; /* aligned MDBX_INTEGERKEY */ - env->me_dbxs[FREE_DBI].md_dcmp = cmp_lenfast; - env->me_dbxs[FREE_DBI].md_klen_max = env->me_dbxs[FREE_DBI].md_klen_min = 8; - env->me_dbxs[FREE_DBI].md_vlen_min = 4; - env->me_dbxs[FREE_DBI].md_vlen_max = - mdbx_env_get_maxvalsize_ex(env, MDBX_INTEGERKEY); - +__cold static int env_open(MDBX_env *env, mdbx_mode_t mode) { /* Использование O_DSYNC или FILE_FLAG_WRITE_THROUGH: * * 0) Если размер страниц БД меньше системной страницы ОЗУ, то ядру ОС @@ -19095,18 +19573,17 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, * при этом для записи мета требуется отдельный не-overlapped дескриптор. */ - rc = osal_openfile((flags & MDBX_RDONLY) ? MDBX_OPEN_DXB_READ - : MDBX_OPEN_DXB_LAZY, - env, env_pathname.dxb, &env->me_lazy_fd, mode); - if (rc != MDBX_SUCCESS) - goto bailout; + env->me_pid = osal_getpid(); + int rc = osal_openfile((env->me_flags & MDBX_RDONLY) ? MDBX_OPEN_DXB_READ + : MDBX_OPEN_DXB_LAZY, + env, env->me_pathname.dxb, &env->me_lazy_fd, mode); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; #if MDBX_LOCKING == MDBX_LOCKING_SYSV - env->me_sysv_ipc.key = ftok(env_pathname.dxb, 42); - if (env->me_sysv_ipc.key == -1) { - rc = errno; - goto bailout; - } + env->me_sysv_ipc.key = ftok(env->me_pathname.dxb, 42); + if (unlikely(env->me_sysv_ipc.key == -1)) + return errno; #endif /* MDBX_LOCKING */ /* Set the position in files outside of the data to avoid corruption @@ -19118,9 +19595,9 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, #if defined(_WIN32) || defined(_WIN64) eASSERT(env, env->me_overlapped_fd == 0); bool ior_direct = false; - if (!(flags & + if (!(env->me_flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_EXCLUSIVE))) { - if (MDBX_AVOID_MSYNC && (flags & MDBX_WRITEMAP)) { + if (MDBX_AVOID_MSYNC && (env->me_flags & MDBX_WRITEMAP)) { /* Запрошен режим MDBX_SYNC_DURABLE | MDBX_WRITEMAP при активной опции * MDBX_AVOID_MSYNC. * @@ -19157,24 +19634,20 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, rc = osal_openfile(ior_direct ? MDBX_OPEN_DXB_OVERLAPPED_DIRECT : MDBX_OPEN_DXB_OVERLAPPED, - env, env_pathname.dxb, &env->me_overlapped_fd, 0); - if (rc != MDBX_SUCCESS) - goto bailout; + env, env->me_pathname.dxb, &env->me_overlapped_fd, 0); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; env->me_data_lock_event = CreateEventW(nullptr, true, false, nullptr); - if (!env->me_data_lock_event) { - rc = (int)GetLastError(); - goto bailout; - } + if (unlikely(!env->me_data_lock_event)) + return (int)GetLastError(); osal_fseek(env->me_overlapped_fd, safe_parking_lot_offset); } #else if (mode == 0) { /* pickup mode for lck-file */ struct stat st; - if (fstat(env->me_lazy_fd, &st)) { - rc = errno; - goto bailout; - } + if (unlikely(fstat(env->me_lazy_fd, &st))) + return errno; mode = st.st_mode; } mode = (/* inherit read permissions for group and others */ mode & @@ -19183,25 +19656,25 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, ((mode & S_IRGRP) ? /* +write if readable by group */ S_IWGRP : 0) | ((mode & S_IROTH) ? /* +write if readable by others */ S_IWOTH : 0); #endif /* !Windows */ - const int lck_rc = setup_lck(env, env_pathname.lck, mode); - if (MDBX_IS_ERROR(lck_rc)) { - rc = lck_rc; - goto bailout; - } - osal_fseek(env->me_lfd, safe_parking_lot_offset); + const int lck_rc = setup_lck(env, mode); + if (unlikely(MDBX_IS_ERROR(lck_rc))) + return lck_rc; + if (env->me_lfd != INVALID_HANDLE_VALUE) + osal_fseek(env->me_lfd, safe_parking_lot_offset); eASSERT(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); - if (!(flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC + if (!(env->me_flags & + (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC #if defined(_WIN32) || defined(_WIN64) - | MDBX_EXCLUSIVE + | MDBX_EXCLUSIVE #endif /* !Windows */ - ))) { - rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb, + ))) { + rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env->me_pathname.dxb, &env->me_dsync_fd, 0); - if (MDBX_IS_ERROR(rc)) - goto bailout; + if (unlikely(MDBX_IS_ERROR(rc))) + return rc; if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { - if ((flags & MDBX_NOMETASYNC) == 0) + if ((env->me_flags & MDBX_NOMETASYNC) == 0) env->me_fd4meta = env->me_dsync_fd; osal_fseek(env->me_dsync_fd, safe_parking_lot_offset); } @@ -19276,37 +19749,35 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, ERROR("current mode/flags 0x%X incompatible with requested 0x%X, " "rigorous diff 0x%X", env->me_flags, snap_flags, rigorous_diff); - rc = MDBX_INCOMPATIBLE; - goto bailout; + return MDBX_INCOMPATIBLE; } } mincore_clean_cache(env); const int dxb_rc = setup_dxb(env, lck_rc, mode); - if (MDBX_IS_ERROR(dxb_rc)) { - rc = dxb_rc; - goto bailout; - } + if (MDBX_IS_ERROR(dxb_rc)) + return dxb_rc; rc = osal_check_fs_incore(env->me_lazy_fd); env->me_incore = false; if (rc == MDBX_RESULT_TRUE) { env->me_incore = true; NOTICE("%s", "in-core database"); + rc = MDBX_SUCCESS; } else if (unlikely(rc != MDBX_SUCCESS)) { ERROR("check_fs_incore(), err %d", rc); - goto bailout; + return rc; } if (unlikely(/* recovery mode */ env->me_stuck_meta >= 0) && (lck_rc != /* exclusive */ MDBX_RESULT_TRUE || - (flags & MDBX_EXCLUSIVE) == 0)) { + (env->me_flags & MDBX_EXCLUSIVE) == 0)) { ERROR("%s", "recovery requires exclusive mode"); - rc = MDBX_BUSY; - goto bailout; + return MDBX_BUSY; } DEBUG("opened dbenv %p", (void *)env); + env->me_flags |= MDBX_ENV_ACTIVE; if (!lck || lck_rc == MDBX_RESULT_TRUE) { env->me_lck->mti_envmode.weak = env->me_flags & mode_flags; env->me_lck->mti_meta_sync_txnid.weak = @@ -19319,130 +19790,222 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, DEBUG("lck-downgrade-%s: rc %i", (env->me_flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc); if (rc != MDBX_SUCCESS) - goto bailout; + return rc; } else { rc = cleanup_dead_readers(env, false, NULL); if (MDBX_IS_ERROR(rc)) - goto bailout; + return rc; } + } - if ((env->me_flags & MDBX_NOTLS) == 0) { - rc = rthc_alloc(&env->me_txkey, &lck->mti_readers[0], - &lck->mti_readers[env->me_maxreaders]); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - env->me_flags |= MDBX_ENV_TXKEY; + rc = (env->me_flags & MDBX_RDONLY) + ? MDBX_SUCCESS + : osal_ioring_create(&env->me_ioring +#if defined(_WIN32) || defined(_WIN64) + , + ior_direct, env->me_overlapped_fd +#endif /* Windows */ + ); + return rc; +} + +__cold int mdbx_env_open(MDBX_env *env, const char *pathname, + MDBX_env_flags_t flags, mdbx_mode_t mode) { +#if defined(_WIN32) || defined(_WIN64) + wchar_t *pathnameW = nullptr; + int rc = osal_mb2w(pathname, &pathnameW); + if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_env_openW(env, pathnameW, flags, mode); + osal_free(pathnameW); + if (rc == MDBX_SUCCESS) + /* force to make cache of the multi-byte pathname representation */ + mdbx_env_get_path(env, &pathname); + } + return rc; +} + +__cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, + MDBX_env_flags_t flags, mdbx_mode_t mode) { +#endif /* Windows */ + + int rc = check_env(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(flags & ~ENV_USABLE_FLAGS)) + return MDBX_EINVAL; + + if (unlikely(env->me_lazy_fd != INVALID_HANDLE_VALUE || + (env->me_flags & MDBX_ENV_ACTIVE) != 0 || env->me_map)) + return MDBX_EPERM; + + /* Pickup previously mdbx_env_set_flags(), + * but avoid MDBX_UTTERLY_NOSYNC by disjunction */ + const uint32_t saved_me_flags = env->me_flags; + flags = merge_sync_flags(flags | MDBX_DEPRECATED_COALESCE, env->me_flags); + + if (flags & MDBX_RDONLY) { + /* Silently ignore irrelevant flags when we're only getting read access */ + flags &= ~(MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC | MDBX_SAFE_NOSYNC | + MDBX_NOMETASYNC | MDBX_DEPRECATED_COALESCE | MDBX_LIFORECLAIM | + MDBX_NOMEMINIT | MDBX_ACCEDE); + mode = 0; + } else { +#if MDBX_MMAP_INCOHERENT_FILE_WRITE + /* Temporary `workaround` for OpenBSD kernel's flaw. + * See https://libmdbx.dqdkfa.ru/dead-github/issues/67 */ + if ((flags & MDBX_WRITEMAP) == 0) { + if (flags & MDBX_ACCEDE) + flags |= MDBX_WRITEMAP; + else { + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, + "System (i.e. OpenBSD) requires MDBX_WRITEMAP because " + "of an internal flaw(s) in a file/buffer/page cache.\n"); + return 42 /* ENOPROTOOPT */; + } } +#endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ + } + + env->me_flags = (flags & ~MDBX_FATAL_ERROR); + rc = env_handle_pathname(env, pathname, mode); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + env->me_dbxs = osal_calloc(env->me_maxdbs, sizeof(env->me_dbxs[0])); + env->me_db_flags = osal_calloc(env->me_maxdbs, sizeof(env->me_db_flags[0])); + env->me_dbi_seqs = osal_calloc(env->me_maxdbs, sizeof(env->me_dbi_seqs[0])); + if (unlikely(!(env->me_dbxs && env->me_db_flags && env->me_dbi_seqs))) { + rc = MDBX_ENOMEM; + goto bailout; } if ((flags & MDBX_RDONLY) == 0) { - const size_t tsize = sizeof(MDBX_txn) + sizeof(MDBX_cursor), - size = tsize + env->me_maxdbs * - (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + - sizeof(MDBX_atomic_uint32_t) + 1); + MDBX_txn *txn = nullptr; + const intptr_t bitmap_bytes = +#if MDBX_ENABLE_DBI_SPARSE + ceil_powerof2(env->me_maxdbs, + CHAR_BIT * sizeof(txn->mt_dbi_sparse[0])) / + CHAR_BIT; +#else + 0; +#endif /* MDBX_ENABLE_DBI_SPARSE */ + const size_t base = sizeof(MDBX_txn) + sizeof(MDBX_cursor); + const size_t size = + base + bitmap_bytes + + env->me_maxdbs * + (sizeof(txn->mt_dbs[0]) + sizeof(txn->mt_cursors[0]) + + sizeof(txn->mt_dbi_seqs[0]) + sizeof(txn->mt_dbi_state[0])); rc = alloc_page_buf(env); - if (rc == MDBX_SUCCESS) { - memset(env->me_pbuf, -1, env->me_psize * (size_t)2); - memset(ptr_disp(env->me_pbuf, env->me_psize * (size_t)2), 0, - env->me_psize); - MDBX_txn *txn = osal_calloc(1, size); - if (txn) { - txn->mt_dbs = ptr_disp(txn, tsize); - txn->mt_cursors = - ptr_disp(txn->mt_dbs, sizeof(MDBX_db) * env->me_maxdbs); - txn->mt_dbiseqs = - ptr_disp(txn->mt_cursors, sizeof(MDBX_cursor *) * env->me_maxdbs); - txn->mt_dbistate = ptr_disp( - txn->mt_dbiseqs, sizeof(MDBX_atomic_uint32_t) * env->me_maxdbs); - txn->mt_env = env; - txn->mt_dbxs = env->me_dbxs; - txn->mt_flags = MDBX_TXN_FINISHED; - env->me_txn0 = txn; - txn->tw.retired_pages = pnl_alloc(MDBX_PNL_INITIAL); - txn->tw.relist = pnl_alloc(MDBX_PNL_INITIAL); - if (unlikely(!txn->tw.retired_pages || !txn->tw.relist)) - rc = MDBX_ENOMEM; - } else - rc = MDBX_ENOMEM; + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + memset(env->me_pbuf, -1, env->me_psize * (size_t)2); + memset(ptr_disp(env->me_pbuf, env->me_psize * (size_t)2), 0, env->me_psize); + txn = osal_calloc(1, size); + if (unlikely(!txn)) { + rc = MDBX_ENOMEM; + goto bailout; } - if (rc == MDBX_SUCCESS) - rc = osal_ioring_create(&env->me_ioring -#if defined(_WIN32) || defined(_WIN64) - , - ior_direct, env->me_overlapped_fd -#endif /* Windows */ - ); - if (rc == MDBX_SUCCESS) - adjust_defaults(env); + txn->mt_dbs = ptr_disp(txn, base); + txn->mt_cursors = + ptr_disp(txn->mt_dbs, env->me_maxdbs * sizeof(txn->mt_dbs[0])); + txn->mt_dbi_seqs = + ptr_disp(txn->mt_cursors, env->me_maxdbs * sizeof(txn->mt_cursors[0])); + txn->mt_dbi_state = + ptr_disp(txn, size - env->me_maxdbs * sizeof(txn->mt_dbi_state[0])); +#if MDBX_ENABLE_DBI_SPARSE + txn->mt_dbi_sparse = ptr_disp(txn->mt_dbi_state, -bitmap_bytes); +#endif /* MDBX_ENABLE_DBI_SPARSE */ + txn->mt_env = env; + txn->mt_flags = MDBX_TXN_FINISHED; + env->me_txn0 = txn; + txn->tw.retired_pages = pnl_alloc(MDBX_PNL_INITIAL); + txn->tw.relist = pnl_alloc(MDBX_PNL_INITIAL); + if (unlikely(!txn->tw.retired_pages || !txn->tw.relist)) { + rc = MDBX_ENOMEM; + goto bailout; + } + adjust_defaults(env); } -#if MDBX_DEBUG - if (rc == MDBX_SUCCESS) { - const meta_troika_t troika = meta_tap(env); - const meta_ptr_t head = meta_recent(env, &troika); - const MDBX_db *db = &head.ptr_c->mm_dbs[MAIN_DBI]; + rc = env_open(env, mode); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; - DEBUG("opened database version %u, pagesize %u", - (uint8_t)unaligned_peek_u64(4, head.ptr_c->mm_magic_and_version), - env->me_psize); - DEBUG("using meta page %" PRIaPGNO ", txn %" PRIaTXN, - data_page(head.ptr_c)->mp_pgno, head.txnid); - DEBUG("depth: %u", db->md_depth); - DEBUG("entries: %" PRIu64, db->md_entries); - DEBUG("branch pages: %" PRIaPGNO, db->md_branch_pages); - DEBUG("leaf pages: %" PRIaPGNO, db->md_leaf_pages); - DEBUG("large/overflow pages: %" PRIaPGNO, db->md_overflow_pages); - DEBUG("root: %" PRIaPGNO, db->md_root); - DEBUG("schema_altered: %" PRIaTXN, db->md_mod_txnid); - } -#endif +#if MDBX_DEBUG + const meta_troika_t troika = meta_tap(env); + const meta_ptr_t head = meta_recent(env, &troika); + const MDBX_db *db = &head.ptr_c->mm_dbs[MAIN_DBI]; + + DEBUG("opened database version %u, pagesize %u", + (uint8_t)unaligned_peek_u64(4, head.ptr_c->mm_magic_and_version), + env->me_psize); + DEBUG("using meta page %" PRIaPGNO ", txn %" PRIaTXN, + data_page(head.ptr_c)->mp_pgno, head.txnid); + DEBUG("depth: %u", db->md_depth); + DEBUG("entries: %" PRIu64, db->md_entries); + DEBUG("branch pages: %" PRIaPGNO, db->md_branch_pages); + DEBUG("leaf pages: %" PRIaPGNO, db->md_leaf_pages); + DEBUG("large/overflow pages: %" PRIaPGNO, db->md_overflow_pages); + DEBUG("root: %" PRIaPGNO, db->md_root); + DEBUG("schema_altered: %" PRIaTXN, db->md_mod_txnid); +#endif /* MDBX_DEBUG */ -bailout: - if (rc != MDBX_SUCCESS) { - rc = env_close(env) ? MDBX_PANIC : rc; - env->me_flags = - saved_me_flags | ((rc != MDBX_PANIC) ? 0 : MDBX_FATAL_ERROR); - } else { -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) + if (likely(rc == MDBX_SUCCESS)) { +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) txn_valgrind(env, nullptr); -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ + } else { + bailout: + if (likely(env_close(env, false) == MDBX_SUCCESS)) { + env->me_flags = saved_me_flags; + } else { + rc = MDBX_PANIC; + env->me_flags = saved_me_flags | MDBX_FATAL_ERROR; + } } - osal_free(env_pathname.buffer_for_free); return rc; } /* Destroy resources from mdbx_env_open(), clear our readers & DBIs */ -__cold static int env_close(MDBX_env *env) { +__cold static int env_close(MDBX_env *env, bool resurrect_after_fork) { const unsigned flags = env->me_flags; - if (!(flags & MDBX_ENV_ACTIVE)) { - ENSURE(env, env->me_lcklist_next == nullptr); - return MDBX_SUCCESS; - } - env->me_flags &= ~ENV_INTERNAL_FLAGS; if (flags & MDBX_ENV_TXKEY) { - rthc_remove(env->me_txkey); - env->me_txkey = (osal_thread_key_t)0; + thread_key_delete(env->me_txkey); + env->me_txkey = 0; + } + + if (env->me_lck) + munlock_all(env); + + rthc_lock(); + int rc = rthc_remove(env); + rthc_unlock(); + +#if MDBX_ENABLE_DBI_LOCKFREE + for (struct mdbx_defer_free_item *next, *ptr = env->me_defer_free; ptr; + ptr = next) { + next = ptr->next; + osal_free(ptr); } + env->me_defer_free = nullptr; +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ - munlock_all(env); if (!(env->me_flags & MDBX_RDONLY)) osal_ioring_destroy(&env->me_ioring); - lcklist_lock(); - const int rc = lcklist_detach_locked(env); - lcklist_unlock(); - env->me_lck = nullptr; if (env->me_lck_mmap.lck) osal_munmap(&env->me_lck_mmap); if (env->me_map) { osal_munmap(&env->me_dxb_mmap); -#ifdef MDBX_USE_VALGRIND +#ifdef ENABLE_MEMCHECK VALGRIND_DISCARD(env->me_valgrind_handle); env->me_valgrind_handle = -1; -#endif +#endif /* ENABLE_MEMCHECK */ } #if defined(_WIN32) || defined(_WIN64) @@ -19452,6 +20015,11 @@ __cold static int env_close(MDBX_env *env) { CloseHandle(env->me_data_lock_event); env->me_data_lock_event = INVALID_HANDLE_VALUE; } + eASSERT(env, !resurrect_after_fork); + if (env->me_pathname_char) { + osal_free(env->me_pathname_char); + env->me_pathname_char = nullptr; + } #endif /* Windows */ if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { @@ -19469,49 +20037,82 @@ __cold static int env_close(MDBX_env *env) { env->me_lfd = INVALID_HANDLE_VALUE; } - if (env->me_dbxs) { - for (size_t i = CORE_DBS; i < env->me_numdbs; ++i) - if (env->me_dbxs[i].md_name.iov_len) - osal_free(env->me_dbxs[i].md_name.iov_base); - osal_free(env->me_dbxs); - env->me_numdbs = CORE_DBS; - env->me_dbxs = nullptr; - } - if (env->me_pbuf) { - osal_memalign_free(env->me_pbuf); - env->me_pbuf = nullptr; - } - if (env->me_dbiseqs) { - osal_free(env->me_dbiseqs); - env->me_dbiseqs = nullptr; - } - if (env->me_dbflags) { - osal_free(env->me_dbflags); - env->me_dbflags = nullptr; - } - if (env->me_pathname) { - osal_free(env->me_pathname); - env->me_pathname = nullptr; - } -#if defined(_WIN32) || defined(_WIN64) - if (env->me_pathname_char) { - osal_free(env->me_pathname_char); - env->me_pathname_char = nullptr; - } -#endif /* Windows */ - if (env->me_txn0) { - dpl_free(env->me_txn0); - txl_free(env->me_txn0->tw.lifo_reclaimed); - pnl_free(env->me_txn0->tw.retired_pages); - pnl_free(env->me_txn0->tw.spilled.list); - pnl_free(env->me_txn0->tw.relist); - osal_free(env->me_txn0); - env->me_txn0 = nullptr; + if (!resurrect_after_fork) { + if (env->me_dbxs) { + for (size_t i = CORE_DBS; i < env->me_numdbs; ++i) + if (env->me_dbxs[i].md_name.iov_len) + osal_free(env->me_dbxs[i].md_name.iov_base); + osal_free(env->me_dbxs); + env->me_numdbs = CORE_DBS; + env->me_dbxs = nullptr; + } + if (env->me_pbuf) { + osal_memalign_free(env->me_pbuf); + env->me_pbuf = nullptr; + } + if (env->me_dbi_seqs) { + osal_free(env->me_dbi_seqs); + env->me_dbi_seqs = nullptr; + } + if (env->me_db_flags) { + osal_free(env->me_db_flags); + env->me_db_flags = nullptr; + } + if (env->me_pathname.buffer) { + osal_free(env->me_pathname.buffer); + env->me_pathname.buffer = nullptr; + } + if (env->me_txn0) { + dpl_free(env->me_txn0); + txl_free(env->me_txn0->tw.lifo_reclaimed); + pnl_free(env->me_txn0->tw.retired_pages); + pnl_free(env->me_txn0->tw.spilled.list); + pnl_free(env->me_txn0->tw.relist); + osal_free(env->me_txn0); + env->me_txn0 = nullptr; + } } env->me_stuck_meta = -1; return rc; } +#if !(defined(_WIN32) || defined(_WIN64)) +__cold int mdbx_env_resurrect_after_fork(MDBX_env *env) { + if (unlikely(!env)) + return MDBX_EINVAL; + + if (unlikely(env->me_signature.weak != MDBX_ME_SIGNATURE)) + return MDBX_EBADSIGN; + + if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) + return MDBX_PANIC; + + if (unlikely((env->me_flags & MDBX_ENV_ACTIVE) == 0)) + return MDBX_SUCCESS; + + const uint32_t new_pid = osal_getpid(); + if (unlikely(env->me_pid == new_pid)) + return MDBX_SUCCESS; + + if (!atomic_cas32(&env->me_signature, MDBX_ME_SIGNATURE, ~MDBX_ME_SIGNATURE)) + return MDBX_EBADSIGN; + + if (env->me_txn) + txn_abort(env->me_txn0); + env->me_live_reader = 0; + int rc = env_close(env, true); + env->me_signature.weak = MDBX_ME_SIGNATURE; + if (likely(rc == MDBX_SUCCESS)) { + rc = (env->me_flags & MDBX_EXCLUSIVE) ? MDBX_BUSY : env_open(env, 0); + if (unlikely(rc != MDBX_SUCCESS && env_close(env, false) != MDBX_SUCCESS)) { + rc = MDBX_PANIC; + env->me_flags |= MDBX_FATAL_ERROR; + } + } + return rc; +} +#endif /* Windows */ + __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { MDBX_page *dp; int rc = MDBX_SUCCESS; @@ -19563,8 +20164,11 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { #endif /* Windows */ } + if (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()) + osal_txn_unlock(env); + eASSERT(env, env->me_signature.weak == 0); - rc = env_close(env) ? MDBX_PANIC : rc; + rc = env_close(env, false) ? MDBX_PANIC : rc; ENSURE(env, osal_fastmutex_destroy(&env->me_dbi_lock) == MDBX_SUCCESS); #if defined(_WIN32) || defined(_WIN64) /* me_remap_guard don't have destructor (Slim Reader/Writer Lock) */ @@ -19575,7 +20179,8 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { #if MDBX_LOCKING > MDBX_LOCKING_SYSV MDBX_lockinfo *const stub = lckless_stub(env); - ENSURE(env, osal_ipclock_destroy(&stub->mti_wlock) == 0); + /* может вернуть ошибку в дочернем процессе после fork() */ + osal_ipclock_destroy(&stub->mti_wlock); #endif /* MDBX_LOCKING */ while ((dp = env->me_dp_reserve) != NULL) { @@ -19586,8 +20191,6 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { osal_free(ptr); } VALGRIND_DESTROY_MEMPOOL(env); - ENSURE(env, env->me_lcklist_next == nullptr); - env->me_pid = 0; osal_free(env); return rc; @@ -19929,21 +20532,22 @@ __hot __noinline static int page_search_root(MDBX_cursor *mc, return MDBX_SUCCESS; } -static int setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, +static int setup_sdb(MDBX_dbx *const dbx, const MDBX_db *const db, const unsigned pagesize) { + if (unlikely(!db_check_flags(db->md_flags))) { + ERROR("incompatible or invalid db.md_flags (%u) ", db->md_flags); + return MDBX_INCOMPATIBLE; + } if (unlikely(!dbx->md_cmp)) { dbx->md_cmp = get_default_keycmp(db->md_flags); dbx->md_dcmp = get_default_datacmp(db->md_flags); } - dbx->md_klen_min = - (db->md_flags & MDBX_INTEGERKEY) ? 4 /* sizeof(uint32_t) */ : 0; + dbx->md_klen_min = keysize_min(db->md_flags); dbx->md_klen_max = keysize_max(pagesize, db->md_flags); assert(dbx->md_klen_max != (unsigned)-1); - dbx->md_vlen_min = (db->md_flags & MDBX_INTEGERDUP) - ? 4 /* sizeof(uint32_t) */ - : ((db->md_flags & MDBX_DUPFIXED) ? 1 : 0); + dbx->md_vlen_min = valsize_min(db->md_flags); dbx->md_vlen_max = valsize_max(pagesize, db->md_flags); assert(dbx->md_vlen_max != (size_t)-1); @@ -19961,18 +20565,14 @@ static int setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, static int fetch_sdb(MDBX_txn *txn, size_t dbi) { MDBX_cursor_couple couple; - if (unlikely(dbi_changed(txn, dbi))) { - NOTICE("dbi %zu was changed for txn %" PRIaTXN, dbi, txn->mt_txnid); - return MDBX_BAD_DBI; - } int rc = cursor_init(&couple.outer, txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) return rc; - MDBX_dbx *const dbx = &txn->mt_dbxs[dbi]; + MDBX_dbx *const dbx = &txn->mt_env->me_dbxs[dbi]; rc = page_search(&couple.outer, &dbx->md_name, 0); if (unlikely(rc != MDBX_SUCCESS)) { - notfound: + bailout: NOTICE("dbi %zu refs to inaccessible subDB `%*s` for txn %" PRIaTXN " (err %d)", dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, @@ -19984,7 +20584,7 @@ static int fetch_sdb(MDBX_txn *txn, size_t dbi) { struct node_result nsr = node_search(&couple.outer, &dbx->md_name); if (unlikely(!nsr.exact)) { rc = MDBX_NOTFOUND; - goto notfound; + goto bailout; } if (unlikely((node_flags(nsr.node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) { NOTICE("dbi %zu refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", @@ -20027,11 +20627,11 @@ static int fetch_sdb(MDBX_txn *txn, size_t dbi) { return MDBX_CORRUPTED; } #endif /* !MDBX_DISABLE_VALIDATION */ - rc = setup_dbx(dbx, db, txn->mt_env->me_psize); + rc = setup_sdb(dbx, db, txn->mt_env->me_psize); if (unlikely(rc != MDBX_SUCCESS)) return rc; - txn->mt_dbistate[dbi] &= ~DBI_STALE; + txn->mt_dbi_state[dbi] &= ~DBI_STALE; return MDBX_SUCCESS; } @@ -20062,8 +20662,8 @@ __hot static int page_search_lowest(MDBX_cursor *mc) { * [in] key the key to search for, or NULL for first/last page. * [in] flags If MDBX_PS_MODIFY is set, visited pages in the DB * are touched (updated with new page numbers). - * If MDBX_PS_FIRST or MDBX_PS_LAST is set, find first or last - * leaf. + * If MDBX_PS_FIRST or MDBX_PS_LAST is set, + * find first or last leaf. * This is used by mdbx_cursor_first() and mdbx_cursor_last(). * If MDBX_PS_ROOTONLY set, just fetch root node, no further * lookups. @@ -20081,7 +20681,7 @@ __hot static int page_search(MDBX_cursor *mc, const MDBX_val *key, int flags) { } /* Make sure we're using an up-to-date root */ - if (unlikely(*mc->mc_dbistate & DBI_STALE)) { + if (unlikely(*mc->mc_dbi_state & DBI_STALE)) { rc = fetch_sdb(mc->mc_txn, mc->mc_dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -20105,7 +20705,7 @@ __hot static int page_search(MDBX_cursor *mc, const MDBX_val *key, int flags) { do if ((scan->mt_flags & MDBX_TXN_DIRTY) && (mc->mc_dbi == MAIN_DBI || - (scan->mt_dbistate[mc->mc_dbi] & DBI_DIRTY))) { + (scan->mt_dbi_state[mc->mc_dbi] & DBI_DIRTY))) { /* После коммита вложенных тразакций может быть mod_txnid > front */ pp_txnid = scan->mt_front; break; @@ -20181,9 +20781,6 @@ int mdbx_get(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, if (unlikely(!key || !data)) return MDBX_EINVAL; - if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) - return MDBX_BAD_DBI; - MDBX_cursor_couple cx; rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) @@ -20201,9 +20798,6 @@ int mdbx_get_equal_or_great(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, if (unlikely(!key || !data)) return MDBX_EINVAL; - if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) - return MDBX_BAD_DBI; - if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) return MDBX_BAD_TXN; @@ -20227,9 +20821,6 @@ int mdbx_get_ex(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, if (unlikely(!key || !data)) return MDBX_EINVAL; - if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) - return MDBX_BAD_DBI; - MDBX_cursor_couple cx; rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) @@ -20323,39 +20914,41 @@ static int cursor_sibling(MDBX_cursor *mc, int dir) { /* Move the cursor to the next data item. */ static int cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { - MDBX_page *mp; - MDBX_node *node; + assert(op == MDBX_NEXT || op == MDBX_NEXT_DUP || op == MDBX_NEXT_NODUP); int rc; if (unlikely(mc->mc_flags & C_DEL) && op == MDBX_NEXT_DUP) return MDBX_NOTFOUND; - if (unlikely(!(mc->mc_flags & C_INITIALIZED))) + if (unlikely(!(mc->mc_flags & C_INITIALIZED))) { + if (unlikely(mc->mc_flags & C_SUB)) + return MDBX_NOTFOUND; return cursor_first(mc, key, data); + } - mp = mc->mc_pg[mc->mc_top]; + const MDBX_page *mp = mc->mc_pg[mc->mc_top]; if (unlikely(mc->mc_flags & C_EOF)) { if (mc->mc_ki[mc->mc_top] + (size_t)1 >= page_numkeys(mp)) return MDBX_NOTFOUND; mc->mc_flags ^= C_EOF; } - if (mc->mc_db->md_flags & MDBX_DUPSORT) { - node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (node_flags(node) & F_DUPDATA) { - if (op == MDBX_NEXT || op == MDBX_NEXT_DUP) { + if (mc->mc_xcursor) { + if (op != MDBX_NEXT_NODUP) { + const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); + if (node_flags(node) & F_DUPDATA) { rc = cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_NEXT); - if (op != MDBX_NEXT || rc != MDBX_NOTFOUND) { - if (likely(rc == MDBX_SUCCESS)) - get_key_optional(node, key); - return rc; + if (likely(rc == MDBX_SUCCESS)) { + get_key_optional(node, key); + return MDBX_SUCCESS; } + if (unlikely(rc != MDBX_NOTFOUND)) + return rc; } - } else { - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - if (op == MDBX_NEXT_DUP) + if (op != MDBX_NEXT) return MDBX_NOTFOUND; } + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); } DEBUG("cursor_next: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, @@ -20399,7 +20992,7 @@ static int cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_SUCCESS; } - node = page_node(mp, mc->mc_ki[mc->mc_top]); + const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); if (node_flags(node) & F_DUPDATA) { rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) @@ -20420,40 +21013,41 @@ static int cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, /* Move the cursor to the previous data item. */ static int cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { - MDBX_page *mp; - MDBX_node *node; + assert(op == MDBX_PREV || op == MDBX_PREV_DUP || op == MDBX_PREV_NODUP); int rc; if (unlikely(mc->mc_flags & C_DEL) && op == MDBX_PREV_DUP) return MDBX_NOTFOUND; if (unlikely(!(mc->mc_flags & C_INITIALIZED))) { + if (unlikely(mc->mc_flags & C_SUB)) + return MDBX_NOTFOUND; rc = cursor_last(mc, key, data); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; mc->mc_ki[mc->mc_top]++; } - mp = mc->mc_pg[mc->mc_top]; - if ((mc->mc_db->md_flags & MDBX_DUPSORT) && - mc->mc_ki[mc->mc_top] < page_numkeys(mp)) { - node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (node_flags(node) & F_DUPDATA) { - if (op == MDBX_PREV || op == MDBX_PREV_DUP) { - rc = cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_PREV); - if (op != MDBX_PREV || rc != MDBX_NOTFOUND) { + const MDBX_page *mp = mc->mc_pg[mc->mc_top]; + if (mc->mc_xcursor) { + if (op != MDBX_PREV_NODUP) { + if (likely(mc->mc_ki[mc->mc_top] < page_numkeys(mp))) { + const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); + if (node_flags(node) & F_DUPDATA) { + rc = cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_PREV); if (likely(rc == MDBX_SUCCESS)) { get_key_optional(node, key); mc->mc_flags &= ~C_EOF; + return MDBX_SUCCESS; } - return rc; + if (unlikely(rc != MDBX_NOTFOUND)) + return rc; } } - } else { - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - if (op == MDBX_PREV_DUP) + if (op != MDBX_PREV) return MDBX_NOTFOUND; } + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); } DEBUG("cursor_prev: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, @@ -20489,8 +21083,7 @@ static int cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_SUCCESS; } - node = page_node(mp, mc->mc_ki[mc->mc_top]); - + const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); if (node_flags(node) & F_DUPDATA) { rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) @@ -20525,7 +21118,7 @@ cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { } MDBX_val aligned_key = *key; - uint64_t aligned_keybytes; + uint64_t aligned_key_buf; if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { switch (aligned_key.iov_len) { default: @@ -20536,13 +21129,13 @@ cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { if (unlikely(3 & (uintptr_t)aligned_key.iov_base)) /* copy instead of return error to avoid break compatibility */ aligned_key.iov_base = - memcpy(&aligned_keybytes, aligned_key.iov_base, 4); + memcpy(&aligned_key_buf, aligned_key.iov_base, 4); break; case 8: if (unlikely(7 & (uintptr_t)aligned_key.iov_base)) /* copy instead of return error to avoid break compatibility */ aligned_key.iov_base = - memcpy(&aligned_keybytes, aligned_key.iov_base, 8); + memcpy(&aligned_key_buf, aligned_key.iov_base, 8); break; } } @@ -20581,7 +21174,7 @@ cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { } if (cmp > 0) { const size_t nkeys = page_numkeys(mp); - if (nkeys > 1) { + if (likely(nkeys > 1)) { if (IS_LEAF2(mp)) { nodekey.iov_base = page_leaf2key(mp, nkeys - 1, nodekey.iov_len); } else { @@ -20600,8 +21193,9 @@ cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { goto got_node; } if (cmp < 0) { - if (mc->mc_ki[mc->mc_top] < page_numkeys(mp)) { - /* This is definitely the right page, skip search_page */ + /* This is definitely the right page, skip search_page */ + if (mc->mc_ki[mc->mc_top] != 0 /* уже проверяли выше */ && + mc->mc_ki[mc->mc_top] < page_numkeys(mp)) { if (IS_LEAF2(mp)) { nodekey.iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], nodekey.iov_len); @@ -20625,23 +21219,22 @@ cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { } /* If any parents have right-sibs, search. * Otherwise, there's nothing further. */ - size_t i; - for (i = 0; i < mc->mc_top; i++) + for (size_t i = 0; i < mc->mc_top; i++) if (mc->mc_ki[i] < page_numkeys(mc->mc_pg[i]) - 1) - break; - if (i == mc->mc_top) { - /* There are no other pages */ - cASSERT(mc, nkeys <= UINT16_MAX); - mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; - mc->mc_flags |= C_EOF; - ret.err = MDBX_NOTFOUND; - return ret; - } + goto continue_other_pages; + + /* There are no other pages */ + cASSERT(mc, nkeys <= UINT16_MAX); + mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; + mc->mc_flags |= C_EOF; + ret.err = MDBX_NOTFOUND; + return ret; } + continue_other_pages: if (!mc->mc_top) { /* There are no other pages */ mc->mc_ki[mc->mc_top] = 0; - if (op == MDBX_SET_RANGE) + if (op >= MDBX_SET_RANGE) goto got_node; cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || @@ -20666,7 +21259,7 @@ search_node:; node = nsr.node; ret.exact = nsr.exact; if (!ret.exact) { - if (op != MDBX_SET_RANGE) { + if (op < MDBX_SET_RANGE) { /* MDBX_SET specified and not an exact match. */ if (unlikely(mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top]))) @@ -20703,7 +21296,7 @@ search_node:; } if (IS_LEAF2(mp)) { - if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) { + if (op >= MDBX_SET_KEY) { key->iov_len = mc->mc_db->md_xsize; key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); } @@ -20715,7 +21308,7 @@ search_node:; ret.err = cursor_xinit1(mc, node, mp); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; - if (op == MDBX_SET || op == MDBX_SET_KEY || op == MDBX_SET_RANGE) { + if (op >= MDBX_SET) { MDBX_ANALYSIS_ASSUME(mc->mc_xcursor != nullptr); ret.err = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(ret.err != MDBX_SUCCESS)) @@ -20731,7 +21324,7 @@ search_node:; } } } else if (likely(data)) { - if (op == MDBX_GET_BOTH || op == MDBX_GET_BOTH_RANGE) { + if (op <= MDBX_GET_BOTH_RANGE) { if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || data->iov_len > mc->mc_dbx->md_vlen_max)) { cASSERT(mc, !"Invalid data-size"); @@ -20783,7 +21376,7 @@ search_node:; } /* The key already matches in all other cases */ - if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) + if (op >= MDBX_SET_KEY) get_key_optional(node, key); DEBUG("==> cursor placed on key [%s], data [%s]", DKEY_DEBUG(key), @@ -20942,6 +21535,8 @@ static __hot int cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return rc; } } else { + cASSERT(mc, !mc->mc_xcursor || !(mc->mc_xcursor->mx_cursor.mc_flags & + C_INITIALIZED)); rc = node_read(mc, node, data, mp); if (unlikely(rc)) return rc; @@ -21068,6 +21663,7 @@ static __hot int cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, case MDBX_LAST_DUP: mfunc = cursor_last; goto move; + case MDBX_SET_UPPERBOUND: /* mostly same as MDBX_SET_LOWERBOUND */ case MDBX_SET_LOWERBOUND: { if (unlikely(key == NULL || data == NULL)) @@ -21111,6 +21707,153 @@ static __hot int cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, } break; } + + /* Doubtless API to positioning of the cursor at a specified key. */ + case MDBX_TO_KEY_LESSER_THAN: + case MDBX_TO_KEY_LESSER_OR_EQUAL: + case MDBX_TO_KEY_EQUAL: + case MDBX_TO_KEY_GREATER_OR_EQUAL: + case MDBX_TO_KEY_GREATER_THAN: { + if (unlikely(key == NULL)) + return MDBX_EINVAL; + struct cursor_set_result csr = cursor_set(mc, key, data, MDBX_SET_RANGE); + rc = csr.err; + if (csr.exact) { + cASSERT(mc, csr.err == MDBX_SUCCESS); + if (op == MDBX_TO_KEY_LESSER_THAN) + rc = cursor_prev(mc, key, data, MDBX_PREV_NODUP); + else if (op == MDBX_TO_KEY_GREATER_THAN) + rc = cursor_next(mc, key, data, MDBX_NEXT_NODUP); + } else if (op < MDBX_TO_KEY_EQUAL && + (rc == MDBX_NOTFOUND || rc == MDBX_SUCCESS)) + rc = cursor_prev(mc, key, data, MDBX_PREV_NODUP); + else if (op == MDBX_TO_KEY_EQUAL && rc == MDBX_SUCCESS) + rc = MDBX_NOTFOUND; + break; + } + + /* Doubtless API to positioning of the cursor at a specified key-value pair + * for multi-value hives. */ + case MDBX_TO_EXACT_KEY_VALUE_LESSER_THAN: + case MDBX_TO_EXACT_KEY_VALUE_LESSER_OR_EQUAL: + case MDBX_TO_EXACT_KEY_VALUE_EQUAL: + case MDBX_TO_EXACT_KEY_VALUE_GREATER_OR_EQUAL: + case MDBX_TO_EXACT_KEY_VALUE_GREATER_THAN: { + if (unlikely(key == NULL || data == NULL)) + return MDBX_EINVAL; + MDBX_val save_data = *data; + struct cursor_set_result csr = cursor_set(mc, key, data, MDBX_SET_KEY); + rc = csr.err; + if (rc == MDBX_SUCCESS) { + cASSERT(mc, csr.exact); + MDBX_cursor *const mx = + (mc->mc_xcursor && + (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + ? &mc->mc_xcursor->mx_cursor + : nullptr; + if (mx) { + csr = cursor_set(mx, &save_data, NULL, MDBX_SET_RANGE); + rc = csr.err; + if (csr.exact) { + cASSERT(mc, csr.err == MDBX_SUCCESS); + if (op == MDBX_TO_EXACT_KEY_VALUE_LESSER_THAN) + rc = cursor_prev(mx, data, NULL, MDBX_PREV); + else if (op == MDBX_TO_EXACT_KEY_VALUE_GREATER_THAN) + rc = cursor_next(mx, data, NULL, MDBX_NEXT); + } else if (op < MDBX_TO_EXACT_KEY_VALUE_EQUAL && + (rc == MDBX_NOTFOUND || rc == MDBX_SUCCESS)) + rc = cursor_prev(mx, data, NULL, MDBX_PREV); + else if (op == MDBX_TO_EXACT_KEY_VALUE_EQUAL && rc == MDBX_SUCCESS) + rc = MDBX_NOTFOUND; + } else { + int cmp = mc->mc_dbx->md_dcmp(data, &save_data); + switch (op) { + default: + __unreachable(); + case MDBX_TO_EXACT_KEY_VALUE_LESSER_THAN: + rc = (cmp < 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; + break; + case MDBX_TO_EXACT_KEY_VALUE_LESSER_OR_EQUAL: + rc = (cmp <= 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; + break; + case MDBX_TO_EXACT_KEY_VALUE_EQUAL: + rc = (cmp == 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; + break; + case MDBX_TO_EXACT_KEY_VALUE_GREATER_OR_EQUAL: + rc = (cmp >= 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; + break; + case MDBX_TO_EXACT_KEY_VALUE_GREATER_THAN: + rc = (cmp > 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; + break; + } + } + } + break; + } + case MDBX_TO_PAIR_LESSER_THAN: + case MDBX_TO_PAIR_LESSER_OR_EQUAL: + case MDBX_TO_PAIR_EQUAL: + case MDBX_TO_PAIR_GREATER_OR_EQUAL: + case MDBX_TO_PAIR_GREATER_THAN: { + if (unlikely(key == NULL || data == NULL)) + return MDBX_EINVAL; + MDBX_val save_data = *data; + struct cursor_set_result csr = cursor_set(mc, key, data, MDBX_SET_RANGE); + rc = csr.err; + if (csr.exact) { + cASSERT(mc, csr.err == MDBX_SUCCESS); + MDBX_cursor *const mx = + (mc->mc_xcursor && + (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + ? &mc->mc_xcursor->mx_cursor + : nullptr; + if (mx) { + csr = cursor_set(mx, &save_data, NULL, MDBX_SET_RANGE); + rc = csr.err; + if (csr.exact) { + cASSERT(mc, csr.err == MDBX_SUCCESS); + if (op == MDBX_TO_PAIR_LESSER_THAN) + rc = cursor_prev(mc, key, data, MDBX_PREV); + else if (op == MDBX_TO_PAIR_GREATER_THAN) + rc = cursor_next(mc, key, data, MDBX_NEXT); + } else if (op < MDBX_TO_PAIR_EQUAL && + (rc == MDBX_NOTFOUND || rc == MDBX_SUCCESS)) + rc = cursor_prev(mc, key, data, MDBX_PREV); + else if (op == MDBX_TO_PAIR_EQUAL && rc == MDBX_SUCCESS) + rc = MDBX_NOTFOUND; + else if (op > MDBX_TO_PAIR_EQUAL && rc == MDBX_NOTFOUND) + rc = cursor_next(mc, key, data, MDBX_NEXT); + } else { + int cmp = mc->mc_dbx->md_dcmp(data, &save_data); + switch (op) { + default: + __unreachable(); + case MDBX_TO_PAIR_LESSER_THAN: + rc = (cmp < 0) ? MDBX_SUCCESS : cursor_prev(mc, key, data, MDBX_PREV); + break; + case MDBX_TO_PAIR_LESSER_OR_EQUAL: + rc = + (cmp <= 0) ? MDBX_SUCCESS : cursor_prev(mc, key, data, MDBX_PREV); + break; + case MDBX_TO_PAIR_EQUAL: + rc = (cmp == 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; + break; + case MDBX_TO_PAIR_GREATER_OR_EQUAL: + rc = + (cmp >= 0) ? MDBX_SUCCESS : cursor_next(mc, key, data, MDBX_NEXT); + break; + case MDBX_TO_PAIR_GREATER_THAN: + rc = (cmp > 0) ? MDBX_SUCCESS : cursor_next(mc, key, data, MDBX_NEXT); + break; + } + } + } else if (op < MDBX_TO_PAIR_EQUAL && + (rc == MDBX_NOTFOUND || rc == MDBX_SUCCESS)) + rc = cursor_prev(mc, key, data, MDBX_PREV_NODUP); + else if (op == MDBX_TO_PAIR_EQUAL && rc == MDBX_SUCCESS) + rc = MDBX_NOTFOUND; + break; + } default: DEBUG("unhandled/unimplemented cursor operation %u", op); return MDBX_EINVAL; @@ -21136,6 +21879,80 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return cursor_get(mc, key, data, op); } +int mdbx_cursor_scan(MDBX_cursor *mc, MDBX_predicate_func *predicate, + void *context, MDBX_cursor_op start_op, + MDBX_cursor_op turn_op, void *arg) { + if (unlikely(!predicate)) + return MDBX_EINVAL; + + const unsigned valid_start_mask = + 1 << MDBX_FIRST | 1 << MDBX_FIRST_DUP | 1 << MDBX_LAST | + 1 << MDBX_LAST_DUP | 1 << MDBX_GET_CURRENT | 1 << MDBX_GET_MULTIPLE; + if (unlikely(start_op > 30 || ((1 << start_op) & valid_start_mask) == 0)) + return MDBX_EINVAL; + + const unsigned valid_turn_mask = + 1 << MDBX_NEXT | 1 << MDBX_NEXT_DUP | 1 << MDBX_NEXT_NODUP | + 1 << MDBX_PREV | 1 << MDBX_PREV_DUP | 1 << MDBX_PREV_NODUP | + 1 << MDBX_NEXT_MULTIPLE | 1 << MDBX_PREV_MULTIPLE; + if (unlikely(turn_op > 30 || ((1 << turn_op) & valid_turn_mask) == 0)) + return MDBX_EINVAL; + + MDBX_val key = {nullptr, 0}, data = {nullptr, 0}; + int rc = mdbx_cursor_get(mc, &key, &data, start_op); + while (likely(rc == MDBX_SUCCESS)) { + rc = predicate(context, &key, &data, arg); + if (rc != MDBX_RESULT_FALSE) + return rc; + rc = cursor_get(mc, &key, &data, turn_op); + } + return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc; +} + +int mdbx_cursor_scan_from(MDBX_cursor *mc, MDBX_predicate_func *predicate, + void *context, MDBX_cursor_op from_op, MDBX_val *key, + MDBX_val *value, MDBX_cursor_op turn_op, void *arg) { + if (unlikely(!predicate)) + return MDBX_EINVAL; + + const unsigned valid_start_mask = + 1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | 1 << MDBX_SET_KEY | + 1 << MDBX_GET_MULTIPLE | 1 << MDBX_SET_LOWERBOUND | + 1 << MDBX_SET_UPPERBOUND; + ; + if (unlikely(from_op < MDBX_TO_KEY_LESSER_THAN && + ((1 << from_op) & valid_start_mask) == 0)) + return MDBX_EINVAL; + + const unsigned valid_turn_mask = + 1 << MDBX_NEXT | 1 << MDBX_NEXT_DUP | 1 << MDBX_NEXT_NODUP | + 1 << MDBX_PREV | 1 << MDBX_PREV_DUP | 1 << MDBX_PREV_NODUP | + 1 << MDBX_NEXT_MULTIPLE | 1 << MDBX_PREV_MULTIPLE; + if (unlikely(turn_op > 30 || ((1 << turn_op) & valid_turn_mask) == 0)) + return MDBX_EINVAL; + + int rc = mdbx_cursor_get(mc, key, value, from_op); + if (unlikely(MDBX_IS_ERROR(rc))) + return rc; + + cASSERT(mc, key != nullptr); + MDBX_val stub; + if (!value) { + value = &stub; + rc = cursor_get(mc, key, value, MDBX_GET_CURRENT); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + for (;;) { + rc = predicate(context, key, value, arg); + if (rc != MDBX_RESULT_FALSE) + return rc; + rc = cursor_get(mc, key, value, turn_op); + if (rc != MDBX_SUCCESS) + return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc; + } +} + static int cursor_first_batch(MDBX_cursor *mc) { if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { int err = page_search(mc, NULL, MDBX_PS_FIRST); @@ -21262,16 +22079,19 @@ int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, } static int touch_dbi(MDBX_cursor *mc) { - cASSERT(mc, (*mc->mc_dbistate & DBI_DIRTY) == 0); - *mc->mc_dbistate |= DBI_DIRTY; + cASSERT(mc, (*mc->mc_dbi_state & DBI_DIRTY) == 0); + *mc->mc_dbi_state |= DBI_DIRTY; mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; if (mc->mc_dbi >= CORE_DBS) { /* Touch DB record of named DB */ MDBX_cursor_couple cx; - int rc = cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI); + int rc = dbi_check(mc->mc_txn, MAIN_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + rc = cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) return rc; - mc->mc_txn->mt_dbistate[MAIN_DBI] |= DBI_DIRTY; + mc->mc_txn->mt_dbi_state[MAIN_DBI] |= DBI_DIRTY; rc = page_search(&cx.outer, &mc->mc_dbx->md_name, MDBX_PS_MODIFY); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -21285,11 +22105,13 @@ static __hot int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, cASSERT(mc, (mc->mc_flags & C_INITIALIZED) || mc->mc_snum == 0); cASSERT(mc, cursor_is_tracked(mc)); + cASSERT(mc, F_ISSET(dbi_state(mc->mc_txn, FREE_DBI), DBI_LINDO | DBI_VALID)); + cASSERT(mc, F_ISSET(dbi_state(mc->mc_txn, MAIN_DBI), DBI_LINDO | DBI_VALID)); if ((mc->mc_flags & C_SUB) == 0) { MDBX_txn *const txn = mc->mc_txn; txn_lru_turn(txn); - if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) { + if (unlikely((*mc->mc_dbi_state & DBI_DIRTY) == 0)) { int err = touch_dbi(mc); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -21326,7 +22148,8 @@ static __hot int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, } int rc = MDBX_SUCCESS; - if (likely(mc->mc_snum)) { + if (likely(mc->mc_snum) && + !IS_MODIFIABLE(mc->mc_txn, mc->mc_pg[mc->mc_snum - 1])) { mc->mc_top = 0; do { rc = page_touch(mc); @@ -22456,7 +23279,7 @@ static pgr_t page_new(MDBX_cursor *mc, const unsigned flags) { DEBUG("db %u allocated new page %" PRIaPGNO, mc->mc_dbi, ret.page->mp_pgno); ret.page->mp_flags = (uint16_t)flags; - cASSERT(mc, *mc->mc_dbistate & DBI_DIRTY); + cASSERT(mc, *mc->mc_dbi_state & DBI_DIRTY); cASSERT(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); #if MDBX_ENABLE_PGOP_STAT mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += 1; @@ -22487,7 +23310,7 @@ static pgr_t page_new_large(MDBX_cursor *mc, const size_t npages) { DEBUG("db %u allocated new large-page %" PRIaPGNO ", num %zu", mc->mc_dbi, ret.page->mp_pgno, npages); ret.page->mp_flags = P_OVERFLOW; - cASSERT(mc, *mc->mc_dbistate & DBI_DIRTY); + cASSERT(mc, *mc->mc_dbi_state & DBI_DIRTY); cASSERT(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); #if MDBX_ENABLE_PGOP_STAT mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += npages; @@ -22809,7 +23632,7 @@ static int cursor_xinit0(MDBX_cursor *mc) { mx->mx_cursor.mc_db = &mx->mx_db; mx->mx_cursor.mc_dbx = &mx->mx_dbx; mx->mx_cursor.mc_dbi = mc->mc_dbi; - mx->mx_cursor.mc_dbistate = mc->mc_dbistate; + mx->mx_cursor.mc_dbi_state = mc->mc_dbi_state; mx->mx_cursor.mc_snum = 0; mx->mx_cursor.mc_top = 0; mx->mx_cursor.mc_flags = C_SUB; @@ -22831,7 +23654,7 @@ static int cursor_xinit0(MDBX_cursor *mc) { * [in] mc The main cursor whose sorted-dups cursor is to be initialized. * [in] node The data containing the MDBX_db record for the sorted-dup database. */ -static int cursor_xinit1(MDBX_cursor *mc, MDBX_node *node, +static int cursor_xinit1(MDBX_cursor *mc, const MDBX_node *node, const MDBX_page *mp) { MDBX_xcursor *mx = mc->mc_xcursor; if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) { @@ -22848,7 +23671,8 @@ static int cursor_xinit1(MDBX_cursor *mc, MDBX_node *node, case F_DUPDATA | F_SUBDATA: if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) != sizeof(MDBX_db))) { - ERROR("invalid nested-db record size %zu", node_ds(node)); + ERROR("invalid nested-db record size (%zu, expect %zu)", node_ds(node), + sizeof(MDBX_db)); return MDBX_CORRUPTED; } memcpy(&mx->mx_db, node_data(node), sizeof(MDBX_db)); @@ -22955,7 +23779,8 @@ static int cursor_xinit2(MDBX_cursor *mc, MDBX_xcursor *src_mx, static __inline int couple_init(MDBX_cursor_couple *couple, const size_t dbi, const MDBX_txn *const txn, MDBX_db *const db, - MDBX_dbx *const dbx, uint8_t *const dbstate) { + MDBX_dbx *const dbx, uint8_t *const dbi_state) { + tASSERT(txn, F_ISSET(*dbi_state, DBI_VALID | DBI_LINDO)); couple->outer.mc_signature = MDBX_MC_LIVE; couple->outer.mc_next = NULL; couple->outer.mc_backup = NULL; @@ -22963,7 +23788,7 @@ static __inline int couple_init(MDBX_cursor_couple *couple, const size_t dbi, couple->outer.mc_txn = (MDBX_txn *)txn; couple->outer.mc_db = db; couple->outer.mc_dbx = dbx; - couple->outer.mc_dbistate = dbstate; + couple->outer.mc_dbi_state = dbi_state; couple->outer.mc_snum = 0; couple->outer.mc_top = 0; couple->outer.mc_pg[0] = 0; @@ -22978,11 +23803,11 @@ static __inline int couple_init(MDBX_cursor_couple *couple, const size_t dbi, couple->outer.mc_xcursor = NULL; int rc = MDBX_SUCCESS; - if (unlikely(*couple->outer.mc_dbistate & DBI_STALE)) { + if (unlikely(*couple->outer.mc_dbi_state & DBI_STALE)) { rc = page_search(&couple->outer, NULL, MDBX_PS_ROOTONLY); rc = (rc != MDBX_NOTFOUND) ? rc : MDBX_SUCCESS; } else if (unlikely(dbx->md_klen_max == 0)) { - rc = setup_dbx(dbx, db, txn->mt_env->me_psize); + rc = setup_sdb(dbx, db, txn->mt_env->me_psize); } if (couple->outer.mc_db->md_flags & MDBX_DUPSORT) { @@ -23000,9 +23825,12 @@ static __inline int couple_init(MDBX_cursor_couple *couple, const size_t dbi, /* Initialize a cursor for a given transaction and database. */ static int cursor_init(MDBX_cursor *mc, const MDBX_txn *txn, size_t dbi) { STATIC_ASSERT(offsetof(MDBX_cursor_couple, outer) == 0); - return couple_init(container_of(mc, MDBX_cursor_couple, outer), dbi, txn, - &txn->mt_dbs[dbi], &txn->mt_dbxs[dbi], - &txn->mt_dbistate[dbi]); + int rc = dbi_check(txn, dbi); + if (likely(rc == MDBX_SUCCESS)) + rc = couple_init(container_of(mc, MDBX_cursor_couple, outer), dbi, txn, + &txn->mt_dbs[dbi], &txn->mt_env->me_dbxs[dbi], + &txn->mt_dbi_state[dbi]); + return rc; } MDBX_cursor *mdbx_cursor_create(void *context) { @@ -23041,6 +23869,38 @@ void *mdbx_cursor_get_userctx(const MDBX_cursor *mc) { return couple->mc_userctx; } +int mdbx_cursor_unbind(MDBX_cursor *mc) { + if (unlikely(!mc)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_SUCCESS + : MDBX_EBADSIGN; + + if (unlikely(mc->mc_backup)) /* Cursor from parent transaction */ + return MDBX_EINVAL; + + eASSERT(nullptr, mc->mc_txn && mc->mc_txn->mt_signature == MDBX_MT_SIGNATURE); + cASSERT(mc, mc->mc_signature == MDBX_MC_LIVE); + cASSERT(mc, !mc->mc_backup); + if (unlikely(!mc->mc_txn || mc->mc_txn->mt_signature != MDBX_MT_SIGNATURE)) { + ERROR("Wrong cursor's transaction %p 0x%x", + __Wpedantic_format_voidptr(mc->mc_txn), + mc->mc_txn ? mc->mc_txn->mt_signature : 0); + return MDBX_PROBLEM; + } + if (mc->mc_flags & C_UNTRACK) { + MDBX_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; + while (*prev && *prev != mc) + prev = &(*prev)->mc_next; + cASSERT(mc, *prev == mc); + *prev = mc->mc_next; + } + mc->mc_signature = MDBX_MC_READY4CLOSE; + mc->mc_flags = 0; + return MDBX_SUCCESS; +} + int mdbx_cursor_bind(const MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { if (unlikely(!mc)) return MDBX_EINVAL; @@ -23053,8 +23913,9 @@ int mdbx_cursor_bind(const MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (unlikely(!check_dbi(txn, dbi, DBI_VALID))) - return MDBX_BAD_DBI; + rc = dbi_check(txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; if (unlikely(dbi == FREE_DBI && !(txn->mt_flags & MDBX_TXN_RDONLY))) return MDBX_EACCESS; @@ -23066,10 +23927,10 @@ int mdbx_cursor_bind(const MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { mc->mc_txn != txn)) return MDBX_EINVAL; - assert(mc->mc_db == &txn->mt_dbs[dbi]); - assert(mc->mc_dbx == &txn->mt_dbxs[dbi]); - assert(mc->mc_dbi == dbi); - assert(mc->mc_dbistate == &txn->mt_dbistate[dbi]); + cASSERT(mc, mc->mc_db == &txn->mt_dbs[dbi]); + cASSERT(mc, mc->mc_dbx == &txn->mt_env->me_dbxs[dbi]); + cASSERT(mc, mc->mc_dbi == dbi); + cASSERT(mc, mc->mc_dbi_state == &txn->mt_dbi_state[dbi]); return likely(mc->mc_dbi == dbi && /* paranoia */ mc->mc_signature == MDBX_MC_LIVE && mc->mc_txn == txn) @@ -23078,27 +23939,9 @@ int mdbx_cursor_bind(const MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { } if (mc->mc_signature == MDBX_MC_LIVE) { - if (unlikely(!mc->mc_txn || - mc->mc_txn->mt_signature != MDBX_MT_SIGNATURE)) { - ERROR("Wrong cursor's transaction %p 0x%x", - __Wpedantic_format_voidptr(mc->mc_txn), - mc->mc_txn ? mc->mc_txn->mt_signature : 0); - return MDBX_PROBLEM; - } - if (mc->mc_flags & C_UNTRACK) { - MDBX_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; - while (*prev && *prev != mc) - prev = &(*prev)->mc_next; - cASSERT(mc, *prev == mc); - *prev = mc->mc_next; - } - mc->mc_signature = MDBX_MC_READY4CLOSE; - mc->mc_flags = 0; - mc->mc_dbi = UINT_MAX; - mc->mc_next = NULL; - mc->mc_db = NULL; - mc->mc_dbx = NULL; - mc->mc_dbistate = NULL; + rc = mdbx_cursor_unbind(mc); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; } cASSERT(mc, !(mc->mc_flags & C_UNTRACK)); @@ -23136,6 +23979,83 @@ int mdbx_cursor_renew(const MDBX_txn *txn, MDBX_cursor *mc) { return likely(mc) ? mdbx_cursor_bind(txn, mc, mc->mc_dbi) : MDBX_EINVAL; } +int mdbx_cursor_compare(const MDBX_cursor *l, const MDBX_cursor *r, + bool ignore_multival) { + const int incomparable = INT16_MAX + 1; + if (unlikely(!l)) + return r ? -incomparable * 9 : 0; + else if (unlikely(!r)) + return incomparable * 9; + + if (unlikely(l->mc_signature != MDBX_MC_LIVE)) + return (r->mc_signature == MDBX_MC_LIVE) ? -incomparable * 8 : 0; + if (unlikely(r->mc_signature != MDBX_MC_LIVE)) + return (l->mc_signature == MDBX_MC_LIVE) ? incomparable * 8 : 0; + + if (unlikely(l->mc_dbx != r->mc_dbx)) { + if (l->mc_txn->mt_env != r->mc_txn->mt_env) + return (l->mc_txn->mt_env > r->mc_txn->mt_env) ? incomparable * 7 + : -incomparable * 7; + if (l->mc_txn->mt_txnid != r->mc_txn->mt_txnid) + return (l->mc_txn->mt_txnid > r->mc_txn->mt_txnid) ? incomparable * 6 + : -incomparable * 6; + return (l->mc_dbx > r->mc_dbx) ? incomparable * 5 : -incomparable * 5; + } + assert(l->mc_dbi == r->mc_dbi); + + int diff = (l->mc_flags & C_INITIALIZED) - (l->mc_flags & C_INITIALIZED); + if (unlikely(diff)) + return (diff > 0) ? incomparable * 4 : -incomparable * 4; + if (unlikely((l->mc_flags & C_INITIALIZED) == 0)) + return 0; + + size_t detent = (l->mc_snum <= r->mc_snum) ? l->mc_snum : r->mc_snum; + for (size_t i = 0; i < detent; ++i) { + diff = l->mc_ki[i] - r->mc_ki[i]; + if (diff) + return diff; + } + if (unlikely(l->mc_snum != r->mc_snum)) + return (l->mc_snum > r->mc_snum) ? incomparable * 3 : -incomparable * 3; + + assert((l->mc_xcursor != nullptr) == (r->mc_xcursor != nullptr)); + if (unlikely((l->mc_xcursor != nullptr) != (r->mc_xcursor != nullptr))) + return l->mc_xcursor ? incomparable * 2 : -incomparable * 2; + if (ignore_multival || !l->mc_xcursor) + return 0; + +#if MDBX_DEBUG + if (l->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + const MDBX_page *mp = l->mc_pg[l->mc_top]; + const MDBX_node *node = page_node(mp, l->mc_ki[l->mc_top]); + assert(node_flags(node) & F_DUPDATA); + } + if (l->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + const MDBX_page *mp = r->mc_pg[r->mc_top]; + const MDBX_node *node = page_node(mp, r->mc_ki[r->mc_top]); + assert(node_flags(node) & F_DUPDATA); + } +#endif /* MDBX_DEBUG */ + + l = &l->mc_xcursor->mx_cursor; + r = &r->mc_xcursor->mx_cursor; + diff = (l->mc_flags & C_INITIALIZED) - (l->mc_flags & C_INITIALIZED); + if (unlikely(diff)) + return (diff > 0) ? incomparable * 2 : -incomparable * 2; + if (unlikely((l->mc_flags & C_INITIALIZED) == 0)) + return 0; + + detent = (l->mc_snum <= r->mc_snum) ? l->mc_snum : r->mc_snum; + for (size_t i = 0; i < detent; ++i) { + diff = l->mc_ki[i] - r->mc_ki[i]; + if (diff) + return diff; + } + if (unlikely(l->mc_snum != r->mc_snum)) + return (l->mc_snum > r->mc_snum) ? incomparable : -incomparable; + return 0; +} + int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest) { if (unlikely(!src)) return MDBX_EINVAL; @@ -23150,7 +24070,7 @@ int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest) { assert(dest->mc_db == src->mc_db); assert(dest->mc_dbi == src->mc_dbi); assert(dest->mc_dbx == src->mc_dbx); - assert(dest->mc_dbistate == src->mc_dbistate); + assert(dest->mc_dbi_state == src->mc_dbi_state); again: assert(dest->mc_txn == src->mc_txn); dest->mc_flags ^= (dest->mc_flags ^ src->mc_flags) & ~C_UNTRACK; @@ -23200,6 +24120,32 @@ void mdbx_cursor_close(MDBX_cursor *mc) { } } +int mdbx_txn_release_all_cursors(const MDBX_txn *txn, bool unbind) { + int rc = check_txn(txn, MDBX_TXN_FINISHED | MDBX_TXN_HAS_CHILD); + if (likely(rc == MDBX_SUCCESS)) { + TXN_FOREACH_DBI_FROM(txn, i, MAIN_DBI) { + while (txn->mt_cursors[i]) { + MDBX_cursor *mc = txn->mt_cursors[i]; + ENSURE(NULL, mc->mc_signature == MDBX_MC_LIVE && + (mc->mc_flags & C_UNTRACK) && !mc->mc_backup); + rc = likely(rc < INT_MAX) ? rc + 1 : rc; + txn->mt_cursors[i] = mc->mc_next; + if (unbind) { + mc->mc_signature = MDBX_MC_READY4CLOSE; + mc->mc_flags = 0; + } else { + mc->mc_signature = 0; + mc->mc_next = mc; + osal_free(mc); + } + } + } + } else { + eASSERT(nullptr, rc < 0); + } + return rc; +} + MDBX_txn *mdbx_cursor_txn(const MDBX_cursor *mc) { if (unlikely(!mc || mc->mc_signature != MDBX_MC_LIVE)) return NULL; @@ -23654,6 +24600,8 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); cASSERT(csrc, csrc->mc_snum < csrc->mc_db->md_depth || IS_LEAF(csrc->mc_pg[csrc->mc_db->md_depth - 1])); + cASSERT(cdst, csrc->mc_txn->mt_env->me_options.prefer_waf_insteadof_balance || + page_room(pdst) >= page_used(cdst->mc_txn->mt_env, psrc)); const int pagetype = PAGETYPE_WHOLE(psrc); /* Move all nodes from src to dst */ @@ -23890,7 +24838,7 @@ static void cursor_restore(const MDBX_cursor *csrc, MDBX_cursor *cdst) { cASSERT(cdst, cdst->mc_txn == csrc->mc_txn); cASSERT(cdst, cdst->mc_db == csrc->mc_db); cASSERT(cdst, cdst->mc_dbx == csrc->mc_dbx); - cASSERT(cdst, cdst->mc_dbistate == csrc->mc_dbistate); + cASSERT(cdst, cdst->mc_dbi_state == csrc->mc_dbi_state); cdst->mc_snum = csrc->mc_snum; cdst->mc_top = csrc->mc_top; cdst->mc_flags = csrc->mc_flags; @@ -23915,7 +24863,7 @@ static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { cdst->mc_txn = csrc->mc_txn; cdst->mc_db = csrc->mc_db; cdst->mc_dbx = csrc->mc_dbx; - cdst->mc_dbistate = csrc->mc_dbistate; + cdst->mc_dbi_state = csrc->mc_dbi_state; cursor_restore(csrc, cdst); } @@ -23973,7 +24921,7 @@ static int rebalance(MDBX_cursor *mc) { if (nkeys == 0) { cASSERT(mc, IS_LEAF(mp)); DEBUG("%s", "tree is completely empty"); - cASSERT(mc, (*mc->mc_dbistate & DBI_DIRTY) != 0); + cASSERT(mc, (*mc->mc_dbi_state & DBI_DIRTY) != 0); mc->mc_db->md_root = P_INVALID; mc->mc_db->md_depth = 0; cASSERT(mc, mc->mc_db->md_branch_pages == 0 && @@ -24154,14 +25102,11 @@ static int rebalance(MDBX_cursor *mc) { return MDBX_SUCCESS; } - /* Заглушено в ветке v0.12.x, будет работать в v0.13.1 и далее. - * - * if (mc->mc_txn->mt_env->me_options.prefer_waf_insteadof_balance && - * likely(room_threshold > 0)) { - * room_threshold = 0; - * goto retry; - * } - */ + if (mc->mc_txn->mt_env->me_options.prefer_waf_insteadof_balance && + likely(room_threshold > 0)) { + room_threshold = 0; + goto retry; + } if (likely(!involve) && (likely(mc->mc_dbi != FREE_DBI) || mc->mc_txn->tw.loose_pages || MDBX_PNL_GETSIZE(mc->mc_txn->tw.relist) || (mc->mc_flags & C_GCU) || @@ -24453,7 +25398,8 @@ __cold static int page_check(const MDBX_cursor *const mc, break; case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: if (unlikely(dsize != sizeof(MDBX_db))) { - rc = bad_page(mp, "invalid nested-db record size (%zu)\n", dsize); + rc = bad_page(mp, "invalid nested-db record size (%zu, expect %zu)\n", + dsize, sizeof(MDBX_db)); continue; } break; @@ -24658,7 +25604,7 @@ int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, if (unlikely(!key)) return MDBX_EINVAL; - if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) + if (unlikely(dbi <= FREE_DBI)) return MDBX_BAD_DBI; if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) @@ -25347,7 +26293,7 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, if (unlikely(!key || !data)) return MDBX_EINVAL; - if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) + if (unlikely(dbi <= FREE_DBI)) return MDBX_BAD_DBI; if (unlikely(flags & ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS | @@ -25602,6 +26548,9 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, } else if (node_flags(node) & F_SUBDATA) { if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) != sizeof(MDBX_db))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid dupsort sub-tree node size", + (unsigned)node_ds(node)); rc = MDBX_CORRUPTED; goto done; } @@ -25701,8 +26650,8 @@ __cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb) { memset(&couple, 0, sizeof(couple)); couple.inner.mx_cursor.mc_signature = ~MDBX_MC_LIVE; MDBX_dbx dbx = {.md_klen_min = INT_MAX}; - uint8_t dbistate = DBI_VALID | DBI_AUDITED; - int rc = couple_init(&couple, ~0u, ctx->mc_txn, sdb, &dbx, &dbistate); + uint8_t dbi_state = DBI_LINDO | DBI_VALID; + int rc = couple_init(&couple, ~0u, ctx->mc_txn, sdb, &dbx, &dbi_state); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -25799,9 +26748,16 @@ __cold static int env_compact(MDBX_env *env, MDBX_txn *read_txn, MDBX_SUCCESS) { const MDBX_PNL pnl = data.iov_base; if (unlikely(data.iov_len % sizeof(pgno_t) || - data.iov_len < MDBX_PNL_SIZEOF(pnl) || - !(pnl_check(pnl, read_txn->mt_next_pgno)))) + data.iov_len < MDBX_PNL_SIZEOF(pnl))) { + ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC-record length", data.iov_len); + return MDBX_CORRUPTED; + } + if (unlikely(!pnl_check(pnl, read_txn->mt_next_pgno))) { + ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC-record content"); return MDBX_CORRUPTED; + } gc += MDBX_PNL_GETSIZE(pnl); } if (unlikely(rc != MDBX_NOTFOUND)) @@ -25909,18 +26865,18 @@ __cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, const bool dest_is_pipe, const MDBX_copy_flags_t flags) { /* We must start the actual read txn after blocking writers */ - int rc = txn_end(read_txn, MDBX_END_RESET_TMP); + int rc = txn_end(read_txn, TXN_END_RESET_TMP); if (unlikely(rc != MDBX_SUCCESS)) return rc; /* Temporarily block writers until we snapshot the meta pages */ - rc = mdbx_txn_lock(env, false); + rc = osal_txn_lock(env, false); if (unlikely(rc != MDBX_SUCCESS)) return rc; rc = txn_renew(read_txn, MDBX_TXN_RDONLY); if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_txn_unlock(env); + osal_txn_unlock(env); return rc; } @@ -25932,7 +26888,7 @@ __cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, memcpy(buffer, env->me_map, meta_bytes); MDBX_meta *const headcopy = /* LY: get pointer to the snapshot copy */ ptr_disp(buffer, ptr_dist(meta_recent(env, &troika).ptr_c, env->me_map)); - mdbx_txn_unlock(env); + osal_txn_unlock(env); if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) meta_make_sizeable(headcopy); @@ -26184,15 +27140,11 @@ __cold int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags, if (unlikely(env->me_flags & MDBX_RDONLY)) return MDBX_EACCESS; - if ((env->me_flags & MDBX_ENV_ACTIVE) && - unlikely(env->me_txn0->mt_owner == osal_thread_self())) - return MDBX_BUSY; - - const bool lock_needed = (env->me_flags & MDBX_ENV_ACTIVE) && - env->me_txn0->mt_owner != osal_thread_self(); + const bool lock_needed = + (env->me_flags & MDBX_ENV_ACTIVE) && !env_txn0_owned(env); bool should_unlock = false; if (lock_needed) { - rc = mdbx_txn_lock(env, false); + rc = osal_txn_lock(env, false); if (unlikely(rc)) return rc; should_unlock = true; @@ -26204,7 +27156,7 @@ __cold int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags, env->me_flags &= ~flags; if (should_unlock) - mdbx_txn_unlock(env); + osal_txn_unlock(env); return MDBX_SUCCESS; } @@ -26256,7 +27208,7 @@ __cold int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **arg) { if (unlikely(!arg)) return MDBX_EINVAL; - *arg = env->me_pathname; + *arg = env->me_pathname.specified; return MDBX_SUCCESS; } #endif /* Windows */ @@ -26273,12 +27225,14 @@ __cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { if (!env->me_pathname_char) { *arg = nullptr; DWORD flags = /* WC_ERR_INVALID_CHARS */ 0x80; - size_t mb_len = WideCharToMultiByte(CP_THREAD_ACP, flags, env->me_pathname, - -1, nullptr, 0, nullptr, nullptr); + size_t mb_len = + WideCharToMultiByte(CP_THREAD_ACP, flags, env->me_pathname.specified, + -1, nullptr, 0, nullptr, nullptr); rc = mb_len ? MDBX_SUCCESS : (int)GetLastError(); if (rc == ERROR_INVALID_FLAGS) { - mb_len = WideCharToMultiByte(CP_THREAD_ACP, flags = 0, env->me_pathname, - -1, nullptr, 0, nullptr, nullptr); + mb_len = WideCharToMultiByte(CP_THREAD_ACP, flags = 0, + env->me_pathname.specified, -1, nullptr, 0, + nullptr, nullptr); rc = mb_len ? MDBX_SUCCESS : (int)GetLastError(); } if (unlikely(rc != MDBX_SUCCESS)) @@ -26287,9 +27241,9 @@ __cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { char *const mb_pathname = osal_malloc(mb_len); if (!mb_pathname) return MDBX_ENOMEM; - if (mb_len != (size_t)WideCharToMultiByte(CP_THREAD_ACP, flags, - env->me_pathname, -1, mb_pathname, - (int)mb_len, nullptr, nullptr)) { + if (mb_len != (size_t)WideCharToMultiByte( + CP_THREAD_ACP, flags, env->me_pathname.specified, -1, + mb_pathname, (int)mb_len, nullptr, nullptr)) { rc = (int)GetLastError(); osal_free(mb_pathname); return rc; @@ -26301,7 +27255,7 @@ __cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { } *arg = env->me_pathname_char; #else - *arg = env->me_pathname; + *arg = env->me_pathname.specified; #endif /* Windows */ return MDBX_SUCCESS; } @@ -26347,26 +27301,22 @@ __cold static int stat_acc(const MDBX_txn *txn, MDBX_stat *st, size_t bytes) { if (unlikely(err != MDBX_SUCCESS)) return err; - st->ms_psize = txn->mt_env->me_psize; -#if 1 - /* assuming GC is internal and not subject for accounting */ - stat_get(&txn->mt_dbs[MAIN_DBI], st, bytes); -#else - stat_get(&txn->mt_dbs[FREE_DBI], st, bytes); - stat_add(&txn->mt_dbs[MAIN_DBI], st, bytes); -#endif + MDBX_cursor_couple cx; + err = cursor_init(&cx.outer, (MDBX_txn *)txn, MAIN_DBI); + if (unlikely(err != MDBX_SUCCESS)) + return err; - /* account opened named subDBs */ - for (MDBX_dbi dbi = CORE_DBS; dbi < txn->mt_numdbs; dbi++) - if ((txn->mt_dbistate[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID) + const MDBX_env *const env = txn->mt_env; + st->ms_psize = env->me_psize; + TXN_FOREACH_DBI_FROM( + txn, dbi, + /* assuming GC is internal and not subject for accounting */ MAIN_DBI) { + if ((txn->mt_dbi_state[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID) stat_add(txn->mt_dbs + dbi, st, bytes); + } - if (!(txn->mt_dbs[MAIN_DBI].md_flags & (MDBX_DUPSORT | MDBX_INTEGERKEY)) && + if (!(txn->mt_dbs[MAIN_DBI].md_flags & MDBX_DUPSORT) && txn->mt_dbs[MAIN_DBI].md_entries /* TODO: use `md_subs` field */) { - MDBX_cursor_couple cx; - err = cursor_init(&cx.outer, (MDBX_txn *)txn, MAIN_DBI); - if (unlikely(err != MDBX_SUCCESS)) - return err; /* scan and account not opened named subDBs */ err = page_search(&cx.outer, NULL, MDBX_PS_FIRST); @@ -26376,18 +27326,22 @@ __cold static int stat_acc(const MDBX_txn *txn, MDBX_stat *st, size_t bytes) { const MDBX_node *node = page_node(mp, i); if (node_flags(node) != F_SUBDATA) continue; - if (unlikely(node_ds(node) != sizeof(MDBX_db))) + if (unlikely(node_ds(node) != sizeof(MDBX_db))) { + ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid subDb node size", node_ds(node)); return MDBX_CORRUPTED; + } /* skip opened and already accounted */ - for (MDBX_dbi dbi = CORE_DBS; dbi < txn->mt_numdbs; dbi++) - if ((txn->mt_dbistate[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID && - node_ks(node) == txn->mt_dbxs[dbi].md_name.iov_len && - memcmp(node_key(node), txn->mt_dbxs[dbi].md_name.iov_base, - node_ks(node)) == 0) { + const MDBX_val name = {node_key(node), node_ks(node)}; + TXN_FOREACH_DBI_USER(txn, dbi) { + if ((txn->mt_dbi_state[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID && + env->me_dbxs[MAIN_DBI].md_cmp(&name, + &env->me_dbxs[dbi].md_name) == 0) { node = NULL; break; } + } if (node) { MDBX_db db; @@ -26422,7 +27376,7 @@ __cold int mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn, if (unlikely(err != MDBX_SUCCESS)) return err; - if (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()) + if (env->me_txn && env_txn0_owned(env)) /* inside write-txn */ return stat_acc(env->me_txn, dest, bytes); @@ -26447,9 +27401,6 @@ __cold int mdbx_dbi_dupsort_depthmask(const MDBX_txn *txn, MDBX_dbi dbi, if (unlikely(!mask)) return MDBX_EINVAL; - if (unlikely(!check_dbi(txn, dbi, DBI_VALID))) - return MDBX_BAD_DBI; - MDBX_cursor_couple cx; rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) @@ -26480,7 +27431,8 @@ __cold int mdbx_dbi_dupsort_depthmask(const MDBX_txn *txn, MDBX_dbi dbi, *mask |= 1 << UNALIGNED_PEEK_16(db, MDBX_db, md_depth); break; default: - ERROR("wrong node-flags %u", flags); + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid node-size", flags); return MDBX_CORRUPTED; } rc = cursor_next(&cx.outer, &key, &data, MDBX_NEXT_NODUP); @@ -26489,11 +27441,13 @@ __cold int mdbx_dbi_dupsort_depthmask(const MDBX_txn *txn, MDBX_dbi dbi, return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc; } -__cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, - MDBX_envinfo *arg, const size_t bytes) { - +__cold static int env_info_snap(const MDBX_env *env, const MDBX_txn *txn, + MDBX_envinfo *out, const size_t bytes, + meta_troika_t *const troika) { const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid); const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat); + if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) + return MDBX_PANIC; /* is the environment open? * (https://libmdbx.dqdkfa.ru/dead-github/issues/171) */ @@ -26501,18 +27455,18 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, /* environment not yet opened */ #if 1 /* default behavior: returns the available info but zeroed the rest */ - memset(arg, 0, bytes); - arg->mi_geo.lower = env->me_dbgeo.lower; - arg->mi_geo.upper = env->me_dbgeo.upper; - arg->mi_geo.shrink = env->me_dbgeo.shrink; - arg->mi_geo.grow = env->me_dbgeo.grow; - arg->mi_geo.current = env->me_dbgeo.now; - arg->mi_maxreaders = env->me_maxreaders; - arg->mi_dxb_pagesize = env->me_psize; - arg->mi_sys_pagesize = env->me_os_psize; + memset(out, 0, bytes); + out->mi_geo.lower = env->me_dbgeo.lower; + out->mi_geo.upper = env->me_dbgeo.upper; + out->mi_geo.shrink = env->me_dbgeo.shrink; + out->mi_geo.grow = env->me_dbgeo.grow; + out->mi_geo.current = env->me_dbgeo.now; + out->mi_maxreaders = env->me_maxreaders; + out->mi_dxb_pagesize = env->me_psize; + out->mi_sys_pagesize = env->me_os_psize; if (likely(bytes > size_before_bootid)) { - arg->mi_bootid.current.x = bootid.x; - arg->mi_bootid.current.y = bootid.y; + out->mi_bootid.current.x = bootid.x; + out->mi_bootid.current.y = bootid.y; } return MDBX_SUCCESS; #else @@ -26521,142 +27475,243 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, #endif } + *troika = (txn && !(txn->mt_flags & MDBX_TXN_RDONLY)) ? txn->tw.troika + : meta_tap(env); + const meta_ptr_t head = meta_recent(env, troika); const MDBX_meta *const meta0 = METAPAGE(env, 0); const MDBX_meta *const meta1 = METAPAGE(env, 1); const MDBX_meta *const meta2 = METAPAGE(env, 2); - if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) - return MDBX_PANIC; - - meta_troika_t holder; - meta_troika_t const *troika; - if (txn && !(txn->mt_flags & MDBX_TXN_RDONLY)) - troika = &txn->tw.troika; - else { - holder = meta_tap(env); - troika = &holder; - } - - const meta_ptr_t head = meta_recent(env, troika); - arg->mi_recent_txnid = head.txnid; - arg->mi_meta0_txnid = troika->txnid[0]; - arg->mi_meta0_sign = unaligned_peek_u64(4, meta0->mm_sign); - arg->mi_meta1_txnid = troika->txnid[1]; - arg->mi_meta1_sign = unaligned_peek_u64(4, meta1->mm_sign); - arg->mi_meta2_txnid = troika->txnid[2]; - arg->mi_meta2_sign = unaligned_peek_u64(4, meta2->mm_sign); + out->mi_recent_txnid = head.txnid; + out->mi_meta_txnid[0] = troika->txnid[0]; + out->mi_meta_sign[0] = unaligned_peek_u64(4, meta0->mm_sign); + out->mi_meta_txnid[1] = troika->txnid[1]; + out->mi_meta_sign[1] = unaligned_peek_u64(4, meta1->mm_sign); + out->mi_meta_txnid[2] = troika->txnid[2]; + out->mi_meta_sign[2] = unaligned_peek_u64(4, meta2->mm_sign); if (likely(bytes > size_before_bootid)) { - memcpy(&arg->mi_bootid.meta0, &meta0->mm_bootid, 16); - memcpy(&arg->mi_bootid.meta1, &meta1->mm_bootid, 16); - memcpy(&arg->mi_bootid.meta2, &meta2->mm_bootid, 16); + memcpy(&out->mi_bootid.meta[0], &meta0->mm_bootid, 16); + memcpy(&out->mi_bootid.meta[1], &meta1->mm_bootid, 16); + memcpy(&out->mi_bootid.meta[2], &meta2->mm_bootid, 16); } const volatile MDBX_meta *txn_meta = head.ptr_v; - arg->mi_last_pgno = txn_meta->mm_geo.next - 1; - arg->mi_geo.current = pgno2bytes(env, txn_meta->mm_geo.now); + out->mi_last_pgno = txn_meta->mm_geo.next - 1; + out->mi_geo.current = pgno2bytes(env, txn_meta->mm_geo.now); if (txn) { - arg->mi_last_pgno = txn->mt_next_pgno - 1; - arg->mi_geo.current = pgno2bytes(env, txn->mt_end_pgno); + out->mi_last_pgno = txn->mt_next_pgno - 1; + out->mi_geo.current = pgno2bytes(env, txn->mt_end_pgno); const txnid_t wanna_meta_txnid = (txn->mt_flags & MDBX_TXN_RDONLY) ? txn->mt_txnid : txn->mt_txnid - xMDBX_TXNID_STEP; - txn_meta = (arg->mi_meta0_txnid == wanna_meta_txnid) ? meta0 : txn_meta; - txn_meta = (arg->mi_meta1_txnid == wanna_meta_txnid) ? meta1 : txn_meta; - txn_meta = (arg->mi_meta2_txnid == wanna_meta_txnid) ? meta2 : txn_meta; - } - arg->mi_geo.lower = pgno2bytes(env, txn_meta->mm_geo.lower); - arg->mi_geo.upper = pgno2bytes(env, txn_meta->mm_geo.upper); - arg->mi_geo.shrink = pgno2bytes(env, pv2pages(txn_meta->mm_geo.shrink_pv)); - arg->mi_geo.grow = pgno2bytes(env, pv2pages(txn_meta->mm_geo.grow_pv)); - const uint64_t unsynced_pages = - atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) + - (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != - (uint32_t)arg->mi_recent_txnid); - - arg->mi_mapsize = env->me_dxb_mmap.limit; + txn_meta = (out->mi_meta_txnid[0] == wanna_meta_txnid) ? meta0 : txn_meta; + txn_meta = (out->mi_meta_txnid[1] == wanna_meta_txnid) ? meta1 : txn_meta; + txn_meta = (out->mi_meta_txnid[2] == wanna_meta_txnid) ? meta2 : txn_meta; + } + out->mi_geo.lower = pgno2bytes(env, txn_meta->mm_geo.lower); + out->mi_geo.upper = pgno2bytes(env, txn_meta->mm_geo.upper); + out->mi_geo.shrink = pgno2bytes(env, pv2pages(txn_meta->mm_geo.shrink_pv)); + out->mi_geo.grow = pgno2bytes(env, pv2pages(txn_meta->mm_geo.grow_pv)); + out->mi_mapsize = env->me_dxb_mmap.limit; const MDBX_lockinfo *const lck = env->me_lck; - arg->mi_maxreaders = env->me_maxreaders; - arg->mi_numreaders = env->me_lck_mmap.lck + out->mi_maxreaders = env->me_maxreaders; + out->mi_numreaders = env->me_lck_mmap.lck ? atomic_load32(&lck->mti_numreaders, mo_Relaxed) : INT32_MAX; - arg->mi_dxb_pagesize = env->me_psize; - arg->mi_sys_pagesize = env->me_os_psize; + out->mi_dxb_pagesize = env->me_psize; + out->mi_sys_pagesize = env->me_os_psize; if (likely(bytes > size_before_bootid)) { - arg->mi_unsync_volume = pgno2bytes(env, (size_t)unsynced_pages); + const uint64_t unsynced_pages = + atomic_load64(&lck->mti_unsynced_pages, mo_Relaxed) + + ((uint32_t)out->mi_recent_txnid != + atomic_load32(&lck->mti_meta_sync_txnid, mo_Relaxed)); + out->mi_unsync_volume = pgno2bytes(env, (size_t)unsynced_pages); const uint64_t monotime_now = osal_monotime(); uint64_t ts = atomic_load64(&lck->mti_eoos_timestamp, mo_Relaxed); - arg->mi_since_sync_seconds16dot16 = + out->mi_since_sync_seconds16dot16 = ts ? osal_monotime_to_16dot16_noUnderflow(monotime_now - ts) : 0; ts = atomic_load64(&lck->mti_reader_check_timestamp, mo_Relaxed); - arg->mi_since_reader_check_seconds16dot16 = + out->mi_since_reader_check_seconds16dot16 = ts ? osal_monotime_to_16dot16_noUnderflow(monotime_now - ts) : 0; - arg->mi_autosync_threshold = pgno2bytes( + out->mi_autosync_threshold = pgno2bytes( env, atomic_load32(&lck->mti_autosync_threshold, mo_Relaxed)); - arg->mi_autosync_period_seconds16dot16 = + out->mi_autosync_period_seconds16dot16 = osal_monotime_to_16dot16_noUnderflow( atomic_load64(&lck->mti_autosync_period, mo_Relaxed)); - arg->mi_bootid.current.x = bootid.x; - arg->mi_bootid.current.y = bootid.y; - arg->mi_mode = env->me_lck_mmap.lck ? lck->mti_envmode.weak : env->me_flags; + out->mi_bootid.current.x = bootid.x; + out->mi_bootid.current.y = bootid.y; + out->mi_mode = env->me_lck_mmap.lck ? lck->mti_envmode.weak : env->me_flags; } if (likely(bytes > size_before_pgop_stat)) { #if MDBX_ENABLE_PGOP_STAT - arg->mi_pgop_stat.newly = + out->mi_pgop_stat.newly = atomic_load64(&lck->mti_pgop_stat.newly, mo_Relaxed); - arg->mi_pgop_stat.cow = atomic_load64(&lck->mti_pgop_stat.cow, mo_Relaxed); - arg->mi_pgop_stat.clone = + out->mi_pgop_stat.cow = atomic_load64(&lck->mti_pgop_stat.cow, mo_Relaxed); + out->mi_pgop_stat.clone = atomic_load64(&lck->mti_pgop_stat.clone, mo_Relaxed); - arg->mi_pgop_stat.split = + out->mi_pgop_stat.split = atomic_load64(&lck->mti_pgop_stat.split, mo_Relaxed); - arg->mi_pgop_stat.merge = + out->mi_pgop_stat.merge = atomic_load64(&lck->mti_pgop_stat.merge, mo_Relaxed); - arg->mi_pgop_stat.spill = + out->mi_pgop_stat.spill = atomic_load64(&lck->mti_pgop_stat.spill, mo_Relaxed); - arg->mi_pgop_stat.unspill = + out->mi_pgop_stat.unspill = atomic_load64(&lck->mti_pgop_stat.unspill, mo_Relaxed); - arg->mi_pgop_stat.wops = + out->mi_pgop_stat.wops = atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed); - arg->mi_pgop_stat.prefault = + out->mi_pgop_stat.prefault = atomic_load64(&lck->mti_pgop_stat.prefault, mo_Relaxed); - arg->mi_pgop_stat.mincore = + out->mi_pgop_stat.mincore = atomic_load64(&lck->mti_pgop_stat.mincore, mo_Relaxed); - arg->mi_pgop_stat.msync = + out->mi_pgop_stat.msync = atomic_load64(&lck->mti_pgop_stat.msync, mo_Relaxed); - arg->mi_pgop_stat.fsync = + out->mi_pgop_stat.fsync = atomic_load64(&lck->mti_pgop_stat.fsync, mo_Relaxed); #else - memset(&arg->mi_pgop_stat, 0, sizeof(arg->mi_pgop_stat)); + memset(&out->mi_pgop_stat, 0, sizeof(out->mi_pgop_stat)); #endif /* MDBX_ENABLE_PGOP_STAT*/ } - arg->mi_self_latter_reader_txnid = arg->mi_latter_reader_txnid = - arg->mi_recent_txnid; + txnid_t overall_latter_reader_txnid = out->mi_recent_txnid; + txnid_t self_latter_reader_txnid = overall_latter_reader_txnid; if (env->me_lck_mmap.lck) { - for (size_t i = 0; i < arg->mi_numreaders; ++i) { + for (size_t i = 0; i < out->mi_numreaders; ++i) { const uint32_t pid = atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); if (pid) { const txnid_t txnid = safe64_read(&lck->mti_readers[i].mr_txnid); - if (arg->mi_latter_reader_txnid > txnid) - arg->mi_latter_reader_txnid = txnid; - if (pid == env->me_pid && arg->mi_self_latter_reader_txnid > txnid) - arg->mi_self_latter_reader_txnid = txnid; + if (overall_latter_reader_txnid > txnid) + overall_latter_reader_txnid = txnid; + if (pid == env->me_pid && self_latter_reader_txnid > txnid) + self_latter_reader_txnid = txnid; } } } + out->mi_self_latter_reader_txnid = self_latter_reader_txnid; + out->mi_latter_reader_txnid = overall_latter_reader_txnid; osal_compiler_barrier(); return MDBX_SUCCESS; } +__cold int env_info(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo *out, + size_t bytes, meta_troika_t *troika) { + MDBX_envinfo snap; + int rc = env_info_snap(env, txn, &snap, sizeof(snap), troika); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + eASSERT(env, sizeof(snap) >= bytes); + while (1) { + rc = env_info_snap(env, txn, out, bytes, troika); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + snap.mi_since_sync_seconds16dot16 = out->mi_since_sync_seconds16dot16; + snap.mi_since_reader_check_seconds16dot16 = + out->mi_since_reader_check_seconds16dot16; + if (likely(memcmp(&snap, out, bytes) == 0)) + return MDBX_SUCCESS; + memcpy(&snap, out, bytes); + } +} + +__cold int mdbx_preopen_snapinfo(const char *pathname, MDBX_envinfo *out, + size_t bytes) { +#if defined(_WIN32) || defined(_WIN64) + wchar_t *pathnameW = nullptr; + int rc = osal_mb2w(pathname, &pathnameW); + if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_preopen_snapinfoW(pathnameW, out, bytes); + osal_free(pathnameW); + } + return rc; +} + +__cold int mdbx_preopen_snapinfoW(const wchar_t *pathname, MDBX_envinfo *out, + size_t bytes) { +#endif /* Windows */ + if (unlikely(!out)) + return MDBX_EINVAL; + + const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid); + const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat); + if (unlikely(bytes != sizeof(MDBX_envinfo)) && bytes != size_before_bootid && + bytes != size_before_pgop_stat) + return MDBX_EINVAL; + + memset(out, 0, bytes); + if (likely(bytes > size_before_bootid)) { + out->mi_bootid.current.x = bootid.x; + out->mi_bootid.current.y = bootid.y; + } + + MDBX_env env; + memset(&env, 0, sizeof(env)); + env.me_pid = osal_getpid(); + const size_t os_psize = osal_syspagesize(); + if (unlikely(!is_powerof2(os_psize) || os_psize < MIN_PAGESIZE)) { + ERROR("unsuitable system pagesize %" PRIuPTR, os_psize); + return MDBX_INCOMPATIBLE; + } + out->mi_sys_pagesize = env.me_os_psize = (unsigned)os_psize; + env.me_flags = MDBX_RDONLY | MDBX_NORDAHEAD | MDBX_ACCEDE | MDBX_VALIDATION; + env.me_stuck_meta = -1; + env.me_lfd = INVALID_HANDLE_VALUE; + env.me_lazy_fd = INVALID_HANDLE_VALUE; + env.me_dsync_fd = INVALID_HANDLE_VALUE; + env.me_fd4meta = INVALID_HANDLE_VALUE; +#if defined(_WIN32) || defined(_WIN64) + env.me_data_lock_event = INVALID_HANDLE_VALUE; + env.me_overlapped_fd = INVALID_HANDLE_VALUE; +#endif /* Windows */ + + int rc = env_handle_pathname(&env, pathname, 0); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + rc = osal_openfile(MDBX_OPEN_DXB_READ, &env, env.me_pathname.dxb, + &env.me_lazy_fd, 0); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + MDBX_meta header; + rc = read_header(&env, &header, 0, 0); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + setup_pagesize(&env, header.mm_psize); + out->mi_dxb_pagesize = env.me_psize; + out->mi_geo.lower = pgno2bytes(&env, header.mm_geo.lower); + out->mi_geo.upper = pgno2bytes(&env, header.mm_geo.upper); + out->mi_geo.shrink = pgno2bytes(&env, pv2pages(header.mm_geo.shrink_pv)); + out->mi_geo.grow = pgno2bytes(&env, pv2pages(header.mm_geo.grow_pv)); + out->mi_geo.current = pgno2bytes(&env, header.mm_geo.now); + out->mi_last_pgno = header.mm_geo.next - 1; + + const unsigned n = 0; + out->mi_recent_txnid = constmeta_txnid(&header); + out->mi_meta_sign[n] = unaligned_peek_u64(4, &header.mm_sign); + if (likely(bytes > size_before_bootid)) + memcpy(&out->mi_bootid.meta[n], &header.mm_bootid, 16); + +bailout: + env_close(&env, false); + return rc; +} + __cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo *arg, size_t bytes) { if (unlikely((env == NULL && txn == NULL) || arg == NULL)) return MDBX_EINVAL; + const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid); + const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat); + if (unlikely(bytes != sizeof(MDBX_envinfo)) && bytes != size_before_bootid && + bytes != size_before_pgop_stat) + return MDBX_EINVAL; + if (txn) { int err = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR); if (unlikely(err != MDBX_SUCCESS)) @@ -26672,28 +27727,8 @@ __cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, env = txn->mt_env; } - const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid); - const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat); - if (unlikely(bytes != sizeof(MDBX_envinfo)) && bytes != size_before_bootid && - bytes != size_before_pgop_stat) - return MDBX_EINVAL; - - MDBX_envinfo snap; - int rc = fetch_envinfo_ex(env, txn, &snap, sizeof(snap)); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - while (1) { - rc = fetch_envinfo_ex(env, txn, arg, bytes); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - snap.mi_since_sync_seconds16dot16 = arg->mi_since_sync_seconds16dot16; - snap.mi_since_reader_check_seconds16dot16 = - arg->mi_since_reader_check_seconds16dot16; - if (likely(memcmp(&snap, arg, bytes) == 0)) - return MDBX_SUCCESS; - memcpy(&snap, arg, bytes); - } + meta_troika_t troika; + return env_info(env, txn, arg, bytes, &troika); } static __inline MDBX_cmp_func *get_default_keycmp(MDBX_db_flags_t flags) { @@ -26710,321 +27745,443 @@ static __inline MDBX_cmp_func *get_default_datacmp(MDBX_db_flags_t flags) { : ((flags & MDBX_REVERSEDUP) ? cmp_reverse : cmp_lexical)); } -static int dbi_bind(MDBX_txn *txn, const MDBX_dbi dbi, unsigned user_flags, +static int dbi_bind(MDBX_txn *txn, const size_t dbi, unsigned user_flags, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { - /* Accepting only three cases: - * 1) user_flags and both comparators are zero - * = assume that a by-default mode/flags is requested for reading; - * 2) user_flags exactly the same - * = assume that the target mode/flags are requested properly; - * 3) user_flags differs, but table is empty and MDBX_CREATE is provided - * = assume that a properly create request with custom flags; + const MDBX_env *const env = txn->mt_env; + eASSERT(env, dbi < txn->mt_numdbs && dbi < env->me_numdbs); + eASSERT(env, dbi_state(txn, dbi) & DBI_LINDO); + eASSERT(env, env->me_db_flags[dbi] != DB_POISON); + if ((env->me_db_flags[dbi] & DB_VALID) == 0) { + eASSERT(env, !env->me_dbxs[dbi].md_cmp && !env->me_dbxs[dbi].md_dcmp && + !env->me_dbxs[dbi].md_name.iov_len && + !env->me_dbxs[dbi].md_name.iov_base && + !env->me_dbxs[dbi].md_klen_max && + !env->me_dbxs[dbi].md_klen_min && + !env->me_dbxs[dbi].md_vlen_max && + !env->me_dbxs[dbi].md_vlen_min); + } else { + eASSERT(env, !(txn->mt_dbi_state[dbi] & DBI_VALID) || + (txn->mt_dbs[dbi].md_flags | DB_VALID) == + env->me_db_flags[dbi]); + eASSERT(env, env->me_dbxs[dbi].md_name.iov_base || dbi < CORE_DBS); + } + + /* Если dbi уже использовался, то корректными считаем четыре варианта: + * 1) user_flags равны MDBX_DB_ACCEDE + * = предполагаем что пользователь открывает существующую subDb, + * при этом код проверки не позволит установить другие компараторы. + * 2) user_flags нулевые, а оба компаратора пустые/нулевые или равны текущим + * = предполагаем что пользователь открывает существующую subDb + * старым способом с нулевыми с флагами по-умолчанию. + * 3) user_flags совпадают, а компараторы не заданы или те же + * = предполагаем что пользователь открывает subDb указывая все параметры; + * 4) user_flags отличаются, но subDb пустая и задан флаг MDBX_CREATE + * = предполагаем что пользователь пересоздает subDb; */ - if ((user_flags ^ txn->mt_dbs[dbi].md_flags) & DB_PERSISTENT_FLAGS) { + if ((user_flags & ~MDBX_CREATE) != + (unsigned)(env->me_db_flags[dbi] & DB_PERSISTENT_FLAGS)) { /* flags are differs, check other conditions */ - if ((!user_flags && (!keycmp || keycmp == txn->mt_dbxs[dbi].md_cmp) && - (!datacmp || datacmp == txn->mt_dbxs[dbi].md_dcmp)) || - user_flags == MDBX_ACCEDE) { - /* no comparators were provided and flags are zero, - * seems that is case #1 above */ - user_flags = txn->mt_dbs[dbi].md_flags; - } else if ((user_flags & MDBX_CREATE) && txn->mt_dbs[dbi].md_entries == 0) { - if (txn->mt_flags & MDBX_TXN_RDONLY) - return /* FIXME: return extended info */ MDBX_EACCESS; - /* make sure flags changes get committed */ - txn->mt_dbs[dbi].md_flags = user_flags & DB_PERSISTENT_FLAGS; - txn->mt_flags |= MDBX_TXN_DIRTY; - /* обнуляем компараторы для установки в соответствии с флагами, - * либо заданных пользователем */ - txn->mt_dbxs[dbi].md_cmp = nullptr; - txn->mt_dbxs[dbi].md_dcmp = nullptr; - } else { + if ((!user_flags && (!keycmp || keycmp == env->me_dbxs[dbi].md_cmp) && + (!datacmp || datacmp == env->me_dbxs[dbi].md_dcmp)) || + user_flags == MDBX_DB_ACCEDE) { + user_flags = env->me_db_flags[dbi] & DB_PERSISTENT_FLAGS; + } else if ((user_flags & MDBX_CREATE) == 0) return /* FIXME: return extended info */ MDBX_INCOMPATIBLE; + else { + eASSERT(env, env->me_db_flags[dbi] & DB_VALID); + if (txn->mt_dbi_state[dbi] & DBI_STALE) { + int err = fetch_sdb(txn, dbi); + if (unlikely(err == MDBX_SUCCESS)) + return err; + } + eASSERT(env, + (txn->mt_dbi_state[dbi] & (DBI_LINDO | DBI_VALID | DBI_STALE)) == + (DBI_LINDO | DBI_VALID)); + if (unlikely(txn->mt_dbs[dbi].md_leaf_pages)) + return /* FIXME: return extended info */ MDBX_INCOMPATIBLE; + + /* Пересоздаём subDB если там пусто */ + if (unlikely(txn->mt_cursors[dbi])) + return MDBX_DANGLING_DBI; + env->me_db_flags[dbi] = DB_POISON; + atomic_store32(&env->me_dbi_seqs[dbi], dbi_seq_next(env, MAIN_DBI), + mo_AcquireRelease); + + const uint32_t seq = dbi_seq_next(env, dbi); + const uint16_t db_flags = user_flags & DB_PERSISTENT_FLAGS; + eASSERT(env, txn->mt_dbs[dbi].md_depth == 0 && + txn->mt_dbs[dbi].md_entries == 0 && + txn->mt_dbs[dbi].md_root == P_INVALID); + env->me_dbxs[dbi].md_cmp = + keycmp ? keycmp : get_default_keycmp(user_flags); + env->me_dbxs[dbi].md_dcmp = + datacmp ? datacmp : get_default_datacmp(user_flags); + txn->mt_dbs[dbi].md_flags = db_flags; + txn->mt_dbs[dbi].md_xsize = 0; + if (unlikely(setup_sdb(&env->me_dbxs[dbi], &txn->mt_dbs[dbi], + env->me_psize))) { + txn->mt_dbi_state[dbi] = DBI_LINDO; + txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_PROBLEM; + } + + env->me_db_flags[dbi] = db_flags | DB_VALID; + atomic_store32(&env->me_dbi_seqs[dbi], seq, mo_AcquireRelease); + txn->mt_dbi_seqs[dbi] = seq; + txn->mt_dbi_state[dbi] = DBI_LINDO | DBI_VALID | DBI_CREAT | DBI_DIRTY; + txn->mt_flags |= MDBX_TXN_DIRTY; } } if (!keycmp) - keycmp = txn->mt_dbxs[dbi].md_cmp ? txn->mt_dbxs[dbi].md_cmp - : get_default_keycmp(user_flags); - if (txn->mt_dbxs[dbi].md_cmp != keycmp) { - if (txn->mt_dbxs[dbi].md_cmp) + keycmp = (env->me_db_flags[dbi] & DB_VALID) + ? env->me_dbxs[dbi].md_cmp + : get_default_keycmp(user_flags); + if (env->me_dbxs[dbi].md_cmp != keycmp) { + if (env->me_db_flags[dbi] & DB_VALID) return MDBX_EINVAL; - txn->mt_dbxs[dbi].md_cmp = keycmp; + env->me_dbxs[dbi].md_cmp = keycmp; } if (!datacmp) - datacmp = txn->mt_dbxs[dbi].md_dcmp ? txn->mt_dbxs[dbi].md_dcmp - : get_default_datacmp(user_flags); - if (txn->mt_dbxs[dbi].md_dcmp != datacmp) { - if (txn->mt_dbxs[dbi].md_dcmp) + datacmp = (env->me_db_flags[dbi] & DB_VALID) + ? env->me_dbxs[dbi].md_dcmp + : get_default_datacmp(user_flags); + if (env->me_dbxs[dbi].md_dcmp != datacmp) { + if (env->me_db_flags[dbi] & DB_VALID) return MDBX_EINVAL; - txn->mt_dbxs[dbi].md_dcmp = datacmp; + env->me_dbxs[dbi].md_dcmp = datacmp; } return MDBX_SUCCESS; } -static int dbi_open(MDBX_txn *txn, const MDBX_val *const table_name, - unsigned user_flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, - MDBX_cmp_func *datacmp) { - int rc = MDBX_EINVAL; - if (unlikely(!dbi)) - return rc; - - void *clone = nullptr; - bool locked = false; - if (unlikely((user_flags & ~DB_USABLE_FLAGS) != 0)) { - bailout: - tASSERT(txn, MDBX_IS_ERROR(rc)); - *dbi = 0; - if (locked) - ENSURE(txn->mt_env, - osal_fastmutex_release(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS); - osal_free(clone); - return rc; - } - - rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - - if ((user_flags & MDBX_CREATE) && unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) { - rc = MDBX_EACCESS; - goto bailout; - } - - switch (user_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED | MDBX_DUPSORT | - MDBX_REVERSEDUP | MDBX_ACCEDE)) { - case MDBX_ACCEDE: - if ((user_flags & MDBX_CREATE) == 0) - break; - __fallthrough /* fall through */; - default: - rc = MDBX_EINVAL; - goto bailout; - - case MDBX_DUPSORT: - case MDBX_DUPSORT | MDBX_REVERSEDUP: - case MDBX_DUPSORT | MDBX_DUPFIXED: - case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP: - case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP: - case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP: - case 0: - break; - } - - /* main table? */ - if (table_name == MDBX_PGWALK_MAIN || - table_name->iov_base == MDBX_PGWALK_MAIN) { - rc = dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - *dbi = MAIN_DBI; - return rc; - } - if (table_name == MDBX_PGWALK_GC || table_name->iov_base == MDBX_PGWALK_GC) { - rc = dbi_bind(txn, FREE_DBI, user_flags, keycmp, datacmp); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - *dbi = FREE_DBI; - return rc; - } - if (table_name == MDBX_PGWALK_META || - table_name->iov_base == MDBX_PGWALK_META) { - rc = MDBX_EINVAL; - goto bailout; - } +static __inline size_t dbi_namelen(const MDBX_val name) { + return (name.iov_len > sizeof(struct mdbx_defer_free_item)) + ? name.iov_len + : sizeof(struct mdbx_defer_free_item); +} - MDBX_val key = *table_name; +static int dbi_open_locked(MDBX_txn *txn, unsigned user_flags, MDBX_dbi *dbi, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp, + MDBX_val name) { MDBX_env *const env = txn->mt_env; - if (key.iov_len > env->me_leaf_nodemax - NODESIZE - sizeof(MDBX_db)) - return MDBX_EINVAL; /* Cannot mix named table(s) with DUPSORT flags */ + tASSERT(txn, + (txn->mt_dbi_state[MAIN_DBI] & (DBI_LINDO | DBI_VALID | DBI_STALE)) == + (DBI_LINDO | DBI_VALID)); if (unlikely(txn->mt_dbs[MAIN_DBI].md_flags & MDBX_DUPSORT)) { - if ((user_flags & MDBX_CREATE) == 0) { - rc = MDBX_NOTFOUND; - goto bailout; - } - if (txn->mt_dbs[MAIN_DBI].md_leaf_pages || txn->mt_dbxs[MAIN_DBI].md_cmp) { - /* В MAIN_DBI есть записи либо она уже использовалась. */ - rc = MDBX_INCOMPATIBLE; - goto bailout; - } - /* Пересоздаём MAIN_DBI если там пусто. */ - atomic_store32(&txn->mt_dbiseqs[MAIN_DBI], dbi_seq(env, MAIN_DBI), - mo_AcquireRelease); + if (unlikely((user_flags & MDBX_CREATE) == 0)) + return MDBX_NOTFOUND; + if (unlikely(txn->mt_dbs[MAIN_DBI].md_leaf_pages)) + /* В MainDB есть записи, либо она уже использовалась. */ + return MDBX_INCOMPATIBLE; + + /* Пересоздаём MainDB когда там пусто. */ tASSERT(txn, txn->mt_dbs[MAIN_DBI].md_depth == 0 && txn->mt_dbs[MAIN_DBI].md_entries == 0 && txn->mt_dbs[MAIN_DBI].md_root == P_INVALID); - txn->mt_dbs[MAIN_DBI].md_flags &= MDBX_REVERSEKEY | MDBX_INTEGERKEY; - txn->mt_dbistate[MAIN_DBI] |= DBI_DIRTY; + if (unlikely(txn->mt_cursors[MAIN_DBI])) + return MDBX_DANGLING_DBI; + env->me_db_flags[MAIN_DBI] = DB_POISON; + atomic_store32(&env->me_dbi_seqs[MAIN_DBI], dbi_seq_next(env, MAIN_DBI), + mo_AcquireRelease); + + const uint32_t seq = dbi_seq_next(env, MAIN_DBI); + const uint16_t main_flags = + txn->mt_dbs[MAIN_DBI].md_flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY); + env->me_dbxs[MAIN_DBI].md_cmp = get_default_keycmp(main_flags); + env->me_dbxs[MAIN_DBI].md_dcmp = get_default_datacmp(main_flags); + txn->mt_dbs[MAIN_DBI].md_flags = main_flags; + txn->mt_dbs[MAIN_DBI].md_xsize = 0; + if (unlikely(setup_sdb(&env->me_dbxs[MAIN_DBI], &txn->mt_dbs[MAIN_DBI], + env->me_psize) != MDBX_SUCCESS)) { + txn->mt_dbi_state[MAIN_DBI] = DBI_LINDO; + txn->mt_flags |= MDBX_TXN_ERROR; + env->me_flags |= MDBX_FATAL_ERROR; + return MDBX_FATAL_ERROR; + } + env->me_db_flags[MAIN_DBI] = main_flags | DB_VALID; + txn->mt_dbi_seqs[MAIN_DBI] = + atomic_store32(&env->me_dbi_seqs[MAIN_DBI], seq, mo_AcquireRelease); + txn->mt_dbi_state[MAIN_DBI] |= DBI_DIRTY; txn->mt_flags |= MDBX_TXN_DIRTY; - txn->mt_dbxs[MAIN_DBI].md_cmp = - get_default_keycmp(txn->mt_dbs[MAIN_DBI].md_flags); - txn->mt_dbxs[MAIN_DBI].md_dcmp = - get_default_datacmp(txn->mt_dbs[MAIN_DBI].md_flags); } - tASSERT(txn, txn->mt_dbxs[MAIN_DBI].md_cmp); + tASSERT(txn, env->me_dbxs[MAIN_DBI].md_cmp); /* Is the DB already open? */ - MDBX_dbi scan, slot; - for (slot = scan = txn->mt_numdbs; --scan >= CORE_DBS;) { - if (!txn->mt_dbxs[scan].md_name.iov_base) { + size_t slot = env->me_numdbs; + for (size_t scan = CORE_DBS; scan < env->me_numdbs; ++scan) { + if ((env->me_db_flags[scan] & DB_VALID) == 0) { /* Remember this free slot */ - slot = scan; + slot = (slot < scan) ? slot : scan; continue; } - if (key.iov_len == txn->mt_dbxs[scan].md_name.iov_len && - !memcmp(key.iov_base, txn->mt_dbxs[scan].md_name.iov_base, - key.iov_len)) { - rc = dbi_bind(txn, scan, user_flags, keycmp, datacmp); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - *dbi = scan; - return rc; + if (!env->me_dbxs[MAIN_DBI].md_cmp(&name, &env->me_dbxs[scan].md_name)) { + slot = scan; + int err = dbi_check(txn, slot); + if (err == MDBX_BAD_DBI && + txn->mt_dbi_state[slot] == (DBI_OLDEN | DBI_LINDO)) { + /* хендл использовался, стал невалидным, + * но теперь явно пере-открывается в этой транзакци */ + eASSERT(env, !txn->mt_cursors[slot]); + txn->mt_dbi_state[slot] = DBI_LINDO; + err = dbi_check(txn, slot); + } + if (err == MDBX_SUCCESS) { + err = dbi_bind(txn, slot, user_flags, keycmp, datacmp); + if (likely(err == MDBX_SUCCESS)) { + goto done; + } + } + return err; } } /* Fail, if no free slot and max hit */ - if (unlikely(slot >= env->me_maxdbs)) { - rc = MDBX_DBS_FULL; - goto bailout; - } + if (unlikely(slot >= env->me_maxdbs)) + return MDBX_DBS_FULL; + + if (env->me_numdbs == slot) + eASSERT(env, !env->me_db_flags[slot] && + !env->me_dbxs[slot].md_name.iov_len && + !env->me_dbxs[slot].md_name.iov_base); + + env->me_db_flags[slot] = DB_POISON; + atomic_store32(&env->me_dbi_seqs[slot], dbi_seq_next(env, slot), + mo_AcquireRelease); + memset(&env->me_dbxs[slot], 0, sizeof(env->me_dbxs[slot])); + if (env->me_numdbs == slot) + env->me_numdbs = (unsigned)slot + 1; + eASSERT(env, slot < env->me_numdbs); + + int err = dbi_check(txn, slot); + eASSERT(env, err == MDBX_BAD_DBI); + if (err != MDBX_BAD_DBI) + return MDBX_PROBLEM; /* Find the DB info */ - MDBX_val data; - MDBX_cursor_couple couple; - rc = cursor_init(&couple.outer, txn, MAIN_DBI); + MDBX_val body; + MDBX_cursor_couple cx; + int rc = cursor_init(&cx.outer, txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - rc = cursor_set(&couple.outer, &key, &data, MDBX_SET).err; + return rc; + rc = cursor_set(&cx.outer, &name, &body, MDBX_SET).err; if (unlikely(rc != MDBX_SUCCESS)) { if (rc != MDBX_NOTFOUND || !(user_flags & MDBX_CREATE)) - goto bailout; + return rc; } else { /* make sure this is actually a table */ - MDBX_node *node = page_node(couple.outer.mc_pg[couple.outer.mc_top], - couple.outer.mc_ki[couple.outer.mc_top]); - if (unlikely((node_flags(node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) { - rc = MDBX_INCOMPATIBLE; - goto bailout; - } - if (!MDBX_DISABLE_VALIDATION && unlikely(data.iov_len != sizeof(MDBX_db))) { - rc = MDBX_CORRUPTED; - goto bailout; + MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], + cx.outer.mc_ki[cx.outer.mc_top]); + if (unlikely((node_flags(node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) + return MDBX_INCOMPATIBLE; + if (!MDBX_DISABLE_VALIDATION && unlikely(body.iov_len != sizeof(MDBX_db))) { + ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid subDb node size", body.iov_len); + return MDBX_CORRUPTED; } - } - - if (rc != MDBX_SUCCESS && unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) { - rc = MDBX_EACCESS; - goto bailout; + memcpy(&txn->mt_dbs[slot], body.iov_base, sizeof(MDBX_db)); } /* Done here so we cannot fail after creating a new DB */ - if (key.iov_len) { - clone = osal_malloc(key.iov_len); - if (unlikely(!clone)) { - rc = MDBX_ENOMEM; - goto bailout; - } - key.iov_base = memcpy(clone, key.iov_base, key.iov_len); + void *clone = nullptr; + if (name.iov_len) { + clone = osal_malloc(dbi_namelen(name)); + if (unlikely(!clone)) + return MDBX_ENOMEM; + name.iov_base = memcpy(clone, name.iov_base, name.iov_len); } else - key.iov_base = ""; - - int err = osal_fastmutex_acquire(&env->me_dbi_lock); - if (unlikely(err != MDBX_SUCCESS)) { - rc = err; - goto bailout; - } - locked = true; - - /* Import handles from env */ - dbi_import_locked(txn); - - /* Rescan after mutex acquisition & import handles */ - for (slot = scan = txn->mt_numdbs; --scan >= CORE_DBS;) { - if (!txn->mt_dbxs[scan].md_name.iov_base) { - /* Remember this free slot */ - slot = scan; - continue; - } - if (key.iov_len == txn->mt_dbxs[scan].md_name.iov_len && - !memcmp(key.iov_base, txn->mt_dbxs[scan].md_name.iov_base, - key.iov_len)) { - rc = dbi_bind(txn, scan, user_flags, keycmp, datacmp); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - slot = scan; - goto done; - } - } - - if (unlikely(slot >= env->me_maxdbs)) { - rc = MDBX_DBS_FULL; - goto bailout; - } + name.iov_base = ""; - unsigned dbiflags = DBI_FRESH | DBI_VALID | DBI_USRVALID; - MDBX_db db_dummy; + uint8_t dbi_state = DBI_LINDO | DBI_VALID | DBI_FRESH; if (unlikely(rc)) { /* MDBX_NOTFOUND and MDBX_CREATE: Create new DB */ tASSERT(txn, rc == MDBX_NOTFOUND); - memset(&db_dummy, 0, sizeof(db_dummy)); - db_dummy.md_root = P_INVALID; - db_dummy.md_mod_txnid = txn->mt_txnid; - db_dummy.md_flags = user_flags & DB_PERSISTENT_FLAGS; - data.iov_len = sizeof(db_dummy); - data.iov_base = &db_dummy; + body.iov_base = + memset(&txn->mt_dbs[slot], 0, body.iov_len = sizeof(MDBX_db)); + txn->mt_dbs[slot].md_root = P_INVALID; + txn->mt_dbs[slot].md_mod_txnid = txn->mt_txnid; + txn->mt_dbs[slot].md_flags = user_flags & DB_PERSISTENT_FLAGS; WITH_CURSOR_TRACKING( - couple.outer, rc = cursor_put_checklen(&couple.outer, &key, &data, - F_SUBDATA | MDBX_NOOVERWRITE)); + cx.outer, rc = cursor_put_checklen(&cx.outer, &name, &body, + F_SUBDATA | MDBX_NOOVERWRITE)); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - dbiflags |= DBI_DIRTY | DBI_CREAT; + dbi_state |= DBI_DIRTY | DBI_CREAT; txn->mt_flags |= MDBX_TXN_DIRTY; - tASSERT(txn, (txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY) != 0); + tASSERT(txn, (txn->mt_dbi_state[MAIN_DBI] & DBI_DIRTY) != 0); } /* Got info, register DBI in this txn */ - memset(txn->mt_dbxs + slot, 0, sizeof(MDBX_dbx)); - memcpy(&txn->mt_dbs[slot], data.iov_base, sizeof(MDBX_db)); - env->me_dbflags[slot] = 0; + const uint32_t seq = dbi_seq_next(env, slot); + eASSERT(env, + env->me_db_flags[slot] == DB_POISON && !txn->mt_cursors[slot] && + (txn->mt_dbi_state[slot] & (DBI_LINDO | DBI_VALID)) == DBI_LINDO); + txn->mt_dbi_state[slot] = dbi_state; + memcpy(&txn->mt_dbs[slot], body.iov_base, sizeof(txn->mt_dbs[slot])); + env->me_db_flags[slot] = txn->mt_dbs[slot].md_flags; rc = dbi_bind(txn, slot, user_flags, keycmp, datacmp); - if (unlikely(rc != MDBX_SUCCESS)) { - tASSERT(txn, (dbiflags & DBI_CREAT) == 0); + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; + + env->me_dbxs[slot].md_name = name; + env->me_db_flags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID; + txn->mt_dbi_seqs[slot] = + atomic_store32(&env->me_dbi_seqs[slot], seq, mo_AcquireRelease); + +done: + *dbi = (MDBX_dbi)slot; + tASSERT(txn, + slot < txn->mt_numdbs && (env->me_db_flags[slot] & DB_VALID) != 0); + eASSERT(env, dbi_check(txn, slot) == MDBX_SUCCESS); + return MDBX_SUCCESS; + +bailout: + eASSERT(env, !txn->mt_cursors[slot] && !env->me_dbxs[slot].md_name.iov_len && + !env->me_dbxs[slot].md_name.iov_base); + txn->mt_dbi_state[slot] &= DBI_LINDO | DBI_OLDEN; + env->me_db_flags[slot] = 0; + osal_free(clone); + if (slot + 1 == env->me_numdbs) + txn->mt_numdbs = env->me_numdbs = (unsigned)slot; + return rc; +} + +static int dbi_open(MDBX_txn *txn, const MDBX_val *const name, + unsigned user_flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, + MDBX_cmp_func *datacmp) { + if (unlikely(!dbi)) + return MDBX_EINVAL; + *dbi = 0; + if (unlikely((user_flags & ~DB_USABLE_FLAGS) != 0)) + return MDBX_EINVAL; + + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if ((user_flags & MDBX_CREATE) && unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) + return MDBX_EACCESS; + + switch (user_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED | MDBX_DUPSORT | + MDBX_REVERSEDUP | MDBX_ACCEDE)) { + case MDBX_ACCEDE: + if ((user_flags & MDBX_CREATE) == 0) + break; + __fallthrough /* fall through */; + default: + return MDBX_EINVAL; + + case MDBX_DUPSORT: + case MDBX_DUPSORT | MDBX_REVERSEDUP: + case MDBX_DUPSORT | MDBX_DUPFIXED: + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP: + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP: + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP: + case MDBX_DB_DEFAULTS: + break; } + tASSERT(txn, db_check_flags((uint16_t)user_flags)); - txn->mt_dbistate[slot] = (uint8_t)dbiflags; - txn->mt_dbxs[slot].md_name = key; - txn->mt_dbiseqs[slot].weak = env->me_dbiseqs[slot].weak = dbi_seq(env, slot); - if (!(dbiflags & DBI_CREAT)) - env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID; - if (txn->mt_numdbs == slot) { - txn->mt_cursors[slot] = NULL; - osal_compiler_barrier(); - txn->mt_numdbs = slot + 1; + /* main table? */ + if (unlikely(name == MDBX_CHK_MAIN || name->iov_base == MDBX_CHK_MAIN)) { + rc = dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp); + if (likely(rc == MDBX_SUCCESS)) + *dbi = MAIN_DBI; + return rc; } - if (env->me_numdbs <= slot) { - osal_memory_fence(mo_AcquireRelease, true); - env->me_numdbs = slot + 1; + if (unlikely(name == MDBX_CHK_GC || name->iov_base == MDBX_CHK_GC)) { + rc = dbi_bind(txn, FREE_DBI, user_flags, keycmp, datacmp); + if (likely(rc == MDBX_SUCCESS)) + *dbi = FREE_DBI; + return rc; } + if (unlikely(name == MDBX_CHK_META || name->iov_base == MDBX_CHK_META)) + return MDBX_EINVAL; + if (unlikely(name->iov_len > + txn->mt_env->me_leaf_nodemax - NODESIZE - sizeof(MDBX_db))) + return MDBX_EINVAL; -done: - *dbi = slot; - ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); - return MDBX_SUCCESS; +#if MDBX_ENABLE_DBI_LOCKFREE + /* Is the DB already open? */ + const MDBX_env *const env = txn->mt_env; + size_t free_slot = env->me_numdbs; + for (size_t i = CORE_DBS; i < env->me_numdbs; ++i) { + retry: + if ((env->me_db_flags[i] & DB_VALID) == 0) { + free_slot = i; + continue; + } + + const uint32_t snap_seq = + atomic_load32(&env->me_dbi_seqs[i], mo_AcquireRelease); + const uint16_t snap_flags = env->me_db_flags[i]; + const MDBX_val snap_name = env->me_dbxs[i].md_name; + if (user_flags != MDBX_ACCEDE && + (((user_flags ^ snap_flags) & DB_PERSISTENT_FLAGS) || + (keycmp && keycmp != env->me_dbxs[i].md_cmp) || + (datacmp && datacmp != env->me_dbxs[i].md_dcmp))) + continue; + const uint32_t main_seq = + atomic_load32(&env->me_dbi_seqs[MAIN_DBI], mo_AcquireRelease); + MDBX_cmp_func *const snap_cmp = env->me_dbxs[MAIN_DBI].md_cmp; + if (unlikely(!(snap_flags & DB_VALID) || !snap_name.iov_base || + !snap_name.iov_len || !snap_cmp)) + continue; + + const bool name_match = snap_cmp(&snap_name, name) == 0; + osal_flush_incoherent_cpu_writeback(); + if (unlikely(snap_seq != + atomic_load32(&env->me_dbi_seqs[i], mo_AcquireRelease) || + main_seq != atomic_load32(&env->me_dbi_seqs[MAIN_DBI], + mo_AcquireRelease) || + snap_flags != env->me_db_flags[i] || + snap_name.iov_base != env->me_dbxs[i].md_name.iov_base || + snap_name.iov_len != env->me_dbxs[i].md_name.iov_len)) + goto retry; + if (name_match) { + rc = dbi_check(txn, i); + if (rc == MDBX_BAD_DBI && + txn->mt_dbi_state[i] == (DBI_OLDEN | DBI_LINDO)) { + /* хендл использовался, стал невалидным, + * но теперь явно пере-открывается в этой транзакци */ + eASSERT(env, !txn->mt_cursors[i]); + txn->mt_dbi_state[i] = DBI_LINDO; + rc = dbi_check(txn, i); + } + if (likely(rc == MDBX_SUCCESS)) { + rc = dbi_bind(txn, i, user_flags, keycmp, datacmp); + if (likely(rc == MDBX_SUCCESS)) + *dbi = (MDBX_dbi)i; + } + return rc; + } + } + + /* Fail, if no free slot and max hit */ + if (unlikely(free_slot >= env->me_maxdbs)) + return MDBX_DBS_FULL; +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ + + rc = osal_fastmutex_acquire(&txn->mt_env->me_dbi_lock); + if (likely(rc == MDBX_SUCCESS)) { + rc = dbi_open_locked(txn, user_flags, dbi, keycmp, datacmp, *name); + ENSURE(txn->mt_env, + osal_fastmutex_release(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS); + } + return rc; } static int dbi_open_cstr(MDBX_txn *txn, const char *name_cstr, MDBX_db_flags_t flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { MDBX_val thunk, *name; - if (name_cstr == MDBX_PGWALK_MAIN || name_cstr == MDBX_PGWALK_GC || - name_cstr == MDBX_PGWALK_META) + if (name_cstr == MDBX_CHK_MAIN || name_cstr == MDBX_CHK_GC || + name_cstr == MDBX_CHK_META) name = (void *)name_cstr; else { thunk.iov_len = strlen(name_cstr); @@ -27056,6 +28213,105 @@ int mdbx_dbi_open_ex2(MDBX_txn *txn, const MDBX_val *name, return dbi_open(txn, name, flags, dbi, keycmp, datacmp); } +__cold int mdbx_dbi_rename(MDBX_txn *txn, MDBX_dbi dbi, const char *name_cstr) { + MDBX_val thunk, *name; + if (name_cstr == MDBX_CHK_MAIN || name_cstr == MDBX_CHK_GC || + name_cstr == MDBX_CHK_META) + name = (void *)name_cstr; + else { + thunk.iov_len = strlen(name_cstr); + thunk.iov_base = (void *)name_cstr; + name = &thunk; + } + return mdbx_dbi_rename2(txn, dbi, name); +} + +struct dbi_rename_result { + struct mdbx_defer_free_item *defer; + int err; +}; + +__cold static struct dbi_rename_result +dbi_rename_locked(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val new_name) { + struct dbi_rename_result pair; + pair.defer = nullptr; + pair.err = dbi_check(txn, dbi); + if (unlikely(pair.err != MDBX_SUCCESS)) + return pair; + + MDBX_env *const env = txn->mt_env; + MDBX_val old_name = env->me_dbxs[dbi].md_name; + if (env->me_dbxs[MAIN_DBI].md_cmp(&new_name, &old_name) == 0 && + MDBX_DEBUG == 0) + return pair; + + MDBX_cursor_couple cx; + pair.err = cursor_init(&cx.outer, txn, MAIN_DBI); + if (unlikely(pair.err != MDBX_SUCCESS)) + return pair; + pair.err = cursor_set(&cx.outer, &new_name, nullptr, MDBX_SET).err; + if (unlikely(pair.err != MDBX_NOTFOUND)) { + pair.err = (pair.err == MDBX_SUCCESS) ? MDBX_KEYEXIST : pair.err; + return pair; + } + + pair.defer = osal_malloc(dbi_namelen(new_name)); + if (unlikely(!pair.defer)) { + pair.err = MDBX_ENOMEM; + return pair; + } + new_name.iov_base = memcpy(pair.defer, new_name.iov_base, new_name.iov_len); + + cx.outer.mc_next = txn->mt_cursors[MAIN_DBI]; + txn->mt_cursors[MAIN_DBI] = &cx.outer; + + MDBX_val data = {&txn->mt_dbs[dbi], sizeof(MDBX_db)}; + pair.err = cursor_put_checklen(&cx.outer, &new_name, &data, + F_SUBDATA | MDBX_NOOVERWRITE); + if (likely(pair.err == MDBX_SUCCESS)) { + pair.err = cursor_set(&cx.outer, &old_name, nullptr, MDBX_SET).err; + if (likely(pair.err == MDBX_SUCCESS)) + pair.err = cursor_del(&cx.outer, F_SUBDATA); + if (likely(pair.err == MDBX_SUCCESS)) { + pair.defer = env->me_dbxs[dbi].md_name.iov_base; + env->me_dbxs[dbi].md_name = new_name; + } else + txn->mt_flags |= MDBX_TXN_ERROR; + } + + txn->mt_cursors[MAIN_DBI] = cx.outer.mc_next; + return pair; +} + +__cold int mdbx_dbi_rename2(MDBX_txn *txn, MDBX_dbi dbi, + const MDBX_val *new_name) { + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(new_name == MDBX_CHK_MAIN || + new_name->iov_base == MDBX_CHK_MAIN || new_name == MDBX_CHK_GC || + new_name->iov_base == MDBX_CHK_GC || new_name == MDBX_CHK_META || + new_name->iov_base == MDBX_CHK_META)) + return MDBX_EINVAL; + + if (unlikely(dbi < CORE_DBS)) + return MDBX_EINVAL; + rc = dbi_check(txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + rc = osal_fastmutex_acquire(&txn->mt_env->me_dbi_lock); + if (likely(rc == MDBX_SUCCESS)) { + struct dbi_rename_result pair = dbi_rename_locked(txn, dbi, *new_name); + if (pair.defer) + pair.defer->next = nullptr; + env_defer_free_and_release(txn->mt_env, pair.defer); + rc = pair.err; + } + return rc; +} + __cold int mdbx_dbi_stat(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, size_t bytes) { int rc = check_txn(txn, MDBX_TXN_BLOCKED); @@ -27065,8 +28321,9 @@ __cold int mdbx_dbi_stat(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, if (unlikely(!dest)) return MDBX_EINVAL; - if (unlikely(!check_dbi(txn, dbi, DBI_VALID))) - return MDBX_BAD_DBI; + rc = dbi_check(txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid); if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid) @@ -27075,7 +28332,7 @@ __cold int mdbx_dbi_stat(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) return MDBX_BAD_TXN; - if (unlikely(txn->mt_dbistate[dbi] & DBI_STALE)) { + if (unlikely(txn->mt_dbi_state[dbi] & DBI_STALE)) { rc = fetch_sdb((MDBX_txn *)txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -27086,31 +28343,35 @@ __cold int mdbx_dbi_stat(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, return MDBX_SUCCESS; } -static int dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { +static struct mdbx_defer_free_item *dbi_close_locked(MDBX_env *env, + MDBX_dbi dbi) { eASSERT(env, dbi >= CORE_DBS); if (unlikely(dbi >= env->me_numdbs)) - return MDBX_BAD_DBI; - - char *const ptr = env->me_dbxs[dbi].md_name.iov_base; - /* If there was no name, this was already closed */ - if (unlikely(!ptr)) - return MDBX_BAD_DBI; + return nullptr; - env->me_dbflags[dbi] = 0; - env->me_dbxs[dbi].md_name.iov_len = 0; - osal_memory_fence(mo_AcquireRelease, true); - env->me_dbxs[dbi].md_name.iov_base = NULL; - osal_free(ptr); + const uint32_t seq = dbi_seq_next(env, dbi); + struct mdbx_defer_free_item *defer_item = env->me_dbxs[dbi].md_name.iov_base; + if (likely(defer_item)) { + env->me_db_flags[dbi] = 0; + env->me_dbxs[dbi].md_name.iov_len = 0; + env->me_dbxs[dbi].md_name.iov_base = nullptr; + atomic_store32(&env->me_dbi_seqs[dbi], seq, mo_AcquireRelease); + osal_flush_incoherent_cpu_writeback(); + defer_item->next = nullptr; - if (env->me_numdbs == dbi + 1) { - size_t i = env->me_numdbs; - do - --i; - while (i > CORE_DBS && !env->me_dbxs[i - 1].md_name.iov_base); - env->me_numdbs = (MDBX_dbi)i; + if (env->me_numdbs == dbi + 1) { + size_t i = env->me_numdbs; + do { + --i; + eASSERT(env, i >= CORE_DBS); + eASSERT(env, !env->me_db_flags[i] && !env->me_dbxs[i].md_name.iov_len && + !env->me_dbxs[i].md_name.iov_base); + } while (i > CORE_DBS && !env->me_dbxs[i - 1].md_name.iov_base); + env->me_numdbs = (unsigned)i; + } } - return MDBX_SUCCESS; + return defer_item; } int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { @@ -27128,12 +28389,8 @@ int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { return MDBX_BAD_DBI; rc = osal_fastmutex_acquire(&env->me_dbi_lock); - if (likely(rc == MDBX_SUCCESS)) { - rc = (dbi < env->me_maxdbs && (env->me_dbflags[dbi] & DB_VALID)) - ? dbi_close_locked(env, dbi) - : MDBX_BAD_DBI; - ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); - } + if (likely(rc == MDBX_SUCCESS)) + rc = env_defer_free_and_release(env, dbi_close_locked(env, dbi)); return rc; } @@ -27146,12 +28403,13 @@ int mdbx_dbi_flags_ex(const MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags, if (unlikely(!flags || !state)) return MDBX_EINVAL; - if (unlikely(!check_dbi(txn, dbi, DBI_VALID))) - return MDBX_BAD_DBI; + rc = dbi_check(txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; *flags = txn->mt_dbs[dbi].md_flags & DB_PERSISTENT_FLAGS; *state = - txn->mt_dbistate[dbi] & (DBI_FRESH | DBI_CREAT | DBI_DIRTY | DBI_STALE); + txn->mt_dbi_state[dbi] & (DBI_FRESH | DBI_CREAT | DBI_DIRTY | DBI_STALE); return MDBX_SUCCESS; } @@ -27251,7 +28509,7 @@ static int drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { return rc; } -int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { +__cold int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -27271,25 +28529,22 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { /* Can't delete the main DB */ if (del && dbi >= CORE_DBS) { - rc = delete (txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); + rc = delete(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); if (likely(rc == MDBX_SUCCESS)) { - tASSERT(txn, txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY); + tASSERT(txn, txn->mt_dbi_state[MAIN_DBI] & DBI_DIRTY); tASSERT(txn, txn->mt_flags & MDBX_TXN_DIRTY); - txn->mt_dbistate[dbi] = DBI_STALE; - MDBX_env *env = txn->mt_env; + txn->mt_dbi_state[dbi] = DBI_LINDO | DBI_OLDEN; + MDBX_env *const env = txn->mt_env; rc = osal_fastmutex_acquire(&env->me_dbi_lock); - if (unlikely(rc != MDBX_SUCCESS)) { - txn->mt_flags |= MDBX_TXN_ERROR; + if (likely(rc == MDBX_SUCCESS)) { + rc = env_defer_free_and_release(env, dbi_close_locked(env, dbi)); goto bailout; } - dbi_close_locked(env, dbi); - ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); - } else { - txn->mt_flags |= MDBX_TXN_ERROR; } + txn->mt_flags |= MDBX_TXN_ERROR; } else { /* reset the DB record, mark it dirty */ - txn->mt_dbistate[dbi] |= DBI_DIRTY; + txn->mt_dbi_state[dbi] |= DBI_DIRTY; txn->mt_dbs[dbi].md_depth = 0; txn->mt_dbs[dbi].md_branch_pages = 0; txn->mt_dbs[dbi].md_leaf_pages = 0; @@ -27525,12 +28780,14 @@ __cold MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, return rc; } -__cold int mdbx_setup_debug(MDBX_log_level_t level, MDBX_debug_flags_t flags, - MDBX_debug_func *logger) { - const int rc = runtime_flags | (loglevel << 16); +__cold static int setup_debug(MDBX_log_level_t level, MDBX_debug_flags_t flags, + union logger_union logger, char *buffer, + size_t buffer_size) { + ENSURE(nullptr, osal_fastmutex_acquire(&debug_lock) == 0); + const int rc = mdbx_static.flags | (mdbx_static.loglevel << 16); if (level != MDBX_LOG_DONTCHANGE) - loglevel = (uint8_t)level; + mdbx_static.loglevel = (uint8_t)level; if (flags != MDBX_DBG_DONTCHANGE) { flags &= @@ -27539,14 +28796,37 @@ __cold int mdbx_setup_debug(MDBX_log_level_t level, MDBX_debug_flags_t flags, #endif MDBX_DBG_DUMP | MDBX_DBG_LEGACY_MULTIOPEN | MDBX_DBG_LEGACY_OVERLAP | MDBX_DBG_DONT_UPGRADE; - runtime_flags = (uint8_t)flags; + mdbx_static.flags = (uint8_t)flags; + } + + assert(MDBX_LOGGER_DONTCHANGE == ((MDBX_debug_func *)(intptr_t)-1)); + if (logger.ptr != (void *)((intptr_t)-1)) { + mdbx_static.logger.ptr = logger.ptr; + mdbx_static.logger_buffer = buffer; + mdbx_static.logger_buffer_size = buffer_size; } - if (logger != MDBX_LOGGER_DONTCHANGE) - debug_logger = logger; + ENSURE(nullptr, osal_fastmutex_release(&debug_lock) == 0); return rc; } +__cold int mdbx_setup_debug_nofmt(MDBX_log_level_t level, + MDBX_debug_flags_t flags, + MDBX_debug_func_nofmt *logger, char *buffer, + size_t buffer_size) { + union logger_union thunk; + thunk.nofmt = + (logger && buffer && buffer_size) ? logger : MDBX_LOGGER_NOFMT_DONTCHANGE; + return setup_debug(level, flags, thunk, buffer, buffer_size); +} + +__cold int mdbx_setup_debug(MDBX_log_level_t level, MDBX_debug_flags_t flags, + MDBX_debug_func *logger) { + union logger_union thunk; + thunk.fmt = logger; + return setup_debug(level, flags, thunk, nullptr, 0); +} + __cold static txnid_t kick_longlived_readers(MDBX_env *env, const txnid_t laggard) { DEBUG("DB size maxed out by reading #%" PRIaTXN, laggard); @@ -27699,12 +28979,12 @@ typedef struct mdbx_walk_ctx { bool mw_dont_check_keys_ordering; } mdbx_walk_ctx_t; -__cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, - const MDBX_val *name, int deep); +__cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_walk_sdb_t *sdb, + int deep); static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { if (mp) - switch (mp->mp_flags) { + switch (mp->mp_flags & ~P_SPILLED) { case P_BRANCH: return MDBX_page_branch; case P_LEAF: @@ -27713,15 +28993,13 @@ static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { return MDBX_page_dupfixed_leaf; case P_OVERFLOW: return MDBX_page_large; - case P_META: - return MDBX_page_meta; } return MDBX_page_broken; } /* Depth-first tree traversal. */ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, - const MDBX_val *name, int deep, + MDBX_walk_sdb_t *sdb, int deep, txnid_t parent_txnid) { assert(pgno != P_INVALID); MDBX_page *mp = nullptr; @@ -27745,9 +29023,10 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, continue; } - MDBX_node *node = page_node(mp, i); + const MDBX_node *node = page_node(mp, i); + header_size += NODESIZE; const size_t node_key_size = node_ks(node); - payload_size += NODESIZE + node_key_size; + payload_size += node_key_size; if (type == MDBX_page_branch) { assert(i > 0 || node_ks(node) == 0); @@ -27780,7 +29059,7 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, pagesize = pgno2bytes(ctx->mw_txn->mt_env, npages); const size_t over_unused = pagesize - over_payload - over_header; const int rc = ctx->mw_visitor(large_pgno, npages, ctx->mw_user, deep, - name, pagesize, MDBX_page_large, err, 1, + sdb, pagesize, MDBX_page_large, err, 1, over_payload, over_header, over_unused); if (unlikely(rc != MDBX_SUCCESS)) return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; @@ -27789,8 +29068,9 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, } break; case F_SUBDATA /* sub-db */: { - const size_t namelen = node_key_size; - if (unlikely(namelen == 0 || node_data_size != sizeof(MDBX_db))) { + if (unlikely(node_data_size != sizeof(MDBX_db))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid subDb node size", (unsigned)node_data_size); assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; } @@ -27800,6 +29080,8 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: if (unlikely(node_data_size != sizeof(MDBX_db))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid sub-tree node size", (unsigned)node_data_size); assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; } @@ -27809,6 +29091,8 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, case F_DUPDATA /* short sub-page */: { if (unlikely(node_data_size <= PAGEHDRSZ || (node_data_size & 1))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid sub-page node size", (unsigned)node_data_size); assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; break; @@ -27831,6 +29115,8 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, subtype = MDBX_subpage_dupfixed_leaf; break; default: + ERROR("%s/%d: %s 0x%x", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid sub-page flags", sp->mp_flags); assert(err == MDBX_CORRUPTED); subtype = MDBX_subpage_broken; err = MDBX_CORRUPTED; @@ -27848,6 +29134,8 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, subpayload_size += subnode_size; subalign_bytes += subnode_size & 1; if (unlikely(node_flags(subnode) != 0)) { + ERROR("%s/%d: %s 0x%x", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "unexpected sub-node flags", node_flags(subnode)); assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; } @@ -27855,7 +29143,7 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, } const int rc = - ctx->mw_visitor(pgno, 0, ctx->mw_user, deep + 1, name, node_data_size, + ctx->mw_visitor(pgno, 0, ctx->mw_user, deep + 1, sdb, node_data_size, subtype, err, nsubkeys, subpayload_size, subheader_size, subunused_size + subalign_bytes); if (unlikely(rc != MDBX_SUCCESS)) @@ -27867,13 +29155,15 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, } break; default: + ERROR("%s/%d: %s 0x%x", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid node flags", node_flags(node)); assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; } } const int rc = ctx->mw_visitor( - pgno, 1, ctx->mw_user, deep, name, ctx->mw_txn->mt_env->me_psize, type, + pgno, 1, ctx->mw_user, deep, sdb, ctx->mw_txn->mt_env->me_psize, type, err, nentries, payload_size, header_size, unused_size + align_bytes); if (unlikely(rc != MDBX_SUCCESS)) return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; @@ -27885,7 +29175,7 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, MDBX_node *node = page_node(mp, i); if (type == MDBX_page_branch) { assert(err == MDBX_SUCCESS); - err = walk_tree(ctx, node_pgno(node), name, deep + 1, mp->mp_txnid); + err = walk_tree(ctx, node_pgno(node), sdb, deep + 1, mp->mp_txnid); if (unlikely(err != MDBX_SUCCESS)) { if (err == MDBX_RESULT_TRUE) break; @@ -27901,32 +29191,44 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, case F_SUBDATA /* sub-db */: if (unlikely(node_ds(node) != sizeof(MDBX_db))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid sub-tree node size", (unsigned)node_ds(node)); assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; } else { - MDBX_db db; - memcpy(&db, node_data(node), sizeof(db)); - const MDBX_val subdb_name = {node_key(node), node_ks(node)}; + MDBX_db aligned_db; + memcpy(&aligned_db, node_data(node), sizeof(aligned_db)); + MDBX_walk_sdb_t sdb_info = { + {node_key(node), node_ks(node)}, nullptr, nullptr}; + sdb_info.internal = &aligned_db; assert(err == MDBX_SUCCESS); - err = walk_sdb(ctx, &db, &subdb_name, deep + 1); + err = walk_sdb(ctx, &sdb_info, deep + 1); } break; case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: - if (unlikely(node_ds(node) != sizeof(MDBX_db) || - ctx->mw_cursor->mc_xcursor == NULL)) { + if (unlikely(node_ds(node) != sizeof(MDBX_db))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid dupsort sub-tree node size", (unsigned)node_ds(node)); + assert(err == MDBX_CORRUPTED); + err = MDBX_CORRUPTED; + } else if (unlikely(!ctx->mw_cursor->mc_xcursor)) { + ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "unexpected dupsort sub-tree node for non-dupsort subDB"); assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; } else { - MDBX_db db; - memcpy(&db, node_data(node), sizeof(db)); + MDBX_db aligned_db; + memcpy(&aligned_db, node_data(node), sizeof(aligned_db)); assert(ctx->mw_cursor->mc_xcursor == &container_of(ctx->mw_cursor, MDBX_cursor_couple, outer)->inner); assert(err == MDBX_SUCCESS); err = cursor_xinit1(ctx->mw_cursor, node, mp); if (likely(err == MDBX_SUCCESS)) { ctx->mw_cursor = &ctx->mw_cursor->mc_xcursor->mx_cursor; - err = walk_tree(ctx, db.md_root, name, deep + 1, mp->mp_txnid); + sdb->nested = &aligned_db; + err = walk_tree(ctx, aligned_db.md_root, sdb, deep + 1, mp->mp_txnid); + sdb->nested = nullptr; MDBX_xcursor *inner_xcursor = container_of(ctx->mw_cursor, MDBX_xcursor, mx_cursor); MDBX_cursor_couple *couple = @@ -27941,15 +29243,16 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, return MDBX_SUCCESS; } -__cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, - const MDBX_val *name, int deep) { - if (unlikely(sdb->md_root == P_INVALID)) +__cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_walk_sdb_t *sdb, + int deep) { + struct MDBX_db *const db = sdb->internal; + if (unlikely(db->md_root == P_INVALID)) return MDBX_SUCCESS; /* empty db */ MDBX_cursor_couple couple; MDBX_dbx dbx = {.md_klen_min = INT_MAX}; - uint8_t dbistate = DBI_VALID | DBI_AUDITED; - int rc = couple_init(&couple, ~0u, ctx->mw_txn, sdb, &dbx, &dbistate); + uint8_t dbi_state = DBI_LINDO | DBI_VALID; + int rc = couple_init(&couple, ~0u, ctx->mw_txn, db, &dbx, &dbi_state); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -27961,8 +29264,8 @@ __cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, : CC_PAGECHECK; couple.outer.mc_next = ctx->mw_cursor; ctx->mw_cursor = &couple.outer; - rc = walk_tree(ctx, sdb->md_root, name, deep, - sdb->md_mod_txnid ? sdb->md_mod_txnid : ctx->mw_txn->mt_txnid); + rc = walk_tree(ctx, db->md_root, sdb, deep, + db->md_mod_txnid ? db->md_mod_txnid : ctx->mw_txn->mt_txnid); ctx->mw_cursor = couple.outer.mc_next; return rc; } @@ -27980,15 +29283,13 @@ __cold int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, ctx.mw_visitor = visitor; ctx.mw_dont_check_keys_ordering = dont_check_keys_ordering; - rc = visitor(0, NUM_METAS, user, 0, MDBX_PGWALK_META, - pgno2bytes(txn->mt_env, NUM_METAS), MDBX_page_meta, MDBX_SUCCESS, - NUM_METAS, sizeof(MDBX_meta) * NUM_METAS, PAGEHDRSZ * NUM_METAS, - (txn->mt_env->me_psize - sizeof(MDBX_meta) - PAGEHDRSZ) * - NUM_METAS); - if (!MDBX_IS_ERROR(rc)) - rc = walk_sdb(&ctx, &txn->mt_dbs[FREE_DBI], MDBX_PGWALK_GC, 0); - if (!MDBX_IS_ERROR(rc)) - rc = walk_sdb(&ctx, &txn->mt_dbs[MAIN_DBI], MDBX_PGWALK_MAIN, 0); + MDBX_walk_sdb_t sdb = {{MDBX_CHK_GC, 0}, &txn->mt_dbs[FREE_DBI], nullptr}; + rc = walk_sdb(&ctx, &sdb, 0); + if (!MDBX_IS_ERROR(rc)) { + sdb.name.iov_base = MDBX_CHK_MAIN; + sdb.internal = &txn->mt_dbs[MAIN_DBI]; + rc = walk_sdb(&ctx, &sdb, 0); + } return rc; } @@ -28042,6 +29343,29 @@ int mdbx_cursor_on_first(const MDBX_cursor *mc) { return MDBX_RESULT_TRUE; } +int mdbx_cursor_on_first_dup(const MDBX_cursor *mc) { + if (unlikely(mc == NULL)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + if (!(mc->mc_flags & C_INITIALIZED)) + return MDBX_RESULT_TRUE; + + if (!mc->mc_xcursor) + return MDBX_RESULT_TRUE; + + mc = &mc->mc_xcursor->mx_cursor; + for (size_t i = 0; i < mc->mc_snum; ++i) { + if (mc->mc_ki[i]) + return MDBX_RESULT_FALSE; + } + + return MDBX_RESULT_TRUE; +} + int mdbx_cursor_on_last(const MDBX_cursor *mc) { if (unlikely(mc == NULL)) return MDBX_EINVAL; @@ -28062,6 +29386,30 @@ int mdbx_cursor_on_last(const MDBX_cursor *mc) { return MDBX_RESULT_TRUE; } +int mdbx_cursor_on_last_dup(const MDBX_cursor *mc) { + if (unlikely(mc == NULL)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + if (!(mc->mc_flags & C_INITIALIZED)) + return MDBX_RESULT_FALSE; + + if (!mc->mc_xcursor) + return MDBX_RESULT_TRUE; + + mc = &mc->mc_xcursor->mx_cursor; + for (size_t i = 0; i < mc->mc_snum; ++i) { + size_t nkeys = page_numkeys(mc->mc_pg[i]); + if (mc->mc_ki[i] < nkeys - 1) + return MDBX_RESULT_FALSE; + } + + return MDBX_RESULT_TRUE; +} + int mdbx_cursor_eof(const MDBX_cursor *mc) { if (unlikely(mc == NULL)) return MDBX_EINVAL; @@ -28335,9 +29683,6 @@ int mdbx_estimate_range(const MDBX_txn *txn, MDBX_dbi dbi, if (unlikely(begin_key == MDBX_EPSILON && end_key == MDBX_EPSILON)) return MDBX_EINVAL; - if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) - return MDBX_BAD_DBI; - MDBX_cursor_couple begin; /* LY: first, initialize cursor to refresh a DB in case it have DB_STALE */ rc = cursor_init(&begin.outer, txn, dbi); @@ -28516,7 +29861,7 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, (flags & (MDBX_CURRENT | MDBX_RESERVE)) != MDBX_CURRENT)) return MDBX_EINVAL; - if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) + if (unlikely(dbi <= FREE_DBI)) return MDBX_BAD_DBI; if (unlikely(flags & @@ -28696,10 +30041,11 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) - return MDBX_BAD_DBI; + rc = dbi_check(txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (unlikely(txn->mt_dbistate[dbi] & DBI_STALE)) { + if (unlikely(txn->mt_dbi_state[dbi] & DBI_STALE)) { rc = fetch_sdb(txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -28710,7 +30056,7 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, *result = dbs->md_seq; if (likely(increment > 0)) { - if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) + if (unlikely(dbi == FREE_DBI || (txn->mt_flags & MDBX_TXN_RDONLY) != 0)) return MDBX_EACCESS; uint64_t new = dbs->md_seq + increment; @@ -28720,7 +30066,7 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, tASSERT(txn, new > dbs->md_seq); dbs->md_seq = new; txn->mt_flags |= MDBX_TXN_DIRTY; - txn->mt_dbistate[dbi] |= DBI_DIRTY; + txn->mt_dbi_state[dbi] |= DBI_DIRTY; } return MDBX_SUCCESS; @@ -29005,7 +30351,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, return err; const bool lock_needed = ((env->me_flags & MDBX_ENV_ACTIVE) && env->me_txn0 && - env->me_txn0->mt_owner != osal_thread_self()); + !env_txn0_owned(env)); bool should_unlock = false; switch (option) { case MDBX_opt_sync_bytes: @@ -29080,7 +30426,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, return MDBX_EINVAL; if (env->me_options.dp_reserve_limit != (unsigned)value) { if (lock_needed) { - err = mdbx_txn_lock(env, false); + err = osal_txn_lock(env, false); if (unlikely(err != MDBX_SUCCESS)) return err; should_unlock = true; @@ -29111,6 +30457,23 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, } break; + case MDBX_opt_gc_time_limit: + if (value == /* default */ UINT64_MAX) + value = 0; + if (unlikely(value > UINT32_MAX)) + return MDBX_EINVAL; + if (unlikely(env->me_flags & MDBX_RDONLY)) + return MDBX_EACCESS; + value = osal_16dot16_to_monotime((uint32_t)value); + if (value != env->me_options.gc_time_limit) { + if (env->me_txn && lock_needed) + return MDBX_EPERM; + env->me_options.gc_time_limit = value; + if (!env->me_options.flags.non_auto.rp_augment_limit) + env->me_options.rp_augment_limit = default_rp_augment_limit(env); + } + break; + case MDBX_opt_txn_dp_limit: case MDBX_opt_txn_dp_initial: if (value == /* default */ UINT64_MAX) @@ -29120,7 +30483,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, if (unlikely(env->me_flags & MDBX_RDONLY)) return MDBX_EACCESS; if (lock_needed) { - err = mdbx_txn_lock(env, false); + err = osal_txn_lock(env, false); if (unlikely(err != MDBX_SUCCESS)) return err; should_unlock = true; @@ -29220,7 +30583,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, } if (should_unlock) - mdbx_txn_unlock(env); + osal_txn_unlock(env); return err; } @@ -29263,6 +30626,10 @@ __cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option, *pvalue = env->me_options.rp_augment_limit; break; + case MDBX_opt_gc_time_limit: + *pvalue = osal_monotime_to_16dot16(env->me_options.gc_time_limit); + break; + case MDBX_opt_txn_dp_limit: *pvalue = env->me_options.dp_limit; break; @@ -29550,14 +30917,42 @@ __cold int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn, return rc; } +#if !defined(_WIN32) && !defined(_WIN64) +__cold static void rthc_afterfork(void) { + NOTICE("drown %d rthc entries", rthc_count); + for (size_t i = 0; i < rthc_count; ++i) { + MDBX_env *const env = rthc_table[i].env; + NOTICE("drown env %p", __Wpedantic_format_voidptr(env)); + if (env->me_lck_mmap.lck) + osal_munmap(&env->me_lck_mmap); + if (env->me_map) { + osal_munmap(&env->me_dxb_mmap); +#ifdef ENABLE_MEMCHECK + VALGRIND_DISCARD(env->me_valgrind_handle); + env->me_valgrind_handle = -1; +#endif /* ENABLE_MEMCHECK */ + } + env->me_lck = lckless_stub(env); + rthc_drown(env); + } + if (rthc_table != rthc_table_static) + osal_free(rthc_table); + rthc_count = 0; + rthc_table = rthc_table_static; + rthc_limit = RTHC_INITIAL_LIMIT; + rthc_pending.weak = 0; +} +#endif /* ! Windows */ + __cold void global_ctor(void) { + ENSURE(nullptr, osal_fastmutex_init(&debug_lock) == 0); osal_ctor(); rthc_limit = RTHC_INITIAL_LIMIT; rthc_table = rthc_table_static; #if defined(_WIN32) || defined(_WIN64) InitializeCriticalSection(&rthc_critical_section); - InitializeCriticalSection(&lcklist_critical_section); #else + ENSURE(nullptr, pthread_atfork(nullptr, nullptr, rthc_afterfork) == 0); ENSURE(nullptr, pthread_key_create(&rthc_key, thread_dtor) == 0); TRACE("pid %d, &mdbx_rthc_key = %p, value 0x%x", osal_getpid(), __Wpedantic_format_voidptr(&rthc_key), (unsigned)rthc_key); @@ -29743,6 +31138,2139 @@ mdbx_key_from_int32(const int32_t i32) { #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ +/*------------------------------------------------------------------------------ + * Locking API */ + +int mdbx_txn_lock(MDBX_env *env, bool dont_wait) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(env->me_flags & MDBX_RDONLY)) + return MDBX_EACCESS; + if (unlikely(env->me_txn0->mt_owner || + (env->me_txn0->mt_flags & MDBX_TXN_FINISHED) == 0)) + return MDBX_BUSY; + + return osal_txn_lock(env, dont_wait); +} + +int mdbx_txn_unlock(MDBX_env *env) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(env->me_flags & MDBX_RDONLY)) + return MDBX_EACCESS; + if (unlikely(env->me_txn0->mt_owner != osal_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely((env->me_txn0->mt_flags & MDBX_TXN_FINISHED) == 0)) + return MDBX_BUSY; + + osal_txn_unlock(env); + return MDBX_SUCCESS; +} + +/******************************************************************************* + * Checking API */ + +typedef struct MDBX_chk_internal { + MDBX_chk_context_t *usr; + const struct MDBX_chk_callbacks *cb; + uint64_t monotime_timeout; + + size_t *problem_counter; + uint8_t flags; + bool got_break; + bool write_locked; + uint8_t scope_depth; + + MDBX_chk_subdb_t subdb_gc, subdb_main; + int16_t *pagemap; + MDBX_chk_subdb_t *last_lookup; + const void *last_nested; + MDBX_chk_scope_t scope_stack[12]; + MDBX_chk_subdb_t *subdb[MDBX_MAX_DBI + CORE_DBS]; + + MDBX_envinfo envinfo; + meta_troika_t troika; + MDBX_val v2a_buf; +} MDBX_chk_internal_t; + +__cold static int chk_check_break(MDBX_chk_scope_t *const scope) { + MDBX_chk_internal_t *const chk = scope->internal; + return (chk->got_break || (chk->cb->check_break && + (chk->got_break = chk->cb->check_break(chk->usr)))) + ? MDBX_RESULT_TRUE + : MDBX_RESULT_FALSE; +} + +__cold static void chk_line_end(MDBX_chk_line_t *line) { + if (likely(line)) { + MDBX_chk_internal_t *chk = line->ctx->internal; + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + if (likely(chk->cb->print_done)) + chk->cb->print_done(line); + } +} + +__cold __must_check_result static MDBX_chk_line_t * +chk_line_begin(MDBX_chk_scope_t *const scope, enum MDBX_chk_severity severity) { + MDBX_chk_internal_t *const chk = scope->internal; + if (severity < MDBX_chk_warning) + mdbx_env_chk_encount_problem(chk->usr); + MDBX_chk_line_t *line = nullptr; + if (likely(chk->cb->print_begin)) { + line = chk->cb->print_begin(chk->usr, severity); + if (likely(line)) { + assert(line->ctx == nullptr || (line->ctx == chk->usr && line->empty)); + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + line->ctx = chk->usr; + } + } + return line; +} + +__cold static MDBX_chk_line_t *chk_line_feed(MDBX_chk_line_t *line) { + if (likely(line)) { + MDBX_chk_internal_t *chk = line->ctx->internal; + enum MDBX_chk_severity severity = line->severity; + chk_line_end(line); + line = chk_line_begin(chk->usr->scope, severity); + } + return line; +} + +__cold static MDBX_chk_line_t *chk_flush(MDBX_chk_line_t *line) { + if (likely(line)) { + MDBX_chk_internal_t *chk = line->ctx->internal; + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + if (likely(chk->cb->print_flush)) { + chk->cb->print_flush(line); + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + line->out = line->begin; + } + } + return line; +} + +__cold static size_t chk_print_wanna(MDBX_chk_line_t *line, size_t need) { + if (likely(line && need)) { + size_t have = line->end - line->out; + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + if (need > have) { + line = chk_flush(line); + have = line->end - line->out; + } + return (need < have) ? need : have; + } + return 0; +} + +__cold static MDBX_chk_line_t *chk_puts(MDBX_chk_line_t *line, + const char *str) { + if (likely(line && str && *str)) { + MDBX_chk_internal_t *chk = line->ctx->internal; + size_t left = strlen(str); + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + if (chk->cb->print_chars) { + chk->cb->print_chars(line, str, left); + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + } else + do { + size_t chunk = chk_print_wanna(line, left); + assert(chunk <= left); + if (unlikely(!chunk)) + break; + memcpy(line->out, str, chunk); + line->out += chunk; + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + str += chunk; + left -= chunk; + } while (left); + line->empty = false; + } + return line; +} + +__cold static MDBX_chk_line_t *chk_print_va(MDBX_chk_line_t *line, + const char *fmt, va_list args) { + if (likely(line)) { + MDBX_chk_internal_t *chk = line->ctx->internal; + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + if (chk->cb->print_format) { + chk->cb->print_format(line, fmt, args); + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + } else { + va_list ones; + va_copy(ones, args); + const int needed = vsnprintf(nullptr, 0, fmt, ones); + va_end(ones); + if (likely(needed > 0)) { + const size_t have = chk_print_wanna(line, needed); + if (likely(have > 0)) { + int written = vsnprintf(line->out, have, fmt, args); + if (likely(written > 0)) + line->out += written; + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + } + } + } + line->empty = false; + } + return line; +} + +__cold static MDBX_chk_line_t *MDBX_PRINTF_ARGS(2, 3) + chk_print(MDBX_chk_line_t *line, const char *fmt, ...) { + if (likely(line)) { + // MDBX_chk_internal_t *chk = line->ctx->internal; + va_list args; + va_start(args, fmt); + line = chk_print_va(line, fmt, args); + va_end(args); + line->empty = false; + } + return line; +} + +__cold static MDBX_chk_line_t *chk_print_size(MDBX_chk_line_t *line, + const char *prefix, + const uint64_t value, + const char *suffix) { + static const char sf[] = + "KMGTPEZY"; /* LY: Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta! */ + if (likely(line)) { + MDBX_chk_internal_t *chk = line->ctx->internal; + prefix = prefix ? prefix : ""; + suffix = suffix ? suffix : ""; + if (chk->cb->print_size) + chk->cb->print_size(line, prefix, value, suffix); + else + for (unsigned i = 0;; ++i) { + const unsigned scale = 10 + i * 10; + const uint64_t rounded = value + (UINT64_C(5) << (scale - 10)); + const uint64_t integer = rounded >> scale; + const uint64_t fractional = + (rounded - (integer << scale)) * 100u >> scale; + if ((rounded >> scale) <= 1000) + return chk_print(line, "%s%" PRIu64 " (%u.%02u %ciB)%s", prefix, + value, (unsigned)integer, (unsigned)fractional, + sf[i], suffix); + } + line->empty = false; + } + return line; +} + +__cold static int chk_error_rc(MDBX_chk_scope_t *const scope, int err, + const char *subj) { + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_error); + if (line) + chk_line_end(chk_flush(chk_print(line, "%s() failed, error %s (%d)", subj, + mdbx_strerror(err), err))); + else + debug_log(MDBX_LOG_ERROR, "mdbx_env_chk", 0, "%s() failed, error %s (%d)", + subj, mdbx_strerror(err), err); + return err; +} + +__cold static void MDBX_PRINTF_ARGS(5, 6) + chk_object_issue(MDBX_chk_scope_t *const scope, const char *object, + uint64_t entry_number, const char *caption, + const char *extra_fmt, ...) { + MDBX_chk_internal_t *const chk = scope->internal; + MDBX_chk_issue_t *issue = chk->usr->scope->issues; + while (issue) { + if (issue->caption == caption) { + issue->count += 1; + break; + } else + issue = issue->next; + } + const bool fresh = issue == nullptr; + if (fresh) { + issue = osal_malloc(sizeof(*issue)); + if (likely(issue)) { + issue->caption = caption; + issue->count = 1; + issue->next = chk->usr->scope->issues; + chk->usr->scope->issues = issue; + } else + chk_error_rc(scope, ENOMEM, "adding issue"); + } + + va_list args; + va_start(args, extra_fmt); + if (chk->cb->issue) { + mdbx_env_chk_encount_problem(chk->usr); + chk->cb->issue(chk->usr, object, entry_number, caption, extra_fmt, args); + } else { + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_error); + if (entry_number != UINT64_MAX) + chk_print(line, "%s #%" PRIu64 ": %s", object, entry_number, caption); + else + chk_print(line, "%s: %s", object, caption); + if (extra_fmt) + chk_puts(chk_print_va(chk_puts(line, " ("), extra_fmt, args), ")"); + chk_line_end(fresh ? chk_flush(line) : line); + } + va_end(args); +} + +__cold static void MDBX_PRINTF_ARGS(2, 3) + chk_scope_issue(MDBX_chk_scope_t *const scope, const char *fmt, ...) { + MDBX_chk_internal_t *const chk = scope->internal; + va_list args; + va_start(args, fmt); + if (likely(chk->cb->issue)) { + mdbx_env_chk_encount_problem(chk->usr); + chk->cb->issue(chk->usr, nullptr, 0, nullptr, fmt, args); + } else + chk_line_end( + chk_print_va(chk_line_begin(scope, MDBX_chk_error), fmt, args)); + va_end(args); +} + +__cold static int chk_scope_end(MDBX_chk_internal_t *chk, int err) { + assert(chk->scope_depth > 0); + MDBX_chk_scope_t *const inner = chk->scope_stack + chk->scope_depth; + MDBX_chk_scope_t *const outer = chk->scope_depth ? inner - 1 : nullptr; + if (!outer || outer->stage != inner->stage) { + if (err == MDBX_SUCCESS && *chk->problem_counter) + err = MDBX_PROBLEM; + else if (*chk->problem_counter == 0 && MDBX_IS_ERROR(err)) + *chk->problem_counter = 1; + if (chk->problem_counter != &chk->usr->result.total_problems) { + chk->usr->result.total_problems += *chk->problem_counter; + chk->problem_counter = &chk->usr->result.total_problems; + } + if (chk->cb->stage_end) + err = chk->cb->stage_end(chk->usr, inner->stage, err); + } + if (chk->cb->scope_conclude) + err = chk->cb->scope_conclude(chk->usr, outer, inner, err); + chk->usr->scope = outer; + chk->usr->scope_nesting = chk->scope_depth -= 1; + if (outer) + outer->subtotal_issues += inner->subtotal_issues; + if (chk->cb->scope_pop) + chk->cb->scope_pop(chk->usr, outer, inner); + + while (inner->issues) { + MDBX_chk_issue_t *next = inner->issues->next; + osal_free(inner->issues); + inner->issues = next; + } + memset(inner, -1, sizeof(*inner)); + return err; +} + +__cold static int chk_scope_begin_args(MDBX_chk_internal_t *chk, + int verbosity_adjustment, + enum MDBX_chk_stage stage, + const void *object, size_t *problems, + const char *fmt, va_list args) { + if (unlikely(chk->scope_depth + 1u >= ARRAY_LENGTH(chk->scope_stack))) + return MDBX_BACKLOG_DEPLETED; + + MDBX_chk_scope_t *const outer = chk->scope_stack + chk->scope_depth; + const int verbosity = + outer->verbosity + + (verbosity_adjustment - 1) * (1 << MDBX_chk_severity_prio_shift); + MDBX_chk_scope_t *const inner = outer + 1; + memset(inner, 0, sizeof(*inner)); + inner->internal = outer->internal; + inner->stage = stage ? stage : (stage = outer->stage); + inner->object = object; + inner->verbosity = (verbosity < MDBX_chk_warning) + ? MDBX_chk_warning + : (enum MDBX_chk_severity)verbosity; + if (problems) + chk->problem_counter = problems; + else if (!chk->problem_counter || outer->stage != stage) + chk->problem_counter = &chk->usr->result.total_problems; + + if (chk->cb->scope_push) { + const int err = chk->cb->scope_push(chk->usr, outer, inner, fmt, args); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + chk->usr->scope = inner; + chk->usr->scope_nesting = chk->scope_depth += 1; + + if (stage != outer->stage && chk->cb->stage_begin) { + int err = chk->cb->stage_begin(chk->usr, stage); + if (unlikely(err != MDBX_SUCCESS)) { + err = chk_scope_end(chk, err); + assert(err != MDBX_SUCCESS); + return err ? err : MDBX_RESULT_TRUE; + } + } + return MDBX_SUCCESS; +} + +__cold static int MDBX_PRINTF_ARGS(6, 7) + chk_scope_begin(MDBX_chk_internal_t *chk, int verbosity_adjustment, + enum MDBX_chk_stage stage, const void *object, + size_t *problems, const char *fmt, ...) { + va_list args; + va_start(args, fmt); + int rc = chk_scope_begin_args(chk, verbosity_adjustment, stage, object, + problems, fmt, args); + va_end(args); + return rc; +} + +__cold static int chk_scope_restore(MDBX_chk_scope_t *const target, int err) { + MDBX_chk_internal_t *const chk = target->internal; + assert(target <= chk->usr->scope); + while (chk->usr->scope > target) + err = chk_scope_end(chk, err); + return err; +} + +__cold void chk_scope_pop(MDBX_chk_scope_t *const inner) { + if (inner && inner > inner->internal->scope_stack) + chk_scope_restore(inner - 1, MDBX_SUCCESS); +} + +__cold static MDBX_chk_scope_t *MDBX_PRINTF_ARGS(3, 4) + chk_scope_push(MDBX_chk_scope_t *const scope, int verbosity_adjustment, + const char *fmt, ...) { + chk_scope_restore(scope, MDBX_SUCCESS); + va_list args; + va_start(args, fmt); + int err = chk_scope_begin_args(scope->internal, verbosity_adjustment, + scope->stage, nullptr, nullptr, fmt, args); + va_end(args); + return err ? nullptr : scope + 1; +} + +__cold static const char *chk_v2a(MDBX_chk_internal_t *chk, + const MDBX_val *val) { + if (val == MDBX_CHK_MAIN) + return "@MAIN"; + if (val == MDBX_CHK_GC) + return "@GC"; + if (val == MDBX_CHK_META) + return "@META"; + + const unsigned char *const data = val->iov_base; + const size_t len = val->iov_len; + if (data == MDBX_CHK_MAIN) + return "@MAIN"; + if (data == MDBX_CHK_GC) + return "@GC"; + if (data == MDBX_CHK_META) + return "@META"; + + if (!len) + return ""; + if (!data) + return ""; + if (len > 65536) { + const size_t enough = 42; + if (chk->v2a_buf.iov_len < enough) { + void *ptr = osal_realloc(chk->v2a_buf.iov_base, enough); + if (unlikely(!ptr)) + return ""; + chk->v2a_buf.iov_base = ptr; + chk->v2a_buf.iov_len = enough; + } + snprintf(chk->v2a_buf.iov_base, chk->v2a_buf.iov_len, + "", len); + return chk->v2a_buf.iov_base; + } + + bool printable = true; + bool quoting = false; + size_t xchars = 0; + for (size_t i = 0; i < len && printable; ++i) { + quoting = quoting || !(data[i] == '_' || isalnum(data[i])); + printable = + isprint(data[i]) || (data[i] < ' ' && ++xchars < 4 && len > xchars * 4); + } + + size_t need = len + 1; + if (quoting || !printable) + need += len + /* quotes */ 2 + 2 * /* max xchars */ 4; + if (need > chk->v2a_buf.iov_len) { + void *ptr = osal_realloc(chk->v2a_buf.iov_base, need); + if (unlikely(!ptr)) + return ""; + chk->v2a_buf.iov_base = ptr; + chk->v2a_buf.iov_len = need; + } + + static const char hex[] = "0123456789abcdef"; + char *w = chk->v2a_buf.iov_base; + if (!quoting) { + memcpy(w, data, len); + w += len; + } else if (printable) { + *w++ = '\''; + for (size_t i = 0; i < len; ++i) { + if (data[i] < ' ') { + assert((char *)chk->v2a_buf.iov_base + chk->v2a_buf.iov_len > w + 4); + w[0] = '\\'; + w[1] = 'x'; + w[2] = hex[data[i] >> 4]; + w[3] = hex[data[i] & 15]; + w += 4; + } else if (strchr("\"'`\\", data[i])) { + assert((char *)chk->v2a_buf.iov_base + chk->v2a_buf.iov_len > w + 2); + w[0] = '\\'; + w[1] = data[i]; + w += 2; + } else { + assert((char *)chk->v2a_buf.iov_base + chk->v2a_buf.iov_len > w + 1); + *w++ = data[i]; + } + } + *w++ = '\''; + } else { + *w++ = '\\'; + *w++ = 'x'; + for (size_t i = 0; i < len; ++i) { + assert((char *)chk->v2a_buf.iov_base + chk->v2a_buf.iov_len > w + 2); + w[0] = hex[data[i] >> 4]; + w[1] = hex[data[i] & 15]; + w += 2; + } + } + assert((char *)chk->v2a_buf.iov_base + chk->v2a_buf.iov_len > w); + *w = 0; + return chk->v2a_buf.iov_base; +} + +__cold static void chk_dispose(MDBX_chk_internal_t *chk) { + assert(chk->subdb[FREE_DBI] == &chk->subdb_gc); + assert(chk->subdb[MAIN_DBI] == &chk->subdb_main); + for (size_t i = 0; i < ARRAY_LENGTH(chk->subdb); ++i) { + MDBX_chk_subdb_t *const sdb = chk->subdb[i]; + if (sdb) { + chk->subdb[i] = nullptr; + if (chk->cb->subdb_dispose && sdb->cookie) { + chk->cb->subdb_dispose(chk->usr, sdb); + sdb->cookie = nullptr; + } + if (sdb != &chk->subdb_gc && sdb != &chk->subdb_main) { + osal_free(sdb); + } + } + } + osal_free(chk->v2a_buf.iov_base); + osal_free(chk->pagemap); + chk->usr->internal = nullptr; + chk->usr->scope = nullptr; + chk->pagemap = nullptr; + memset(chk, 0xDD, sizeof(*chk)); + osal_free(chk); +} + +static size_t div_8s(size_t numerator, size_t divider) { + assert(numerator <= (SIZE_MAX >> 8)); + return (numerator << 8) / divider; +} + +static size_t mul_8s(size_t quotient, size_t multiplier) { + size_t hi = multiplier * (quotient >> 8); + size_t lo = multiplier * (quotient & 255) + 128; + return hi + (lo >> 8); +} + +static void histogram_reduce(struct MDBX_chk_histogram *p) { + const size_t size = ARRAY_LENGTH(p->ranges), last = size - 1; + // ищем пару для слияния с минимальной ошибкой + size_t min_err = SIZE_MAX, min_i = last - 1; + for (size_t i = 0; i < last; ++i) { + const size_t b1 = p->ranges[i].begin, e1 = p->ranges[i].end, + s1 = p->ranges[i].amount; + const size_t b2 = p->ranges[i + 1].begin, e2 = p->ranges[i + 1].end, + s2 = p->ranges[i + 1].amount; + const size_t l1 = e1 - b1, l2 = e2 - b2, lx = e2 - b1, sx = s1 + s2; + assert(s1 > 0 && b1 > 0 && b1 < e1); + assert(s2 > 0 && b2 > 0 && b2 < e2); + assert(e1 <= b2); + // за ошибку принимаем площадь изменений на гистограмме при слиянии + const size_t h1 = div_8s(s1, l1), h2 = div_8s(s2, l2), hx = div_8s(sx, lx); + const size_t d1 = mul_8s((h1 > hx) ? h1 - hx : hx - h1, l1); + const size_t d2 = mul_8s((h2 > hx) ? h2 - hx : hx - h2, l2); + const size_t dx = mul_8s(hx, b2 - e1); + const size_t err = d1 + d2 + dx; + if (min_err >= err) { + min_i = i; + min_err = err; + } + } + // объединяем + p->ranges[min_i].end = p->ranges[min_i + 1].end; + p->ranges[min_i].amount += p->ranges[min_i + 1].amount; + p->ranges[min_i].count += p->ranges[min_i + 1].count; + if (min_i < last) + // перемещаем хвост + memmove(p->ranges + min_i, p->ranges + min_i + 1, + (last - min_i) * sizeof(p->ranges[0])); + // обнуляем последний элемент и продолжаем + p->ranges[last].count = 0; +} + +static void histogram_acc(const size_t n, struct MDBX_chk_histogram *p) { + STATIC_ASSERT(ARRAY_LENGTH(p->ranges) > 2); + p->amount += n; + p->count += 1; + if (likely(n < 2)) { + p->ones += n; + p->pad += 1; + } else + for (;;) { + const size_t size = ARRAY_LENGTH(p->ranges), last = size - 1; + size_t i = 0; + while (i < size && p->ranges[i].count && n >= p->ranges[i].begin) { + if (n < p->ranges[i].end) { + // значение попадает в существующий интервал + p->ranges[i].amount += n; + p->ranges[i].count += 1; + return; + } + ++i; + } + if (p->ranges[last].count == 0) { + // использованы еще не все слоты, добавляем интервал + assert(i < size); + if (p->ranges[i].count) { + assert(i < last); + // раздвигаем +#ifdef __COVERITY__ + if (i < last) /* avoid Coverity false-positive issue */ +#endif /* __COVERITY__ */ + memmove(p->ranges + i + 1, p->ranges + i, + (last - i) * sizeof(p->ranges[0])); + } + p->ranges[i].begin = n; + p->ranges[i].end = n + 1; + p->ranges[i].amount = n; + p->ranges[i].count = 1; + return; + } + histogram_reduce(p); + } +} + +__cold static MDBX_chk_line_t * +histogram_dist(MDBX_chk_line_t *line, + const struct MDBX_chk_histogram *histogram, const char *prefix, + const char *first, bool amount) { + line = chk_print(line, "%s:", prefix); + const char *comma = ""; + const size_t first_val = amount ? histogram->ones : histogram->pad; + if (first_val) { + chk_print(line, " %s=%" PRIuSIZE, first, first_val); + comma = ","; + } + for (size_t n = 0; n < ARRAY_LENGTH(histogram->ranges); ++n) + if (histogram->ranges[n].count) { + chk_print(line, "%s %" PRIuSIZE, comma, histogram->ranges[n].begin); + if (histogram->ranges[n].begin != histogram->ranges[n].end - 1) + chk_print(line, "-%" PRIuSIZE, histogram->ranges[n].end - 1); + line = chk_print(line, "=%" PRIuSIZE, + amount ? histogram->ranges[n].amount + : histogram->ranges[n].count); + comma = ","; + } + return line; +} + +__cold static MDBX_chk_line_t * +histogram_print(MDBX_chk_scope_t *scope, MDBX_chk_line_t *line, + const struct MDBX_chk_histogram *histogram, const char *prefix, + const char *first, bool amount) { + if (histogram->count) { + line = chk_print(line, "%s %" PRIuSIZE, prefix, + amount ? histogram->amount : histogram->count); + if (scope->verbosity > MDBX_chk_info) + line = chk_puts( + histogram_dist(line, histogram, " (distribution", first, amount), + ")"); + } + return line; +} + +//----------------------------------------------------------------------------- + +__cold static int chk_get_sdb(MDBX_chk_scope_t *const scope, + const MDBX_walk_sdb_t *in, + MDBX_chk_subdb_t **out) { + MDBX_chk_internal_t *const chk = scope->internal; + if (chk->last_lookup && + chk->last_lookup->name.iov_base == in->name.iov_base) { + *out = chk->last_lookup; + return MDBX_SUCCESS; + } + + for (size_t i = 0; i < ARRAY_LENGTH(chk->subdb); ++i) { + MDBX_chk_subdb_t *sdb = chk->subdb[i]; + if (!sdb) { + sdb = osal_calloc(1, sizeof(MDBX_chk_subdb_t)); + if (unlikely(!sdb)) { + *out = nullptr; + return chk_error_rc(scope, MDBX_ENOMEM, "alloc_subDB"); + } + chk->subdb[i] = sdb; + sdb->flags = in->internal->md_flags; + sdb->id = -1; + sdb->name = in->name; + } + if (sdb->name.iov_base == in->name.iov_base) { + if (sdb->id < 0) { + sdb->id = (int)i; + sdb->cookie = + chk->cb->subdb_filter + ? chk->cb->subdb_filter(chk->usr, &sdb->name, sdb->flags) + : (void *)(intptr_t)-1; + } + *out = (chk->last_lookup = sdb); + return MDBX_SUCCESS; + } + } + chk_scope_issue(scope, "too many subDBs > %u", + (unsigned)ARRAY_LENGTH(chk->subdb) - CORE_DBS - /* meta */ 1); + *out = nullptr; + return MDBX_PROBLEM; +} + +//------------------------------------------------------------------------------ + +__cold static void chk_verbose_meta(MDBX_chk_scope_t *const scope, + const unsigned num) { + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_verbose); + MDBX_chk_internal_t *const chk = scope->internal; + if (line) { + MDBX_env *const env = chk->usr->env; + const bool have_bootid = (chk->envinfo.mi_bootid.current.x | + chk->envinfo.mi_bootid.current.y) != 0; + const bool bootid_match = + have_bootid && memcmp(&chk->envinfo.mi_bootid.meta[num], + &chk->envinfo.mi_bootid.current, + sizeof(chk->envinfo.mi_bootid.current)) == 0; + + const char *status = "stay"; + if (num == chk->troika.recent) + status = "head"; + else if (num == TROIKA_TAIL(&chk->troika)) + status = "tail"; + line = chk_print(line, "meta-%u: %s, ", num, status); + + switch (chk->envinfo.mi_meta_sign[num]) { + case MDBX_DATASIGN_NONE: + line = chk_puts(line, "no-sync/legacy"); + break; + case MDBX_DATASIGN_WEAK: + line = chk_print(line, "weak-%s", + have_bootid + ? (bootid_match ? "intact (same boot-id)" : "dead") + : "unknown (no boot-id)"); + break; + default: + line = chk_puts(line, "steady"); + break; + } + const txnid_t meta_txnid = chk->envinfo.mi_meta_txnid[num]; + line = chk_print(line, " txn#%" PRIaTXN ", ", meta_txnid); + if (chk->envinfo.mi_bootid.meta[num].x | chk->envinfo.mi_bootid.meta[num].y) + line = chk_print(line, "boot-id %" PRIx64 "-%" PRIx64 " (%s)", + chk->envinfo.mi_bootid.meta[num].x, + chk->envinfo.mi_bootid.meta[num].y, + bootid_match ? "live" : "not match"); + else + line = chk_puts(line, "no boot-id"); + + if (env->me_stuck_meta >= 0) { + if (num == (unsigned)env->me_stuck_meta) + line = chk_print(line, ", %s", "forced for checking"); + } else if (meta_txnid > chk->envinfo.mi_recent_txnid && + (env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) == + MDBX_EXCLUSIVE) + line = chk_print(line, + ", rolled-back %" PRIu64 " commit(s) (%" PRIu64 + " >>> %" PRIu64 ")", + meta_txnid - chk->envinfo.mi_recent_txnid, meta_txnid, + chk->envinfo.mi_recent_txnid); + chk_line_end(line); + } +} + +__cold static int +chk_pgvisitor(const size_t pgno, const unsigned npages, void *const ctx, + const int deep, const MDBX_walk_sdb_t *sdb_info, + const size_t page_size, const MDBX_page_type_t pagetype, + const MDBX_error_t page_err, const size_t nentries, + const size_t payload_bytes, const size_t header_bytes, + const size_t unused_bytes) { + MDBX_chk_scope_t *const scope = ctx; + MDBX_chk_internal_t *const chk = scope->internal; + MDBX_chk_context_t *const usr = chk->usr; + MDBX_env *const env = usr->env; + + MDBX_chk_subdb_t *sdb; + int err = chk_get_sdb(scope, sdb_info, &sdb); + if (unlikely(err)) + return err; + + if (deep > 42) { + chk_scope_issue(scope, "too deeply %u", deep); + return MDBX_CORRUPTED /* avoid infinite loop/recursion */; + } + histogram_acc(deep, &sdb->histogram.deep); + usr->result.processed_pages += npages; + const size_t page_bytes = payload_bytes + header_bytes + unused_bytes; + + int height = deep + 1; + if (sdb->id >= CORE_DBS) + height -= usr->txn->mt_dbs[MAIN_DBI].md_depth; + const struct MDBX_db *nested = sdb_info->nested; + if (nested) { + if (sdb->flags & MDBX_DUPSORT) + height -= sdb_info->internal->md_depth; + else { + chk_object_issue(scope, "nested tree", pgno, "unexpected", + "subDb %s flags 0x%x, deep %i", chk_v2a(chk, &sdb->name), + sdb->flags, deep); + nested = nullptr; + } + } else + chk->last_nested = nullptr; + + const char *pagetype_caption; + bool branch = false; + switch (pagetype) { + default: + chk_object_issue(scope, "page", pgno, "unknown page-type", + "type %u, deep %i", (unsigned)pagetype, deep); + pagetype_caption = "unknown"; + sdb->pages.other += npages; + break; + case MDBX_page_broken: + assert(page_err != MDBX_SUCCESS); + pagetype_caption = "broken"; + sdb->pages.other += npages; + break; + case MDBX_subpage_broken: + assert(page_err != MDBX_SUCCESS); + pagetype_caption = "broken-subpage"; + sdb->pages.other += npages; + break; + case MDBX_page_large: + pagetype_caption = "large"; + histogram_acc(npages, &sdb->histogram.large_pages); + if (sdb->flags & MDBX_DUPSORT) + chk_object_issue(scope, "page", pgno, "unexpected", + "type %u, subDb %s flags 0x%x, deep %i", + (unsigned)pagetype, chk_v2a(chk, &sdb->name), sdb->flags, + deep); + break; + case MDBX_page_branch: + branch = true; + if (!nested) { + pagetype_caption = "branch"; + sdb->pages.branch += 1; + } else { + pagetype_caption = "nested-branch"; + sdb->pages.nested_branch += 1; + } + break; + case MDBX_page_dupfixed_leaf: + if (!nested) + chk_object_issue(scope, "page", pgno, "unexpected", + "type %u, subDb %s flags 0x%x, deep %i", + (unsigned)pagetype, chk_v2a(chk, &sdb->name), sdb->flags, + deep); + /* fall through */ + __fallthrough; + case MDBX_page_leaf: + if (!nested) { + pagetype_caption = "leaf"; + sdb->pages.leaf += 1; + if (height != sdb_info->internal->md_depth) + chk_object_issue(scope, "page", pgno, "wrong tree height", + "actual %i != %i subDb %s", height, + sdb_info->internal->md_depth, + chk_v2a(chk, &sdb->name)); + } else { + pagetype_caption = + (pagetype == MDBX_page_leaf) ? "nested-leaf" : "nested-leaf-dupfixed"; + sdb->pages.nested_leaf += 1; + if (chk->last_nested != nested) { + histogram_acc(height, &sdb->histogram.nested_tree); + chk->last_nested = nested; + } + if (height != nested->md_depth) + chk_object_issue(scope, "page", pgno, "wrong nested-tree height", + "actual %i != %i dupsort-node %s", height, + nested->md_depth, chk_v2a(chk, &sdb->name)); + } + break; + case MDBX_subpage_dupfixed_leaf: + case MDBX_subpage_leaf: + pagetype_caption = (pagetype == MDBX_subpage_leaf) ? "subleaf-dupsort" + : "subleaf-dupfixed"; + sdb->pages.nested_subleaf += 1; + if ((sdb->flags & MDBX_DUPSORT) == 0 || nested) + chk_object_issue(scope, "page", pgno, "unexpected", + "type %u, subDb %s flags 0x%x, deep %i", + (unsigned)pagetype, chk_v2a(chk, &sdb->name), sdb->flags, + deep); + break; + } + + if (npages) { + if (sdb->cookie) { + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_extra); + if (npages == 1) + chk_print(line, "%s-page %" PRIuSIZE, pagetype_caption, pgno); + else + chk_print(line, "%s-span %" PRIuSIZE "[%u]", pagetype_caption, pgno, + npages); + chk_line_end( + chk_print(line, + " of %s: header %" PRIiPTR ", %s %" PRIiPTR + ", payload %" PRIiPTR ", unused %" PRIiPTR ", deep %i", + chk_v2a(chk, &sdb->name), header_bytes, + (pagetype == MDBX_page_branch) ? "keys" : "entries", + nentries, payload_bytes, unused_bytes, deep)); + } + + bool already_used = false; + for (unsigned n = 0; n < npages; ++n) { + const size_t spanpgno = pgno + n; + if (spanpgno >= usr->result.alloc_pages) { + chk_object_issue(scope, "page", spanpgno, "wrong page-no", + "%s-page: %" PRIuSIZE " > %" PRIuSIZE ", deep %i", + pagetype_caption, spanpgno, usr->result.alloc_pages, + deep); + sdb->pages.all += 1; + } else if (chk->pagemap[spanpgno]) { + const MDBX_chk_subdb_t *const rival = + chk->subdb[chk->pagemap[spanpgno] - 1]; + chk_object_issue(scope, "page", spanpgno, + (branch && rival == sdb) ? "loop" : "already used", + "%s-page: by %s, deep %i", pagetype_caption, + chk_v2a(chk, &rival->name), deep); + already_used = true; + } else { + chk->pagemap[spanpgno] = (int16_t)sdb->id + 1; + sdb->pages.all += 1; + } + } + + if (already_used) + return branch ? MDBX_RESULT_TRUE /* avoid infinite loop/recursion */ + : MDBX_SUCCESS; + } + + if (MDBX_IS_ERROR(page_err)) { + chk_object_issue(scope, "page", pgno, "invalid/corrupted", "%s-page", + pagetype_caption); + } else { + if (unused_bytes > page_size) + chk_object_issue(scope, "page", pgno, "illegal unused-bytes", + "%s-page: %u < %" PRIuSIZE " < %u", pagetype_caption, 0, + unused_bytes, env->me_psize); + + if (header_bytes < (int)sizeof(long) || + (size_t)header_bytes >= env->me_psize - sizeof(long)) { + chk_object_issue(scope, "page", pgno, "illegal header-length", + "%s-page: %" PRIuSIZE " < %" PRIuSIZE " < %" PRIuSIZE, + pagetype_caption, sizeof(long), header_bytes, + env->me_psize - sizeof(long)); + } + if (nentries < 1 || (pagetype == MDBX_page_branch && nentries < 2)) { + chk_object_issue(scope, "page", pgno, nentries ? "half-empty" : "empty", + "%s-page: payload %" PRIuSIZE " bytes, %" PRIuSIZE + " entries, deep %i", + pagetype_caption, payload_bytes, nentries, deep); + sdb->pages.empty += 1; + } + + if (npages) { + if (page_bytes != page_size) { + chk_object_issue(scope, "page", pgno, "misused", + "%s-page: %" PRIuPTR " != %" PRIuPTR " (%" PRIuPTR + "h + %" PRIuPTR "p + %" PRIuPTR "u), deep %i", + pagetype_caption, page_size, page_bytes, header_bytes, + payload_bytes, unused_bytes, deep); + if (page_size > page_bytes) + sdb->lost_bytes += page_size - page_bytes; + } else { + sdb->payload_bytes += payload_bytes + header_bytes; + usr->result.total_payload_bytes += payload_bytes + header_bytes; + } + } + } + return chk_check_break(scope); +} + +__cold static int chk_tree(MDBX_chk_scope_t *const scope) { + MDBX_chk_internal_t *const chk = scope->internal; + MDBX_chk_context_t *const usr = chk->usr; + MDBX_env *const env = usr->env; + MDBX_txn *const txn = usr->txn; + +#if defined(_WIN32) || defined(_WIN64) + SetLastError(ERROR_SUCCESS); +#else + errno = 0; +#endif /* Windows */ + chk->pagemap = osal_calloc(usr->result.alloc_pages, sizeof(*chk->pagemap)); + if (!chk->pagemap) { + int err = osal_get_errno(); + return chk_error_rc(scope, err ? err : MDBX_ENOMEM, "calloc"); + } + + if (scope->verbosity > MDBX_chk_info) + chk_scope_push(scope, 0, "Walking pages..."); + /* always skip key ordering checking + * to avoid MDBX_CORRUPTED in case custom comparators were used */ + usr->result.processed_pages = NUM_METAS; + int err = mdbx_env_pgwalk(txn, chk_pgvisitor, scope, true); + if (MDBX_IS_ERROR(err) && err != MDBX_EINTR) + chk_error_rc(scope, err, "mdbx_env_pgwalk"); + + for (size_t n = NUM_METAS; n < usr->result.alloc_pages; ++n) + if (!chk->pagemap[n]) + usr->result.unused_pages += 1; + + MDBX_chk_subdb_t total; + memset(&total, 0, sizeof(total)); + total.pages.all = NUM_METAS; + for (size_t i = 0; i < ARRAY_LENGTH(chk->subdb) && chk->subdb[i]; ++i) { + MDBX_chk_subdb_t *const sdb = chk->subdb[i]; + total.payload_bytes += sdb->payload_bytes; + total.lost_bytes += sdb->lost_bytes; + total.pages.all += sdb->pages.all; + total.pages.empty += sdb->pages.empty; + total.pages.other += sdb->pages.other; + total.pages.branch += sdb->pages.branch; + total.pages.leaf += sdb->pages.leaf; + total.pages.nested_branch += sdb->pages.nested_branch; + total.pages.nested_leaf += sdb->pages.nested_leaf; + total.pages.nested_subleaf += sdb->pages.nested_subleaf; + } + assert(total.pages.all == usr->result.processed_pages); + + const size_t total_page_bytes = pgno2bytes(env, total.pages.all); + if (usr->scope->subtotal_issues || usr->scope->verbosity >= MDBX_chk_verbose) + chk_line_end(chk_print(chk_line_begin(usr->scope, MDBX_chk_resolution), + "walked %zu pages, left/unused %zu" + ", %" PRIuSIZE " problem(s)", + usr->result.processed_pages, + usr->result.unused_pages, + usr->scope->subtotal_issues)); + + err = chk_scope_restore(scope, err); + if (scope->verbosity > MDBX_chk_info) { + for (size_t i = 0; i < ARRAY_LENGTH(chk->subdb) && chk->subdb[i]; ++i) { + MDBX_chk_subdb_t *const sdb = chk->subdb[i]; + MDBX_chk_scope_t *inner = + chk_scope_push(scope, 0, "tree %s:", chk_v2a(chk, &sdb->name)); + if (sdb->pages.all == 0) + chk_line_end( + chk_print(chk_line_begin(inner, MDBX_chk_resolution), "empty")); + else { + MDBX_chk_line_t *line = chk_line_begin(inner, MDBX_chk_info); + if (line) { + line = chk_print(line, "page usage: subtotal %" PRIuSIZE, + sdb->pages.all); + const size_t branch_pages = + sdb->pages.branch + sdb->pages.nested_branch; + const size_t leaf_pages = sdb->pages.leaf + sdb->pages.nested_leaf + + sdb->pages.nested_subleaf; + if (sdb->pages.other) + line = chk_print(line, ", other %" PRIuSIZE, sdb->pages.other); + if (sdb->pages.other == 0 || + (branch_pages | leaf_pages | sdb->histogram.large_pages.count) != + 0) { + line = chk_print(line, ", branch %" PRIuSIZE ", leaf %" PRIuSIZE, + branch_pages, leaf_pages); + if (sdb->histogram.large_pages.count || + (sdb->flags & MDBX_DUPSORT) == 0) { + line = chk_print(line, ", large %" PRIuSIZE, + sdb->histogram.large_pages.count); + if (sdb->histogram.large_pages.amount | + sdb->histogram.large_pages.count) + line = histogram_print(inner, line, &sdb->histogram.large_pages, + " amount", "single", true); + } + } + line = histogram_dist(chk_line_feed(line), &sdb->histogram.deep, + "tree deep density", "1", false); + if (sdb != &chk->subdb_gc && sdb->histogram.nested_tree.count) { + line = chk_print(chk_line_feed(line), "nested tree(s) %" PRIuSIZE, + sdb->histogram.nested_tree.count); + line = histogram_dist(line, &sdb->histogram.nested_tree, " density", + "1", false); + line = chk_print(chk_line_feed(line), + "nested tree(s) pages %" PRIuSIZE + ": branch %" PRIuSIZE ", leaf %" PRIuSIZE + ", subleaf %" PRIuSIZE, + sdb->pages.nested_branch + sdb->pages.nested_leaf, + sdb->pages.nested_branch, sdb->pages.nested_leaf, + sdb->pages.nested_subleaf); + } + + const size_t bytes = pgno2bytes(env, sdb->pages.all); + line = chk_print( + chk_line_feed(line), + "page filling: subtotal %" PRIuSIZE + " bytes (%.1f%%), payload %" PRIuSIZE + " (%.1f%%), unused %" PRIuSIZE " (%.1f%%)", + bytes, bytes * 100.0 / total_page_bytes, sdb->payload_bytes, + sdb->payload_bytes * 100.0 / bytes, bytes - sdb->payload_bytes, + (bytes - sdb->payload_bytes) * 100.0 / bytes); + if (sdb->pages.empty) + line = chk_print(line, ", %" PRIuSIZE " empty pages", + sdb->pages.empty); + if (sdb->lost_bytes) + line = + chk_print(line, ", %" PRIuSIZE " bytes lost", sdb->lost_bytes); + chk_line_end(line); + } + } + chk_scope_restore(scope, 0); + } + } + + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_resolution); + line = chk_print(line, + "summary: total %" PRIuSIZE " bytes, payload %" PRIuSIZE + " (%.1f%%), unused %" PRIuSIZE " (%.1f%%)," + " average fill %.1f%%", + total_page_bytes, usr->result.total_payload_bytes, + usr->result.total_payload_bytes * 100.0 / total_page_bytes, + total_page_bytes - usr->result.total_payload_bytes, + (total_page_bytes - usr->result.total_payload_bytes) * + 100.0 / total_page_bytes, + usr->result.total_payload_bytes * 100.0 / total_page_bytes); + if (total.pages.empty) + line = chk_print(line, ", %" PRIuSIZE " empty pages", total.pages.empty); + if (total.lost_bytes) + line = chk_print(line, ", %" PRIuSIZE " bytes lost", total.lost_bytes); + chk_line_end(line); + return err; +} + +typedef int(chk_kv_visitor)(MDBX_chk_scope_t *const scope, + MDBX_chk_subdb_t *sdb, const size_t record_number, + const MDBX_val *key, const MDBX_val *data); + +__cold static int chk_handle_kv(MDBX_chk_scope_t *const scope, + MDBX_chk_subdb_t *sdb, + const size_t record_number, const MDBX_val *key, + const MDBX_val *data) { + MDBX_chk_internal_t *const chk = scope->internal; + int err = MDBX_SUCCESS; + assert(sdb->cookie); + if (chk->cb->subdb_handle_kv) + err = chk->cb->subdb_handle_kv(chk->usr, sdb, record_number, key, data); + return err ? err : chk_check_break(scope); +} + +__cold static int chk_db(MDBX_chk_scope_t *const scope, MDBX_dbi dbi, + MDBX_chk_subdb_t *sdb, chk_kv_visitor *handler) { + MDBX_chk_internal_t *const chk = scope->internal; + MDBX_chk_context_t *const usr = chk->usr; + MDBX_env *const env = usr->env; + MDBX_txn *const txn = usr->txn; + MDBX_cursor *cursor = nullptr; + size_t record_count = 0, dups = 0, sub_databases = 0; + int err; + + if ((MDBX_TXN_FINISHED | MDBX_TXN_ERROR) & txn->mt_flags) { + chk_line_end( + chk_flush(chk_print(chk_line_begin(scope, MDBX_chk_error), + "abort processing %s due to a previous error", + chk_v2a(chk, &sdb->name)))); + err = MDBX_BAD_TXN; + goto bailout; + } + + if (0 > (int)dbi) { + err = dbi_open( + txn, &sdb->name, MDBX_DB_ACCEDE, &dbi, + (chk->flags & MDBX_CHK_IGNORE_ORDER) ? cmp_equal_or_greater : nullptr, + (chk->flags & MDBX_CHK_IGNORE_ORDER) ? cmp_equal_or_greater : nullptr); + if (unlikely(err)) { + tASSERT(txn, dbi >= txn->mt_env->me_numdbs || + (txn->mt_env->me_db_flags[dbi] & DB_VALID) == 0); + chk_error_rc(scope, err, "mdbx_dbi_open"); + goto bailout; + } + tASSERT(txn, dbi < txn->mt_env->me_numdbs && + (txn->mt_env->me_db_flags[dbi] & DB_VALID) != 0); + } + + const MDBX_db *const db = txn->mt_dbs + dbi; + if (handler) { + const char *key_mode = nullptr; + switch (sdb->flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY)) { + case 0: + key_mode = "usual"; + break; + case MDBX_REVERSEKEY: + key_mode = "reserve"; + break; + case MDBX_INTEGERKEY: + key_mode = "ordinal"; + break; + case MDBX_REVERSEKEY | MDBX_INTEGERKEY: + key_mode = "msgpack"; + break; + default: + key_mode = "inconsistent"; + chk_scope_issue(scope, "wrong key-mode (0x%x)", + sdb->flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY)); + } + + const char *value_mode = nullptr; + switch (sdb->flags & (MDBX_DUPSORT | MDBX_REVERSEDUP | MDBX_DUPFIXED | + MDBX_INTEGERDUP)) { + case 0: + value_mode = "single"; + break; + case MDBX_DUPSORT: + value_mode = "multi"; + break; + case MDBX_DUPSORT | MDBX_REVERSEDUP: + value_mode = "multi-reverse"; + break; + case MDBX_DUPSORT | MDBX_DUPFIXED: + value_mode = "multi-samelength"; + break; + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP: + value_mode = "multi-reverse-samelength"; + break; + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP: + value_mode = "multi-ordinal"; + break; + case MDBX_DUPSORT | MDBX_INTEGERDUP | MDBX_REVERSEDUP: + value_mode = "multi-msgpack"; + break; + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP: + value_mode = "reserved"; + break; + default: + value_mode = "inconsistent"; + chk_scope_issue(scope, "wrong value-mode (0x%x)", + sdb->flags & (MDBX_DUPSORT | MDBX_REVERSEDUP | + MDBX_DUPFIXED | MDBX_INTEGERDUP)); + } + + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_info); + line = chk_print(line, "key-value kind: %s-key => %s-value", key_mode, + value_mode); + line = chk_print(line, ", flags:"); + if (!sdb->flags) + line = chk_print(line, " none"); + else { + const uint8_t f[] = {MDBX_DUPSORT, + MDBX_INTEGERKEY, + MDBX_REVERSEKEY, + MDBX_DUPFIXED, + MDBX_REVERSEDUP, + MDBX_INTEGERDUP, + 0}; + const char *const t[] = {"dupsort", "integerkey", "reversekey", + "dupfixed", "reversedup", "integerdup"}; + for (size_t i = 0; f[i]; i++) + if (sdb->flags & f[i]) + line = chk_print(line, " %s", t[i]); + } + chk_line_end(chk_print(line, " (0x%02X)", sdb->flags)); + + line = chk_print(chk_line_begin(scope, MDBX_chk_verbose), + "entries %" PRIu64 ", sequence %" PRIu64, db->md_entries, + db->md_seq); + if (db->md_mod_txnid) + line = chk_print(line, ", last modification txn#%" PRIaTXN, + db->md_mod_txnid); + if (db->md_root != P_INVALID) + line = chk_print(line, ", root #%" PRIaPGNO, db->md_root); + chk_line_end(line); + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_verbose), + "b-tree depth %u, pages: branch %" PRIaPGNO + ", leaf %" PRIaPGNO ", large %" PRIaPGNO, + db->md_depth, db->md_branch_pages, db->md_leaf_pages, + db->md_overflow_pages)); + + if ((chk->flags & MDBX_CHK_SKIP_BTREE_TRAVERSAL) == 0) { + const size_t branch_pages = sdb->pages.branch + sdb->pages.nested_branch; + const size_t leaf_pages = sdb->pages.leaf + sdb->pages.nested_leaf; + const size_t subtotal_pages = + db->md_branch_pages + db->md_leaf_pages + db->md_overflow_pages; + if (subtotal_pages != sdb->pages.all) + chk_scope_issue( + scope, "%s pages mismatch (%" PRIuSIZE " != walked %" PRIuSIZE ")", + "subtotal", subtotal_pages, sdb->pages.all); + if (db->md_branch_pages != branch_pages) + chk_scope_issue( + scope, "%s pages mismatch (%" PRIaPGNO " != walked %" PRIuSIZE ")", + "branch", db->md_branch_pages, branch_pages); + if (db->md_leaf_pages != leaf_pages) + chk_scope_issue( + scope, "%s pages mismatch (%" PRIaPGNO " != walked %" PRIuSIZE ")", + "all-leaf", db->md_leaf_pages, leaf_pages); + if (db->md_overflow_pages != sdb->histogram.large_pages.amount) + chk_scope_issue( + scope, "%s pages mismatch (%" PRIaPGNO " != walked %" PRIuSIZE ")", + "large/overlow", db->md_overflow_pages, + sdb->histogram.large_pages.amount); + } + } + + err = mdbx_cursor_open(txn, dbi, &cursor); + if (unlikely(err)) { + chk_error_rc(scope, err, "mdbx_cursor_open"); + goto bailout; + } + if (chk->flags & MDBX_CHK_IGNORE_ORDER) { + cursor->mc_checking |= CC_SKIPORD | CC_PAGECHECK; + if (cursor->mc_xcursor) + cursor->mc_xcursor->mx_cursor.mc_checking |= CC_SKIPORD | CC_PAGECHECK; + } + + const size_t maxkeysize = mdbx_env_get_maxkeysize_ex(env, sdb->flags); + MDBX_val prev_key = {nullptr, 0}, prev_data = {nullptr, 0}; + MDBX_val key, data; + err = mdbx_cursor_get(cursor, &key, &data, MDBX_FIRST); + while (err == MDBX_SUCCESS) { + err = chk_check_break(scope); + if (unlikely(err)) + goto bailout; + + bool bad_key = false; + if (key.iov_len > maxkeysize) { + chk_object_issue(scope, "entry", record_count, + "key length exceeds max-key-size", + "%" PRIuPTR " > %" PRIuPTR, key.iov_len, maxkeysize); + bad_key = true; + } else if ((sdb->flags & MDBX_INTEGERKEY) && key.iov_len != 8 && + key.iov_len != 4) { + chk_object_issue(scope, "entry", record_count, "wrong key length", + "%" PRIuPTR " != 4or8", key.iov_len); + bad_key = true; + } + + bool bad_data = false; + if ((sdb->flags & MDBX_INTEGERDUP) && data.iov_len != 8 && + data.iov_len != 4) { + chk_object_issue(scope, "entry", record_count, "wrong data length", + "%" PRIuPTR " != 4or8", data.iov_len); + bad_data = true; + } + + if (prev_key.iov_base) { + if (prev_data.iov_base && !bad_data && (sdb->flags & MDBX_DUPFIXED) && + prev_data.iov_len != data.iov_len) { + chk_object_issue(scope, "entry", record_count, "different data length", + "%" PRIuPTR " != %" PRIuPTR, prev_data.iov_len, + data.iov_len); + bad_data = true; + } + + if (!bad_key) { + int cmp = mdbx_cmp(txn, dbi, &key, &prev_key); + if (cmp == 0) { + ++dups; + if ((sdb->flags & MDBX_DUPSORT) == 0) { + chk_object_issue(scope, "entry", record_count, "duplicated entries", + nullptr); + if (prev_data.iov_base && data.iov_len == prev_data.iov_len && + memcmp(data.iov_base, prev_data.iov_base, data.iov_len) == 0) + chk_object_issue(scope, "entry", record_count, + "complete duplicate", nullptr); + } else if (!bad_data && prev_data.iov_base) { + cmp = mdbx_dcmp(txn, dbi, &data, &prev_data); + if (cmp == 0) + chk_object_issue(scope, "entry", record_count, + "complete duplicate", nullptr); + else if (cmp < 0 && !(chk->flags & MDBX_CHK_IGNORE_ORDER)) + chk_object_issue(scope, "entry", record_count, + "wrong order of multi-values", nullptr); + } + } else if (cmp < 0 && !(chk->flags & MDBX_CHK_IGNORE_ORDER)) + chk_object_issue(scope, "entry", record_count, + "wrong order of entries", nullptr); + } + } + + if (!bad_key) { + if (!prev_key.iov_base && (sdb->flags & MDBX_INTEGERKEY)) + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_info), + "fixed key-size %" PRIuSIZE, key.iov_len)); + prev_key = key; + } + if (!bad_data) { + if (!prev_data.iov_base && + (sdb->flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED))) + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_info), + "fixed data-size %" PRIuSIZE, data.iov_len)); + prev_data = data; + } + + record_count++; + histogram_acc(key.iov_len, &sdb->histogram.key_len); + histogram_acc(data.iov_len, &sdb->histogram.val_len); + + const MDBX_node *const node = + page_node(cursor->mc_pg[cursor->mc_top], cursor->mc_ki[cursor->mc_top]); + if (node_flags(node) == F_SUBDATA) { + if (dbi != MAIN_DBI || (sdb->flags & (MDBX_DUPSORT | MDBX_DUPFIXED | + MDBX_REVERSEDUP | MDBX_INTEGERDUP))) + chk_object_issue(scope, "entry", record_count, + "unexpected sub-database", "node-flags 0x%x", + node_flags(node)); + else if (data.iov_len != sizeof(MDBX_db)) + chk_object_issue(scope, "entry", record_count, + "wrong sub-database node size", + "node-size %" PRIuSIZE " != %" PRIuSIZE, data.iov_len, + sizeof(MDBX_db)); + else if (scope->stage == MDBX_chk_traversal_maindb) + /* подсчитываем subDB при первом проходе */ + sub_databases += 1; + else { + /* обработка subDB при втором проходе */ + MDBX_db aligned_db; + memcpy(&aligned_db, data.iov_base, sizeof(aligned_db)); + MDBX_walk_sdb_t sdb_info = {key, nullptr, nullptr}; + sdb_info.internal = &aligned_db; + MDBX_chk_subdb_t *subdb; + err = chk_get_sdb(scope, &sdb_info, &subdb); + if (unlikely(err)) + goto bailout; + if (subdb->cookie) { + err = chk_scope_begin(chk, 0, MDBX_chk_traversal_subdbs, subdb, + &usr->result.problems_kv, + "Processing subDB %s...", + chk_v2a(chk, &subdb->name)); + if (likely(!err)) { + err = chk_db(usr->scope, (MDBX_dbi)-1, subdb, chk_handle_kv); + if (err != MDBX_EINTR && err != MDBX_RESULT_TRUE) + usr->result.subdb_processed += 1; + } + err = chk_scope_restore(scope, err); + if (unlikely(err)) + goto bailout; + } else + chk_line_end(chk_flush( + chk_print(chk_line_begin(scope, MDBX_chk_processing), + "Skip processing %s...", chk_v2a(chk, &subdb->name)))); + } + } else if (handler) { + err = handler(scope, sdb, record_count, &key, &data); + if (unlikely(err)) + goto bailout; + } + + err = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT); + } + + err = (err != MDBX_NOTFOUND) ? chk_error_rc(scope, err, "mdbx_cursor_get") + : MDBX_SUCCESS; + if (err == MDBX_SUCCESS && record_count != db->md_entries) + chk_scope_issue(scope, + "different number of entries %" PRIuSIZE " != %" PRIu64, + record_count, db->md_entries); +bailout: + if (cursor) { + if (handler) { + if (sdb->histogram.key_len.count) { + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_info); + line = histogram_dist(line, &sdb->histogram.key_len, + "key length density", "0/1", false); + chk_line_feed(line); + line = histogram_dist(line, &sdb->histogram.val_len, + "value length density", "0/1", false); + chk_line_end(line); + } + if (scope->stage == MDBX_chk_traversal_maindb) + usr->result.subdb_total = sub_databases; + if (chk->cb->subdb_conclude) + err = chk->cb->subdb_conclude(usr, sdb, cursor, err); + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_resolution); + line = chk_print(line, "summary: %" PRIuSIZE " records,", record_count); + if (dups || (sdb->flags & (MDBX_DUPSORT | MDBX_DUPFIXED | + MDBX_REVERSEDUP | MDBX_INTEGERDUP))) + line = chk_print(line, " %" PRIuSIZE " dups,", dups); + if (sub_databases || dbi == MAIN_DBI) + line = chk_print(line, " %" PRIuSIZE " sub-databases,", sub_databases); + line = chk_print(line, + " %" PRIuSIZE " key's bytes," + " %" PRIuSIZE " data's bytes," + " %" PRIuSIZE " problem(s)", + sdb->histogram.key_len.amount, + sdb->histogram.val_len.amount, scope->subtotal_issues); + chk_line_end(chk_flush(line)); + } + + mdbx_cursor_close(cursor); + if (!txn->mt_cursors[dbi] && (txn->mt_dbi_state[dbi] & DBI_FRESH)) + mdbx_dbi_close(env, dbi); + } + return err; +} + +__cold static int chk_handle_gc(MDBX_chk_scope_t *const scope, + MDBX_chk_subdb_t *sdb, + const size_t record_number, const MDBX_val *key, + const MDBX_val *data) { + MDBX_chk_internal_t *const chk = scope->internal; + MDBX_chk_context_t *const usr = chk->usr; + assert(sdb == &chk->subdb_gc); + (void)sdb; + const char *bad = ""; + pgno_t *iptr = data->iov_base; + + if (key->iov_len != sizeof(txnid_t)) + chk_object_issue(scope, "entry", record_number, "wrong txn-id size", + "key-size %" PRIuSIZE, key->iov_len); + else { + txnid_t txnid; + memcpy(&txnid, key->iov_base, sizeof(txnid)); + if (txnid < 1 || txnid > usr->txn->mt_txnid) + chk_object_issue(scope, "entry", record_number, "wrong txn-id", + "%" PRIaTXN, txnid); + else { + if (data->iov_len < sizeof(pgno_t) || data->iov_len % sizeof(pgno_t)) + chk_object_issue(scope, "entry", txnid, "wrong idl size", "%" PRIuPTR, + data->iov_len); + size_t number = (data->iov_len >= sizeof(pgno_t)) ? *iptr++ : 0; + if (number > MDBX_PGL_LIMIT) + chk_object_issue(scope, "entry", txnid, "wrong idl length", "%" PRIuPTR, + number); + else if ((number + 1) * sizeof(pgno_t) > data->iov_len) { + chk_object_issue(scope, "entry", txnid, "trimmed idl", + "%" PRIuSIZE " > %" PRIuSIZE " (corruption)", + (number + 1) * sizeof(pgno_t), data->iov_len); + number = data->iov_len / sizeof(pgno_t) - 1; + } else if (data->iov_len - (number + 1) * sizeof(pgno_t) >= + /* LY: allow gap up to one page. it is ok + * and better than shink-and-retry inside update_gc() */ + usr->env->me_psize) + chk_object_issue(scope, "entry", txnid, "extra idl space", + "%" PRIuSIZE " < %" PRIuSIZE " (minor, not a trouble)", + (number + 1) * sizeof(pgno_t), data->iov_len); + + usr->result.gc_pages += number; + if (chk->envinfo.mi_latter_reader_txnid > txnid) + usr->result.reclaimable_pages += number; + + size_t prev = MDBX_PNL_ASCENDING ? NUM_METAS - 1 : usr->txn->mt_next_pgno; + size_t span = 1; + for (size_t i = 0; i < number; ++i) { + const size_t pgno = iptr[i]; + if (pgno < NUM_METAS) + chk_object_issue(scope, "entry", txnid, "wrong idl entry", + "pgno %" PRIuSIZE " < meta-pages %u", pgno, + NUM_METAS); + else if (pgno >= usr->result.backed_pages) + chk_object_issue(scope, "entry", txnid, "wrong idl entry", + "pgno %" PRIuSIZE " > backed-pages %" PRIuSIZE, pgno, + usr->result.backed_pages); + else if (pgno >= usr->result.alloc_pages) + chk_object_issue(scope, "entry", txnid, "wrong idl entry", + "pgno %" PRIuSIZE " > alloc-pages %" PRIuSIZE, pgno, + usr->result.alloc_pages - 1); + else { + if (MDBX_PNL_DISORDERED(prev, pgno)) { + bad = " [bad sequence]"; + chk_object_issue( + scope, "entry", txnid, "bad sequence", + "%" PRIuSIZE " %c [%" PRIuSIZE "].%" PRIuSIZE, prev, + (prev == pgno) ? '=' : (MDBX_PNL_ASCENDING ? '>' : '<'), i, + pgno); + } + if (chk->pagemap) { + const intptr_t id = chk->pagemap[pgno]; + if (id == 0) + chk->pagemap[pgno] = -1 /* mark the pgno listed in GC */; + else if (id > 0) { + assert(id - 1 <= (intptr_t)ARRAY_LENGTH(chk->subdb)); + chk_object_issue(scope, "page", pgno, "already used", "by %s", + chk_v2a(chk, &chk->subdb[id - 1]->name)); + } else + chk_object_issue(scope, "page", pgno, "already listed in GC", + nullptr); + } + } + prev = pgno; + while (i + span < number && + iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span) + : pgno_sub(pgno, span))) + ++span; + } + if (sdb->cookie) { + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_details), + "transaction %" PRIaTXN ", %" PRIuSIZE + " pages, maxspan %" PRIuSIZE "%s", + txnid, number, span, bad)); + for (size_t i = 0; i < number; i += span) { + const size_t pgno = iptr[i]; + for (span = 1; + i + span < number && + iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span) + : pgno_sub(pgno, span)); + ++span) + ; + histogram_acc(span, &sdb->histogram.nested_tree); + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_extra); + if (line) { + if (span > 1) + line = + chk_print(line, "%9" PRIuSIZE "[%" PRIuSIZE "]", pgno, span); + else + line = chk_print(line, "%9" PRIuSIZE, pgno); + chk_line_end(line); + int err = chk_check_break(scope); + if (err) + return err; + } + } + } + } + } + return chk_check_break(scope); +} + +__cold static int env_chk(MDBX_chk_scope_t *const scope) { + MDBX_chk_internal_t *const chk = scope->internal; + MDBX_chk_context_t *const usr = chk->usr; + MDBX_env *const env = usr->env; + MDBX_txn *const txn = usr->txn; + int err = + env_info(env, txn, &chk->envinfo, sizeof(chk->envinfo), &chk->troika); + if (unlikely(err)) + return chk_error_rc(scope, err, "env_info"); + + MDBX_chk_line_t *line = + chk_puts(chk_line_begin(scope, MDBX_chk_info), "current boot-id "); + if (chk->envinfo.mi_bootid.current.x | chk->envinfo.mi_bootid.current.y) + line = chk_print(line, "%016" PRIx64 "-%016" PRIx64, + chk->envinfo.mi_bootid.current.x, + chk->envinfo.mi_bootid.current.y); + else + line = chk_puts(line, "unavailable"); + chk_line_end(line); + + err = osal_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); + if (unlikely(err)) + return chk_error_rc(scope, err, "osal_filesize"); + + //-------------------------------------------------------------------------- + + err = chk_scope_begin(chk, 1, MDBX_chk_meta, nullptr, + &usr->result.problems_meta, "Peek the meta-pages..."); + if (likely(!err)) { + MDBX_chk_scope_t *const inner = usr->scope; + const uint64_t dxbfile_pages = + env->me_dxb_mmap.filesize >> env->me_psize2log; + usr->result.alloc_pages = txn->mt_next_pgno; + usr->result.backed_pages = bytes2pgno(env, env->me_dxb_mmap.current); + if (unlikely(usr->result.backed_pages > dxbfile_pages)) + chk_scope_issue(inner, "backed-pages %zu > file-pages %" PRIu64, + usr->result.backed_pages, dxbfile_pages); + if (unlikely(dxbfile_pages < NUM_METAS)) + chk_scope_issue(inner, "file-pages %" PRIu64 " < %u", dxbfile_pages, + NUM_METAS); + if (unlikely(usr->result.backed_pages < NUM_METAS)) + chk_scope_issue(inner, "backed-pages %zu < %u", usr->result.backed_pages, + NUM_METAS); + if (unlikely(usr->result.backed_pages < NUM_METAS)) { + chk_scope_issue(inner, "backed-pages %zu < num-metas %u", + usr->result.backed_pages, NUM_METAS); + return MDBX_CORRUPTED; + } + if (unlikely(dxbfile_pages < NUM_METAS)) { + chk_scope_issue(inner, "backed-pages %zu < num-metas %u", + usr->result.backed_pages, NUM_METAS); + return MDBX_CORRUPTED; + } + if (unlikely(usr->result.backed_pages > (size_t)MAX_PAGENO + 1)) { + chk_scope_issue(inner, "backed-pages %zu > max-pages %zu", + usr->result.backed_pages, (size_t)MAX_PAGENO + 1); + usr->result.backed_pages = MAX_PAGENO + 1; + } + + if ((env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) { + if (unlikely(usr->result.backed_pages > dxbfile_pages)) { + chk_scope_issue(inner, "backed-pages %zu > file-pages %" PRIu64, + usr->result.backed_pages, dxbfile_pages); + usr->result.backed_pages = (size_t)dxbfile_pages; + } + if (unlikely(usr->result.alloc_pages > usr->result.backed_pages)) { + chk_scope_issue(scope, "alloc-pages %zu > backed-pages %zu", + usr->result.alloc_pages, usr->result.backed_pages); + usr->result.alloc_pages = usr->result.backed_pages; + } + } else { + /* DB may be shrunk by writer down to the allocated (but unused) pages. */ + if (unlikely(usr->result.alloc_pages > usr->result.backed_pages)) { + chk_scope_issue(inner, "alloc-pages %zu > backed-pages %zu", + usr->result.alloc_pages, usr->result.backed_pages); + usr->result.alloc_pages = usr->result.backed_pages; + } + if (unlikely(usr->result.alloc_pages > dxbfile_pages)) { + chk_scope_issue(inner, "alloc-pages %zu > file-pages %" PRIu64, + usr->result.alloc_pages, dxbfile_pages); + usr->result.alloc_pages = (size_t)dxbfile_pages; + } + if (unlikely(usr->result.backed_pages > dxbfile_pages)) + usr->result.backed_pages = (size_t)dxbfile_pages; + } + + line = chk_line_feed(chk_print( + chk_line_begin(inner, MDBX_chk_info), + "pagesize %u (%u system), max keysize %u..%u" + ", max readers %u", + env->me_psize, env->me_os_psize, + mdbx_env_get_maxkeysize_ex(env, MDBX_DUPSORT), + mdbx_env_get_maxkeysize_ex(env, MDBX_DB_DEFAULTS), env->me_maxreaders)); + line = chk_line_feed( + chk_print_size(line, "mapsize ", env->me_dxb_mmap.current, nullptr)); + if (txn->mt_geo.lower == txn->mt_geo.upper) + line = chk_print_size( + line, "fixed datafile: ", chk->envinfo.mi_geo.current, nullptr); + else { + line = chk_print_size( + line, "dynamic datafile: ", chk->envinfo.mi_geo.lower, nullptr); + line = chk_print_size(line, " .. ", chk->envinfo.mi_geo.upper, ", "); + line = chk_print_size(line, "+", chk->envinfo.mi_geo.grow, ", "); + + line = chk_line_feed( + chk_print_size(line, "-", chk->envinfo.mi_geo.shrink, nullptr)); + line = chk_print_size( + line, "current datafile: ", chk->envinfo.mi_geo.current, nullptr); + } + tASSERT(txn, txn->mt_geo.now == chk->envinfo.mi_geo.current / + chk->envinfo.mi_dxb_pagesize); + chk_line_end(chk_print(line, ", %u pages", txn->mt_geo.now)); +#if defined(_WIN32) || defined(_WIN64) || MDBX_DEBUG + if (txn->mt_geo.shrink_pv && txn->mt_geo.now != txn->mt_geo.upper && + scope->verbosity >= MDBX_chk_verbose) { + line = chk_line_begin(inner, MDBX_chk_notice); + chk_line_feed(chk_print( + line, " > WARNING: Due Windows system limitations a file couldn't")); + chk_line_feed(chk_print( + line, " > be truncated while the database is opened. So, the size")); + chk_line_feed(chk_print( + line, " > database file of may by large than the database itself,")); + chk_line_end(chk_print( + line, " > until it will be closed or reopened in read-write mode.")); + } +#endif /* Windows || Debug */ + chk_verbose_meta(inner, 0); + chk_verbose_meta(inner, 1); + chk_verbose_meta(inner, 2); + + if (env->me_stuck_meta >= 0) { + chk_line_end(chk_print(chk_line_begin(inner, MDBX_chk_processing), + "skip checking meta-pages since the %u" + " is selected for verification", + env->me_stuck_meta)); + line = chk_line_feed( + chk_print(chk_line_begin(inner, MDBX_chk_resolution), + "transactions: recent %" PRIu64 ", " + "selected for verification %" PRIu64 ", lag %" PRIi64, + chk->envinfo.mi_recent_txnid, + chk->envinfo.mi_meta_txnid[env->me_stuck_meta], + chk->envinfo.mi_recent_txnid - + chk->envinfo.mi_meta_txnid[env->me_stuck_meta])); + chk_line_end(line); + } else { + chk_line_end(chk_puts(chk_line_begin(inner, MDBX_chk_verbose), + "performs check for meta-pages clashes")); + const unsigned meta_clash_mask = meta_eq_mask(&chk->troika); + if (meta_clash_mask & 1) + chk_scope_issue(inner, "meta-%d and meta-%d are clashed", 0, 1); + if (meta_clash_mask & 2) + chk_scope_issue(inner, "meta-%d and meta-%d are clashed", 1, 2); + if (meta_clash_mask & 4) + chk_scope_issue(inner, "meta-%d and meta-%d are clashed", 2, 0); + + const unsigned prefer_steady_metanum = chk->troika.prefer_steady; + const uint64_t prefer_steady_txnid = + chk->troika.txnid[prefer_steady_metanum]; + const unsigned recent_metanum = chk->troika.recent; + const uint64_t recent_txnid = chk->troika.txnid[recent_metanum]; + if (env->me_flags & MDBX_EXCLUSIVE) { + chk_line_end( + chk_puts(chk_line_begin(inner, MDBX_chk_verbose), + "performs full check recent-txn-id with meta-pages")); + eASSERT(env, recent_txnid == chk->envinfo.mi_recent_txnid); + if (prefer_steady_txnid != recent_txnid) { + if ((chk->flags & MDBX_CHK_READWRITE) != 0 && + (env->me_flags & MDBX_RDONLY) == 0 && + recent_txnid > prefer_steady_txnid && + (chk->envinfo.mi_bootid.current.x | + chk->envinfo.mi_bootid.current.y) != 0 && + chk->envinfo.mi_bootid.current.x == + chk->envinfo.mi_bootid.meta[recent_metanum].x && + chk->envinfo.mi_bootid.current.y == + chk->envinfo.mi_bootid.meta[recent_metanum].y) { + chk_line_end( + chk_print(chk_line_begin(inner, MDBX_chk_verbose), + "recent meta-%u is weak, but boot-id match current" + " (will synced upon successful check)", + recent_metanum)); + } else + chk_scope_issue( + inner, + "steady meta-%d txn-id mismatch recent-txn-id (%" PRIi64 + " != %" PRIi64 ")", + prefer_steady_metanum, prefer_steady_txnid, recent_txnid); + } + } else if (chk->write_locked) { + chk_line_end( + chk_puts(chk_line_begin(inner, MDBX_chk_verbose), + "performs lite check recent-txn-id with meta-pages (not a " + "monopolistic mode)")); + if (recent_txnid != chk->envinfo.mi_recent_txnid) { + chk_scope_issue(inner, + "weak meta-%d txn-id mismatch recent-txn-id (%" PRIi64 + " != %" PRIi64 ")", + recent_metanum, recent_txnid, + chk->envinfo.mi_recent_txnid); + } + } else { + chk_line_end(chk_puts( + chk_line_begin(inner, MDBX_chk_verbose), + "skip check recent-txn-id with meta-pages (monopolistic or " + "read-write mode only)")); + } + + chk_line_end(chk_print( + chk_line_begin(inner, MDBX_chk_resolution), + "transactions: recent %" PRIu64 ", latter reader %" PRIu64 + ", lag %" PRIi64, + chk->envinfo.mi_recent_txnid, chk->envinfo.mi_latter_reader_txnid, + chk->envinfo.mi_recent_txnid - chk->envinfo.mi_latter_reader_txnid)); + } + } + err = chk_scope_restore(scope, err); + + //-------------------------------------------------------------------------- + + if (chk->flags & MDBX_CHK_SKIP_BTREE_TRAVERSAL) + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_processing), + "Skipping %s traversal...", "b-tree")); + else { + err = chk_scope_begin( + chk, -1, MDBX_chk_traversal_tree, nullptr, &usr->result.tree_problems, + "Traversal %s by txn#%" PRIaTXN "...", "b-tree", txn->mt_txnid); + if (likely(!err)) + err = chk_tree(usr->scope); + if (usr->result.tree_problems && usr->result.gc_tree_problems == 0) + usr->result.gc_tree_problems = usr->result.tree_problems; + if (usr->result.tree_problems && usr->result.kv_tree_problems == 0) + usr->result.kv_tree_problems = usr->result.tree_problems; + chk_scope_restore(scope, err); + } + + if (usr->result.gc_tree_problems > 0) + chk_line_end(chk_print( + chk_line_begin(scope, MDBX_chk_processing), + "Skip processing %s since %s is corrupted (%" PRIuSIZE " problem(s))", + chk_v2a(chk, MDBX_CHK_GC), "b-tree", + usr->result.problems_gc = usr->result.gc_tree_problems)); + else { + err = chk_scope_begin(chk, -1, MDBX_chk_traversal_freedb, &chk->subdb_gc, + &usr->result.problems_gc, + "Traversal %s by txn#%" PRIaTXN "...", "GC/freeDB", + txn->mt_txnid); + if (likely(!err)) + err = chk_db(usr->scope, FREE_DBI, &chk->subdb_gc, chk_handle_gc); + line = chk_line_begin(scope, MDBX_chk_info); + if (line) { + histogram_print(scope, line, &chk->subdb_gc.histogram.nested_tree, + "span(s)", "single", false); + chk_line_end(line); + } + if (usr->result.problems_gc == 0 && + (chk->flags & MDBX_CHK_SKIP_BTREE_TRAVERSAL) == 0) { + const size_t used_pages = usr->result.alloc_pages - usr->result.gc_pages; + if (usr->result.processed_pages != used_pages) + chk_scope_issue(usr->scope, + "used pages mismatch (%" PRIuSIZE + "(walked) != %" PRIuSIZE "(allocated - GC))", + usr->result.processed_pages, used_pages); + if (usr->result.unused_pages != usr->result.gc_pages) + chk_scope_issue(usr->scope, + "GC pages mismatch (%" PRIuSIZE + "(expected) != %" PRIuSIZE "(GC))", + usr->result.unused_pages, usr->result.gc_pages); + } + } + chk_scope_restore(scope, err); + + //-------------------------------------------------------------------------- + + err = chk_scope_begin(chk, 1, MDBX_chk_space, nullptr, nullptr, + "Page allocation:"); + const double percent_boundary_reciprocal = 100.0 / txn->mt_geo.upper; + const double percent_backed_reciprocal = 100.0 / usr->result.backed_pages; + const size_t detained = usr->result.gc_pages - usr->result.reclaimable_pages; + const size_t available2boundary = txn->mt_geo.upper - + usr->result.alloc_pages + + usr->result.reclaimable_pages; + const size_t available2backed = usr->result.backed_pages - + usr->result.alloc_pages + + usr->result.reclaimable_pages; + const size_t remained2boundary = txn->mt_geo.upper - usr->result.alloc_pages; + const size_t remained2backed = + usr->result.backed_pages - usr->result.alloc_pages; + + const size_t used = (chk->flags & MDBX_CHK_SKIP_BTREE_TRAVERSAL) + ? usr->result.alloc_pages - usr->result.gc_pages + : usr->result.processed_pages; + + line = chk_line_begin(usr->scope, MDBX_chk_info); + line = chk_print(line, + "backed by file: %" PRIuSIZE " pages (%.1f%%)" + ", %" PRIuSIZE " left to boundary (%.1f%%)", + usr->result.backed_pages, + usr->result.backed_pages * percent_boundary_reciprocal, + txn->mt_geo.upper - usr->result.backed_pages, + (txn->mt_geo.upper - usr->result.backed_pages) * + percent_boundary_reciprocal); + line = chk_line_feed(line); + + line = chk_print( + line, "%s: %" PRIuSIZE " page(s), %.1f%% of backed, %.1f%% of boundary", + "used", used, used * percent_backed_reciprocal, + used * percent_boundary_reciprocal); + line = chk_line_feed(line); + + line = chk_print( + line, + "%s: %" PRIuSIZE " page(s) (%.1f%%) of backed, %" PRIuSIZE + " to boundary (%.1f%% of boundary)", + "remained", remained2backed, remained2backed * percent_backed_reciprocal, + remained2boundary, remained2boundary * percent_boundary_reciprocal); + line = chk_line_feed(line); + + line = chk_print( + line, + "reclaimable: %" PRIuSIZE " (%.1f%% of backed, %.1f%% of boundary)" + ", GC %" PRIuSIZE " (%.1f%% of backed, %.1f%% of boundary)", + usr->result.reclaimable_pages, + usr->result.reclaimable_pages * percent_backed_reciprocal, + usr->result.reclaimable_pages * percent_boundary_reciprocal, + usr->result.gc_pages, usr->result.gc_pages * percent_backed_reciprocal, + usr->result.gc_pages * percent_boundary_reciprocal); + line = chk_line_feed(line); + + line = chk_print( + line, + "detained by reader(s): %" PRIuSIZE + " (%.1f%% of backed, %.1f%% of boundary)" + ", %u reader(s), lag %" PRIi64, + detained, detained * percent_backed_reciprocal, + detained * percent_boundary_reciprocal, chk->envinfo.mi_numreaders, + chk->envinfo.mi_recent_txnid - chk->envinfo.mi_latter_reader_txnid); + line = chk_line_feed(line); + + line = chk_print( + line, "%s: %" PRIuSIZE " page(s), %.1f%% of backed, %.1f%% of boundary", + "allocated", usr->result.alloc_pages, + usr->result.alloc_pages * percent_backed_reciprocal, + usr->result.alloc_pages * percent_boundary_reciprocal); + line = chk_line_feed(line); + + line = chk_print(line, + "%s: %" PRIuSIZE " page(s) (%.1f%%) of backed, %" PRIuSIZE + " to boundary (%.1f%% of boundary)", + "available", available2backed, + available2backed * percent_backed_reciprocal, + available2boundary, + available2boundary * percent_boundary_reciprocal); + chk_line_end(line); + + line = chk_line_begin(usr->scope, MDBX_chk_resolution); + line = chk_print(line, "%s %" PRIaPGNO " pages", + (txn->mt_geo.upper == txn->mt_geo.now) ? "total" : "upto", + txn->mt_geo.upper); + line = chk_print(line, ", backed %" PRIuSIZE " (%.1f%%)", + usr->result.backed_pages, + usr->result.backed_pages * percent_boundary_reciprocal); + line = chk_print(line, ", allocated %" PRIuSIZE " (%.1f%%)", + usr->result.alloc_pages, + usr->result.alloc_pages * percent_boundary_reciprocal); + line = + chk_print(line, ", available %" PRIuSIZE " (%.1f%%)", available2boundary, + available2boundary * percent_boundary_reciprocal); + chk_line_end(line); + chk_scope_restore(scope, err); + + //-------------------------------------------------------------------------- + + if (chk->flags & MDBX_CHK_SKIP_KV_TRAVERSAL) + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_processing), + "Skipping %s traversal...", "key-value")); + else if ((usr->result.problems_kv = usr->result.kv_tree_problems) > 0) + chk_line_end(chk_print( + chk_line_begin(scope, MDBX_chk_processing), + "Skip processing %s since %s is corrupted (%" PRIuSIZE " problem(s))", + chk_v2a(chk, MDBX_CHK_MAIN), "key-value", + usr->result.problems_kv = usr->result.kv_tree_problems)); + else { + err = + chk_scope_begin(chk, 0, MDBX_chk_traversal_maindb, &chk->subdb_main, + &usr->result.problems_kv, "Processing %s...", "MainDB"); + if (likely(!err)) + err = chk_db(usr->scope, MAIN_DBI, &chk->subdb_main, chk_handle_kv); + chk_scope_restore(scope, err); + + if (usr->result.problems_kv && usr->result.subdb_total) + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_processing), + "Skip processing %s", "sub-database(s)")); + else if (usr->result.problems_kv == 0 && usr->result.subdb_total == 0) + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_info), "No %s", + "sub-database(s)")); + else if (usr->result.problems_kv == 0 && usr->result.subdb_total) { + err = chk_scope_begin(chk, 1, MDBX_chk_traversal_subdbs, nullptr, + &usr->result.problems_kv, + "Traversal %s by txn#%" PRIaTXN "...", + "sub-database(s)", txn->mt_txnid); + if (!err) + err = chk_db(usr->scope, MAIN_DBI, &chk->subdb_main, nullptr); + if (usr->scope->subtotal_issues) + chk_line_end( + chk_print(chk_line_begin(usr->scope, MDBX_chk_resolution), + "processed %" PRIuSIZE " of %" PRIuSIZE " subDb(s)" + ", %" PRIuSIZE " problems(s)", + usr->result.subdb_processed, usr->result.subdb_total, + usr->scope->subtotal_issues)); + } + chk_scope_restore(scope, err); + } + + return chk_scope_end(chk, chk_scope_begin(chk, 0, MDBX_chk_conclude, nullptr, + nullptr, nullptr)); +} + +__cold int mdbx_env_chk_encount_problem(MDBX_chk_context_t *ctx) { + if (likely(ctx && ctx->internal && ctx->internal->usr == ctx && + ctx->internal->problem_counter && ctx->scope)) { + *ctx->internal->problem_counter += 1; + ctx->scope->subtotal_issues += 1; + return MDBX_SUCCESS; + } + return MDBX_EINVAL; +} + +__cold int mdbx_env_chk(MDBX_env *env, const struct MDBX_chk_callbacks *cb, + MDBX_chk_context_t *ctx, + const enum MDBX_chk_flags_t flags, + enum MDBX_chk_severity verbosity, + unsigned timeout_seconds_16dot16) { + int err, rc = check_env(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + if (unlikely(!cb || !ctx || ctx->internal)) + return MDBX_EINVAL; + + MDBX_chk_internal_t *const chk = osal_calloc(1, sizeof(MDBX_chk_internal_t)); + if (unlikely(!chk)) + return MDBX_ENOMEM; + + chk->cb = cb; + chk->usr = ctx; + chk->usr->internal = chk; + chk->usr->env = env; + chk->flags = flags; + + chk->subdb_gc.id = -1; + chk->subdb_gc.name.iov_base = MDBX_CHK_GC; + chk->subdb[FREE_DBI] = &chk->subdb_gc; + + chk->subdb_main.id = -1; + chk->subdb_main.name.iov_base = MDBX_CHK_MAIN; + chk->subdb[MAIN_DBI] = &chk->subdb_main; + + chk->monotime_timeout = + timeout_seconds_16dot16 + ? osal_16dot16_to_monotime(timeout_seconds_16dot16) + osal_monotime() + : 0; + chk->usr->scope_nesting = 0; + chk->usr->result.subdbs = (const void *)&chk->subdb; + + MDBX_chk_scope_t *const top = chk->scope_stack; + top->verbosity = verbosity; + top->internal = chk; + + // init + rc = chk_scope_end( + chk, chk_scope_begin(chk, 0, MDBX_chk_init, nullptr, nullptr, nullptr)); + + // lock + if (likely(!rc)) + rc = chk_scope_begin( + chk, 0, MDBX_chk_lock, nullptr, nullptr, "Taking %slock...", + (env->me_flags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) ? "" : "read "); + if (likely(!rc) && (env->me_flags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) == 0 && + (flags & MDBX_CHK_READWRITE)) { + rc = mdbx_txn_lock(env, false); + if (unlikely(rc)) + chk_error_rc(ctx->scope, rc, "mdbx_txn_lock"); + else + chk->write_locked = true; + } + if (likely(!rc)) { + rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &ctx->txn); + if (unlikely(rc)) + chk_error_rc(ctx->scope, rc, "mdbx_txn_begin"); + } + chk_scope_end(chk, rc); + + // doit + if (likely(!rc)) { + chk->subdb_gc.flags = ctx->txn->mt_dbs[FREE_DBI].md_flags; + chk->subdb_main.flags = ctx->txn->mt_dbs[MAIN_DBI].md_flags; + rc = env_chk(top); + } + + // unlock + if (ctx->txn || chk->write_locked) { + chk_scope_begin(chk, 0, MDBX_chk_unlock, nullptr, nullptr, nullptr); + if (ctx->txn) { + err = mdbx_txn_abort(ctx->txn); + if (err && !rc) + rc = err; + ctx->txn = nullptr; + } + if (chk->write_locked) + mdbx_txn_unlock(env); + rc = chk_scope_end(chk, rc); + } + + // finalize + err = chk_scope_begin(chk, 0, MDBX_chk_finalize, nullptr, nullptr, nullptr); + rc = chk_scope_end(chk, err ? err : rc); + chk_dispose(chk); + return rc; +} + /******************************************************************************/ __dll_export @@ -29897,9 +33425,9 @@ __dll_export #ifdef __SANITIZE_ADDRESS__ " SANITIZE_ADDRESS=YES" #endif /* __SANITIZE_ADDRESS__ */ -#ifdef MDBX_USE_VALGRIND - " MDBX_USE_VALGRIND=YES" -#endif /* MDBX_USE_VALGRIND */ +#ifdef ENABLE_MEMCHECK + " ENABLE_MEMCHECK=YES" +#endif /* ENABLE_MEMCHECK */ #if MDBX_FORCE_ASSERTIONS " MDBX_FORCE_ASSERTIONS=YES" #endif /* MDBX_FORCE_ASSERTIONS */ @@ -30011,7 +33539,7 @@ const char *__asan_default_options(void) { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -30253,7 +33781,7 @@ MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, unsigned line) { #endif /* MDBX_DEBUG */ - if (debug_logger) + if (mdbx_static.logger.ptr) debug_log(MDBX_LOG_FATAL, func, line, "assert: %s\n", msg); else { #if defined(_WIN32) || defined(_WIN64) @@ -30296,7 +33824,7 @@ __cold void mdbx_panic(const char *fmt, ...) { ? "" : message; - if (debug_logger) + if (mdbx_static.logger.ptr) debug_log(MDBX_LOG_FATAL, "panic", 0, "%s", const_message); while (1) { @@ -30512,8 +34040,18 @@ MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) InitializeCriticalSection(fastmutex); return MDBX_SUCCESS; +#elif MDBX_DEBUG + pthread_mutexattr_t ma; + int rc = pthread_mutexattr_init(&ma); + if (likely(!rc)) { + rc = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK); + if (likely(!rc) || rc == ENOTSUP) + rc = pthread_mutex_init(fastmutex, &ma); + pthread_mutexattr_destroy(&ma); + } + return rc; #else - return pthread_mutex_init(fastmutex, NULL); + return pthread_mutex_init(fastmutex, nullptr); #endif } @@ -30535,7 +34073,7 @@ MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex) { 0xC0000194 /* STATUS_POSSIBLE_DEADLOCK / EXCEPTION_POSSIBLE_DEADLOCK */) ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH) { - return ERROR_POSSIBLE_DEADLOCK; + return MDBX_EDEADLK; } return MDBX_SUCCESS; #else @@ -31835,8 +35373,8 @@ MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, #else struct statvfs info; if (err != MDBX_ENOFILE) { - if (statvfs(pathname, &info) == 0 && (info.f_flag & ST_RDONLY) == 0) - return err; + if (statvfs(pathname, &info) == 0) + return (info.f_flag & ST_RDONLY) ? MDBX_SUCCESS : err; if (errno != MDBX_ENOFILE) return errno; } @@ -33487,10 +37025,8 @@ __cold int mdbx_get_sysraminfo(intptr_t *page_size, intptr_t *total_pages, return MDBX_SUCCESS; } -#ifndef xMDBX_ALLOY -unsigned sys_pagesize; -MDBX_MAYBE_UNUSED unsigned sys_pagesize_ln2, sys_allocation_granularity; -#endif /* xMDBX_ALLOY */ +MDBX_INTERNAL_VAR_INSTA unsigned sys_pagesize, sys_pagesize_ln2, + sys_allocation_granularity; void osal_ctor(void) { #if MDBX_HAVE_PWRITEV && defined(_SC_IOV_MAX) @@ -33537,7 +37073,7 @@ void osal_dtor(void) {} #if MDBX_VERSION_MAJOR != 0 || \ - MDBX_VERSION_MINOR != 12 + MDBX_VERSION_MINOR != 13 #error "API version mismatch! Had `git fetch --tags` done?" #endif @@ -33557,11 +37093,11 @@ __dll_export #endif const struct MDBX_version_info mdbx_version = { 0, - 12, - 9, - 16, - {"2024-03-06T22:58:31+03:00", "c5e6e3a4f75727b9e0039ad420ae167d3487d006", "fff3fbd866c50ee3c77b244a9b05f497e06a65e8", - "v0.12.9-16-gfff3fbd8"}, + 13, + 0, + 38, + {"2024-04-04T22:31:03+03:00", "a0fc2d938419aa82764beae00e1472f412d5a4d5", "f19753636d2364c43125f972b8d3f29dc9e244b4", + "v0.13.0-38-gf1975363"}, sourcery}; __dll_export @@ -33578,7 +37114,7 @@ __dll_export #endif const char *const mdbx_sourcery_anchor = sourcery; /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -33752,7 +37288,7 @@ static int funlock(mdbx_filehandle_t fd, size_t offset, size_t bytes) { #define DXB_BODY (env->me_psize * (size_t)NUM_METAS), DXB_MAXLEN #define DXB_WHOLE 0, DXB_MAXLEN -int mdbx_txn_lock(MDBX_env *env, bool dontwait) { +int osal_txn_lock(MDBX_env *env, bool dontwait) { if (dontwait) { if (!TryEnterCriticalSection(&env->me_windowsbug_lock)) return MDBX_BUSY; @@ -33764,16 +37300,13 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) { 0xC0000194 /* STATUS_POSSIBLE_DEADLOCK / EXCEPTION_POSSIBLE_DEADLOCK */) ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH) { - return ERROR_POSSIBLE_DEADLOCK; + return MDBX_EDEADLK; } } - if (env->me_flags & MDBX_EXCLUSIVE) { - /* Zap: Failing to release lock 'env->me_windowsbug_lock' - * in function 'mdbx_txn_lock' */ - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(26115); - return MDBX_SUCCESS; - } + eASSERT(env, !env->me_txn0->mt_owner); + if (env->me_flags & MDBX_EXCLUSIVE) + goto done; const HANDLE fd4data = env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; @@ -33792,17 +37325,20 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) { } } if (rc == MDBX_SUCCESS) { + done: /* Zap: Failing to release lock 'env->me_windowsbug_lock' * in function 'mdbx_txn_lock' */ MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(26115); - return rc; + env->me_txn0->mt_owner = osal_thread_self(); + return MDBX_SUCCESS; } LeaveCriticalSection(&env->me_windowsbug_lock); return (!dontwait || rc != ERROR_LOCK_VIOLATION) ? rc : MDBX_BUSY; } -void mdbx_txn_unlock(MDBX_env *env) { +void osal_txn_unlock(MDBX_env *env) { + eASSERT(env, env->me_txn0->mt_owner == osal_thread_self()); if ((env->me_flags & MDBX_EXCLUSIVE) == 0) { const HANDLE fd4data = env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; @@ -33810,6 +37346,7 @@ void mdbx_txn_unlock(MDBX_env *env) { if (err != MDBX_SUCCESS) mdbx_panic("%s failed: err %u", __func__, err); } + env->me_txn0->mt_owner = 0; LeaveCriticalSection(&env->me_windowsbug_lock); } @@ -33899,7 +37436,7 @@ static int suspend_and_append(mdbx_handle_array_t **array, MDBX_INTERNAL_FUNC int osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { - eASSERT(env, (env->me_flags & MDBX_NOTLS) == 0); + eASSERT(env, (env->me_flags & MDBX_NOSTICKYTHREADS) == 0); const uintptr_t CurrentTid = GetCurrentThreadId(); int rc; if (env->me_lck_mmap.lck) { @@ -34016,7 +37553,7 @@ osal_resume_threads_after_remap(mdbx_handle_array_t *array) { * The osal_lck_downgrade() moves the locking-FSM from "exclusive write" * state to the "used" (i.e. shared) state. * - * The mdbx_lck_upgrade() moves the locking-FSM from "used" (i.e. shared) + * The osal_lck_upgrade() moves the locking-FSM from "used" (i.e. shared) * state to the "exclusive write" state. */ @@ -34189,7 +37726,7 @@ MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) { return MDBX_SUCCESS /* 5) now at S-? (used), done */; } -MDBX_INTERNAL_FUNC int mdbx_lck_upgrade(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, bool dont_wait) { /* Transite from used state (S-?) to exclusive-write (E-E) */ assert(env->me_lfd != INVALID_HANDLE_VALUE); @@ -34199,7 +37736,9 @@ MDBX_INTERNAL_FUNC int mdbx_lck_upgrade(MDBX_env *env) { /* 1) now on S-? (used), try S-E (locked) */ jitter4testing(false); - int rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_UPPER); + int rc = flock(env->me_lfd, + dont_wait ? LCK_EXCLUSIVE | LCK_DONTWAIT : LCK_EXCLUSIVE, + LCK_UPPER); if (rc != MDBX_SUCCESS) { /* 2) something went wrong, give up */; VERBOSE("%s, err %u", "S-?(used) >> S-E(locked)", rc); @@ -34214,7 +37753,9 @@ MDBX_INTERNAL_FUNC int mdbx_lck_upgrade(MDBX_env *env) { /* 4) now on ?-E (middle), try E-E (exclusive-write) */ jitter4testing(false); - rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER); + rc = flock(env->me_lfd, + dont_wait ? LCK_EXCLUSIVE | LCK_DONTWAIT : LCK_EXCLUSIVE, + LCK_LOWER); if (rc != MDBX_SUCCESS) { /* 5) something went wrong, give up */; VERBOSE("%s, err %u", "?-E(middle) >> E-E(exclusive-write)", rc); @@ -34251,7 +37792,9 @@ MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, } MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, - MDBX_env *inprocess_neighbor) { + MDBX_env *inprocess_neighbor, + const uint32_t current_pid) { + (void)current_pid; /* LY: should unmap before releasing the locks to avoid race condition and * STATUS_USER_MAPPED_FILE/ERROR_USER_MAPPED_FILE */ if (env->me_map) @@ -34260,7 +37803,7 @@ MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, const bool synced = env->me_lck_mmap.lck->mti_unsynced_pages.weak == 0; osal_munmap(&env->me_lck_mmap); if (synced && !inprocess_neighbor && env->me_lfd != INVALID_HANDLE_VALUE && - mdbx_lck_upgrade(env) == MDBX_SUCCESS) + osal_lck_upgrade(env, true) == MDBX_SUCCESS) /* this will fail if LCK is used/mmapped by other process(es) */ osal_ftruncate(env->me_lfd, 0); } @@ -34487,7 +38030,7 @@ static void mdbx_winnt_import(void) { #endif /* Windows LCK-implementation */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -34514,10 +38057,9 @@ static void mdbx_winnt_import(void) { #include -#ifndef xMDBX_ALLOY -uint32_t linux_kernel_version; -bool mdbx_RunningOnWSL1; -#endif /* xMDBX_ALLOY */ +MDBX_INTERNAL_VAR_INSTA uint32_t linux_kernel_version; +MDBX_INTERNAL_VAR_INSTA bool + mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; MDBX_EXCLUDE_FOR_GPROF __cold static uint8_t probe_for_WSL(const char *tag) { @@ -34607,7 +38149,7 @@ mdbx_global_destructor(void) { * - Блокировка таблицы читателей для регистрации, * т.е. функции osal_rdt_lock() и osal_rdt_unlock(). * - Блокировка БД для пишущих транзакций, - * т.е. функции mdbx_txn_lock() и mdbx_txn_unlock(). + * т.е. функции osal_txn_lock() и osal_txn_unlock(). * * Остальной функционал реализуется отдельно посредством файловых блокировок: * - Первоначальный захват БД в режиме exclusive/shared и последующий перевод @@ -34657,7 +38199,7 @@ mdbx_global_destructor(void) { static int op_setlk, op_setlkw, op_getlk; __cold static void choice_fcntl(void) { assert(!op_setlk && !op_setlkw && !op_getlk); - if ((runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 + if ((mdbx_static.flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 #if defined(__linux__) || defined(__gnu_linux__) && linux_kernel_version > 0x030f0000 /* OFD locks are available since 3.15, but engages here @@ -34781,7 +38323,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid) { /*---------------------------------------------------------------------------*/ #if MDBX_LOCKING > MDBX_LOCKING_SYSV -MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc) { +MDBX_INTERNAL_FUNC int osal_ipclock_stubinit(osal_ipclock_t *ipc) { #if MDBX_LOCKING == MDBX_LOCKING_POSIX1988 return sem_init(ipc, false, 1) ? errno : 0; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ @@ -35014,15 +38556,42 @@ MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) { return rc; } -__cold MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, - MDBX_env *inprocess_neighbor) { +MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, bool dont_wait) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); if (unlikely(osal_getpid() != env->me_pid)) return MDBX_PANIC; + const int cmd = dont_wait ? op_setlk : op_setlkw; + int rc = lck_op(env->me_lfd, cmd, F_WRLCK, 0, 1); + if (rc == MDBX_SUCCESS && (env->me_flags & MDBX_EXCLUSIVE) == 0) { + rc = (env->me_pid > 1) + ? lck_op(env->me_lazy_fd, cmd, F_WRLCK, 0, env->me_pid - 1) + : MDBX_SUCCESS; + if (rc == MDBX_SUCCESS) { + rc = lck_op(env->me_lazy_fd, cmd, F_WRLCK, env->me_pid + 1, + OFF_T_MAX - env->me_pid - 1); + if (rc != MDBX_SUCCESS && env->me_pid > 1 && + lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, 0, env->me_pid - 1)) + rc = MDBX_PANIC; + } + if (rc != MDBX_SUCCESS && lck_op(env->me_lfd, op_setlk, F_RDLCK, 0, 1)) + rc = MDBX_PANIC; + } + if (unlikely(rc != 0)) { + ERROR("%s, err %u", "lck", rc); + assert(MDBX_IS_ERROR(rc)); + } + return rc; +} + +__cold MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, + MDBX_env *inprocess_neighbor, + const uint32_t current_pid) { + eASSERT(env, osal_getpid() == current_pid); int rc = MDBX_SUCCESS; struct stat lck_info; - MDBX_lockinfo *lck = env->me_lck_mmap.lck; - if (env->me_lfd != INVALID_HANDLE_VALUE && !inprocess_neighbor && lck && + MDBX_lockinfo *lck = env->me_lck; + if (lck && lck == env->me_lck_mmap.lck && !inprocess_neighbor && /* try get exclusive access */ lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, OFF_T_MAX) == 0 && /* if LCK was not removed */ @@ -35031,7 +38600,8 @@ __cold MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX) == 0) { - VERBOSE("%p got exclusive, drown locks", (void *)env); + VERBOSE("%p got exclusive, drown ipc-locks", (void *)env); + eASSERT(env, current_pid == env->me_pid); #if MDBX_LOCKING == MDBX_LOCKING_SYSV if (env->me_sysv_ipc.semid != -1) rc = semctl(env->me_sysv_ipc.semid, 2, IPC_RMID) ? errno : 0; @@ -35045,13 +38615,20 @@ __cold MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, if (rc == 0) { const bool synced = lck->mti_unsynced_pages.weak == 0; osal_munmap(&env->me_lck_mmap); - if (synced) + if (synced && env->me_lfd != INVALID_HANDLE_VALUE) rc = ftruncate(env->me_lfd, 0) ? errno : 0; } jitter4testing(false); } + if (current_pid != env->me_pid) { + eASSERT(env, !inprocess_neighbor); + NOTICE("drown env %p after-fork pid %d -> %d", + __Wpedantic_format_voidptr(env), env->me_pid, current_pid); + inprocess_neighbor = nullptr; + } + /* 1) POSIX's fcntl() locks (i.e. when op_setlk == F_SETLK) should be restored * after file was closed. * @@ -35248,7 +38825,7 @@ __cold MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, #endif /* MDBX_LOCKING > 0 */ } -__cold static int mdbx_ipclock_failed(MDBX_env *env, osal_ipclock_t *ipc, +__cold static int osal_ipclock_failed(MDBX_env *env, osal_ipclock_t *ipc, const int err) { int rc = err; #if MDBX_LOCKING == MDBX_LOCKING_POSIX2008 || MDBX_LOCKING == MDBX_LOCKING_SYSV @@ -35309,11 +38886,6 @@ __cold static int mdbx_ipclock_failed(MDBX_env *env, osal_ipclock_t *ipc, #error "FIXME" #endif /* MDBX_LOCKING */ -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - if (rc == EDEADLK && atomic_load32(&env->me_ignore_EDEADLK, mo_Relaxed) > 0) - return rc; -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ - ERROR("mutex (un)lock failed, %s", mdbx_strerror(err)); if (rc != EDEADLK) env->me_flags |= MDBX_FATAL_ERROR; @@ -35339,7 +38911,7 @@ MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void) { } #endif /* __ANDROID_API__ || ANDROID) || BIONIC */ -static int mdbx_ipclock_lock(MDBX_env *env, osal_ipclock_t *ipc, +static int osal_ipclock_lock(MDBX_env *env, osal_ipclock_t *ipc, const bool dont_wait) { #if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 @@ -35375,63 +38947,87 @@ static int mdbx_ipclock_lock(MDBX_env *env, osal_ipclock_t *ipc, #endif /* MDBX_LOCKING */ if (unlikely(rc != MDBX_SUCCESS && rc != MDBX_BUSY)) - rc = mdbx_ipclock_failed(env, ipc, rc); + rc = osal_ipclock_failed(env, ipc, rc); return rc; } -static int mdbx_ipclock_unlock(MDBX_env *env, osal_ipclock_t *ipc) { +int osal_ipclock_unlock(MDBX_env *env, osal_ipclock_t *ipc) { + int err = MDBX_ENOSYS; #if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 - int rc = pthread_mutex_unlock(ipc); - (void)env; + err = pthread_mutex_unlock(ipc); #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 - int rc = sem_post(ipc) ? errno : MDBX_SUCCESS; - (void)env; + err = sem_post(ipc) ? errno : MDBX_SUCCESS; #elif MDBX_LOCKING == MDBX_LOCKING_SYSV if (unlikely(*ipc != (pid_t)env->me_pid)) - return EPERM; - *ipc = 0; - struct sembuf op = {.sem_num = (ipc != &env->me_lck->mti_wlock), - .sem_op = 1, - .sem_flg = SEM_UNDO}; - int rc = semop(env->me_sysv_ipc.semid, &op, 1) ? errno : MDBX_SUCCESS; + err = EPERM; + else { + *ipc = 0; + struct sembuf op = {.sem_num = (ipc != &env->me_lck->mti_wlock), + .sem_op = 1, + .sem_flg = SEM_UNDO}; + err = semop(env->me_sysv_ipc.semid, &op, 1) ? errno : MDBX_SUCCESS; + } #else #error "FIXME" #endif /* MDBX_LOCKING */ + int rc = err; + if (unlikely(rc != MDBX_SUCCESS)) { + const uint32_t current_pid = osal_getpid(); + if (current_pid == env->me_pid || LOG_ENABLED(MDBX_LOG_NOTICE)) + debug_log((current_pid == env->me_pid) + ? MDBX_LOG_FATAL + : (rc = MDBX_SUCCESS, MDBX_LOG_NOTICE), + "ipc-unlock()", __LINE__, "failed: env %p, lck-%s %p, err %d\n", + __Wpedantic_format_voidptr(env), + (env->me_lck == env->me_lck_mmap.lck) ? "mmap" : "stub", + __Wpedantic_format_voidptr(env->me_lck), err); + } return rc; } MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) { TRACE("%s", ">>"); jitter4testing(true); - int rc = mdbx_ipclock_lock(env, &env->me_lck->mti_rlock, false); + int rc = osal_ipclock_lock(env, &env->me_lck->mti_rlock, false); TRACE("<< rc %d", rc); return rc; } MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) { TRACE("%s", ">>"); - int rc = mdbx_ipclock_unlock(env, &env->me_lck->mti_rlock); - TRACE("<< rc %d", rc); - if (unlikely(rc != MDBX_SUCCESS)) - mdbx_panic("%s() failed: err %d\n", __func__, rc); + int err = osal_ipclock_unlock(env, &env->me_lck->mti_rlock); + TRACE("<< err %d", err); + if (unlikely(err != MDBX_SUCCESS)) + mdbx_panic("%s() failed: err %d\n", __func__, err); jitter4testing(true); } -int mdbx_txn_lock(MDBX_env *env, bool dont_wait) { +int osal_txn_lock(MDBX_env *env, bool dont_wait) { TRACE("%swait %s", dont_wait ? "dont-" : "", ">>"); jitter4testing(true); - int rc = mdbx_ipclock_lock(env, &env->me_lck->mti_wlock, dont_wait); - TRACE("<< rc %d", rc); - return MDBX_IS_ERROR(rc) ? rc : MDBX_SUCCESS; + const int err = osal_ipclock_lock(env, &env->me_lck->mti_wlock, dont_wait); + int rc = err; + if (likely(!MDBX_IS_ERROR(err))) { + eASSERT(env, !env->me_txn0->mt_owner || + err == /* если другой поток в этом-же процессе завершился + не освободив блокировку */ + MDBX_RESULT_TRUE); + env->me_txn0->mt_owner = osal_thread_self(); + rc = MDBX_SUCCESS; + } + TRACE("<< err %d, rc %d", err, rc); + return rc; } -void mdbx_txn_unlock(MDBX_env *env) { +void osal_txn_unlock(MDBX_env *env) { TRACE("%s", ">>"); - int rc = mdbx_ipclock_unlock(env, &env->me_lck->mti_wlock); - TRACE("<< rc %d", rc); - if (unlikely(rc != MDBX_SUCCESS)) - mdbx_panic("%s() failed: err %d\n", __func__, rc); + eASSERT(env, env->me_txn0->mt_owner == osal_thread_self()); + env->me_txn0->mt_owner = 0; + int err = osal_ipclock_unlock(env, &env->me_lck->mti_wlock); + TRACE("<< err %d", err); + if (unlikely(err != MDBX_SUCCESS)) + mdbx_panic("%s() failed: err %d\n", __func__, err); jitter4testing(true); } diff --git a/mdbx/mdbx.h b/mdbx/mdbx.h index 1512271..98f9a86 100644 --- a/mdbx/mdbx.h +++ b/mdbx/mdbx.h @@ -25,7 +25,7 @@ _The Future will (be) [Positive](https://www.ptsecurity.com). Всё будет \section copyright LICENSE & COPYRIGHT -\authors Copyright (c) 2015-2023, Leonid Yuriev +\authors Copyright (c) 2015-2024, Leonid Yuriev and other _libmdbx_ authors: please see [AUTHORS](./AUTHORS) file. \copyright Redistribution and use in source and binary forms, with or without @@ -343,13 +343,14 @@ typedef mode_t mdbx_mode_t; #ifdef __deprecated #define MDBX_DEPRECATED __deprecated #elif defined(DOXYGEN) || \ - (defined(__cplusplus) && __cplusplus >= 201603L && \ - __has_cpp_attribute(maybe_unused) && \ - __has_cpp_attribute(maybe_unused) >= 201603L) || \ + (defined(__cplusplus) && __cplusplus >= 201403L && \ + __has_cpp_attribute(deprecated) && \ + __has_cpp_attribute(deprecated) >= 201309L) || \ (!defined(__cplusplus) && defined(__STDC_VERSION__) && \ - __STDC_VERSION__ > 202005L) + __STDC_VERSION__ >= 202304L) #define MDBX_DEPRECATED [[deprecated]] -#elif defined(__GNUC__) || __has_attribute(__deprecated__) +#elif (defined(__GNUC__) && __GNUC__ > 5) || \ + (__has_attribute(__deprecated__) && !defined(__GNUC__)) #define MDBX_DEPRECATED __attribute__((__deprecated__)) #elif defined(_MSC_VER) #define MDBX_DEPRECATED __declspec(deprecated) @@ -634,9 +635,9 @@ typedef mode_t mdbx_mode_t; extern "C" { #endif -/* MDBX version 0.12.x */ +/* MDBX version 0.13.x */ #define MDBX_VERSION_MAJOR 0 -#define MDBX_VERSION_MINOR 12 +#define MDBX_VERSION_MINOR 13 #ifndef LIBMDBX_API #if defined(LIBMDBX_EXPORTS) @@ -816,7 +817,7 @@ typedef struct iovec MDBX_val; #endif /* ! SunOS */ enum MDBX_constants { - /** The hard limit for DBI handles */ + /** The hard limit for DBI handles. */ MDBX_MAX_DBI = UINT32_C(32765), /** The maximum size of a data item. */ @@ -1012,6 +1013,7 @@ typedef void MDBX_debug_func(MDBX_log_level_t loglevel, const char *function, /** \brief The "don't change `logger`" value for mdbx_setup_debug() */ #define MDBX_LOGGER_DONTCHANGE ((MDBX_debug_func *)(intptr_t)-1) +#define MDBX_LOGGER_NOFMT_DONTCHANGE ((MDBX_debug_func_nofmt *)(intptr_t)-1) /** \brief Setup global log-level, debug options and debug logger. * \returns The previously `debug_flags` in the 0-15 bits @@ -1020,6 +1022,17 @@ LIBMDBX_API int mdbx_setup_debug(MDBX_log_level_t log_level, MDBX_debug_flags_t debug_flags, MDBX_debug_func *logger); +typedef void MDBX_debug_func_nofmt(MDBX_log_level_t loglevel, + const char *function, int line, + const char *msg, + unsigned length) MDBX_CXX17_NOEXCEPT; + +LIBMDBX_API int mdbx_setup_debug_nofmt(MDBX_log_level_t log_level, + MDBX_debug_flags_t debug_flags, + MDBX_debug_func_nofmt *logger, + char *logger_buffer, + size_t logger_buffer_size); + /** \brief A callback function for most MDBX assert() failures, * called before printing the message and aborting. * \see mdbx_env_set_assert() @@ -1195,28 +1208,80 @@ enum MDBX_env_flags_t { */ MDBX_WRITEMAP = UINT32_C(0x80000), - /** Tie reader locktable slots to read-only transactions - * instead of to threads. + /** Отвязывает транзакции от потоков/threads насколько это возможно. * - * Don't use Thread-Local Storage, instead tie reader locktable slots to - * \ref MDBX_txn objects instead of to threads. So, \ref mdbx_txn_reset() - * keeps the slot reserved for the \ref MDBX_txn object. A thread may use - * parallel read-only transactions. And a read-only transaction may span - * threads if you synchronizes its use. + * Эта опция предназначена для приложений, которые мультиплексируют множество + * пользовательских легковесных потоков выполнения по отдельным потокам + * операционной системы, например как это происходит в средах выполнения + * GoLang и Rust. Таким приложениям также рекомендуется сериализовать + * транзакции записи в одном потоке операционной системы, поскольку блокировка + * записи MDBX использует базовые системные примитивы синхронизации и ничего + * не знает о пользовательских потоках и/или легковесных потоков среды + * выполнения. Как минимум, обязательно требуется обеспечить завершение каждой + * пишущей транзакции строго в том же потоке операционной системы где она была + * запущена. * - * Applications that multiplex many user threads over individual OS threads - * need this option. Such an application must also serialize the write - * transactions in an OS thread, since MDBX's write locking is unaware of - * the user threads. + * \note Начиная с версии v0.13 опция `MDBX_NOSTICKYTHREADS` полностью + * заменяет опцию \ref MDBX_NOTLS. * - * \note Regardless to `MDBX_NOTLS` flag a write transaction entirely should - * always be used in one thread from start to finish. MDBX checks this in a - * reasonable manner and return the \ref MDBX_THREAD_MISMATCH error in rules - * violation. + * При использовании `MDBX_NOSTICKYTHREADS` транзакции становятся не + * ассоциированными с создавшими их потоками выполнения. Поэтому в функциях + * API не выполняется проверка соответствия транзакции и текущего потока + * выполнения. Большинство функций работающих с транзакциями и курсорами + * становится возможным вызывать из любых потоков выполнения. Однако, также + * становится невозможно обнаружить ошибки одновременного использования + * транзакций и/или курсоров в разных потоках. * - * This flag affects only at environment opening but can't be changed after. + * Использование `MDBX_NOSTICKYTHREADS` также сужает возможности по изменению + * размера БД, так как теряется возможность отслеживать работающие с БД потоки + * выполнения и приостанавливать их на время снятия отображения БД в ОЗУ. В + * частности, по этой причине на Windows уменьшение файла БД не возможно до + * закрытия БД последним работающим с ней процессом или до последующего + * открытия БД в режиме чтения-записи. + * + * \warning Вне зависимости от \ref MDBX_NOSTICKYTHREADS и \ref MDBX_NOTLS не + * допускается одновременно использование объектов API из разных потоков + * выполнения! Обеспечение всех мер для исключения одновременного + * использования объектов API из разных потоков выполнения целиком ложится на + * вас! + * + * \warning Транзакции записи могут быть завершены только в том же потоке + * выполнения где они были запущены. Это ограничение следует из требований + * большинства операционных систем о том, что захваченный примитив + * синхронизации (мьютекс, семафор, критическая секция) должен освобождаться + * только захватившим его потоком выполнения. + * + * \warning Создание курсора в контексте транзакции, привязка курсора к + * транзакции, отвязка курсора от транзакции и закрытие привязанного к + * транзакции курсора, являются операциями использующими как сам курсор так и + * соответствующую транзакцию. Аналогично, завершение или прерывание + * транзакции является операцией использующей как саму транзакцию, так и все + * привязанные к ней курсоры. Во избежание повреждения внутренних структур + * данных, непредсказуемого поведения, разрушение БД и потери данных следует + * не допускать возможности одновременного использования каких-либо курсора + * или транзакций из разных потоков выполнения. + * + * Читающие транзакции при использовании `MDBX_NOSTICKYTHREADS` перестают + * использовать TLS (Thread Local Storage), а слоты блокировок MVCC-снимков в + * таблице читателей привязываются только к транзакциям. Завершение каких-либо + * потоков не приводит к снятию блокировок MVCC-снимков до явного завершения + * транзакций, либо до завершения соответствующего процесса в целом. + * + * Для пишущих транзакций не выполняется проверка соответствия текущего потока + * выполнения и потока создавшего транзакцию. Однако, фиксация или прерывание + * пишущих транзакций должны выполняться строго в потоке запустившим + * транзакцию, так как эти операции связаны с захватом и освобождением + * примитивов синхронизации (мьютексов, критических секций), для которых + * большинство операционных систем требует освобождение только потоком + * захватившим ресурс. + * + * Этот флаг вступает в силу при открытии среды и не может быть изменен после. */ - MDBX_NOTLS = UINT32_C(0x200000), + MDBX_NOSTICKYTHREADS = UINT32_C(0x200000), +#ifndef _MSC_VER /* avoid madness MSVC */ + /** \deprecated Please use \ref MDBX_NOSTICKYTHREADS instead. */ + MDBX_NOTLS MDBX_DEPRECATED = MDBX_NOSTICKYTHREADS, +#endif /* avoid madness MSVC */ /** Don't do readahead. * @@ -1262,8 +1327,9 @@ enum MDBX_env_flags_t { * This flag may be changed at any time using `mdbx_env_set_flags()`. */ MDBX_NOMEMINIT = UINT32_C(0x1000000), +#ifndef _MSC_VER /* avoid madness MSVC */ /** Aims to coalesce a Garbage Collection items. - * \note Always enabled since v0.12 + * \deprecated Always enabled since v0.12 and deprecated since v0.13. * * With `MDBX_COALESCE` flag MDBX will aims to coalesce items while recycling * a Garbage Collection. Technically, when possible short lists of pages @@ -1273,7 +1339,8 @@ enum MDBX_env_flags_t { * Unallocated space and reducing the database file. * * This flag may be changed at any time using mdbx_env_set_flags(). */ - MDBX_COALESCE = UINT32_C(0x2000000), + MDBX_COALESCE MDBX_DEPRECATED = UINT32_C(0x2000000), +#endif /* avoid madness MSVC */ /** LIFO policy for recycling a Garbage Collection items. * @@ -1778,7 +1845,7 @@ enum MDBX_cursor_op { * return both key and data, and the return code depends on whether a * upper-bound was found. * - * For non DUPSORT-ed collections this work the same to \ref MDBX_SET_RANGE, + * For non DUPSORT-ed collections this work like \ref MDBX_SET_RANGE, * but returns \ref MDBX_SUCCESS if the greater key was found or * \ref MDBX_NOTFOUND otherwise. * @@ -1786,7 +1853,28 @@ enum MDBX_cursor_op { * i.e. for a pairs/tuples of a key and an each data value of duplicates. * Returns \ref MDBX_SUCCESS if the greater pair was returned or * \ref MDBX_NOTFOUND otherwise. */ - MDBX_SET_UPPERBOUND + MDBX_SET_UPPERBOUND, + + /* Doubtless cursor positioning at a specified key. */ + MDBX_TO_KEY_LESSER_THAN, + MDBX_TO_KEY_LESSER_OR_EQUAL, + MDBX_TO_KEY_EQUAL, + MDBX_TO_KEY_GREATER_OR_EQUAL, + MDBX_TO_KEY_GREATER_THAN, + + /* Doubtless cursor positioning at a specified key-value pair + * for dupsort/multi-value hives. */ + MDBX_TO_EXACT_KEY_VALUE_LESSER_THAN, + MDBX_TO_EXACT_KEY_VALUE_LESSER_OR_EQUAL, + MDBX_TO_EXACT_KEY_VALUE_EQUAL, + MDBX_TO_EXACT_KEY_VALUE_GREATER_OR_EQUAL, + MDBX_TO_EXACT_KEY_VALUE_GREATER_THAN, + + MDBX_TO_PAIR_LESSER_THAN, + MDBX_TO_PAIR_LESSER_OR_EQUAL, + MDBX_TO_PAIR_EQUAL, + MDBX_TO_PAIR_GREATER_OR_EQUAL, + MDBX_TO_PAIR_GREATER_THAN }; #ifndef __cplusplus /** \ingroup c_cursors */ @@ -1921,7 +2009,7 @@ enum MDBX_error_t { MDBX_TOO_LARGE = -30417, /** A thread has attempted to use a not owned object, - * e.g. a transaction that started by another thread. */ + * e.g. a transaction that started by another thread */ MDBX_THREAD_MISMATCH = -30416, /** Overlapping read and write transactions for the current thread */ @@ -1936,8 +2024,12 @@ enum MDBX_error_t { /** Alternative/Duplicate LCK-file is exists and should be removed manually */ MDBX_DUPLICATED_CLK = -30413, + /** Some cursors and/or other resources should be closed before subDb or + * corresponding DBI-handle could be (re)used */ + MDBX_DANGLING_DBI = -30412, + /* The last of MDBX-added error codes */ - MDBX_LAST_ADDED_ERRCODE = MDBX_DUPLICATED_CLK, + MDBX_LAST_ADDED_ERRCODE = MDBX_DANGLING_DBI, #if defined(_WIN32) || defined(_WIN64) MDBX_ENODATA = ERROR_HANDLE_EOF, @@ -1950,7 +2042,8 @@ enum MDBX_error_t { MDBX_EPERM = ERROR_INVALID_FUNCTION, MDBX_EINTR = ERROR_CANCELLED, MDBX_ENOFILE = ERROR_FILE_NOT_FOUND, - MDBX_EREMOTE = ERROR_REMOTE_STORAGE_MEDIA_ERROR + MDBX_EREMOTE = ERROR_REMOTE_STORAGE_MEDIA_ERROR, + MDBX_EDEADLK = ERROR_POSSIBLE_DEADLOCK #else /* Windows */ #ifdef ENODATA MDBX_ENODATA = ENODATA, @@ -1966,7 +2059,8 @@ enum MDBX_error_t { MDBX_EPERM = EPERM, MDBX_EINTR = EINTR, MDBX_ENOFILE = ENOENT, - MDBX_EREMOTE = ENOTBLK + MDBX_EREMOTE = ENOTBLK, + MDBX_EDEADLK = EDEADLK #endif /* !Windows */ }; #ifndef __cplusplus @@ -2082,11 +2176,12 @@ enum MDBX_option_t { * track readers in the the environment. The default is about 100 for 4K * system page size. Starting a read-only transaction normally ties a lock * table slot to the current thread until the environment closes or the thread - * exits. If \ref MDBX_NOTLS is in use, \ref mdbx_txn_begin() instead ties the - * slot to the \ref MDBX_txn object until it or the \ref MDBX_env object is - * destroyed. This option may only set after \ref mdbx_env_create() and before - * \ref mdbx_env_open(), and has an effect only when the database is opened by - * the first process interacts with the database. + * exits. If \ref MDBX_NOSTICKYTHREADS is in use, \ref mdbx_txn_begin() + * instead ties the slot to the \ref MDBX_txn object until it or the \ref + * MDBX_env object is destroyed. This option may only set after \ref + * mdbx_env_create() and before \ref mdbx_env_open(), and has an effect only + * when the database is opened by the first process interacts with the + * database. * * \see mdbx_env_set_maxreaders() \see mdbx_env_get_maxreaders() */ MDBX_opt_max_readers, @@ -2106,6 +2201,7 @@ enum MDBX_option_t { /** \brief Controls the in-process limit to grow a list of reclaimed/recycled * page's numbers for finding a sequence of contiguous pages for large data * items. + * \see MDBX_opt_gc_time_limit * * \details A long values requires allocation of contiguous database pages. * To find such sequences, it may be necessary to accumulate very large lists, @@ -2266,6 +2362,33 @@ enum MDBX_option_t { * in the \ref MDBX_WRITEMAP mode by clearing ones through file handle before * touching. */ MDBX_opt_prefault_write_enable, + + /** \brief Controls the in-process spending time limit of searching + * consecutive pages inside GC. + * \see MDBX_opt_rp_augment_limit + * + * \details Задаёт ограничение времени в 1/65536 долях секунды, которое может + * быть потрачено в ходе пишущей транзакции на поиск последовательностей + * страниц внутри GC/freelist после достижения ограничения задаваемого опцией + * \ref MDBX_opt_rp_augment_limit. Контроль по времени не выполняется при + * поиске/выделении одиночных страниц и выделении страниц под нужды GC (при + * обновлении GC в ходе фиксации транзакции). + * + * Задаваемый лимит времени исчисляется по "настенным часам" и контролируется + * в рамках транзакции, наследуется для вложенных транзакций и с + * аккумулированием в родительской при их фиксации. Контроль по времени + * производится только при достижении ограничения задаваемого опцией \ref + * MDBX_opt_rp_augment_limit. Это позволяет гибко управлять поведением + * используя обе опции. + * + * По умолчанию ограничение устанавливается в 0, что приводит к + * незамедлительной остановке поиска в GC при достижении \ref + * MDBX_opt_rp_augment_limit во внутреннем состоянии транзакции и + * соответствует поведению до появления опции `MDBX_opt_gc_time_limit`. + * С другой стороны, при минимальном значении (включая 0) + * `MDBX_opt_rp_augment_limit` переработка GC будет ограничиваться + * преимущественно затраченным временем. */ + MDBX_opt_gc_time_limit }; #ifndef __cplusplus /** \ingroup c_settings */ @@ -2322,7 +2445,7 @@ LIBMDBX_API int mdbx_env_get_option(const MDBX_env *env, * * Flags set by mdbx_env_set_flags() are also used: * - \ref MDBX_ENV_DEFAULTS, \ref MDBX_NOSUBDIR, \ref MDBX_RDONLY, - * \ref MDBX_EXCLUSIVE, \ref MDBX_WRITEMAP, \ref MDBX_NOTLS, + * \ref MDBX_EXCLUSIVE, \ref MDBX_WRITEMAP, \ref MDBX_NOSTICKYTHREADS, * \ref MDBX_NORDAHEAD, \ref MDBX_NOMEMINIT, \ref MDBX_COALESCE, * \ref MDBX_LIFORECLAIM. See \ref env_flags section. * @@ -2389,7 +2512,7 @@ enum MDBX_env_delete_mode_t { /** \brief Just delete the environment's files and directory if any. * \note On POSIX systems, processes already working with the database will * continue to work without interference until it close the environment. - * \note On Windows, the behavior of `MDB_ENV_JUST_DELETE` is different + * \note On Windows, the behavior of `MDBX_ENV_JUST_DELETE` is different * because the system does not support deleting files that are currently * memory mapped. */ MDBX_ENV_JUST_DELETE = 0, @@ -2571,9 +2694,7 @@ struct MDBX_envinfo { uint64_t mi_latter_reader_txnid; /**< ID of the last reader transaction */ uint64_t mi_self_latter_reader_txnid; /**< ID of the last reader transaction of caller process */ - uint64_t mi_meta0_txnid, mi_meta0_sign; - uint64_t mi_meta1_txnid, mi_meta1_sign; - uint64_t mi_meta2_txnid, mi_meta2_sign; + uint64_t mi_meta_txnid[3], mi_meta_sign[3]; uint32_t mi_maxreaders; /**< Total reader slots in the environment */ uint32_t mi_numreaders; /**< Max reader slots used in the environment */ uint32_t mi_dxb_pagesize; /**< Database pagesize */ @@ -2590,7 +2711,7 @@ struct MDBX_envinfo { struct { struct { uint64_t x, y; - } current, meta0, meta1, meta2; + } current, meta[3]; } mi_bootid; /** Bytes not explicitly synchronized to disk */ @@ -2656,7 +2777,8 @@ typedef struct MDBX_envinfo MDBX_envinfo; * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin() * \param [out] info The address of an \ref MDBX_envinfo structure * where the information will be copied - * \param [in] bytes The size of \ref MDBX_envinfo. + * \param [in] bytes The actual size of \ref MDBX_envinfo, + * this value is used to provide ABI compatibility. * * \returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, @@ -2891,6 +3013,86 @@ LIBMDBX_INLINE_API(int, mdbx_env_close, (MDBX_env * env)) { return mdbx_env_close_ex(env, false); } +#if defined(DOXYGEN) || !(defined(_WIN32) || defined(_WIN64)) +/** \brief Восстанавливает экземпляр среды в дочернем процессе после ветвления + * родительского процесса посредством `fork()` и родственных системных вызовов. + * \ingroup c_extra + * + * Без вызова \ref mdbx_env_resurrect_after_fork() использование открытого + * экземпляра среды в дочернем процессе не возможно, включая все выполняющиеся + * на момент ветвления транзакции. + * + * Выполняемые функцией действия можно рассматривать как повторное открытие БД + * в дочернем процессе, с сохранением заданных опций и адресов уже созданных + * экземпляров объектов связанных с API. + * + * \note Функция не доступна в ОС семейства Windows по причине отсутствия + * функционала ветвления процесса в API операционной системы. + * + * Ветвление не оказывает влияния на состояние MDBX-среды в родительском + * процессе. Все транзакции, которые были в родительском процессе на момент + * ветвления, после ветвления в родительском процессе продолжат выполняться без + * помех. Но в дочернем процессе все соответствующие транзакции безальтернативно + * перестают быть валидными, а попытка их использования приведет к возврату + * ошибки или отправке `SIGSEGV`. + * + * Использование экземпляра среды в дочернем процессе не возможно до вызова + * \ref mdbx_env_resurrect_after_fork(), так как в результате ветвления у + * процесса меняется PID, значение которого используется для организации + * совместно работы с БД, в том числе, для отслеживания процессов/потоков + * выполняющих читающие транзакции связанные с соответствующими снимками данных. + * Все активные на момент ветвления транзакции не могут продолжаться в дочернем + * процессе, так как не владеют какими-либо блокировками или каким-либо снимком + * данных и не удерживает его от переработки при сборке мусора. + * + * Функция \ref mdbx_env_resurrect_after_fork() восстанавливает переданный + * экземпляр среды в дочернем процессе после ветвления, а именно: обновляет + * используемые системные идентификаторы, повторно открывает дескрипторы файлов, + * производит захват необходимых блокировок связанных с LCK- и DXB-файлами БД, + * восстанавливает отображения в память страницы БД, таблицы читателей и + * служебных/вспомогательных данных в память. Однако унаследованные от + * родительского процесса транзакции не восстанавливаются, прием пишущие и + * читающие транзакции обрабатываются по-разному: + * + * - Пишущая транзакция, если таковая была на момент ветвления, + * прерывается в дочернем процессе с освобождение связанных с ней ресурсов, + * включая все вложенные транзакции. + * + * - Читающие же транзакции, если таковые были в родительском процессе, + * в дочернем процессе логически прерываются, но без освобождения ресурсов. + * Поэтому необходимо обеспечить вызов \ref mdbx_txn_abort() для каждой + * такой читающей транзакций в дочернем процессе, либо смириться с утечкой + * ресурсов до завершения дочернего процесса. + * + * Причина не-освобождения ресурсов читающих транзакций в том, что исторически + * MDBX не ведет какой-либо общий список экземпляров читающих, так как это не + * требуется для штатных режимов работы, но требует использования атомарных + * операций или дополнительных объектов синхронизации при создании/разрушении + * экземпляров \ref MDBX_txn. + * + * Вызов \ref mdbx_env_resurrect_after_fork() без ветвления, не в дочернем + * процессе, либо повторные вызовы не приводят к каким-либо действиям или + * изменениям. + * + * \param [in,out] env Экземпляр среды созданный функцией + * \ref mdbx_env_create(). + * + * \returns Ненулевое значение ошибки при сбое и 0 при успешном выполнении, + * некоторые возможные ошибки таковы: + * + * \retval MDBX_BUSY В родительском процессе БД была открыта + * в режиме \ref MDBX_EXCLUSIVE. + * + * \retval MDBX_EBADSIGN При повреждении сигнатуры экземпляра объекта, а также + * в случае одновременного вызова \ref + * mdbx_env_resurrect_after_fork() из разных потоков. + * + * \retval MDBX_PANIC Произошла критическая ошибка при восстановлении + * экземпляра среды, либо такая ошибка уже была + * до вызова функции. */ +LIBMDBX_API int mdbx_env_resurrect_after_fork(MDBX_env *env); +#endif /* Windows */ + /** \brief Warming up options * \ingroup c_settings * \anchor warmup_flags @@ -3239,7 +3441,7 @@ LIBMDBX_API int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *fd); * 2) Temporary close memory mapped is required to change * geometry, but there read transaction(s) is running * and no corresponding thread(s) could be suspended - * since the \ref MDBX_NOTLS mode is used. + * since the \ref MDBX_NOSTICKYTHREADS mode is used. * \retval MDBX_EACCESS The environment opened in read-only. * \retval MDBX_MAP_FULL Specified size smaller than the space already * consumed by the environment. @@ -3268,7 +3470,7 @@ MDBX_DEPRECATED LIBMDBX_INLINE_API(int, mdbx_env_set_mapsize, * value. * * \returns A \ref MDBX_RESULT_TRUE or \ref MDBX_RESULT_FALSE value, - * otherwise the error code: + * otherwise the error code. * \retval MDBX_RESULT_TRUE Readahead is reasonable. * \retval MDBX_RESULT_FALSE Readahead is NOT reasonable, * i.e. \ref MDBX_NORDAHEAD is useful to @@ -3310,6 +3512,12 @@ mdbx_limits_dbsize_max(intptr_t pagesize); MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t mdbx_limits_keysize_max(intptr_t pagesize, MDBX_db_flags_t flags); +/** \brief Returns minimal key size in bytes for given database flags. + * \ingroup c_statinfo + * \see db_flags */ +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t +mdbx_limits_keysize_min(MDBX_db_flags_t flags); + /** \brief Returns maximal data size in bytes for given page size * and database flags, or -1 if pagesize is invalid. * \ingroup c_statinfo @@ -3317,6 +3525,12 @@ mdbx_limits_keysize_max(intptr_t pagesize, MDBX_db_flags_t flags); MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t mdbx_limits_valsize_max(intptr_t pagesize, MDBX_db_flags_t flags); +/** \brief Returns minimal data size in bytes for given database flags. + * \ingroup c_statinfo + * \see db_flags */ +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t +mdbx_limits_valsize_min(MDBX_db_flags_t flags); + /** \brief Returns maximal size of key-value pair to fit in a single page with * the given size and database flags, or -1 if pagesize is invalid. * \ingroup c_statinfo @@ -3346,11 +3560,11 @@ mdbx_limits_txnsize_max(intptr_t pagesize); * track readers in the the environment. The default is about 100 for 4K system * page size. Starting a read-only transaction normally ties a lock table slot * to the current thread until the environment closes or the thread exits. If - * \ref MDBX_NOTLS is in use, \ref mdbx_txn_begin() instead ties the slot to the - * \ref MDBX_txn object until it or the \ref MDBX_env object is destroyed. - * This function may only be called after \ref mdbx_env_create() and before - * \ref mdbx_env_open(), and has an effect only when the database is opened by - * the first process interacts with the database. + * \ref MDBX_NOSTICKYTHREADS is in use, \ref mdbx_txn_begin() instead ties the + * slot to the \ref MDBX_txn object until it or the \ref MDBX_env object is + * destroyed. This function may only be called after \ref mdbx_env_create() and + * before \ref mdbx_env_open(), and has an effect only when the database is + * opened by the first process interacts with the database. * \see mdbx_env_get_maxreaders() * * \param [in] env An environment handle returned @@ -3544,8 +3758,8 @@ mdbx_env_get_userctx(const MDBX_env *env); * \see mdbx_txn_begin() * * \note A transaction and its cursors must only be used by a single thread, - * and a thread may only have a single transaction at a time. If \ref MDBX_NOTLS - * is in use, this does not apply to read-only transactions. + * and a thread may only have a single transaction at a time unless + * the \ref MDBX_NOSTICKYTHREADS is used. * * \note Cursors may not span transactions. * @@ -3606,8 +3820,8 @@ LIBMDBX_API int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, * \see mdbx_txn_begin_ex() * * \note A transaction and its cursors must only be used by a single thread, - * and a thread may only have a single transaction at a time. If \ref MDBX_NOTLS - * is in use, this does not apply to read-only transactions. + * and a thread may only have a single transaction at a time unless + * the \ref MDBX_NOSTICKYTHREADS is used. * * \note Cursors may not span transactions. * @@ -3766,7 +3980,7 @@ mdbx_txn_env(const MDBX_txn *txn); * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). * * \returns A transaction flags, valid if input is an valid transaction, - * otherwise -1. */ + * otherwise \ref MDBX_TXN_INVALID. */ MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int mdbx_txn_flags(const MDBX_txn *txn); /** \brief Return the transaction's ID. @@ -3982,10 +4196,11 @@ LIBMDBX_API int mdbx_txn_break(MDBX_txn *txn); * Abort the read-only transaction like \ref mdbx_txn_abort(), but keep the * transaction handle. Therefore \ref mdbx_txn_renew() may reuse the handle. * This saves allocation overhead if the process will start a new read-only - * transaction soon, and also locking overhead if \ref MDBX_NOTLS is in use. The - * reader table lock is released, but the table slot stays tied to its thread - * or \ref MDBX_txn. Use \ref mdbx_txn_abort() to discard a reset handle, and to - * free its lock table slot if \ref MDBX_NOTLS is in use. + * transaction soon, and also locking overhead if \ref MDBX_NOSTICKYTHREADS is + * in use. The reader table lock is released, but the table slot stays tied to + * its thread or \ref MDBX_txn. Use \ref mdbx_txn_abort() to discard a reset + * handle, and to free its lock table slot if \ref MDBX_NOSTICKYTHREADS is in + * use. * * Cursors opened within the transaction must not be used again after this * call, except with \ref mdbx_cursor_renew() and \ref mdbx_cursor_close(). @@ -4196,6 +4411,7 @@ typedef int(MDBX_cmp_func)(const MDBX_val *a, * by current thread. */ LIBMDBX_API int mdbx_dbi_open(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, MDBX_dbi *dbi); +/** \copydoc mdbx_dbi_open() */ LIBMDBX_API int mdbx_dbi_open2(MDBX_txn *txn, const MDBX_val *name, MDBX_db_flags_t flags, MDBX_dbi *dbi); @@ -4217,10 +4433,30 @@ LIBMDBX_API int mdbx_dbi_open2(MDBX_txn *txn, const MDBX_val *name, MDBX_DEPRECATED LIBMDBX_API int mdbx_dbi_open_ex(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp); +/** \copydoc mdbx_dbi_open_ex() */ MDBX_DEPRECATED LIBMDBX_API int mdbx_dbi_open_ex2(MDBX_txn *txn, const MDBX_val *name, MDBX_db_flags_t flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp); +/** \brief Переименовает таблицу по DBI-хендлу. + * \ingroup c_dbi + * + * Переименовывает пользовательскую именованную subDB связанную с передаваемым + * DBI-дескриптором. + * + * \param [in,out] txn Пишущая транзакция запущенная посредством + * \ref mdbx_txn_begin(). + * \param [in] dbi Дескриптор таблицы (именованной пользовательской subDB) + * открытый посредством \ref mdbx_dbi_open(). + * + * \param [in] name Новое имя для переименования. + * + * \returns Ненулевое значение ошибки при сбое и 0 при успешном выполнении. */ +LIBMDBX_API int mdbx_dbi_rename(MDBX_txn *txn, MDBX_dbi dbi, const char *name); +/** \copydoc mdbx_dbi_rename() */ +LIBMDBX_API int mdbx_dbi_rename2(MDBX_txn *txn, MDBX_dbi dbi, + const MDBX_val *name); + /** \defgroup value2key Value-to-Key functions * \brief Value-to-Key functions to * \ref avoid_custom_comparators "avoid using custom comparators" @@ -4734,6 +4970,28 @@ mdbx_cursor_get_userctx(const MDBX_cursor *cursor); LIBMDBX_API int mdbx_cursor_bind(const MDBX_txn *txn, MDBX_cursor *cursor, MDBX_dbi dbi); +/** \brief Unbind cursor from a transaction. + * \ingroup c_cursors + * + * Unbinded cursor is disassociated with any transactions but still holds + * the original DBI-handle internally. Thus it could be renewed with any running + * transaction or closed. + * + * \see mdbx_cursor_renew() + * \see mdbx_cursor_bind() + * \see mdbx_cursor_close() + * + * \note In contrast to LMDB, the MDBX required that any opened cursors can be + * reused and must be freed explicitly, regardless ones was opened in a + * read-only or write transaction. The REASON for this is eliminates ambiguity + * which helps to avoid errors such as: use-after-free, double-free, i.e. + * memory corruption and segfaults. + * + * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_cursor_unbind(MDBX_cursor *cursor); + /** \brief Create a cursor handle for the specified transaction and DBI handle. * \ingroup c_cursors * @@ -4783,6 +5041,27 @@ LIBMDBX_API int mdbx_cursor_open(const MDBX_txn *txn, MDBX_dbi dbi, * or \ref mdbx_cursor_create(). */ LIBMDBX_API void mdbx_cursor_close(MDBX_cursor *cursor); +/** \brief Unbind or closes all cursors of a given transaction. + * \ingroup c_cursors + * + * Unbinds either closes all cursors associated (opened or renewed) with + * a given transaction in a bulk with minimal overhead. + * + * \see mdbx_cursor_unbind() + * \see mdbx_cursor_close() + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * \param [in] unbind If non-zero, unbinds cursors and leaves ones reusable. + * Otherwise close and dispose cursors. + * + * \returns A negative error value on failure or the number of closed cursors + * on success, some possible errors are: + * \retval MDBX_THREAD_MISMATCH Given transaction is not owned + * by current thread. + * \retval MDBX_BAD_TXN Given transaction is invalid or has + * a child/nested transaction transaction. */ +LIBMDBX_API int mdbx_txn_release_all_cursors(const MDBX_txn *txn, bool unbind); + /** \brief Renew a cursor handle for use within the given transaction. * \ingroup c_cursors * @@ -4834,6 +5113,31 @@ LIBMDBX_API MDBX_dbi mdbx_cursor_dbi(const MDBX_cursor *cursor); * \returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest); +/** \brief Сравнивает позицию курсоров. + * \ingroup c_cursors + * + * Функция предназначена для сравнения позиций двух + * инициализированных/установленных курсоров, связанных с одной транзакцией и + * одной таблицей (DBI-дескриптором). + * Если же курсоры связаны с разными транзакциями, либо с разными таблицами, + * либо один из них не инициализирован, то результат сравнения не определен + * (поведением может быть изменено в последующих версиях). + * + * \param [in] left Левый курсор для сравнения позиций. + * \param [in] right Правый курсор для сравнения позиций. + * \param [in] ignore_multival Булевой флаг, влияющий на результат только при + * сравнении курсоров для таблиц с мульти-значениями, т.е. с флагом + * \ref MDBX_DUPSORT. В случае `true`, позиции курсоров сравниваются + * только по ключам, без учета позиционирования среди мульти-значений. + * Иначе, в случае `false`, при совпадении позиций по ключам, + * сравниваются также позиции по мульти-значениям. + * + * \retval Значение со знаком в семантике оператора `<=>` (меньше нуля, ноль, + * либо больше нуля) как результат сравнения позиций курсоров. */ +LIBMDBX_API int mdbx_cursor_compare(const MDBX_cursor *left, + const MDBX_cursor *right, + bool ignore_multival); + /** \brief Retrieve by cursor. * \ingroup c_crud * @@ -4868,6 +5172,203 @@ LIBMDBX_API int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest); LIBMDBX_API int mdbx_cursor_get(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op); +/** \brief Тип предикативных функций обратного вызова используемых + * \ref mdbx_cursor_scan() и \ref mdbx_cursor_scan_from() для пробирования + * пар ключ-значения. + * \ingroup c_crud + * + * \param [in,out] context Указатель на контекст с необходимой для оценки + * информацией, который полностью подготавливается + * и контролируется вами. + * \param [in] key Ключ для оценки пользовательской функцией. + * \param [in] value Значение для оценки пользовательской функцией. + * \param [in,out] arg Дополнительный аргумент предикативной функции, + * который полностью подготавливается + * и контролируется вами. + * + * \returns Результат проверки соответствия переданной пары ключ-значения + * искомой цели. Иначе код ошибки, который прерывает сканирование и возвращается + * без изменения в качестве результата из функций \ref mdbx_cursor_scan() + * или \ref mdbx_cursor_scan_from(). + * + * \retval MDBX_RESULT_TRUE если переданная пара ключ-значение соответствует + * искомой и следует завершить сканирование. + * \retval MDBX_RESULT_FALSE если переданная пара ключ-значение НЕ соответствует + * искомой и следует продолжать сканирование. + * \retval ИНАЧЕ любое другое значение, отличное от \ref MDBX_RESULT_TRUE + * и \ref MDBX_RESULT_FALSE, считается индикатором ошибки + * и возвращается без изменений в качестве результата сканирования. + * + * \see mdbx_cursor_scan() + * \see mdbx_cursor_scan_from() */ +typedef int(MDBX_predicate_func)(void *context, MDBX_val *key, MDBX_val *value, + void *arg) MDBX_CXX17_NOEXCEPT; + +/** \brief Сканирует таблицу с использованием передаваемого предиката, + * с уменьшением сопутствующих накладных расходов. + * \ingroup c_crud + * + * Реализует функционал сходный с шаблоном `std::find_if<>()` с использованием + * курсора и пользовательской предикативной функции, экономя при этом + * на сопутствующих накладных расходах, в том числе, не выполняя часть проверок + * внутри цикла итерации записей и потенциально уменьшая количество + * DSO-трансграничных вызовов. + * + * Функция принимает курсор, который должен быть привязан к некоторой транзакции + * и DBI-дескриптору таблицы (именованной пользовательской subDB), выполняет + * первоначальное позиционирование курсора определяемое аргументом `start_op`. + * Далее, производится оценка каждой пары ключ-значения посредством + * предоставляемой вами предикативной функции `predicate` и затем, при + * необходимости, переход к следующему элементу посредством операции `turn_op`, + * до наступления одного из четырех событий: + * - достигается конец данных; + * - возникнет ошибка при позиционировании курсора; + * - оценочная функция вернет \ref MDBX_RESULT_TRUE, сигнализируя + * о необходимости остановить дальнейшее сканирование; + * - оценочная функция возвратит значение отличное от \ref MDBX_RESULT_FALSE + * и \ref MDBX_RESULT_TRUE сигнализируя об ошибке. + * + * \param [in,out] cursor Курсор для выполнения операции сканирования, + * связанный с активной транзакцией и DBI-дескриптором + * таблицы. Например, курсор созданный + * посредством \ref mdbx_cursor_open(). + * \param [in] predicate Предикативная функция для оценки итерируемых + * пар ключ-значения, + * более подробно смотрите \ref MDBX_predicate_func. + * \param [in,out] context Указатель на контекст с необходимой для оценки + * информацией, который полностью подготавливается + * и контролируется вами. + * \param [in] start_op Стартовая операция позиционирования курсора, + * более подробно смотрите \ref MDBX_cursor_op. + * Для сканирования без изменения исходной позиции + * курсора используйте \ref MDBX_GET_CURRENT. + * Допустимые значения \ref MDBX_FIRST, + * \ref MDBX_FIRST_DUP, \ref MDBX_LAST, + * \ref MDBX_LAST_DUP, \ref MDBX_GET_CURRENT, + * а также \ref MDBX_GET_MULTIPLE. + * \param [in] turn_op Операция позиционирования курсора для перехода + * к следующему элементу. Допустимые значения + * \ref MDBX_NEXT, \ref MDBX_NEXT_DUP, + * \ref MDBX_NEXT_NODUP, \ref MDBX_PREV, + * \ref MDBX_PREV_DUP, \ref MDBX_PREV_NODUP, а также + * \ref MDBX_NEXT_MULTIPLE и \ref MDBX_PREV_MULTIPLE. + * \param [in,out] arg Дополнительный аргумент предикативной функции, + * который полностью подготавливается + * и контролируется вами. + * + * \note При использовании \ref MDBX_GET_MULTIPLE, \ref MDBX_NEXT_MULTIPLE + * или \ref MDBX_PREV_MULTIPLE внимательно учитывайте пакетную специфику + * передачи значений через параметры предикативной функции. + * + * \see MDBX_predicate_func + * \see mdbx_cursor_scan_from + * + * \returns Результат операции сканирования, либо код ошибки. + * + * \retval MDBX_RESULT_TRUE если найдена пара ключ-значение, для которой + * предикативная функция вернула \ref MDBX_RESULT_TRUE. + * \retval MDBX_RESULT_FALSE если если подходящая пара ключ-значения НЕ найдена, + * в процессе поиска достигнут конец данных, либо нет данных для поиска. + * \retval ИНАЧЕ любое другое значение, отличное от \ref MDBX_RESULT_TRUE + * и \ref MDBX_RESULT_FALSE, является кодом ошибки при позиционировании + * курса, либо определяемым пользователем кодом остановки поиска + * или ошибочной ситуации. */ +LIBMDBX_API int mdbx_cursor_scan(MDBX_cursor *cursor, + MDBX_predicate_func *predicate, void *context, + MDBX_cursor_op start_op, + MDBX_cursor_op turn_op, void *arg); + +/** Сканирует таблицу с использованием передаваемого предиката, + * начиная с передаваемой пары ключ-значение, + * с уменьшением сопутствующих накладных расходов. + * \ingroup c_crud + * + * Функция принимает курсор, который должен быть привязан к некоторой транзакции + * и DBI-дескриптору таблицы (именованной пользовательской subDB), выполняет + * первоначальное позиционирование курсора определяемое аргументом `from_op`. + * а также аргументами `from_key` и `from_value`. + * Далее, производится оценка каждой пары ключ-значения посредством + * предоставляемой вами предикативной функции `predicate` и затем, при + * необходимости, переход к следующему элементу посредством операции `turn_op`, + * до наступления одного из четырех событий: + * - достигается конец данных; + * - возникнет ошибка при позиционировании курсора; + * - оценочная функция вернет \ref MDBX_RESULT_TRUE, сигнализируя + * о необходимости остановить дальнейшее сканирование; + * - оценочная функция возвратит значение отличное от \ref MDBX_RESULT_FALSE + * и \ref MDBX_RESULT_TRUE сигнализируя об ошибке. + * + * \param [in,out] cursor Курсор для выполнения операции сканирования, + * связанный с активной транзакцией и DBI-дескриптором + * таблицы. Например, курсор созданный + * посредством \ref mdbx_cursor_open(). + * \param [in] predicate Предикативная функция для оценки итерируемых + * пар ключ-значения, + * более подробно смотрите \ref MDBX_predicate_func. + * \param [in,out] context Указатель на контекст с необходимой для оценки + * информацией, который полностью подготавливается + * и контролируется вами. + * \param [in] from_op Операция позиционирования курсора к исходной + * позиции, более подробно смотрите + * \ref MDBX_cursor_op. + * Допустимые значения \ref MDBX_GET_BOTH, + * \ref MDBX_GET_BOTH_RANGE, \ref MDBX_SET_KEY, + * \ref MDBX_SET_LOWERBOUND, \ref MDBX_SET_UPPERBOUND, + * \ref MDBX_TO_KEY_LESSER_THAN, + * \ref MDBX_TO_KEY_LESSER_OR_EQUAL, + * \ref MDBX_TO_KEY_EQUAL, + * \ref MDBX_TO_KEY_GREATER_OR_EQUAL, + * \ref MDBX_TO_KEY_GREATER_THAN, + * \ref MDBX_TO_EXACT_KEY_VALUE_LESSER_THAN, + * \ref MDBX_TO_EXACT_KEY_VALUE_LESSER_OR_EQUAL, + * \ref MDBX_TO_EXACT_KEY_VALUE_EQUAL, + * \ref MDBX_TO_EXACT_KEY_VALUE_GREATER_OR_EQUAL, + * \ref MDBX_TO_EXACT_KEY_VALUE_GREATER_THAN, + * \ref MDBX_TO_PAIR_LESSER_THAN, + * \ref MDBX_TO_PAIR_LESSER_OR_EQUAL, + * \ref MDBX_TO_PAIR_EQUAL, + * \ref MDBX_TO_PAIR_GREATER_OR_EQUAL, + * \ref MDBX_TO_PAIR_GREATER_THAN, + * а также \ref MDBX_GET_MULTIPLE. + * \param [in,out] from_key Указатель на ключ используемый как для исходного + * позиционирования, так и для последующих итераций + * перехода. + * \param [in,out] from_value Указатель на значние используемое как для + * исходного позиционирования, так и для последующих + * итераций перехода. + * \param [in] turn_op Операция позиционирования курсора для перехода + * к следующему элементу. Допустимые значения + * \ref MDBX_NEXT, \ref MDBX_NEXT_DUP, + * \ref MDBX_NEXT_NODUP, \ref MDBX_PREV, + * \ref MDBX_PREV_DUP, \ref MDBX_PREV_NODUP, а также + * \ref MDBX_NEXT_MULTIPLE и \ref MDBX_PREV_MULTIPLE. + * \param [in,out] arg Дополнительный аргумент предикативной функции, + * который полностью подготавливается + * и контролируется вами. + * + * \note При использовании \ref MDBX_GET_MULTIPLE, \ref MDBX_NEXT_MULTIPLE + * или \ref MDBX_PREV_MULTIPLE внимательно учитывайте пакетную специфику + * передачи значений через параметры предикативной функции. + * + * \see MDBX_predicate_func + * \see mdbx_cursor_scan + * + * \returns Результат операции сканирования, либо код ошибки. + * + * \retval MDBX_RESULT_TRUE если найдена пара ключ-значение, для которой + * предикативная функция вернула \ref MDBX_RESULT_TRUE. + * \retval MDBX_RESULT_FALSE если если подходящая пара ключ-значения НЕ найдена, + * в процессе поиска достигнут конец данных, либо нет данных для поиска. + * \retval ИНАЧЕ любое другое значение, отличное от \ref MDBX_RESULT_TRUE + * и \ref MDBX_RESULT_FALSE, является кодом ошибки при позиционировании + * курса, либо определяемым пользователем кодом остановки поиска + * или ошибочной ситуации. */ +LIBMDBX_API int mdbx_cursor_scan_from(MDBX_cursor *cursor, + MDBX_predicate_func *predicate, + void *context, MDBX_cursor_op from_op, + MDBX_val *from_key, MDBX_val *from_value, + MDBX_cursor_op turn_op, void *arg); + /** \brief Retrieve multiple non-dupsort key/value pairs by cursor. * \ingroup c_crud * @@ -5054,7 +5555,7 @@ LIBMDBX_API int mdbx_cursor_count(const MDBX_cursor *cursor, size_t *pcount); * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). * * \returns A \ref MDBX_RESULT_TRUE or \ref MDBX_RESULT_FALSE value, - * otherwise the error code: + * otherwise the error code. * \retval MDBX_RESULT_TRUE No more data available or cursor not * positioned * \retval MDBX_RESULT_FALSE A data is available @@ -5069,13 +5570,27 @@ mdbx_cursor_eof(const MDBX_cursor *cursor); * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). * * \returns A MDBX_RESULT_TRUE or MDBX_RESULT_FALSE value, - * otherwise the error code: + * otherwise the error code. * \retval MDBX_RESULT_TRUE Cursor positioned to the first key-value pair * \retval MDBX_RESULT_FALSE Cursor NOT positioned to the first key-value * pair \retval Otherwise the error code */ MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int mdbx_cursor_on_first(const MDBX_cursor *cursor); +/** \brief Определяет стоит ли курсор на первом или единственном + * мульти-значении соответствующем ключу. + * \ingroup c_cursors + * \param [in] cursor Курсор созданный посредством \ref mdbx_cursor_open(). + * \returns Значание \ref MDBX_RESULT_TRUE, либо \ref MDBX_RESULT_FALSE, + * иначе код ошибки. + * \retval MDBX_RESULT_TRUE курсор установлен на первом или единственном + * мульти-значении соответствующем ключу. + * \retval MDBX_RESULT_FALSE курсор НЕ установлен на первом или единственном + * мульти-значении соответствующем ключу. + * \retval ИНАЧЕ код ошибки. */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int +mdbx_cursor_on_first_dup(const MDBX_cursor *cursor); + /** \brief Determines whether the cursor is pointed to the last key-value pair * or not. * \ingroup c_cursors @@ -5083,13 +5598,27 @@ mdbx_cursor_on_first(const MDBX_cursor *cursor); * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). * * \returns A \ref MDBX_RESULT_TRUE or \ref MDBX_RESULT_FALSE value, - * otherwise the error code: + * otherwise the error code. * \retval MDBX_RESULT_TRUE Cursor positioned to the last key-value pair * \retval MDBX_RESULT_FALSE Cursor NOT positioned to the last key-value pair * \retval Otherwise the error code */ MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int mdbx_cursor_on_last(const MDBX_cursor *cursor); +/** \brief Определяет стоит ли курсор на последнем или единственном + * мульти-значении соответствующем ключу. + * \ingroup c_cursors + * \param [in] cursor Курсор созданный посредством \ref mdbx_cursor_open(). + * \returns Значание \ref MDBX_RESULT_TRUE, либо \ref MDBX_RESULT_FALSE, + * иначе код ошибки. + * \retval MDBX_RESULT_TRUE курсор установлен на последнем или единственном + * мульти-значении соответствующем ключу. + * \retval MDBX_RESULT_FALSE курсор НЕ установлен на последнем или единственном + * мульти-значении соответствующем ключу. + * \retval ИНАЧЕ код ошибки. */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int +mdbx_cursor_on_last_dup(const MDBX_cursor *cursor); + /** \addtogroup c_rqest * \details \note The estimation result varies greatly depending on the filling * of specific pages and the overall balance of the b-tree: @@ -5226,7 +5755,7 @@ LIBMDBX_API int mdbx_estimate_range(const MDBX_txn *txn, MDBX_dbi dbi, * \param [in] ptr The address of data to check. * * \returns A MDBX_RESULT_TRUE or MDBX_RESULT_FALSE value, - * otherwise the error code: + * otherwise the error code. * \retval MDBX_RESULT_TRUE Given address is on the dirty page. * \retval MDBX_RESULT_FALSE Given address is NOT on the dirty page. * \retval Otherwise the error code. */ @@ -5521,48 +6050,21 @@ LIBMDBX_API int mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr_callback); MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API MDBX_hsr_func * mdbx_env_get_hsr(const MDBX_env *env); -/** \defgroup btree_traversal B-tree Traversal - * This is internal API for mdbx_chk tool. You should avoid to use it, except - * some extremal special cases. +/** \defgroup chk Checking and Recovery + * Basically this is internal API for `mdbx_chk` tool, etc. + * You should avoid to use it, except some extremal special cases. * \ingroup c_extra * @{ */ -/** \brief Page types for traverse the b-tree. - * \see mdbx_env_pgwalk() \see MDBX_pgvisitor_func */ -enum MDBX_page_type_t { - MDBX_page_broken, - MDBX_page_meta, - MDBX_page_large, - MDBX_page_branch, - MDBX_page_leaf, - MDBX_page_dupfixed_leaf, - MDBX_subpage_leaf, - MDBX_subpage_dupfixed_leaf, - MDBX_subpage_broken, -}; -#ifndef __cplusplus -typedef enum MDBX_page_type_t MDBX_page_type_t; -#endif - -/** \brief Pseudo-name for MainDB */ -#define MDBX_PGWALK_MAIN ((void *)((ptrdiff_t)0)) -/** \brief Pseudo-name for GarbageCollectorDB */ -#define MDBX_PGWALK_GC ((void *)((ptrdiff_t)-1)) -/** \brief Pseudo-name for MetaPages */ -#define MDBX_PGWALK_META ((void *)((ptrdiff_t)-2)) - -/** \brief Callback function for traverse the b-tree. \see mdbx_env_pgwalk() */ -typedef int -MDBX_pgvisitor_func(const uint64_t pgno, const unsigned number, void *const ctx, - const int deep, const MDBX_val *dbi_name, - const size_t page_size, const MDBX_page_type_t type, - const MDBX_error_t err, const size_t nentries, - const size_t payload_bytes, const size_t header_bytes, - const size_t unused_bytes) MDBX_CXX17_NOEXCEPT; +/** \brief Acquires write-transaction lock. + * Provided for custom and/or complex locking scenarios. + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dont_wait); -/** \brief B-tree traversal function. */ -LIBMDBX_API int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, - void *ctx, bool dont_check_keys_ordering); +/** \brief Releases write-transaction lock. + * Provided for custom and/or complex locking scenarios. + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_txn_unlock(MDBX_env *env); /** \brief Open an environment instance using specific meta-page * for checking and recovery. @@ -5594,7 +6096,314 @@ LIBMDBX_API int mdbx_env_open_for_recoveryW(MDBX_env *env, * leg(s). */ LIBMDBX_API int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target_meta); -/** end of btree_traversal @} */ +/** \brief Получает базовую информацию о БД не открывая её. + * \ingroup c_opening + * + * Назначение функции в получении базовой информации без открытия БД и + * отображения данных в память (что может быть достаточно затратным действием + * для ядра ОС). Полученная таким образом информация может быть полезной для + * подстройки опций работы с БД перед её открытием, а также в сценариях файловых + * менеджерах и прочих вспомогательных утилитах. + * + * \todo Добавить в API возможность установки обратного вызова для ревизии опций + * работы с БД в процессе её открытия (при удержании блокировок). + * + * \param [in] pathname Путь к директории или файлу БД. + * \param [out] info Указатель на структуру \ref MDBX_envinfo + * для получения информации. + * \param [in] bytes Актуальный размер структуры \ref MDBX_envinfo, это + * значение используется для обеспечения совместимости + * ABI. + * + * \note Заполняется только некоторые поля структуры \ref MDBX_envinfo, значения + * которых возможно получить без отображения файлов БД в память и без захвата + * блокировок: размер страницы БД, геометрия БД, размер распределенного места + * (номер последней распределенной страницы), номер последней транзакции и + * boot-id. + * + * \warning Полученная информация является снимком на время выполнения функции и + * может быть в любой момент изменена работающим с БД процессом. В том числе, + * нет препятствий к тому, чтобы другой процесс удалил БД и создал её заново с + * другим размером страницы и/или изменением любых других параметров. + * + * \returns Ненулевое значение ошибки при сбое и 0 при успешном выполнении. */ +LIBMDBX_API int mdbx_preopen_snapinfo(const char *pathname, MDBX_envinfo *info, + size_t bytes); +#if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) +/** \copydoc mdbx_preopen_snapinfo() + * \note Available only on Windows. + * \see mdbx_preopen_snapinfo() */ +LIBMDBX_API int mdbx_preopen_snapinfoW(const wchar_t *pathname, + MDBX_envinfo *info, size_t bytes); +#endif /* Windows */ + +/** \brief Флаги/опции для проверки целостности базы данных. + * \note Данный API еще не зафиксирован, в последующих версиях могут быть + * незначительные доработки и изменения. + * \see mdbx_env_chk() */ +enum MDBX_chk_flags_t { + /** Режим проверки по-умолчанию, в том числе в режиме только-чтения. */ + MDBX_CHK_DEFAULTS = 0, + + /** Проверка в режиме чтения-записи, с захватом блокировки и приостановки + * пишущих транзакций. */ + MDBX_CHK_READWRITE = 1, + + /** Пропустить обход дерева страниц. */ + MDBX_CHK_SKIP_BTREE_TRAVERSAL = 2, + + /** Пропустить просмотр записей ключ-значение. */ + MDBX_CHK_SKIP_KV_TRAVERSAL = 4, + + /** Игнорировать порядок ключей и записей. + * \note Требуется при проверке унаследованных БД созданных с использованием + * нестандартных (пользовательских) функций сравнения ключей или значений. */ + MDBX_CHK_IGNORE_ORDER = 8 +}; +#ifndef __cplusplus +/** \ingroup c_opening */ +typedef enum MDBX_chk_flags_t MDBX_chk_flags_t; +#else +DEFINE_ENUM_FLAG_OPERATORS(MDBX_chk_flags_t) +#endif + +/** \brief Уровни логирование/детализации информации, + * поставляемой через обратные вызовы при проверке целостности базы данных. + * \see mdbx_env_chk() */ +enum MDBX_chk_severity { + MDBX_chk_severity_prio_shift = 4, + MDBX_chk_severity_kind_mask = 0xF, + MDBX_chk_fatal = 0x00u, + MDBX_chk_error = 0x11u, + MDBX_chk_warning = 0x22u, + MDBX_chk_notice = 0x33u, + MDBX_chk_result = 0x44u, + MDBX_chk_resolution = 0x55u, + MDBX_chk_processing = 0x56u, + MDBX_chk_info = 0x67u, + MDBX_chk_verbose = 0x78u, + MDBX_chk_details = 0x89u, + MDBX_chk_extra = 0x9Au +}; + +/** \brief Стадии проверки, + * сообщаемые через обратные вызовы при проверке целостности базы данных. + * \see mdbx_env_chk() */ +enum MDBX_chk_stage { + MDBX_chk_none, + MDBX_chk_init, + MDBX_chk_lock, + MDBX_chk_meta, + MDBX_chk_traversal_tree, + MDBX_chk_traversal_freedb, + MDBX_chk_space, + MDBX_chk_traversal_maindb, + MDBX_chk_traversal_subdbs, + MDBX_chk_conclude, + MDBX_chk_unlock, + MDBX_chk_finalize +}; + +/** \brief Виртуальная строка отчета, формируемого при проверке целостности базы + * данных. \see mdbx_env_chk() */ +typedef struct MDBX_chk_line { + struct MDBX_chk_context *ctx; + uint8_t severity, scope_depth, empty; + char *begin, *end, *out; +} MDBX_chk_line_t; + +/** \brief Проблема обнаруженная при проверке целостности базы данных. + * \see mdbx_env_chk() */ +typedef struct MDBX_chk_issue { + struct MDBX_chk_issue *next; + size_t count; + const char *caption; +} MDBX_chk_issue_t; + +/** \brief Иерархический контекст при проверке целостности базы данных. + * \see mdbx_env_chk() */ +typedef struct MDBX_chk_scope { + MDBX_chk_issue_t *issues; + struct MDBX_chk_internal *internal; + const void *object; + enum MDBX_chk_stage stage; + enum MDBX_chk_severity verbosity; + size_t subtotal_issues; + union { + void *ptr; + size_t number; + } usr_z, usr_v, usr_o; +} MDBX_chk_scope_t; + +/** \brief Пользовательский тип для привязки дополнительных данных, + * связанных с некоторой таблицей ключ-значение, при проверке целостности базы + * данных. \see mdbx_env_chk() */ +typedef struct MDBX_chk_user_subdb_cookie MDBX_chk_user_subdb_cookie_t; + +/** \brief Гистограмма с некоторой статистической информацией, + * собираемой при проверке целостности БД. + * \see mdbx_env_chk() */ +struct MDBX_chk_histogram { + size_t amount, count, ones, pad; + struct { + size_t begin, end, amount, count; + } ranges[9]; +}; + +/** \brief Информация о некоторой таблицей ключ-значение, + * при проверке целостности базы данных. + * \see mdbx_env_chk() */ +typedef struct MDBX_chk_subdb { + MDBX_chk_user_subdb_cookie_t *cookie; + +/** \brief Pseudo-name for MainDB */ +#define MDBX_CHK_MAIN ((void *)((ptrdiff_t)0)) +/** \brief Pseudo-name for GarbageCollectorDB */ +#define MDBX_CHK_GC ((void *)((ptrdiff_t)-1)) +/** \brief Pseudo-name for MetaPages */ +#define MDBX_CHK_META ((void *)((ptrdiff_t)-2)) + + MDBX_val name; + MDBX_db_flags_t flags; + int id; + + size_t payload_bytes, lost_bytes; + struct { + size_t all, empty, other; + size_t branch, leaf; + size_t nested_branch, nested_leaf, nested_subleaf; + } pages; + struct { + /// Tree deep histogram + struct MDBX_chk_histogram deep; + /// Histogram of large/overflow pages length + struct MDBX_chk_histogram large_pages; + /// Histogram of nested trees height, span length for GC + struct MDBX_chk_histogram nested_tree; + /// Keys length histogram + struct MDBX_chk_histogram key_len; + /// Values length histogram + struct MDBX_chk_histogram val_len; + } histogram; +} MDBX_chk_subdb_t; + +/** \brief Контекст проверки целостности базы данных. + * \see mdbx_env_chk() */ +typedef struct MDBX_chk_context { + struct MDBX_chk_internal *internal; + MDBX_env *env; + MDBX_txn *txn; + MDBX_chk_scope_t *scope; + uint8_t scope_nesting; + struct { + size_t total_payload_bytes; + size_t subdb_total, subdb_processed; + size_t total_unused_bytes, unused_pages; + size_t processed_pages, reclaimable_pages, gc_pages, alloc_pages, + backed_pages; + size_t problems_meta, tree_problems, gc_tree_problems, kv_tree_problems, + problems_gc, problems_kv, total_problems; + uint64_t steady_txnid, recent_txnid; + /** Указатель на массив размером subdb_total с указателями на экземпляры + * структур MDBX_chk_subdb_t с информацией о всех таблицах ключ-значение, + * включая MainDB и GC/FreeDB. */ + const MDBX_chk_subdb_t *const *subdbs; + } result; +} MDBX_chk_context_t; + +/** \brief Набор функций обратного вызова используемых при проверке целостности + * базы данных. + * + * Функции обратного вызова предназначены для организации взаимодействия с кодом + * приложения. В том числе, для интеграции логики приложения проверяющей + * целостность стуктуры данных выше уровня ключ-значение, подготовки и + * структурированного вывода информации как о ходе, так и результатов проверки. + * + * Все функции обратного вызова опциональны, неиспользуемые указатели должны + * быть установлены в `nullptr`. + * + * \note Данный API еще не зафиксирован, в последующих версиях могут быть + * незначительные доработки и изменения. + * + * \see mdbx_env_chk() */ +typedef struct MDBX_chk_callbacks { + bool (*check_break)(MDBX_chk_context_t *ctx); + int (*scope_push)(MDBX_chk_context_t *ctx, MDBX_chk_scope_t *outer, + MDBX_chk_scope_t *inner, const char *fmt, va_list args); + int (*scope_conclude)(MDBX_chk_context_t *ctx, MDBX_chk_scope_t *outer, + MDBX_chk_scope_t *inner, int err); + void (*scope_pop)(MDBX_chk_context_t *ctx, MDBX_chk_scope_t *outer, + MDBX_chk_scope_t *inner); + void (*issue)(MDBX_chk_context_t *ctx, const char *object, + uint64_t entry_number, const char *issue, const char *extra_fmt, + va_list extra_args); + MDBX_chk_user_subdb_cookie_t *(*subdb_filter)(MDBX_chk_context_t *ctx, + const MDBX_val *name, + MDBX_db_flags_t flags); + int (*subdb_conclude)(MDBX_chk_context_t *ctx, const MDBX_chk_subdb_t *subdb, + MDBX_cursor *cursor, int err); + void (*subdb_dispose)(MDBX_chk_context_t *ctx, const MDBX_chk_subdb_t *subdb); + + int (*subdb_handle_kv)(MDBX_chk_context_t *ctx, const MDBX_chk_subdb_t *subdb, + size_t entry_number, const MDBX_val *key, + const MDBX_val *value); + + int (*stage_begin)(MDBX_chk_context_t *ctx, enum MDBX_chk_stage); + int (*stage_end)(MDBX_chk_context_t *ctx, enum MDBX_chk_stage, int err); + + MDBX_chk_line_t *(*print_begin)(MDBX_chk_context_t *ctx, + enum MDBX_chk_severity severity); + void (*print_flush)(MDBX_chk_line_t *); + void (*print_done)(MDBX_chk_line_t *); + void (*print_chars)(MDBX_chk_line_t *, const char *str, size_t len); + void (*print_format)(MDBX_chk_line_t *, const char *fmt, va_list args); + void (*print_size)(MDBX_chk_line_t *, const char *prefix, + const uint64_t value, const char *suffix); +} MDBX_chk_callbacks_t; + +/** \brief Проверяет целостность базы данных. + * + * Взаимодействие с кодом приложения реализуется через функции обратного вызова, + * предоставляемые приложением посредством параметра `cb`. В ходе такого + * взаимодействия приложение может контролировать ход проверки, в том числе, + * пропускать/фильтровать обработку отдельных элементов, а также реализовать + * дополнительную верификацию структуры и/или информации с учетом назначения и + * семантической значимости для приложения. Например, приложение может выполнить + * проверку собственных индексов и корректность записей в БД. Именно с этой + * целью функционал проверки целостности был доработан для интенсивного + * использования обратных вызовов и перенесен из утилиты `mdbx_chk` в основную + * библиотеку. + * + * Проверка выполняется в несколько стадий, начиная с инициализации и до + * завершения, более подробно см \ref MDBX_chk_stage. О начале и завершении + * каждой стадии код приложения уведомляется через соответствующие функции + * обратного вызова, более подробно см \ref MDBX_chk_callbacks_t. + * + * \param [in] env Указатель на экземпляр среды. + * \param [in] cb Набор функций обратного вызова. + * \param [in,out] ctx Контекст проверки целостности базы данных, + * где будут формироваться результаты проверки. + * \param [in] flags Флаги/опции проверки целостности базы данных. + * \param [in] verbosity Необходимый уровень детализации информации о ходе + * и результатах проверки. + * \param [in] timeout_seconds_16dot16 Ограничение длительности в 1/65536 долях + * секунды для выполнения проверки, + * либо 0 при отсутствии ограничения. + * \returns Нулевое значение в случае успеха, иначе код ошибки. */ +LIBMDBX_API int mdbx_env_chk(MDBX_env *env, const MDBX_chk_callbacks_t *cb, + MDBX_chk_context_t *ctx, + const enum MDBX_chk_flags_t flags, + enum MDBX_chk_severity verbosity, + unsigned timeout_seconds_16dot16); + +/** \brief Вспомогательная функция для подсчета проблем детектируемых + * приложением, в том числе, поступающим к приложению через логирование. + * \see mdbx_env_chk() + * \see MDBX_debug_func + * \returns Нулевое значение в случае успеха, иначе код ошибки. */ +LIBMDBX_API int mdbx_env_chk_encount_problem(MDBX_chk_context_t *ctx); + +/** end of chk @} */ /** end of c_api @} */ diff --git a/mdbx/txn_test.go b/mdbx/txn_test.go index 515ac3a..277e29b 100644 --- a/mdbx/txn_test.go +++ b/mdbx/txn_test.go @@ -4,6 +4,7 @@ package mdbx import ( "bytes" "encoding/binary" + "errors" "fmt" "runtime" "syscall" @@ -421,16 +422,19 @@ func TestTxn_OpenDBI_emptyName(t *testing.T) { func TestTxn_OpenDBI_zero(t *testing.T) { env, _ := setup(t) - err := env.View(func(txn *Txn) (err error) { - _, err = txn.OpenRoot(0) - if err != nil { - return err - } - _, err = txn.Get(0, []byte("k")) - return err - }) - if !IsErrno(err, BadDBI) { - t.Errorf("mdb_dbi_open: %v", err) + txn, err := env.BeginTxn(nil, 0) + if err != nil { + panic(err) + } + defer txn.Abort() + + dbi, err := txn.OpenRoot(0) + if err != nil { + panic(err) + } + _, err = txn.Get(dbi, []byte("k")) + if !errors.Is(err, ErrNotFound) { + panic(err) } } diff --git a/mdbxdist/CMakeLists.txt b/mdbxdist/CMakeLists.txt index 33e6233..2fc9e8c 100644 --- a/mdbxdist/CMakeLists.txt +++ b/mdbxdist/CMakeLists.txt @@ -1,5 +1,5 @@ ## -## Copyright 2020-2023 Leonid Yuriev +## Copyright 2020-2024 Leonid Yuriev ## and other libmdbx authors: please see AUTHORS file. ## All rights reserved. ## @@ -305,7 +305,7 @@ else() "${CMAKE_CURRENT_SOURCE_DIR}/test/valgrind_suppress.txt" CACHE FILEPATH "Suppressions file for Valgrind" FORCE) set(MEMORYCHECK_COMMAND_OPTIONS - "--trace-children=yes --leak-check=full --track-origins=yes --error-exitcode=42 --error-markers=@ --errors-for-leak-kinds=definite --fair-sched=yes --suppressions=${MEMORYCHECK_SUPPRESSIONS_FILE}" + "--trace-children=yes --leak-check=full --track-origins=yes --track-origins=yes --error-exitcode=42 --error-markers=@ --errors-for-leak-kinds=definite --fair-sched=yes --suppressions=${MEMORYCHECK_SUPPRESSIONS_FILE}" CACHE STRING "Valgrind options" FORCE) set(VALGRIND_COMMAND_OPTIONS "${MEMORYCHECK_COMMAND_OPTIONS}" CACHE STRING "Valgrind options" FORCE) endif() @@ -467,7 +467,7 @@ endif() # #### # # # #### # # #### # -set(MDBX_BUILD_OPTIONS ENABLE_UBSAN ENABLE_ASAN MDBX_USE_VALGRIND ENABLE_GPROF ENABLE_GCOV) +set(MDBX_BUILD_OPTIONS ENABLE_UBSAN ENABLE_ASAN ENABLE_MEMCHECK ENABLE_GPROF ENABLE_GCOV) macro(add_mdbx_option NAME DESCRIPTION DEFAULT) list(APPEND MDBX_BUILD_OPTIONS ${NAME}) if(NOT ${DEFAULT} STREQUAL "AUTO") @@ -531,6 +531,8 @@ add_mdbx_option(MDBX_ENABLE_BIGFOOT "Chunking long list of retired pages during add_mdbx_option(MDBX_ENABLE_PGOP_STAT "Gathering statistics for page operations" ON) add_mdbx_option(MDBX_ENABLE_PROFGC "Profiling of GC search and updates" OFF) mark_as_advanced(MDBX_ENABLE_PROFGC) +add_mdbx_option(MDBX_ENABLE_DBI_SPARSE "FIXME" ON) +add_mdbx_option(MDBX_ENABLE_DBI_LOCKFREE "FIXME" ON) if(NOT MDBX_AMALGAMATED_SOURCE) if(CMAKE_CONFIGURATION_TYPES OR CMAKE_BUILD_TYPE_UPPERCASE STREQUAL "DEBUG") diff --git a/mdbxdist/ChangeLog.md b/mdbxdist/ChangeLog.md index 0934d9f..51269f9 100644 --- a/mdbxdist/ChangeLog.md +++ b/mdbxdist/ChangeLog.md @@ -4,19 +4,143 @@ ChangeLog English version [by Google](https://gitflic-ru.translate.goog/project/erthink/libmdbx/blob?file=ChangeLog.md&_x_tr_sl=ru&_x_tr_tl=en) and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md). -## v0.12.10 (подготовка к выпуску) +## v0.13.1 (в процессе подготовки выпуска) -Поддерживающий выпуск с исправлением обнаруженных ошибок и устранением недочетов. +Новая версия с существенным расширением API и добавлением функционала. +В том числе, с незначительным нарушением обратной совместимости API +библиотеки. + +Новое: + + - Перенос функционала утилиты `mdbx_chk` внутрь библиотеки в виде + функции `mdbx_env_chk() `для проверка целостности структуры БД, в том + числе с вовлечением логики приложения. + + - Опция `MDBX_opt_gc_time_limit` для более гибкого контроля времени + расходуемого на поиск последовательностей соседствующих свободных + страниц в GC. + + - Снижение накладных расходов на запуск транзакций в сценариях с + большим количеством DBI-хендов, за счет отложенной/ленивой инициализации + элементов служебных таблиц. В том числе, механизм поддержки разреженных + наборов DBI-хендов, управляемый опцией сборки `MDBX_ENABLE_DBI_SPARSE`, + которая включена по-умолчанию. + + - Снижение накладных расходов на открытие DBI-хендов. В том числе, + механизм отложенного освобождения и поддержки быстрого пути открытия без + использования блокировок, управляемый опцией сборки + `MDBX_ENABLE_DBI_LOCKFREE`, которая включена по-умолчанию. + + - Расширение API позиционирования курсоров более удобными и очевидными + операциями по аналогии условиям `<`, `<=`, `==`, `>=`, `>` как для + ключей, так и для пар ключ-значение. + + - Функции `mdbx_dbi_rename()` и `mdbx_dbi_rename2()` для переименования таблиц. + + - Функции `mdbx_cursor_unbind()` и `mdbx_txn_release_all_cursors()` для + гибкого управления курсорами в сценариях повторного использования для + уменьшения накладных расходов. + + - Функция `mdbx_env_resurrect_after_fork()` для восстановление открытой + среды работы с БД в дочернем процессе после ветвления/расщепления + процесса. + + - Функция `mdbx_cursor_compare()` для сравнения позиций курсоров + аналогично оператору `<=>`. + + - Функции `mdbx_cursor_scan()` и `mdbx_cursor_scan_from()` для + сканирования таблиц с использованием функционального предиката и + уменьшением сопутствующих накладных расходов. + + - Функции `mdbx_cursor_on_first_dup()` и `mdbx_cursor_on_last_dup()` + для оценки позиции курсора. + + - Функция `mdbx_preopen_snapinfo()` для получения информации о БД без + её открытия. + + - Поддержка функций логирования обратного вызова без функционала + `vprintf()`, что существенно облегчает использование логирования в + привязках к другим языкам программирования. + + - Режим работы `MDBX_NOSTICKYTHREADS` вместо `MDBX_NOTLS` для упрощения + интеграции с легковесными потоками/нитями их мультиплексирования вместе + с транзакциями по потокам операционной системы. + + - TODO: Опция `MDBX_opt_prefer_waf_insteadof_balance`. + + - TODO: Опции `MDBX_opt_subpage_limit`, `MDBX_opt_subpage_room_threshold`, `MDBX_opt_subpage_reserve_prereq`, `MDBX_opt_subpage_reserve_limit`. + + - Управление основной блокировкой lock/unlock/upgrade/downgrade для координации пишущих транзакций. + + - Функции `mdbx_limits_keysize_min()` и `mdbx_limits_valsize_min()` для + получения нижней границы длины ключей и значений. + + - Расширение и доработка C++ API: + + - добавлен тип `mdbx::cursor::estimation_result`, а поведение методов + `cursor::estimate()` унифицировано с `cursor::move()`; + - для предотвращения незаметного неверного использования API, для инициализации + возвращаемых по ссылке срезов, вместо пустых срезов задействован `slice::invalid()`; + - добавлены дополнительные C++ операторы преобразования к типам C API; + - для совместимости со старыми стандартами C++ и старыми версиями STL перенесены + в public классы `buffer::move_assign_alloc` и `buffer::copy_assign_alloc`; + - добавлен тип `mdbx::default_buffer`; + - для срезов и буферов добавлены методы `hex_decode()`, `base64_decode()`, `base58_decode()`; + - добавлен тип `mdbx::comparator` и функций `mdbx::default_comparator()`; + - добавлены статические методы `buffer::hex()`, `base64()`, `base58()`; + - для транзакций и курсоров добавлены методы `get_/set_context`; + - добавлен метод `cursor::clone()`; + - поддержка base58 переработана и приведена в соответствии с черновиком RFC, в текущем понимании теперь это одна из самых высокопроизводительных реализаций; + - переработка `to_hex()` и `from_hex()`. + +Нарушение совместимости: + - Опция `MDBX_COALESCE` объявлена устаревшей, так как соответствующий функционал всегда включен начиная с предыдущей версии 0.12. + - Опция `MDBX_NOTLS` объявлена устаревшей и заменена на `MDBX_NOSTICKYTHREADS`. + - Опция сборки `MDBX_USE_VALGRIND` заменена на общепринятую `ENABLE_MEMCHECK`. + - В структуре `MDBX_envinfo` серии полей вида `meta1`, `meta2` и `meta3` заменены на массивы вида `meta[3]`. + - В шаблонных классах и функциях С++ API по-умолчанию вместо `mdbx::legacy_buffer` использован тип `mdbx::default_buffer` использующий полиморфные аллокаторы С++ 17. + + +## v0.13.0 от 2023-04-23 + +Не выпуск, а начало ветки `0.13` с новым функционалом и изменением API. + +Новое: + + - Расширение API функционалом проверки целостности структуры БД, с + переработкой и переноса функционала утилиты `mdbx_chk` внутрь библиотеки. + + - Расширение API функциями lock/unlock/upgrade/downgrade основной блокировки. + + - Добавление в API функций `mdbx_cursor_unbind()` и `mdbx_txn_release_all_cursors()`. + + - Возвращение `MDBX_TXN_INVALID` (`INT32_MIN`) вместо `-1` + из `mdbx_txn_flags()` при передаче невалидной транзакции. + +Мелочи: + + - Обновление конфигурации Doxygen до 1.9.6. + - Добавление `--read-var-info=yes` для Valgrind. + - Вывод из `mdbx_chk` информации об уровне детализации/verbosity. + + +******************************************************************************** + + +## v0.12.10 "СЭМ" от 2024-03-12 + +Поддерживающий выпуск с исправлением обнаруженных ошибок и устранением недочетов +в память Героя России гвардии майора Дмитрия Семёнова с позывным "СЭМ". ``` -git diff' stat: 16 commits, 10 files changed, 665 insertions(+), 238 deletions(-) +git diff' stat: 19 commits, 57 files changed, 751 insertions(+), 331 deletions(-) Signed-off-by: Леонид Юрьев (Leonid Yuriev) ``` Благодарности: - [Dvir H](https://t.me/Dvirsw) за [сообщение](https://t.me/libmdbx/5368) - об ошибке `MDBX_CORRUPTED` в сценарии работы в режиме `MDBX_DUPFIXED` и нечетной длинной + об ошибке `MDBX_CORRUPTED` в сценарии работы в режиме `MDBX_DUPFIXED` и нечетной длиной мульти-значений, с предоставлением точного минимального сценария воспроизведения. Значимые исправления и доработки: @@ -34,10 +158,10 @@ Signed-off-by: Леонид Юрьев (Leonid Yuriev) вероятность проявления близка к нулю, а сценарий такого проявления найти не удалось. В MDBX ошибка присутствовала с момента отделения проекта от LMDB, - где эта ошибка присутствует более 11 лети, по настоящее время. + где эта ошибка присутствует более 11 лет, по настоящее время. - Исправление ложной ошибки `MDBX_CORRUPTED (-30796)` в сценарии работы - в режиме `MDBX_DUPFIXED` и нечетной длинной мульти-значений. + в режиме `MDBX_DUPFIXED` и нечетной длиной мульти-значений. - Исправление недочета корректировки сопутствующих курсоров при разделении страницы по сценарию добавления пустой страницы слева. @@ -45,7 +169,7 @@ Signed-off-by: Леонид Юрьев (Leonid Yuriev) - Доработка `rebalance()` ради уменьшения WAF. Новый функционал, включая контролируемую пользователем опцию `enum MDBX_option_t`, будет доступен в выпусках ветки `0.13.x`, а в этом выпуске доработка сводится к тактике - не-вовленичения чистой страницы при нехватке запаса страниц в ходе обновления GC, + не-вовлечения чистой страницы при нехватке запаса страниц в ходе обновления GC, за счет ухудшения баланса дерева страниц. - Устранение упущения приводящего к нелогичной ситуации @@ -62,6 +186,13 @@ Signed-off-by: Леонид Юрьев (Leonid Yuriev) - Исправление assert-проверки внутри `check_txn()` для случая завершенных транзакций в режиме `MDBX_NO_TLS`. Последствий ошибки, кроме срабатывания assert-проверки в отладочных сборках, нет. + - Устранение ошибки при открытии БД на файловой системе только-для-чтения. + + - Удалены излишне строгие проверки в утилите `mdbx_chk`, которые + приводили к ложно-позитивным ошибкам при проверке БД после серии + последних доработок. Подробности см в комментариях к коммитам [781b3f64d52b73cbaeb00a55811d1247c25624a8](https://gitflic.ru/project/erthink/libmdbx/commit/781b3f64d52b73cbaeb00a55811d1247c25624a8) + и [0741c81cfd8dc0864fcf55e04192b2207c8f68f7](https://gitflic.ru/project/erthink/libmdbx/commit/0741c81cfd8dc0864fcf55e04192b2207c8f68f7). + Прочее: - Расширение стохастического теста dupfixed-сценариями. @@ -144,6 +275,7 @@ Signed-off-by: Леонид Юрьев (Leonid Yuriev) ## v0.12.8 "Владимир Уткин" от 2023-10-17 + Стабилизирующий выпуск с исправлением обнаруженных ошибок и устранением недочетов, в день 100-летия со дня рождения выдающегося советского и российского ученого и конструктора [Влади́мира Фёдоровича У́ткина](https://ru.wikipedia.org/wiki/Уткин,_Владимир_Фёдорович). @@ -161,7 +293,7 @@ Signed-off-by: Леонид Юрьев (Leonid Yuriev) - Устранение регресса/ошибки в пути обработки `put(MDBX_MULTIPLE)` при пакетном/оптовом помещении в БД множественных значений одного ключа (aka multi-value или dupsort). - Проявление проблемы зависит от компилятора и опций оптимизации/кодогенерации, но с большой вероятностью возвращется + Проявление проблемы зависит от компилятора и опций оптимизации/кодогенерации, но с большой вероятностью возвращается ошибка `MDBX_BAD_VALSIZE` (`-30781`), а в отладочных сборках срабатывает проверка `cASSERT(mc, !"Invalid key-size")`. Сценарии приводящие к другим проявлениям на данный момент не известны. diff --git a/mdbxdist/Makefile b/mdbxdist/Makefile index 599e478..78ba348 100644 --- a/mdbxdist/Makefile +++ b/mdbxdist/Makefile @@ -6,9 +6,10 @@ bench bench-clean bench-couple bench-quartet bench-triplet re-bench \ lib libs lib-static lib-shared tools-static \ libmdbx mdbx mdbx_chk mdbx_copy mdbx_drop mdbx_dump mdbx_load mdbx_stat \ check dist memcheck cross-gcc cross-qemu doxygen gcc-analyzer reformat \ -release-assets tags test build-test mdbx_test smoke smoke-fault smoke-singleprocess \ -smoke-assertion test-assertion long-test-assertion \ -test-asan test-leak test-singleprocess test-ubsan test-valgrind: +release-assets tags build-test mdbx_test \ +smoke smoke-fault smoke-singleprocess smoke-assertion smoke-memcheck \ +test test-assertion test-long test-long-assertion test-ci test-ci-extra \ +test-asan test-leak test-singleprocess test-ubsan test-memcheck: @CC=$(CC) \ CXX=`if test -n "$(CXX)" && which "$(CXX)" > /dev/null; then echo "$(CXX)"; elif test -n "$(CCC)" && which "$(CCC)" > /dev/null; then echo "$(CCC)"; else echo "c++"; fi` \ `which gmake || which gnumake || echo 'echo "GNU Make 3.80 or above is required"; exit 2;'` \ diff --git a/mdbxdist/README.md b/mdbxdist/README.md index de335b0..c0c21c1 100644 --- a/mdbxdist/README.md +++ b/mdbxdist/README.md @@ -1,7 +1,19 @@ +### Время учить Русский + +Начиная с 2021 года наблюдается устойчивые тенденции к распространению +недостоверной информации о _libmdbx_ в странах ~~НАТО~~, +политизированной критика, а также отказу от использования библиотеки в +пользу LMDB, несмотря на явные проблемы с одной стороны и преимущества с +другой. Поэтому начиная с 17 марта 2024 года прекращается +документирование и сопровождение проекта на английском языке. Новый +функционал будет документироваться только на русском языке, однако, +целенаправленного переписывания/перевода документации пока не +планируется. + ### The origin has been migrated to [GitFlic](https://gitflic.ru/project/erthink/libmdbx) -since on 2022-04-15 the Github administration, without any warning +Since on 2022-04-15 the Github administration, without any warning nor explanation, deleted _libmdbx_ along with a lot of other projects, simultaneously blocking access for many developers. For the same reason ~~Github~~ is blacklisted forever. @@ -10,7 +22,7 @@ GitFlic's developers plan to support other languages, including English 和 中文, in the near future. ### Основной репозиторий перемещен на [GitFlic](https://gitflic.ru/project/erthink/libmdbx) -так как 15 апреля 2022 администрация Github без предупреждения и +Так как 15 апреля 2022 администрация Github без предупреждения и объяснения причин удалила _libmdbx_ вместе с массой других проектов, одновременно заблокировав доступ многим разработчикам. По этой же причине ~~Github~~ навсегда занесен в черный список. diff --git a/mdbxdist/VERSION.txt b/mdbxdist/VERSION.txt index d14e996..a95960b 100644 --- a/mdbxdist/VERSION.txt +++ b/mdbxdist/VERSION.txt @@ -1 +1 @@ -0.12.9.16 +0.13.0.38 diff --git a/mdbxdist/cmake/compiler.cmake b/mdbxdist/cmake/compiler.cmake index 762ea1b..73cd350 100644 --- a/mdbxdist/cmake/compiler.cmake +++ b/mdbxdist/cmake/compiler.cmake @@ -1,4 +1,4 @@ -## Copyright (c) 2012-2023 Leonid Yuriev . +## Copyright (c) 2012-2024 Leonid Yuriev . ## ## Licensed under the Apache License, Version 2.0 (the "License"); ## you may not use this file except in compliance with the License. diff --git a/mdbxdist/cmake/profile.cmake b/mdbxdist/cmake/profile.cmake index f13b697..a77b7dc 100644 --- a/mdbxdist/cmake/profile.cmake +++ b/mdbxdist/cmake/profile.cmake @@ -1,4 +1,4 @@ -## Copyright (c) 2012-2023 Leonid Yuriev . +## Copyright (c) 2012-2024 Leonid Yuriev . ## ## Licensed under the Apache License, Version 2.0 (the "License"); ## you may not use this file except in compliance with the License. @@ -24,6 +24,25 @@ endif() cmake_policy(PUSH) cmake_policy(VERSION ${CMAKE_MINIMUM_REQUIRED_VERSION}) +unset(MEMCHECK_OPTION_NAME) +if(NOT DEFINED ENABLE_MEMCHECK) + if (DEFINED MDBX_USE_VALGRIND) + set(MEMCHECK_OPTION_NAME "MDBX_USE_VALGRIND") + elseif(DEFINED ENABLE_VALGRIND) + set(MEMCHECK_OPTION_NAME "ENABLE_VALGRIND") + else() + set(MEMCHECK_OPTION_NAME "ENABLE_MEMCHECK") + endif() + if(MEMCHECK_OPTION_NAME STREQUAL "ENABLE_MEMCHECK") + option(ENABLE_MEMCHECK + "Enable integration with valgrind, a memory analyzing tool" OFF) + elseif(${MEMCHECK_OPTION_NAME}) + set(ENABLE_MEMCHECK ON) + else() + set(ENABLE_MEMCHECK OFF) + endif() +endif() + include(CheckLibraryExists) check_library_exists(gcov __gcov_flush "" HAVE_GCOV) @@ -33,23 +52,23 @@ option(ENABLE_GCOV option(ENABLE_GPROF "Enable integration with gprof, a performance analyzing tool" OFF) -if(CMAKE_CXX_COMPILER_LOADED) - include(CheckIncludeFileCXX) - check_include_file_cxx(valgrind/memcheck.h HAVE_VALGRIND_MEMCHECK_H) -else() - include(CheckIncludeFile) - check_include_file(valgrind/memcheck.h HAVE_VALGRIND_MEMCHECK_H) -endif() - -option(MDBX_USE_VALGRIND "Enable integration with valgrind, a memory analyzing tool" OFF) -if(MDBX_USE_VALGRIND AND NOT HAVE_VALGRIND_MEMCHECK_H) - message(FATAL_ERROR "MDBX_USE_VALGRIND option is set but valgrind/memcheck.h is not found") -endif() - option(ENABLE_ASAN "Enable AddressSanitizer, a fast memory error detector based on compiler instrumentation" OFF) option(ENABLE_UBSAN "Enable UndefinedBehaviorSanitizer, a fast undefined behavior detector based on compiler instrumentation" OFF) +if(ENABLE_MEMCHECK) + if(CMAKE_CXX_COMPILER_LOADED) + include(CheckIncludeFileCXX) + check_include_file_cxx(valgrind/memcheck.h HAVE_VALGRIND_MEMCHECK_H) + else() + include(CheckIncludeFile) + check_include_file(valgrind/memcheck.h HAVE_VALGRIND_MEMCHECK_H) + endif() + if(NOT HAVE_VALGRIND_MEMCHECK_H) + message(FATAL_ERROR "${MEMCHECK_OPTION_NAME} option is set but valgrind/memcheck.h is not found") + endif() +endif() + cmake_policy(POP) diff --git a/mdbxdist/cmake/utils.cmake b/mdbxdist/cmake/utils.cmake index aa8aef0..0fa5784 100644 --- a/mdbxdist/cmake/utils.cmake +++ b/mdbxdist/cmake/utils.cmake @@ -1,4 +1,4 @@ -## Copyright (c) 2012-2023 Leonid Yuriev . +## Copyright (c) 2012-2024 Leonid Yuriev . ## ## Licensed under the Apache License, Version 2.0 (the "License"); ## you may not use this file except in compliance with the License. diff --git a/mdbxdist/config.h.in b/mdbxdist/config.h.in index 05c561b..88a282c 100644 --- a/mdbxdist/config.h.in +++ b/mdbxdist/config.h.in @@ -5,7 +5,7 @@ /* clang-format off */ #cmakedefine LTO_ENABLED -#cmakedefine MDBX_USE_VALGRIND +#cmakedefine ENABLE_MEMCHECK #cmakedefine ENABLE_GPROF #cmakedefine ENABLE_GCOV #cmakedefine ENABLE_ASAN @@ -33,6 +33,8 @@ #cmakedefine01 MDBX_ENABLE_BIGFOOT #cmakedefine01 MDBX_ENABLE_PGOP_STAT #cmakedefine01 MDBX_ENABLE_PROFGC +#cmakedefine01 MDBX_ENABLE_DBI_SPARSE +#cmakedefine01 MDBX_ENABLE_DBI_LOCKFREE /* Windows */ #cmakedefine01 MDBX_WITHOUT_MSVC_CRT diff --git a/mdbxdist/man1/mdbx_chk.1 b/mdbxdist/man1/mdbx_chk.1 index aa4e986..bc438e8 100644 --- a/mdbxdist/man1/mdbx_chk.1 +++ b/mdbxdist/man1/mdbx_chk.1 @@ -1,6 +1,6 @@ -.\" Copyright 2015-2023 Leonid Yuriev . +.\" Copyright 2015-2024 Leonid Yuriev . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_CHK 1 "2023-10-17" "MDBX 0.12.8" +.TH MDBX_CHK 1 "2024-03-21" "MDBX 0.13" .SH NAME mdbx_chk \- MDBX checking tool .SH SYNOPSIS diff --git a/mdbxdist/man1/mdbx_copy.1 b/mdbxdist/man1/mdbx_copy.1 index 4e67a5b..14663b8 100644 --- a/mdbxdist/man1/mdbx_copy.1 +++ b/mdbxdist/man1/mdbx_copy.1 @@ -1,8 +1,8 @@ -.\" Copyright 2015-2023 Leonid Yuriev . +.\" Copyright 2015-2024 Leonid Yuriev . .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_COPY 1 "2023-10-17" "MDBX 0.12.8" +.TH MDBX_COPY 1 "2024-03-21" "MDBX 0.13" .SH NAME mdbx_copy \- MDBX environment copy tool .SH SYNOPSIS diff --git a/mdbxdist/man1/mdbx_drop.1 b/mdbxdist/man1/mdbx_drop.1 index 425eecd..7ae14f9 100644 --- a/mdbxdist/man1/mdbx_drop.1 +++ b/mdbxdist/man1/mdbx_drop.1 @@ -1,7 +1,7 @@ -.\" Copyright 2021-2023 Leonid Yuriev . +.\" Copyright 2021-2024 Leonid Yuriev . .\" Copyright 2014-2021 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_DROP 1 "2023-10-17" "MDBX 0.12.8" +.TH MDBX_DROP 1 "2024-03-21" "MDBX 0.13" .SH NAME mdbx_drop \- MDBX database delete tool .SH SYNOPSIS diff --git a/mdbxdist/man1/mdbx_dump.1 b/mdbxdist/man1/mdbx_dump.1 index d236b93..51e6cac 100644 --- a/mdbxdist/man1/mdbx_dump.1 +++ b/mdbxdist/man1/mdbx_dump.1 @@ -1,8 +1,8 @@ -.\" Copyright 2015-2023 Leonid Yuriev . +.\" Copyright 2015-2024 Leonid Yuriev . .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_DUMP 1 "2023-10-17" "MDBX 0.12.8" +.TH MDBX_DUMP 1 "2024-03-21" "MDBX 0.13" .SH NAME mdbx_dump \- MDBX environment export tool .SH SYNOPSIS diff --git a/mdbxdist/man1/mdbx_load.1 b/mdbxdist/man1/mdbx_load.1 index ae8e759..b7fa87f 100644 --- a/mdbxdist/man1/mdbx_load.1 +++ b/mdbxdist/man1/mdbx_load.1 @@ -1,8 +1,8 @@ -.\" Copyright 2015-2023 Leonid Yuriev . +.\" Copyright 2015-2024 Leonid Yuriev . .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_LOAD 1 "2023-10-17" "MDBX 0.12.8" +.TH MDBX_LOAD 1 "2024-03-21" "MDBX 0.13" .SH NAME mdbx_load \- MDBX environment import tool .SH SYNOPSIS diff --git a/mdbxdist/man1/mdbx_stat.1 b/mdbxdist/man1/mdbx_stat.1 index c330d2e..997bdae 100644 --- a/mdbxdist/man1/mdbx_stat.1 +++ b/mdbxdist/man1/mdbx_stat.1 @@ -1,8 +1,8 @@ -.\" Copyright 2015-2023 Leonid Yuriev . +.\" Copyright 2015-2024 Leonid Yuriev . .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_STAT 1 "2023-10-17" "MDBX 0.12.8" +.TH MDBX_STAT 1 "2024-03-21" "MDBX 0.13" .SH NAME mdbx_stat \- MDBX environment status tool .SH SYNOPSIS diff --git a/mdbxdist/mdbx.c b/mdbxdist/mdbx.c index 4a6ea35..01303e0 100644 --- a/mdbxdist/mdbx.c +++ b/mdbxdist/mdbx.c @@ -1,5 +1,5 @@ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * . */ #define xMDBX_ALLOY 1 -#define MDBX_BUILD_SOURCERY 0c86daff919dc09ab5e99c8e5bfb1d51e1367babca9c630c97af92a2aa988ea9_v0_12_9_16_gfff3fbd8 +#define MDBX_BUILD_SOURCERY 91ff5b5423830ee44fca4b70dcb298f233338a17a3185c44df67ec16d3034af3_v0_13_0_38_gf1975363 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -25,11 +25,13 @@ #ifdef xMDBX_ALLOY /* Amalgamated build */ #define MDBX_INTERNAL_FUNC static -#define MDBX_INTERNAL_VAR static +#define MDBX_INTERNAL_VAR_PROTO static +#define MDBX_INTERNAL_VAR_INSTA static #else /* Non-amalgamated build */ #define MDBX_INTERNAL_FUNC -#define MDBX_INTERNAL_VAR extern +#define MDBX_INTERNAL_VAR_PROTO extern +#define MDBX_INTERNAL_VAR_INSTA #endif /* xMDBX_ALLOY */ /*----------------------------------------------------------------------------*/ @@ -94,6 +96,10 @@ disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ producing 'defined' has undefined behavior */ #endif +#if _MSC_VER < 1920 +/* avoid "error C2219: syntax error: type qualifier must be after '*'" */ +#define __restrict +#endif #if _MSC_VER > 1930 #pragma warning(disable : 6235) /* is always a constant */ #pragma warning(disable : 6237) /* is never evaluated and might \ @@ -139,7 +145,7 @@ #include "mdbx.h" /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -187,6 +193,7 @@ #include #include +#include #include #include #include @@ -824,7 +831,7 @@ __extern_C key_t ftok(const char *, int); /*----------------------------------------------------------------------------*/ -#if defined(MDBX_USE_VALGRIND) +#if defined(ENABLE_MEMCHECK) #include #ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE /* LY: available since Valgrind 3.10 */ @@ -846,7 +853,7 @@ __extern_C key_t ftok(const char *, int); #define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a, s) (0) #define VALGRIND_CHECK_MEM_IS_DEFINED(a, s) (0) #define RUNNING_ON_VALGRIND (0) -#endif /* MDBX_USE_VALGRIND */ +#endif /* ENABLE_MEMCHECK */ #ifdef __SANITIZE_ADDRESS__ #include @@ -993,7 +1000,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1197,8 +1204,8 @@ typedef pthread_mutex_t osal_fastmutex_t; /*----------------------------------------------------------------------------*/ /* OS abstraction layer stuff */ -MDBX_INTERNAL_VAR unsigned sys_pagesize; -MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_pagesize_ln2, +MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize; +MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize_ln2, sys_allocation_granularity; /* Get the size of a memory page for the system. @@ -1462,8 +1469,9 @@ MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); #endif #if defined(__linux__) || defined(__gnu_linux__) -MDBX_INTERNAL_VAR uint32_t linux_kernel_version; -MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; +MDBX_INTERNAL_VAR_PROTO uint32_t linux_kernel_version; +MDBX_INTERNAL_VAR_PROTO bool + mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ #ifndef osal_strdup @@ -1677,7 +1685,8 @@ MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, /// restore POSIX-fcntl locks after the closing of file descriptors. /// \return Error code (MDBX_PANIC) or zero on success. MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, - MDBX_env *inprocess_neighbor); + MDBX_env *inprocess_neighbor, + const uint32_t current_pid); /// \brief Connects to shared interprocess locking objects and tries to acquire /// the maximum lock level (shared if exclusive is not available) @@ -1705,6 +1714,8 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); /// operational lock. /// \return Error code or zero on success MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, + bool dont_wait); /// \brief Locks LCK-file or/and table of readers for (de)registering. /// \return Error code or zero on success @@ -1713,16 +1724,12 @@ MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); /// \brief Unlocks LCK-file or/and table of readers after (de)registering. MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); -/// \brief Acquires lock for DB change (on writing transaction start) -/// Reading transactions will not be blocked. -/// Declared as LIBMDBX_API because it is used in mdbx_chk. +/// \brief Acquires write-transaction lock. /// \return Error code or zero on success -LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dont_wait); +MDBX_INTERNAL_FUNC int osal_txn_lock(MDBX_env *env, bool dont_wait); -/// \brief Releases lock once DB changes is made (after writing transaction -/// has finished). -/// Declared as LIBMDBX_API because it is used in mdbx_chk. -LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); +/// \brief Releases write-transaction lock.. +MDBX_INTERNAL_FUNC void osal_txn_unlock(MDBX_env *env); /// \brief Sets alive-flag of reader presence (indicative lock) for PID of /// the current process. The function does no more than needed for @@ -1751,7 +1758,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst); typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); -MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, +MDBX_INTERNAL_VAR_PROTO osal_srwlock_t_function osal_srwlock_Init, osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; @@ -1804,7 +1811,7 @@ typedef struct _FILE_REMOTE_PROTOCOL_INFO { typedef BOOL(WINAPI *MDBX_GetFileInformationByHandleEx)( _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -MDBX_INTERNAL_VAR MDBX_GetFileInformationByHandleEx +MDBX_INTERNAL_VAR_PROTO MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx; typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( @@ -1813,19 +1820,20 @@ typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( _Out_opt_ LPDWORD lpMaximumComponentLength, _Out_opt_ LPDWORD lpFileSystemFlags, _Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize); -MDBX_INTERNAL_VAR MDBX_GetVolumeInformationByHandleW +MDBX_INTERNAL_VAR_PROTO MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW; typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile, _Out_ LPWSTR lpszFilePath, _In_ DWORD cchFilePath, _In_ DWORD dwFlags); -MDBX_INTERNAL_VAR MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; +MDBX_INTERNAL_VAR_PROTO MDBX_GetFinalPathNameByHandleW + mdbx_GetFinalPathNameByHandleW; typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)( _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -MDBX_INTERNAL_VAR MDBX_SetFileInformationByHandle +MDBX_INTERNAL_VAR_PROTO MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle; typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( @@ -1834,10 +1842,10 @@ typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode, IN OUT PVOID InputBuffer, IN ULONG InputBufferLength, OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength); -MDBX_INTERNAL_VAR MDBX_NtFsControlFile mdbx_NtFsControlFile; +MDBX_INTERNAL_VAR_PROTO MDBX_NtFsControlFile mdbx_NtFsControlFile; typedef uint64_t(WINAPI *MDBX_GetTickCount64)(void); -MDBX_INTERNAL_VAR MDBX_GetTickCount64 mdbx_GetTickCount64; +MDBX_INTERNAL_VAR_PROTO MDBX_GetTickCount64 mdbx_GetTickCount64; #if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8 typedef struct _WIN32_MEMORY_RANGE_ENTRY { @@ -1849,13 +1857,13 @@ typedef struct _WIN32_MEMORY_RANGE_ENTRY { typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)( HANDLE hProcess, ULONG_PTR NumberOfEntries, PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags); -MDBX_INTERNAL_VAR MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; +MDBX_INTERNAL_VAR_PROTO MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT; typedef NTSTATUS(NTAPI *MDBX_NtExtendSection)(IN HANDLE SectionHandle, IN PLARGE_INTEGER NewSectionSize); -MDBX_INTERNAL_VAR MDBX_NtExtendSection mdbx_NtExtendSection; +MDBX_INTERNAL_VAR_PROTO MDBX_NtExtendSection mdbx_NtExtendSection; static __inline bool mdbx_RunningUnderWine(void) { return !mdbx_NtExtendSection; @@ -1865,14 +1873,15 @@ typedef LSTATUS(WINAPI *MDBX_RegGetValueA)(HKEY hkey, LPCSTR lpSubKey, LPCSTR lpValue, DWORD dwFlags, LPDWORD pdwType, PVOID pvData, LPDWORD pcbData); -MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; +MDBX_INTERNAL_VAR_PROTO MDBX_RegGetValueA mdbx_RegGetValueA; NTSYSAPI ULONG RtlRandomEx(PULONG Seed); typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle, PUCHAR OverlappedRangeStart, ULONG Length); -MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; +MDBX_INTERNAL_VAR_PROTO MDBX_SetFileIoOverlappedRange + mdbx_SetFileIoOverlappedRange; #endif /* Windows */ @@ -2102,7 +2111,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1 #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ -/** Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP +/** Controls dirty pages tracking, spilling and persisting in `MDBX_WRITEMAP` * mode. 0/OFF = Don't track dirty pages at all, don't spill ones, and use * msync() to persist data. This is by-default on Linux and other systems where * kernel provides properly LRU tracking and effective flushing on-demand. 1/ON @@ -2119,6 +2128,22 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_AVOID_MSYNC must be defined as 0 or 1 #endif /* MDBX_AVOID_MSYNC */ +/** Управляет механизмом поддержки разреженных наборов DBI-хендлов для снижения + * накладных расходов при запуске и обработке транзакций. */ +#ifndef MDBX_ENABLE_DBI_SPARSE +#define MDBX_ENABLE_DBI_SPARSE 1 +#elif !(MDBX_ENABLE_DBI_SPARSE == 0 || MDBX_ENABLE_DBI_SPARSE == 1) +#error MDBX_ENABLE_DBI_SPARSE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_DBI_SPARSE */ + +/** Управляет механизмом отложенного освобождения и поддержки пути быстрого + * открытия DBI-хендлов без захвата блокировок. */ +#ifndef MDBX_ENABLE_DBI_LOCKFREE +#define MDBX_ENABLE_DBI_LOCKFREE 1 +#elif !(MDBX_ENABLE_DBI_LOCKFREE == 0 || MDBX_ENABLE_DBI_LOCKFREE == 1) +#error MDBX_ENABLE_DBI_LOCKFREE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ + /** Controls sort order of internal page number lists. * This mostly experimental/advanced option with not for regular MDBX users. * \warning The database format depend on this option and libmdbx built with @@ -2166,8 +2191,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /** If defined then enables integration with Valgrind, * a memory analyzing tool. */ -#ifndef MDBX_USE_VALGRIND -#endif /* MDBX_USE_VALGRIND */ +#ifndef ENABLE_MEMCHECK +#endif /* ENABLE_MEMCHECK */ /** If defined then enables use C11 atomics, * otherwise detects ones availability automatically. */ @@ -2490,13 +2515,23 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_RUNTIME_FLAGS_INIT \ ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT -extern uint8_t runtime_flags; -extern uint8_t loglevel; -extern MDBX_debug_func *debug_logger; +union logger_union { + void *ptr; + MDBX_debug_func *fmt; + MDBX_debug_func_nofmt *nofmt; +}; + +MDBX_INTERNAL_VAR_PROTO struct mdbx_static { + uint8_t flags; + uint8_t loglevel; + union logger_union logger; + size_t logger_buffer_size; + char *logger_buffer; +} mdbx_static; MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { #if MDBX_DEBUG - if (MDBX_DBG_JITTER & runtime_flags) + if (MDBX_DBG_JITTER & mdbx_static.flags) osal_jitter(tiny); #else (void)tiny; @@ -2510,17 +2545,17 @@ MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, const char *fmt, va_list args); #if MDBX_DEBUG -#define LOG_ENABLED(msg) unlikely(msg <= loglevel) -#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) +#define LOG_ENABLED(msg) unlikely(msg <= mdbx_static.loglevel) +#define AUDIT_ENABLED() unlikely((mdbx_static.flags & MDBX_DBG_AUDIT)) #else /* MDBX_DEBUG */ -#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= mdbx_static.loglevel) #define AUDIT_ENABLED() (0) #endif /* MDBX_DEBUG */ #if MDBX_FORCE_ASSERTIONS #define ASSERT_ENABLED() (1) #elif MDBX_DEBUG -#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) +#define ASSERT_ENABLED() likely((mdbx_static.flags & MDBX_DBG_ASSERT)) #else #define ASSERT_ENABLED() (0) #endif /* assertions */ @@ -2955,7 +2990,8 @@ typedef struct MDBX_page { #define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) -/* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ +/* Drop legacy P_DIRTY flag for sub-pages for compatilibity, + * for assertions only. */ #define PAGETYPE_COMPAT(p) \ (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ @@ -3064,7 +3100,7 @@ typedef sem_t osal_ipclock_t; #endif /* MDBX_LOCKING */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) -MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_stubinit(osal_ipclock_t *ipc); MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); #endif /* MDBX_LOCKING */ @@ -3077,8 +3113,9 @@ MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); * read transactions started by the same thread need no further locking to * proceed. * - * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data. - * No reader table is used if the database is on a read-only filesystem. + * If MDBX_NOSTICKYTHREADS is set, the slot address is not saved in + * thread-specific data. No reader table is used if the database is on a + * read-only filesystem. * * Since the database uses multi-version concurrency control, readers don't * actually need any locking. This table is used to keep track of which @@ -3388,10 +3425,10 @@ typedef struct troika { #if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */ uint32_t unused_pad; #endif -#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) -#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) -#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) -#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3) +#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7u) +#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64u) +#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128u) +#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3u) txnid_t txnid[NUM_METAS]; } meta_troika_t; @@ -3421,6 +3458,8 @@ struct MDBX_txn { #error "Oops, some txn flags overlapped or wrong" #endif uint32_t mt_flags; + unsigned mt_numdbs; + size_t mt_owner; /* thread ID that owns this transaction */ MDBX_txn *mt_parent; /* parent of a nested txn */ /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ @@ -3438,31 +3477,30 @@ struct MDBX_txn { txnid_t mt_front; MDBX_env *mt_env; /* the DB environment */ - /* Array of records for each DB known in the environment. */ - MDBX_dbx *mt_dbxs; /* Array of MDBX_db records for each known DB */ MDBX_db *mt_dbs; - /* Array of sequence numbers for each DB handle */ - MDBX_atomic_uint32_t *mt_dbiseqs; - - /* Transaction DBI Flags */ -#define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ -#define DBI_STALE MDBX_DBI_STALE /* Named-DB record is older than txnID */ -#define DBI_FRESH MDBX_DBI_FRESH /* Named-DB handle opened in this txn */ -#define DBI_CREAT MDBX_DBI_CREAT /* Named-DB handle created in this txn */ -#define DBI_VALID 0x10 /* DB handle is valid, see also DB_VALID */ -#define DBI_USRVALID 0x20 /* As DB_VALID, but not set for FREE_DBI */ -#define DBI_AUDITED 0x40 /* Internal flag for accounting during audit */ - /* Array of flags for each DB */ - uint8_t *mt_dbistate; - /* Number of DB records in use, or 0 when the txn is finished. - * This number only ever increments until the txn finishes; we - * don't decrement it when individual DB handles are closed. */ - MDBX_dbi mt_numdbs; - size_t mt_owner; /* thread ID that owns this transaction */ + +#if MDBX_ENABLE_DBI_SPARSE + unsigned *__restrict mt_dbi_sparse; +#endif /* MDBX_ENABLE_DBI_SPARSE */ + + /* Non-shared DBI state flags inside transaction */ +#define DBI_DIRTY 0x01 /* DB was written in this txn */ +#define DBI_STALE 0x02 /* Named-DB record is older than txnID */ +#define DBI_FRESH 0x04 /* Named-DB handle opened in this txn */ +#define DBI_CREAT 0x08 /* Named-DB handle created in this txn */ +#define DBI_VALID 0x10 /* Handle is valid, see also DB_VALID */ +#define DBI_OLDEN 0x40 /* Handle was closed/reopened outside txn */ +#define DBI_LINDO 0x80 /* Lazy initialization done for DBI-slot */ + /* Array of non-shared txn's flags of DBI */ + uint8_t *__restrict mt_dbi_state; + + /* Array of sequence numbers for each DB handle. */ + uint32_t *__restrict mt_dbi_seqs; + MDBX_cursor **mt_cursors; + MDBX_canary mt_canary; void *mt_userctx; /* User-settable context */ - MDBX_cursor **mt_cursors; union { struct { @@ -3472,8 +3510,8 @@ struct MDBX_txn { struct { meta_troika_t troika; /* In write txns, array of cursors for each DB */ - MDBX_PNL relist; /* Reclaimed GC pages */ - txnid_t last_reclaimed; /* ID of last used record */ + MDBX_PNL __restrict relist; /* Reclaimed GC pages */ + txnid_t last_reclaimed; /* ID of last used record */ #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; #endif /* MDBX_ENABLE_REFUND */ @@ -3485,14 +3523,14 @@ struct MDBX_txn { * dirtylist into mt_parent after freeing hidden mt_parent pages. */ size_t dirtyroom; /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ - MDBX_dpl *dirtylist; + MDBX_dpl *__restrict dirtylist; /* The list of reclaimed txns from GC */ - MDBX_TXL lifo_reclaimed; + MDBX_TXL __restrict lifo_reclaimed; /* The list of pages that became unused during this transaction. */ - MDBX_PNL retired_pages; + MDBX_PNL __restrict retired_pages; /* The list of loose pages that became unused and may be reused * in this transaction, linked through `mp_next`. */ - MDBX_page *loose_pages; + MDBX_page *__restrict loose_pages; /* Number of loose pages (tw.loose_pages) */ size_t loose_count; union { @@ -3501,11 +3539,12 @@ struct MDBX_txn { /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ - MDBX_PNL list; + MDBX_PNL __restrict list; } spilled; size_t writemap_dirty_npages; size_t writemap_spilled_npages; }; + uint64_t gc_time_acc; } tw; }; }; @@ -3544,8 +3583,8 @@ struct MDBX_cursor { MDBX_db *mc_db; /* The database auxiliary record for this cursor */ MDBX_dbx *mc_dbx; - /* The mt_dbistate for this database */ - uint8_t *mc_dbistate; + /* The mt_dbi_state[] for this DBI */ + uint8_t *__restrict mc_dbi_state; uint8_t mc_snum; /* number of pushed pages */ uint8_t mc_top; /* index of top page, normally mc_snum-1 */ @@ -3598,6 +3637,11 @@ typedef struct MDBX_cursor_couple { MDBX_xcursor inner; } MDBX_cursor_couple; +struct mdbx_defer_free_item { + struct mdbx_defer_free_item *next; + uint64_t timestamp; +}; + /* The database environment. */ struct MDBX_env { /* ----------------------------------------------------- mostly static part */ @@ -3615,6 +3659,7 @@ struct MDBX_env { #define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; + unsigned me_psize; /* DB page size, initialized from me_os_psize */ osal_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.base #define me_lazy_fd me_dxb_mmap.fd @@ -3627,7 +3672,6 @@ struct MDBX_env { #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; - unsigned me_psize; /* DB page size, initialized from me_os_psize */ uint16_t me_leaf_nodemax; /* max size of a leaf-node */ uint16_t me_branch_nodemax; /* max size of a branch-node */ uint16_t me_subpage_limit; @@ -3645,13 +3689,15 @@ struct MDBX_env { MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ osal_thread_key_t me_txkey; /* thread-key for readers */ - pathchar_t *me_pathname; /* path to the DB files */ - void *me_pbuf; /* scratch area for DUPSORT put() */ - MDBX_txn *me_txn0; /* preallocated write transaction */ - - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ + struct { /* path to the DB files */ + pathchar_t *lck, *dxb, *specified; + void *buffer; + } me_pathname; + void *me_pbuf; /* scratch area for DUPSORT put() */ + MDBX_txn *me_txn0; /* preallocated write transaction */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *__restrict me_db_flags; /* array of flags from MDBX_db.md_flags */ + MDBX_atomic_uint32_t *me_dbi_seqs; /* array of dbi sequence numbers */ unsigned me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ unsigned me_maxgc_per_branch; @@ -3665,6 +3711,7 @@ struct MDBX_env { unsigned rp_augment_limit; unsigned dp_limit; unsigned dp_initial; + uint64_t gc_time_limit; uint8_t dp_loose_limit; uint8_t spill_max_denominator; uint8_t spill_min_denominator; @@ -3674,6 +3721,8 @@ struct MDBX_env { unsigned writethrough_threshold; #endif /* Windows */ bool prefault_write; + bool prefer_waf_insteadof_balance; /* Strive to minimize WAF instead of + balancing pages fullment */ union { unsigned all; /* tracks options with non-auto values but tuned by user */ @@ -3703,20 +3752,23 @@ struct MDBX_env { } me_sysv_ipc; #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ bool me_incore; + bool me_prefault_write; - MDBX_env *me_lcklist_next; +#if MDBX_ENABLE_DBI_LOCKFREE + struct mdbx_defer_free_item *me_defer_free; +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ /* --------------------------------------------------- mostly volatile part */ MDBX_txn *me_txn; /* current write transaction */ osal_fastmutex_t me_dbi_lock; - MDBX_dbi me_numdbs; /* number of DBs opened */ - bool me_prefault_write; + unsigned me_numdbs; /* number of DBs opened */ - MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ unsigned me_dp_reserve_len; + MDBX_page *__restrict me_dp_reserve; /* list of malloc'ed blocks for re-use */ + /* PNL of pages that became unused in a write txn */ - MDBX_PNL me_retired_pages; + MDBX_PNL __restrict me_retired_pages; osal_ioring_t me_ioring; #if defined(_WIN32) || defined(_WIN64) @@ -3734,13 +3786,12 @@ struct MDBX_env { #if MDBX_DEBUG MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ #endif -#ifdef MDBX_USE_VALGRIND +#ifdef ENABLE_MEMCHECK int me_valgrind_handle; #endif -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - MDBX_atomic_uint32_t me_ignore_EDEADLK; +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) pgno_t me_poison_edge; -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ #ifndef xMDBX_DEBUG_SPILLING #define xMDBX_DEBUG_SPILLING 0 @@ -3800,10 +3851,6 @@ osal_flush_incoherent_mmap(const void *addr, size_t nbytes, MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, int *dead); -MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, - MDBX_reader *end); -MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); - MDBX_INTERNAL_FUNC void global_ctor(void); MDBX_INTERNAL_FUNC void osal_ctor(void); MDBX_INTERNAL_FUNC void global_dtor(void); @@ -3918,7 +3965,8 @@ typedef struct MDBX_node { /* mdbx_dbi_open() flags */ #define DB_USABLE_FLAGS (DB_PERSISTENT_FLAGS | MDBX_CREATE | MDBX_DB_ACCEDE) -#define DB_VALID 0x8000 /* DB handle is valid, for me_dbflags */ +#define DB_VALID 0x8000u /* DB handle is valid, for me_db_flags */ +#define DB_POISON 0x7fffu /* update pending */ #define DB_INTERNAL_FLAGS DB_VALID #if DB_INTERNAL_FLAGS & DB_USABLE_FLAGS @@ -4007,11 +4055,11 @@ log2n_powerof2(size_t value_uintptr) { * environment and re-opening it with the new flags. */ #define ENV_CHANGEABLE_FLAGS \ (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | \ - MDBX_VALIDATION) + MDBX_NOMEMINIT | MDBX_DEPRECATED_COALESCE | MDBX_PAGEPERTURB | \ + MDBX_ACCEDE | MDBX_VALIDATION) #define ENV_CHANGELESS_FLAGS \ - (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ - MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) + (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOSTICKYTHREADS | \ + MDBX_NORDAHEAD | MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS) #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS @@ -4044,8 +4092,38 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { (size_t)(size), __LINE__); \ ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) + +/******************************************************************************/ + +/** \brief Page types for traverse the b-tree. + * \see mdbx_env_pgwalk() \see MDBX_pgvisitor_func */ +enum MDBX_page_type_t { + MDBX_page_broken, + MDBX_page_large, + MDBX_page_branch, + MDBX_page_leaf, + MDBX_page_dupfixed_leaf, + MDBX_subpage_leaf, + MDBX_subpage_dupfixed_leaf, + MDBX_subpage_broken, +}; +typedef enum MDBX_page_type_t MDBX_page_type_t; + +typedef struct MDBX_walk_sdb { + MDBX_val name; + struct MDBX_db *internal, *nested; +} MDBX_walk_sdb_t; + +/** \brief Callback function for traverse the b-tree. \see mdbx_env_pgwalk() */ +typedef int +MDBX_pgvisitor_func(const size_t pgno, const unsigned number, void *const ctx, + const int deep, const MDBX_walk_sdb_t *subdb, + const size_t page_size, const MDBX_page_type_t page_type, + const MDBX_error_t err, const size_t nentries, + const size_t payload_bytes, const size_t header_bytes, + const size_t unused_bytes); /* - * Copyright 2015-2023 Leonid Yuriev . + * Copyright 2015-2024 Leonid Yuriev . * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -4503,6 +4581,19 @@ static __inline size_t keysize_max(size_t pagesize, MDBX_db_flags_t flags) { return max_branch_key; } +static __inline size_t keysize_min(MDBX_db_flags_t flags) { + return (flags & MDBX_INTEGERKEY) ? 4 /* sizeof(uint32_t) */ : 0; +} + +static __inline size_t valsize_min(MDBX_db_flags_t flags) { + if (flags & MDBX_INTEGERDUP) + return 4 /* sizeof(uint32_t) */; + else if (flags & MDBX_DUPFIXED) + return sizeof(indx_t); + else + return 0; +} + static __inline size_t valsize_max(size_t pagesize, MDBX_db_flags_t flags) { assert(pagesize >= MIN_PAGESIZE && pagesize <= MAX_PAGESIZE && is_powerof2(pagesize)); @@ -4555,6 +4646,10 @@ __cold intptr_t mdbx_limits_keysize_max(intptr_t pagesize, return keysize_max(pagesize, flags); } +__cold intptr_t mdbx_limits_keysize_min(MDBX_db_flags_t flags) { + return keysize_min(flags); +} + __cold int mdbx_env_get_maxvalsize_ex(const MDBX_env *env, MDBX_db_flags_t flags) { if (unlikely(!env || env->me_signature.weak != MDBX_ME_SIGNATURE)) @@ -4575,6 +4670,10 @@ __cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize, return valsize_max(pagesize, flags); } +__cold intptr_t mdbx_limits_valsize_min(MDBX_db_flags_t flags) { + return valsize_min(flags); +} + __cold intptr_t mdbx_limits_pairsize4page_max(intptr_t pagesize, MDBX_db_flags_t flags) { if (pagesize < 1) @@ -5173,10 +5272,12 @@ MDBX_MAYBE_UNUSED static /*----------------------------------------------------------------------------*/ /* rthc (tls keys and destructors) */ +static int rthc_register(MDBX_env *const env); +static int rthc_remove(MDBX_env *const env); +static int rthc_uniq_check(const osal_mmap_t *pending, MDBX_env **found); + typedef struct rthc_entry_t { - MDBX_reader *begin; - MDBX_reader *end; - osal_thread_key_t thr_tls_key; + MDBX_env *env; } rthc_entry_t; #if MDBX_DEBUG @@ -5189,10 +5290,8 @@ static bin128_t bootid; #if defined(_WIN32) || defined(_WIN64) static CRITICAL_SECTION rthc_critical_section; -static CRITICAL_SECTION lcklist_critical_section; #else -static pthread_mutex_t lcklist_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t rthc_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t rthc_cond = PTHREAD_COND_INITIALIZER; static osal_thread_key_t rthc_key; @@ -5391,17 +5490,24 @@ static void thread_rthc_set(osal_thread_key_t key, const void *value) { /* dtor called for thread, i.e. for all mdbx's environment objects */ __cold void thread_dtor(void *rthc) { rthc_lock(); - TRACE(">> pid %d, thread 0x%" PRIxPTR ", rthc %p", osal_getpid(), + const uint32_t current_pid = osal_getpid(); + TRACE(">> pid %d, thread 0x%" PRIxPTR ", rthc %p", current_pid, osal_thread_self(), rthc); - const uint32_t self_pid = osal_getpid(); for (size_t i = 0; i < rthc_count; ++i) { - const osal_thread_key_t key = rthc_table[i].thr_tls_key; - MDBX_reader *const reader = thread_rthc_get(key); - if (reader < rthc_table[i].begin || reader >= rthc_table[i].end) + MDBX_env *const env = rthc_table[i].env; + if (env->me_pid != current_pid) + continue; + if (!(env->me_flags & MDBX_ENV_TXKEY)) + continue; + MDBX_reader *const reader = thread_rthc_get(env->me_txkey); + MDBX_reader *const begin = &env->me_lck_mmap.lck->mti_readers[0]; + MDBX_reader *const end = + &env->me_lck_mmap.lck->mti_readers[env->me_maxreaders]; + if (reader < begin || reader >= end) continue; #if !defined(_WIN32) && !defined(_WIN64) - if (pthread_setspecific(key, nullptr) != 0) { + if (pthread_setspecific(env->me_txkey, nullptr) != 0) { TRACE("== thread 0x%" PRIxPTR ", rthc %p: ignore race with tsd-key deletion", osal_thread_self(), __Wpedantic_format_voidptr(reader)); @@ -5413,13 +5519,13 @@ __cold void thread_dtor(void *rthc) { ", rthc %p, [%zi], %p ... %p (%+i), rtch-pid %i, " "current-pid %i", osal_thread_self(), __Wpedantic_format_voidptr(reader), i, - __Wpedantic_format_voidptr(rthc_table[i].begin), - __Wpedantic_format_voidptr(rthc_table[i].end), - (int)(reader - rthc_table[i].begin), reader->mr_pid.weak, self_pid); - if (atomic_load32(&reader->mr_pid, mo_Relaxed) == self_pid) { + __Wpedantic_format_voidptr(begin), __Wpedantic_format_voidptr(end), + (int)(reader - begin), reader->mr_pid.weak, current_pid); + if (atomic_load32(&reader->mr_pid, mo_Relaxed) == current_pid) { TRACE("==== thread 0x%" PRIxPTR ", rthc %p, cleanup", osal_thread_self(), __Wpedantic_format_voidptr(reader)); - (void)atomic_cas32(&reader->mr_pid, self_pid, 0); + (void)atomic_cas32(&reader->mr_pid, current_pid, 0); + atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); } } @@ -5462,16 +5568,21 @@ __cold void thread_dtor(void *rthc) { #endif } +MDBX_INTERNAL_VAR_INSTA struct mdbx_static mdbx_static = { + MDBX_RUNTIME_FLAGS_INIT, MDBX_LOG_FATAL, {nullptr}, 0, nullptr}; +static osal_fastmutex_t debug_lock; + MDBX_EXCLUDE_FOR_GPROF __cold void global_dtor(void) { - TRACE(">> pid %d", osal_getpid()); + const uint32_t current_pid = osal_getpid(); + TRACE(">> pid %d", current_pid); rthc_lock(); #if !defined(_WIN32) && !defined(_WIN64) uint64_t *rthc = pthread_getspecific(rthc_key); TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status 0x%08" PRIx64 ", left %d", - osal_thread_self(), __Wpedantic_format_voidptr(rthc), osal_getpid(), + osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid, rthc ? rthc_read(rthc) : ~UINT64_C(0), atomic_load32(&rthc_pending, mo_Relaxed)); if (rthc) { @@ -5482,20 +5593,20 @@ __cold void global_dtor(void) { rthc_compare_and_clean(rthc, sign_registered)) { TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - osal_thread_self(), __Wpedantic_format_voidptr(rthc), osal_getpid(), + osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid, "registered", state); } else if (state == sign_counted && rthc_compare_and_clean(rthc, sign_counted)) { TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - osal_thread_self(), __Wpedantic_format_voidptr(rthc), osal_getpid(), + osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid, "counted", state); ENSURE(nullptr, atomic_sub32(&rthc_pending, 1) > 0); } else { WARNING("thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - osal_thread_self(), __Wpedantic_format_voidptr(rthc), - osal_getpid(), "wrong", state); + osal_thread_self(), __Wpedantic_format_voidptr(rthc), current_pid, + "wrong", state); } } @@ -5512,7 +5623,7 @@ __cold void global_dtor(void) { for (unsigned left; (left = atomic_load32(&rthc_pending, mo_AcquireRelease)) > 0;) { - NOTICE("tls-cleanup: pid %d, pending %u, wait for...", osal_getpid(), left); + NOTICE("tls-cleanup: pid %d, pending %u, wait for...", current_pid, left); const int rc = pthread_cond_timedwait(&rthc_cond, &rthc_mutex, &abstime); if (rc && rc != EINTR) break; @@ -5520,23 +5631,31 @@ __cold void global_dtor(void) { thread_key_delete(rthc_key); #endif - const uint32_t self_pid = osal_getpid(); for (size_t i = 0; i < rthc_count; ++i) { - const osal_thread_key_t key = rthc_table[i].thr_tls_key; - thread_key_delete(key); - for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; - ++rthc) { + MDBX_env *const env = rthc_table[i].env; + if (env->me_pid != current_pid) + continue; + if (!(env->me_flags & MDBX_ENV_TXKEY)) + continue; + MDBX_reader *const begin = &env->me_lck_mmap.lck->mti_readers[0]; + MDBX_reader *const end = + &env->me_lck_mmap.lck->mti_readers[env->me_maxreaders]; + thread_key_delete(env->me_txkey); + bool cleaned = false; + for (MDBX_reader *reader = begin; reader < end; ++reader) { TRACE("== [%zi] = key %" PRIuPTR ", %p ... %p, rthc %p (%+i), " "rthc-pid %i, current-pid %i", - i, (uintptr_t)key, __Wpedantic_format_voidptr(rthc_table[i].begin), - __Wpedantic_format_voidptr(rthc_table[i].end), - __Wpedantic_format_voidptr(rthc), (int)(rthc - rthc_table[i].begin), - rthc->mr_pid.weak, self_pid); - if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) { - atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease); - TRACE("== cleanup %p", __Wpedantic_format_voidptr(rthc)); + i, (uintptr_t)env->me_txkey, __Wpedantic_format_voidptr(begin), + __Wpedantic_format_voidptr(end), __Wpedantic_format_voidptr(reader), + (int)(reader - begin), reader->mr_pid.weak, current_pid); + if (atomic_load32(&reader->mr_pid, mo_Relaxed) == current_pid) { + (void)atomic_cas32(&reader->mr_pid, current_pid, 0); + TRACE("== cleanup %p", __Wpedantic_format_voidptr(reader)); + cleaned = true; } } + if (cleaned) + atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); } rthc_limit = rthc_count = 0; @@ -5546,7 +5665,6 @@ __cold void global_dtor(void) { rthc_unlock(); #if defined(_WIN32) || defined(_WIN64) - DeleteCriticalSection(&lcklist_critical_section); DeleteCriticalSection(&rthc_critical_section); #else /* LY: yielding a few timeslices to give a more chance @@ -5555,24 +5673,27 @@ __cold void global_dtor(void) { #endif osal_dtor(); - TRACE("<< pid %d\n", osal_getpid()); + TRACE("<< pid %d\n", current_pid); + ENSURE(nullptr, osal_fastmutex_destroy(&debug_lock) == 0); } -__cold int rthc_alloc(osal_thread_key_t *pkey, MDBX_reader *begin, - MDBX_reader *end) { - assert(pkey != NULL); -#ifndef NDEBUG - *pkey = (osal_thread_key_t)0xBADBADBAD; -#endif /* NDEBUG */ +__cold int rthc_register(MDBX_env *const env) { + TRACE(">> env %p, rthc_count %u, rthc_limit %u", + __Wpedantic_format_voidptr(env), rthc_count, rthc_limit); - rthc_lock(); - TRACE(">> rthc_count %u, rthc_limit %u", rthc_count, rthc_limit); - int rc; - if (rthc_count == rthc_limit) { + int rc = MDBX_SUCCESS; + for (size_t i = 0; i < rthc_count; ++i) + if (unlikely(rthc_table[i].env == env)) { + rc = MDBX_PANIC; + goto bailout; + } + + env->me_txkey = 0; + if (unlikely(rthc_count == rthc_limit)) { rthc_entry_t *new_table = osal_realloc((rthc_table == rthc_table_static) ? nullptr : rthc_table, sizeof(rthc_entry_t) * rthc_limit * 2); - if (new_table == nullptr) { + if (unlikely(new_table == nullptr)) { rc = MDBX_ENOMEM; goto bailout; } @@ -5582,84 +5703,92 @@ __cold int rthc_alloc(osal_thread_key_t *pkey, MDBX_reader *begin, rthc_limit *= 2; } - rc = thread_key_create(&rthc_table[rthc_count].thr_tls_key); - if (rc != MDBX_SUCCESS) - goto bailout; - - *pkey = rthc_table[rthc_count].thr_tls_key; - TRACE("== [%i] = key %" PRIuPTR ", %p ... %p", rthc_count, (uintptr_t)*pkey, - __Wpedantic_format_voidptr(begin), __Wpedantic_format_voidptr(end)); + if ((env->me_flags & MDBX_NOSTICKYTHREADS) == 0) { + rc = thread_key_create(&env->me_txkey); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + env->me_flags |= MDBX_ENV_TXKEY; + } - rthc_table[rthc_count].begin = begin; - rthc_table[rthc_count].end = end; + rthc_table[rthc_count].env = env; + TRACE("== [%i] = env %p, key %" PRIuPTR, rthc_count, + __Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey); ++rthc_count; - TRACE("<< key %" PRIuPTR ", rthc_count %u, rthc_limit %u", (uintptr_t)*pkey, - rthc_count, rthc_limit); - rthc_unlock(); - return MDBX_SUCCESS; bailout: - rthc_unlock(); + TRACE("<< env %p, key %" PRIuPTR ", rthc_count %u, rthc_limit %u, rc %d", + __Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey, rthc_count, + rthc_limit, rc); return rc; } +__cold static int rthc_drown(MDBX_env *const env) { + const uint32_t current_pid = osal_getpid(); + int rc = MDBX_SUCCESS; + MDBX_env *inprocess_neighbor = nullptr; + if (likely(env->me_lck_mmap.lck && current_pid == env->me_pid)) { + MDBX_reader *const begin = &env->me_lck_mmap.lck->mti_readers[0]; + MDBX_reader *const end = + &env->me_lck_mmap.lck->mti_readers[env->me_maxreaders]; + TRACE("== %s env %p pid %d, readers %p ...%p, current-pid %d", + (current_pid == env->me_pid) ? "cleanup" : "skip", + __Wpedantic_format_voidptr(env), env->me_pid, + __Wpedantic_format_voidptr(begin), __Wpedantic_format_voidptr(end), + current_pid); + bool cleaned = false; + for (MDBX_reader *r = begin; r < end; ++r) { + if (atomic_load32(&r->mr_pid, mo_Relaxed) == current_pid) { + atomic_store32(&r->mr_pid, 0, mo_AcquireRelease); + TRACE("== cleanup %p", __Wpedantic_format_voidptr(r)); + cleaned = true; + } + } + if (cleaned) + atomic_store32(&env->me_lck_mmap.lck->mti_readers_refresh_flag, true, + mo_Relaxed); + rc = rthc_uniq_check(&env->me_lck_mmap, &inprocess_neighbor); + if (!inprocess_neighbor && env->me_live_reader && + env->me_lfd != INVALID_HANDLE_VALUE) { + int err = osal_rpid_clear(env); + rc = rc ? rc : err; + } + } + int err = osal_lck_destroy(env, inprocess_neighbor, current_pid); + env->me_pid = 0; + return rc ? rc : err; +} -__cold void rthc_remove(const osal_thread_key_t key) { - thread_key_delete(key); - rthc_lock(); - TRACE(">> key %zu, rthc_count %u, rthc_limit %u", (uintptr_t)key, rthc_count, +__cold static int rthc_remove(MDBX_env *const env) { + TRACE(">>> env %p, key %zu, rthc_count %u, rthc_limit %u", + __Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey, rthc_count, rthc_limit); + int rc = MDBX_SUCCESS; + if (likely(env->me_pid)) + rc = rthc_drown(env); + for (size_t i = 0; i < rthc_count; ++i) { - if (key == rthc_table[i].thr_tls_key) { - const uint32_t self_pid = osal_getpid(); - TRACE("== [%zi], %p ...%p, current-pid %d", i, - __Wpedantic_format_voidptr(rthc_table[i].begin), - __Wpedantic_format_voidptr(rthc_table[i].end), self_pid); - - for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; - ++rthc) { - if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) { - atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease); - TRACE("== cleanup %p", __Wpedantic_format_voidptr(rthc)); - } - } + if (rthc_table[i].env == env) { if (--rthc_count > 0) rthc_table[i] = rthc_table[rthc_count]; else if (rthc_table != rthc_table_static) { - osal_free(rthc_table); + void *tmp = rthc_table; rthc_table = rthc_table_static; rthc_limit = RTHC_INITIAL_LIMIT; + osal_memory_barrier(); + osal_free(tmp); } break; } } - TRACE("<< key %zu, rthc_count %u, rthc_limit %u", (size_t)key, rthc_count, + TRACE("<<< %p, key %zu, rthc_count %u, rthc_limit %u", + __Wpedantic_format_voidptr(env), (uintptr_t)env->me_txkey, rthc_count, rthc_limit); - rthc_unlock(); + return rc; } //------------------------------------------------------------------------------ -#define RTHC_ENVLIST_END ((MDBX_env *)((uintptr_t)50459)) -static MDBX_env *inprocess_lcklist_head = RTHC_ENVLIST_END; - -static __inline void lcklist_lock(void) { -#if defined(_WIN32) || defined(_WIN64) - EnterCriticalSection(&lcklist_critical_section); -#else - ENSURE(nullptr, osal_pthread_mutex_lock(&lcklist_mutex) == 0); -#endif -} - -static __inline void lcklist_unlock(void) { -#if defined(_WIN32) || defined(_WIN64) - LeaveCriticalSection(&lcklist_critical_section); -#else - ENSURE(nullptr, pthread_mutex_unlock(&lcklist_mutex) == 0); -#endif -} - MDBX_NOTHROW_CONST_FUNCTION static uint64_t rrxmrrxmsx_0(uint64_t v) { /* Pelle Evensen's mixer, https://bit.ly/2HOfynt */ v ^= (v << 39 | v >> 25) ^ (v << 14 | v >> 50); @@ -5712,13 +5841,16 @@ static int uniq_poke(const osal_mmap_t *pending, osal_mmap_t *scan, return uniq_peek(pending, scan); } -__cold static int uniq_check(const osal_mmap_t *pending, MDBX_env **found) { +__cold static int rthc_uniq_check(const osal_mmap_t *pending, + MDBX_env **found) { *found = nullptr; uint64_t salt = 0; - for (MDBX_env *scan = inprocess_lcklist_head; scan != RTHC_ENVLIST_END; - scan = scan->me_lcklist_next) { - MDBX_lockinfo *const scan_lck = scan->me_lck_mmap.lck; - int err = atomic_load64(&scan_lck->mti_bait_uniqueness, mo_AcquireRelease) + for (size_t i = 0; i < rthc_count; ++i) { + MDBX_env *const scan = rthc_table[i].env; + if (!scan->me_lck_mmap.lck || &scan->me_lck_mmap == pending) + continue; + int err = atomic_load64(&scan->me_lck_mmap.lck->mti_bait_uniqueness, + mo_AcquireRelease) ? uniq_peek(pending, &scan->me_lck_mmap) : uniq_poke(pending, &scan->me_lck_mmap, &salt); if (err == MDBX_ENODATA) { @@ -5726,8 +5858,8 @@ __cold static int uniq_check(const osal_mmap_t *pending, MDBX_env **found) { if (likely(osal_filesize(pending->fd, &length) == MDBX_SUCCESS && length == 0)) { /* LY: skip checking since LCK-file is empty, i.e. just created. */ - DEBUG("uniq-probe: %s", "unique (new/empty lck)"); - return MDBX_RESULT_TRUE; + DEBUG("%s", "unique (new/empty lck)"); + return MDBX_SUCCESS; } } if (err == MDBX_RESULT_TRUE) @@ -5740,44 +5872,17 @@ __cold static int uniq_check(const osal_mmap_t *pending, MDBX_env **found) { if (err == MDBX_RESULT_TRUE) { err = uniq_poke(pending, &scan->me_lck_mmap, &salt); *found = scan; - DEBUG("uniq-probe: found %p", __Wpedantic_format_voidptr(*found)); - return MDBX_RESULT_FALSE; + DEBUG("found %p", __Wpedantic_format_voidptr(*found)); + return MDBX_SUCCESS; } if (unlikely(err != MDBX_SUCCESS)) { - DEBUG("uniq-probe: failed rc %d", err); + DEBUG("failed rc %d", err); return err; } } - DEBUG("uniq-probe: %s", "unique"); - return MDBX_RESULT_TRUE; -} - -static int lcklist_detach_locked(MDBX_env *env) { - MDBX_env *inprocess_neighbor = nullptr; - int rc = MDBX_SUCCESS; - if (env->me_lcklist_next != nullptr) { - ENSURE(env, env->me_lcklist_next != nullptr); - ENSURE(env, inprocess_lcklist_head != RTHC_ENVLIST_END); - for (MDBX_env **ptr = &inprocess_lcklist_head; *ptr != RTHC_ENVLIST_END; - ptr = &(*ptr)->me_lcklist_next) { - if (*ptr == env) { - *ptr = env->me_lcklist_next; - env->me_lcklist_next = nullptr; - break; - } - } - ENSURE(env, env->me_lcklist_next == nullptr); - } - - rc = likely(osal_getpid() == env->me_pid) - ? uniq_check(&env->me_lck_mmap, &inprocess_neighbor) - : MDBX_PANIC; - if (!inprocess_neighbor && env->me_live_reader) - (void)osal_rpid_clear(env); - if (!MDBX_IS_ERROR(rc)) - rc = osal_lck_destroy(env, inprocess_neighbor); - return rc; + DEBUG("%s", "unique"); + return MDBX_SUCCESS; } /*------------------------------------------------------------------------------ @@ -6306,7 +6411,7 @@ static void pnl_free(MDBX_PNL pl) { } /* Shrink the PNL to the default size if it has grown larger */ -static void pnl_shrink(MDBX_PNL *ppl) { +static void pnl_shrink(MDBX_PNL __restrict *__restrict ppl) { assert(pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) >= MDBX_PNL_INITIAL && pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) < MDBX_PNL_INITIAL * 3 / 2); @@ -6329,7 +6434,8 @@ static void pnl_shrink(MDBX_PNL *ppl) { } /* Grow the PNL to the size growed to at least given size */ -static int pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { +static int pnl_reserve(MDBX_PNL __restrict *__restrict ppl, + const size_t wanna) { const size_t allocated = MDBX_PNL_ALLOCLEN(*ppl); assert(MDBX_PNL_GETSIZE(*ppl) <= MDBX_PGL_LIMIT && MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_GETSIZE(*ppl)); @@ -6359,8 +6465,8 @@ static int pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { } /* Make room for num additional elements in an PNL */ -static __always_inline int __must_check_result pnl_need(MDBX_PNL *ppl, - size_t num) { +static __always_inline int __must_check_result +pnl_need(MDBX_PNL __restrict *__restrict ppl, size_t num) { assert(MDBX_PNL_GETSIZE(*ppl) <= MDBX_PGL_LIMIT && MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_GETSIZE(*ppl)); assert(num <= MDBX_PGL_LIMIT); @@ -6369,7 +6475,7 @@ static __always_inline int __must_check_result pnl_need(MDBX_PNL *ppl, : pnl_reserve(ppl, wanna); } -static __always_inline void pnl_xappend(MDBX_PNL pl, pgno_t pgno) { +static __always_inline void pnl_xappend(__restrict MDBX_PNL pl, pgno_t pgno) { assert(MDBX_PNL_GETSIZE(pl) < MDBX_PNL_ALLOCLEN(pl)); if (AUDIT_ENABLED()) { for (size_t i = MDBX_PNL_GETSIZE(pl); i > 0; --i) @@ -6380,10 +6486,8 @@ static __always_inline void pnl_xappend(MDBX_PNL pl, pgno_t pgno) { } /* Append an pgno range onto an unsorted PNL */ -__always_inline static int __must_check_result pnl_append_range(bool spilled, - MDBX_PNL *ppl, - pgno_t pgno, - size_t n) { +__always_inline static int __must_check_result pnl_append_range( + bool spilled, __restrict MDBX_PNL *ppl, pgno_t pgno, size_t n) { assert(n > 0); int rc = pnl_need(ppl, n); if (unlikely(rc != MDBX_SUCCESS)) @@ -6410,7 +6514,7 @@ __always_inline static int __must_check_result pnl_append_range(bool spilled, } /* Append an pgno range into the sorted PNL */ -__hot static int __must_check_result pnl_insert_range(MDBX_PNL *ppl, +__hot static int __must_check_result pnl_insert_range(__restrict MDBX_PNL *ppl, pgno_t pgno, size_t n) { assert(n > 0); int rc = pnl_need(ppl, n); @@ -6714,7 +6818,8 @@ static void txl_free(MDBX_TXL tl) { osal_free(tl - 1); } -static int txl_reserve(MDBX_TXL *ptl, const size_t wanna) { +static int txl_reserve(MDBX_TXL __restrict *__restrict ptl, + const size_t wanna) { const size_t allocated = (size_t)MDBX_PNL_ALLOCLEN(*ptl); assert(MDBX_PNL_GETSIZE(*ptl) <= MDBX_TXL_MAX && MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_GETSIZE(*ptl)); @@ -6743,8 +6848,8 @@ static int txl_reserve(MDBX_TXL *ptl, const size_t wanna) { return MDBX_ENOMEM; } -static __always_inline int __must_check_result txl_need(MDBX_TXL *ptl, - size_t num) { +static __always_inline int __must_check_result +txl_need(MDBX_TXL __restrict *__restrict ptl, size_t num) { assert(MDBX_PNL_GETSIZE(*ptl) <= MDBX_TXL_MAX && MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_GETSIZE(*ptl)); assert(num <= MDBX_PGL_LIMIT); @@ -6753,7 +6858,7 @@ static __always_inline int __must_check_result txl_need(MDBX_TXL *ptl, : txl_reserve(ptl, wanna); } -static __always_inline void txl_xappend(MDBX_TXL tl, txnid_t id) { +static __always_inline void txl_xappend(MDBX_TXL __restrict tl, txnid_t id) { assert(MDBX_PNL_GETSIZE(tl) < MDBX_PNL_ALLOCLEN(tl)); tl[0] += 1; MDBX_PNL_LAST(tl) = id; @@ -6765,7 +6870,8 @@ static void txl_sort(MDBX_TXL tl) { txnid_sort(MDBX_PNL_BEGIN(tl), MDBX_PNL_END(tl)); } -static int __must_check_result txl_append(MDBX_TXL *ptl, txnid_t id) { +static int __must_check_result txl_append(MDBX_TXL __restrict *ptl, + txnid_t id) { if (unlikely(MDBX_PNL_GETSIZE(*ptl) == MDBX_PNL_ALLOCLEN(*ptl))) { int rc = txl_need(ptl, MDBX_TXL_GRANULATE); if (unlikely(rc != MDBX_SUCCESS)) @@ -7251,10 +7357,6 @@ static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, /*----------------------------------------------------------------------------*/ -uint8_t runtime_flags = MDBX_RUNTIME_FLAGS_INIT; -uint8_t loglevel = MDBX_LOG_FATAL; -MDBX_debug_func *debug_logger; - static __must_check_result __inline int page_retire(MDBX_cursor *mc, MDBX_page *mp); @@ -7273,26 +7375,26 @@ static int page_touch(MDBX_cursor *mc); static int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, const MDBX_val *data); -#define MDBX_END_NAMES \ +#define TXN_END_NAMES \ { \ "committed", "empty-commit", "abort", "reset", "reset-tmp", "fail-begin", \ "fail-beginchild" \ } enum { /* txn_end operation number, for logging */ - MDBX_END_COMMITTED, - MDBX_END_PURE_COMMIT, - MDBX_END_ABORT, - MDBX_END_RESET, - MDBX_END_RESET_TMP, - MDBX_END_FAIL_BEGIN, - MDBX_END_FAIL_BEGINCHILD + TXN_END_COMMITTED, + TXN_END_PURE_COMMIT, + TXN_END_ABORT, + TXN_END_RESET, + TXN_END_RESET_TMP, + TXN_END_FAIL_BEGIN, + TXN_END_FAIL_BEGINCHILD }; -#define MDBX_END_OPMASK 0x0F /* mask for txn_end() operation number */ -#define MDBX_END_UPDATE 0x10 /* update env state (DBIs) */ -#define MDBX_END_FREE 0x20 /* free txn unless it is MDBX_env.me_txn0 */ -#define MDBX_END_EOTDONE 0x40 /* txn's cursors already closed */ -#define MDBX_END_SLOT 0x80 /* release any reader slot if MDBX_NOTLS */ +#define TXN_END_OPMASK 0x0F /* mask for txn_end() operation number */ +#define TXN_END_UPDATE 0x10 /* update env state (DBIs) */ +#define TXN_END_FREE 0x20 /* free txn unless it is MDBX_env.me_txn0 */ +#define TXN_END_EOTDONE 0x40 /* txn's cursors already closed */ +#define TXN_END_SLOT 0x80 /* release any reader slot if NOSTICKYTHREADS */ static int txn_end(MDBX_txn *txn, const unsigned mode); static __always_inline pgr_t page_get_inline(const uint16_t ILL, @@ -7356,7 +7458,7 @@ static int __must_check_result read_header(MDBX_env *env, MDBX_meta *meta, static int __must_check_result sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, meta_troika_t *const troika); -static int env_close(MDBX_env *env); +static int env_close(MDBX_env *env, bool resurrect_after_fork); struct node_result { MDBX_node *node; @@ -7430,7 +7532,8 @@ static int __must_check_result cursor_last(MDBX_cursor *mc, MDBX_val *key, static int __must_check_result cursor_init(MDBX_cursor *mc, const MDBX_txn *txn, size_t dbi); static int __must_check_result cursor_xinit0(MDBX_cursor *mc); -static int __must_check_result cursor_xinit1(MDBX_cursor *mc, MDBX_node *node, +static int __must_check_result cursor_xinit1(MDBX_cursor *mc, + const MDBX_node *node, const MDBX_page *mp); static int __must_check_result cursor_xinit2(MDBX_cursor *mc, MDBX_xcursor *src_mx, @@ -7440,7 +7543,7 @@ static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst); static int __must_check_result drop_tree(MDBX_cursor *mc, const bool may_have_subDBs); static int __must_check_result fetch_sdb(MDBX_txn *txn, size_t dbi); -static int __must_check_result setup_dbx(MDBX_dbx *const dbx, +static int __must_check_result setup_sdb(MDBX_dbx *const dbx, const MDBX_db *const db, const unsigned pagesize); @@ -7515,8 +7618,11 @@ __cold const char *mdbx_liberr2str(int errnum) { return "MDBX_TXN_OVERLAPPING: Overlapping read and write transactions for" " the current thread"; case MDBX_DUPLICATED_CLK: - return "MDBX_DUPLICATED_CLK: Alternative/Duplicate LCK-file is exists, " - "please keep one and remove unused other"; + return "MDBX_DUPLICATED_CLK: Alternative/Duplicate LCK-file is exists," + " please keep one and remove unused other"; + case MDBX_DANGLING_DBI: + return "MDBX_DANGLING_DBI: Some cursors and/or other resources should be" + " closed before subDb or corresponding DBI-handle could be (re)used"; default: return NULL; } @@ -7602,9 +7708,18 @@ const char *mdbx_strerror_ANSI2OEM(int errnum) { __cold void debug_log_va(int level, const char *function, int line, const char *fmt, va_list args) { - if (debug_logger) - debug_logger(level, function, line, fmt, args); - else { + ENSURE(nullptr, osal_fastmutex_acquire(&debug_lock) == 0); + if (mdbx_static.logger.ptr) { + if (mdbx_static.logger_buffer == nullptr) + mdbx_static.logger.fmt(level, function, line, fmt, args); + else { + const int len = vsnprintf(mdbx_static.logger_buffer, + mdbx_static.logger_buffer_size, fmt, args); + if (len > 0) + mdbx_static.logger.nofmt(level, function, line, + mdbx_static.logger_buffer, len); + } + } else { #if defined(_WIN32) || defined(_WIN64) if (IsDebuggerPresent()) { int prefix_len = 0; @@ -7637,6 +7752,7 @@ __cold void debug_log_va(int level, const char *function, int line, fflush(stderr); #endif } + ENSURE(nullptr, osal_fastmutex_release(&debug_lock) == 0); } __cold void debug_log(int level, const char *function, int line, @@ -7819,16 +7935,479 @@ MDBX_MAYBE_UNUSED static bool cursor_is_tracked(const MDBX_cursor *mc) { *tracking_head = tracked->mc_next; \ } while (0) +static int +env_defer_free_and_release(MDBX_env *const env, + struct mdbx_defer_free_item *const chain) { + size_t length = 0; + struct mdbx_defer_free_item *obsolete_chain = nullptr; +#if MDBX_ENABLE_DBI_LOCKFREE + const uint64_t now = osal_monotime(); + struct mdbx_defer_free_item **scan = &env->me_defer_free; + if (env->me_defer_free) { + const uint64_t threshold_1second = osal_16dot16_to_monotime(1 * 65536); + do { + struct mdbx_defer_free_item *item = *scan; + if (now - item->timestamp < threshold_1second) { + scan = &item->next; + length += 1; + } else { + *scan = item->next; + item->next = obsolete_chain; + obsolete_chain = item; + } + } while (*scan); + } + + eASSERT(env, *scan == nullptr); + if (chain) { + struct mdbx_defer_free_item *item = chain; + do { + item->timestamp = now; + item = item->next; + } while (item); + *scan = chain; + } +#else /* MDBX_ENABLE_DBI_LOCKFREE */ + obsolete_chain = chain; +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ + + ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + if (length > 42) { +#if defined(_WIN32) || defined(_WIN64) + SwitchToThread(); +#else + sched_yield(); +#endif /* Windows */ + } + while (obsolete_chain) { + struct mdbx_defer_free_item *item = obsolete_chain; + obsolete_chain = obsolete_chain->next; + osal_free(item); + } + return chain ? MDBX_SUCCESS : MDBX_BAD_DBI; +} + +#if MDBX_ENABLE_DBI_SPARSE + +static __inline size_t dbi_bitmap_ctz(const MDBX_txn *txn, intptr_t bmi) { + tASSERT(txn, bmi > 0); + STATIC_ASSERT(sizeof(bmi) >= sizeof(txn->mt_dbi_sparse[0])); +#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl) + if (sizeof(txn->mt_dbi_sparse[0]) <= sizeof(int)) + return __builtin_ctz((int)bmi); + if (sizeof(txn->mt_dbi_sparse[0]) == sizeof(long)) + return __builtin_ctzl((long)bmi); +#if (defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ == 8) || \ + __has_builtin(__builtin_ctzll) + return __builtin_ctzll(bmi); +#endif /* have(long long) && long long == uint64_t */ +#endif /* GNU C */ + +#if defined(_MSC_VER) + unsigned long index; + if (sizeof(txn->mt_dbi_sparse[0]) > 4) { +#if defined(_M_AMD64) || defined(_M_ARM64) || defined(_M_X64) + _BitScanForward64(&index, bmi); + return index; +#else + if (bmi > UINT32_MAX) { + _BitScanForward(&index, (uint32_t)((uint64_t)bmi >> 32)); + return index; + } +#endif + } + _BitScanForward(&index, (uint32_t)bmi); + return index; +#endif /* MSVC */ + + bmi &= -bmi; + if (sizeof(txn->mt_dbi_sparse[0]) > 4) { + static const uint8_t debruijn_ctz64[64] = { + 0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28, + 62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11, + 63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10, + 51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12}; + return debruijn_ctz64[(UINT64_C(0x022FDD63CC95386D) * (uint64_t)bmi) >> 58]; + } else { + static const uint8_t debruijn_ctz32[32] = { + 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, + 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; + return debruijn_ctz32[(UINT32_C(0x077CB531) * (uint32_t)bmi) >> 27]; + } +} + +/* LY: Макрос целенаправленно сделан с одним циклом, чтобы сохранить возможность + * использования оператора break */ +#define TXN_FOREACH_DBI_FROM(TXN, I, FROM) \ + for (size_t bitmap_chunk = CHAR_BIT * sizeof(TXN->mt_dbi_sparse[0]), \ + bitmap_item = TXN->mt_dbi_sparse[0] >> FROM, I = FROM; \ + I < TXN->mt_numdbs; ++I) \ + if (bitmap_item == 0) { \ + I = (I - 1) | (bitmap_chunk - 1); \ + bitmap_item = TXN->mt_dbi_sparse[(1 + I) / bitmap_chunk]; \ + if (!bitmap_item) \ + I += bitmap_chunk; \ + continue; \ + } else if ((bitmap_item & 1) == 0) { \ + size_t bitmap_skip = dbi_bitmap_ctz(txn, bitmap_item); \ + bitmap_item >>= bitmap_skip; \ + I += bitmap_skip - 1; \ + continue; \ + } else if (bitmap_item >>= 1, TXN->mt_dbi_state[I]) +#else +#define TXN_FOREACH_DBI_FROM(TXN, I, SKIP) \ + for (size_t I = SKIP; I < TXN->mt_numdbs; ++I) \ + if (TXN->mt_dbi_state[I]) +#endif /* MDBX_ENABLE_DBI_SPARSE */ + +#define TXN_FOREACH_DBI_ALL(TXN, I) TXN_FOREACH_DBI_FROM(TXN, I, 0) +#define TXN_FOREACH_DBI_USER(TXN, I) TXN_FOREACH_DBI_FROM(TXN, I, CORE_DBS) + +/* Back up parent txn's cursor, then grab the original for tracking */ +static int cursor_shadow(MDBX_cursor *parent_cursor, MDBX_txn *nested_txn, + const size_t dbi) { + + tASSERT(nested_txn, dbi > FREE_DBI && dbi < nested_txn->mt_numdbs); + const size_t size = parent_cursor->mc_xcursor + ? sizeof(MDBX_cursor) + sizeof(MDBX_xcursor) + : sizeof(MDBX_cursor); + for (MDBX_cursor *bk; parent_cursor; parent_cursor = bk->mc_next) { + bk = parent_cursor; + if (parent_cursor->mc_signature != MDBX_MC_LIVE) + continue; + bk = osal_malloc(size); + if (unlikely(!bk)) + return MDBX_ENOMEM; +#if MDBX_DEBUG + memset(bk, 0xCD, size); + VALGRIND_MAKE_MEM_UNDEFINED(bk, size); +#endif /* MDBX_DEBUG */ + *bk = *parent_cursor; + parent_cursor->mc_backup = bk; + /* Kill pointers into src to reduce abuse: The + * user may not use mc until dst ends. But we need a valid + * txn pointer here for cursor fixups to keep working. */ + parent_cursor->mc_txn = nested_txn; + parent_cursor->mc_db = &nested_txn->mt_dbs[dbi]; + parent_cursor->mc_dbi_state = &nested_txn->mt_dbi_state[dbi]; + MDBX_xcursor *mx = parent_cursor->mc_xcursor; + if (mx != NULL) { + *(MDBX_xcursor *)(bk + 1) = *mx; + mx->mx_cursor.mc_txn = nested_txn; + } + parent_cursor->mc_next = nested_txn->mt_cursors[dbi]; + nested_txn->mt_cursors[dbi] = parent_cursor; + } + return MDBX_SUCCESS; +} + +/* Close this txn's cursors, give parent txn's cursors back to parent. + * + * [in] txn the transaction handle. + * [in] merge true to keep changes to parent cursors, false to revert. + * + * Returns 0 on success, non-zero on failure. */ +static void cursors_eot(MDBX_txn *txn, const bool merge) { + tASSERT(txn, txn->mt_cursors[FREE_DBI] == nullptr); + TXN_FOREACH_DBI_FROM(txn, i, /* skip FREE_DBI */ 1) { + MDBX_cursor *mc = txn->mt_cursors[i]; + if (!mc) + continue; + txn->mt_cursors[i] = nullptr; + do { + const unsigned stage = mc->mc_signature; + MDBX_cursor *const next = mc->mc_next; + MDBX_cursor *const bk = mc->mc_backup; + ENSURE(txn->mt_env, + stage == MDBX_MC_LIVE || (stage == MDBX_MC_WAIT4EOT && bk)); + cASSERT(mc, mc->mc_dbi == (MDBX_dbi)i); + if (bk) { + MDBX_xcursor *mx = mc->mc_xcursor; + tASSERT(txn, txn->mt_parent != NULL); + /* Zap: Using uninitialized memory '*mc->mc_backup'. */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6001); + ENSURE(txn->mt_env, bk->mc_signature == MDBX_MC_LIVE); + tASSERT(txn, mx == bk->mc_xcursor); + if (stage == MDBX_MC_WAIT4EOT /* Cursor was closed by user */) + mc->mc_signature = stage /* Promote closed state to parent txn */; + else if (merge) { + /* Restore pointers to parent txn */ + mc->mc_next = bk->mc_next; + mc->mc_backup = bk->mc_backup; + mc->mc_txn = bk->mc_txn; + mc->mc_db = bk->mc_db; + mc->mc_dbi_state = bk->mc_dbi_state; + if (mx) { + if (mx != bk->mc_xcursor) { + *bk->mc_xcursor = *mx; + mx = bk->mc_xcursor; + } + mx->mx_cursor.mc_txn = bk->mc_txn; + } + } else { + /* Restore from backup, i.e. rollback/abort nested txn */ + *mc = *bk; + if (mx) + *mx = *(MDBX_xcursor *)(bk + 1); + } + bk->mc_signature = 0; + osal_free(bk); + } else { + ENSURE(txn->mt_env, stage == MDBX_MC_LIVE); + mc->mc_signature = MDBX_MC_READY4CLOSE /* Cursor may be reused */; + mc->mc_flags = 0 /* reset C_UNTRACK */; + } + mc = next; + } while (mc); + } +} + +static __noinline int dbi_import(MDBX_txn *txn, const size_t dbi); + +static __inline bool db_check_flags(uint16_t db_flags) { + switch (db_flags & ~(DB_VALID | MDBX_REVERSEKEY | MDBX_INTEGERKEY)) { + default: + NOTICE("invalid db-flags 0x%x", db_flags); + return false; + case MDBX_DUPSORT: + case MDBX_DUPSORT | MDBX_REVERSEDUP: + case MDBX_DUPSORT | MDBX_DUPFIXED: + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP: + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP: + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP: + case MDBX_DB_DEFAULTS: + return (db_flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY)) != + (MDBX_REVERSEKEY | MDBX_INTEGERKEY); + } +} + +static __inline uint8_t dbi_state(const MDBX_txn *txn, const size_t dbi) { + STATIC_ASSERT(DBI_DIRTY == MDBX_DBI_DIRTY && DBI_STALE == MDBX_DBI_STALE && + DBI_FRESH == MDBX_DBI_FRESH && DBI_CREAT == MDBX_DBI_CREAT); + +#if MDBX_ENABLE_DBI_SPARSE + const size_t bitmap_chunk = CHAR_BIT * sizeof(txn->mt_dbi_sparse[0]); + const size_t bitmap_indx = dbi / bitmap_chunk; + const size_t bitmap_mask = (size_t)1 << dbi % bitmap_chunk; + return likely(dbi < txn->mt_numdbs && + (txn->mt_dbi_sparse[bitmap_indx] & bitmap_mask) != 0) + ? txn->mt_dbi_state[dbi] + : 0; +#else + return likely(dbi < txn->mt_numdbs) ? txn->mt_dbi_state[dbi] : 0; +#endif /* MDBX_ENABLE_DBI_SPARSE */ +} + +static __inline bool dbi_changed(const MDBX_txn *txn, const size_t dbi) { + const MDBX_env *const env = txn->mt_env; + eASSERT(env, dbi_state(txn, dbi) & DBI_LINDO); + const uint32_t snap_seq = + atomic_load32(&env->me_dbi_seqs[dbi], mo_AcquireRelease); + return snap_seq != txn->mt_dbi_seqs[dbi]; +} + +static __always_inline int dbi_check(const MDBX_txn *txn, const size_t dbi) { + const uint8_t state = dbi_state(txn, dbi); + if (likely((state & DBI_LINDO) != 0 && !dbi_changed(txn, dbi))) + return (state & DBI_VALID) ? MDBX_SUCCESS : MDBX_BAD_DBI; + + /* Медленный путь: ленивая до-инициализацяи и импорт */ + return dbi_import((MDBX_txn *)txn, dbi); +} + +static __inline uint32_t dbi_seq_next(const MDBX_env *const env, size_t dbi) { + uint32_t v = atomic_load32(&env->me_dbi_seqs[dbi], mo_AcquireRelease) + 1; + return v ? v : 1; +} + +struct dbi_snap_result { + uint32_t sequence; + unsigned flags; +}; + +static struct dbi_snap_result dbi_snap(const MDBX_env *env, const size_t dbi) { + eASSERT(env, dbi < env->me_numdbs); + struct dbi_snap_result r; + uint32_t snap = atomic_load32(&env->me_dbi_seqs[dbi], mo_AcquireRelease); + do { + r.sequence = snap; + r.flags = env->me_db_flags[dbi]; + snap = atomic_load32(&env->me_dbi_seqs[dbi], mo_AcquireRelease); + } while (unlikely(snap != r.sequence)); + return r; +} + +static __noinline int dbi_import(MDBX_txn *txn, const size_t dbi) { + const MDBX_env *const env = txn->mt_env; + if (dbi >= env->me_numdbs || !env->me_db_flags[dbi]) + return MDBX_BAD_DBI; + +#if MDBX_ENABLE_DBI_SPARSE + const size_t bitmap_chunk = CHAR_BIT * sizeof(txn->mt_dbi_sparse[0]); + const size_t bitmap_indx = dbi / bitmap_chunk; + const size_t bitmap_mask = (size_t)1 << dbi % bitmap_chunk; + if (dbi >= txn->mt_numdbs) { + for (size_t i = (txn->mt_numdbs + bitmap_chunk - 1) / bitmap_chunk; + bitmap_indx >= i; ++i) + txn->mt_dbi_sparse[i] = 0; + eASSERT(env, (txn->mt_dbi_sparse[bitmap_indx] & bitmap_mask) == 0); + MDBX_txn *scan = txn; + do { + eASSERT(env, scan->mt_dbi_sparse == txn->mt_dbi_sparse); + eASSERT(env, scan->mt_numdbs < dbi + 1); + scan->mt_numdbs = (unsigned)dbi + 1; + scan->mt_dbi_state[dbi] = 0; + scan = scan->mt_parent; + } while (scan /* && scan->mt_dbi_sparse == txn->mt_dbi_sparse */); + txn->mt_dbi_sparse[bitmap_indx] |= bitmap_mask; + goto lindo; + } + if ((txn->mt_dbi_sparse[bitmap_indx] & bitmap_mask) == 0) { + MDBX_txn *scan = txn; + do { + eASSERT(env, scan->mt_dbi_sparse == txn->mt_dbi_sparse); + eASSERT(env, scan->mt_numdbs == txn->mt_numdbs); + scan->mt_dbi_state[dbi] = 0; + scan = scan->mt_parent; + } while (scan /* && scan->mt_dbi_sparse == txn->mt_dbi_sparse */); + txn->mt_dbi_sparse[bitmap_indx] |= bitmap_mask; + goto lindo; + } +#else + if (dbi >= txn->mt_numdbs) { + size_t i = txn->mt_numdbs; + do + txn->mt_dbi_state[i] = 0; + while (dbi >= ++i); + txn->mt_numdbs = i; + goto lindo; + } +#endif /* MDBX_ENABLE_DBI_SPARSE */ + + if (!txn->mt_dbi_state[dbi]) { + lindo: + /* dbi-слот еще не инициализирован в транзакции, а хендл не использовался */ + txn->mt_cursors[dbi] = nullptr; + MDBX_txn *const parent = txn->mt_parent; + if (parent) { + /* вложенная пишущая транзакция */ + int rc = dbi_check(parent, dbi); + /* копируем состояние subDB очищая new-флаги. */ + eASSERT(env, txn->mt_dbi_seqs == parent->mt_dbi_seqs); + txn->mt_dbi_state[dbi] = + parent->mt_dbi_state[dbi] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); + if (likely(rc == MDBX_SUCCESS)) { + txn->mt_dbs[dbi] = parent->mt_dbs[dbi]; + if (parent->mt_cursors[dbi]) { + rc = cursor_shadow(parent->mt_cursors[dbi], txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) { + /* не получилось забекапить курсоры */ + txn->mt_dbi_state[dbi] = DBI_OLDEN | DBI_LINDO | DBI_STALE; + txn->mt_flags |= MDBX_TXN_ERROR; + } + } + } + return rc; + } + txn->mt_dbi_seqs[dbi] = 0; + txn->mt_dbi_state[dbi] = DBI_LINDO; + } else { + eASSERT(env, txn->mt_dbi_seqs[dbi] != env->me_dbi_seqs[dbi].weak); + if (unlikely((txn->mt_dbi_state[dbi] & (DBI_VALID | DBI_OLDEN)) || + txn->mt_cursors[dbi])) { + /* хендл уже использовался в транзакции, но был закрыт или переоткрыт, + * либо при явном пере-открытии хендла есть висячие курсоры */ + eASSERT(env, (txn->mt_dbi_state[dbi] & DBI_STALE) == 0); + txn->mt_dbi_seqs[dbi] = env->me_dbi_seqs[dbi].weak; + txn->mt_dbi_state[dbi] = DBI_OLDEN | DBI_LINDO; + return txn->mt_cursors[dbi] ? MDBX_DANGLING_DBI : MDBX_BAD_DBI; + } + } + + /* хендл не использовался в транзакции, либо явно пере-отрывается при + * отсутствии висячих курсоров */ + eASSERT(env, (txn->mt_dbi_state[dbi] & DBI_LINDO) && !txn->mt_cursors[dbi]); + + /* читаем актуальные флаги и sequence */ + struct dbi_snap_result snap = dbi_snap(env, dbi); + txn->mt_dbi_seqs[dbi] = snap.sequence; + if (snap.flags & DB_VALID) { + txn->mt_dbs[dbi].md_flags = snap.flags & DB_PERSISTENT_FLAGS; + txn->mt_dbi_state[dbi] = DBI_LINDO | DBI_VALID | DBI_STALE; + return MDBX_SUCCESS; + } + return MDBX_BAD_DBI; +} + +/* Export or close DBI handles opened in this txn. */ +static int dbi_update(MDBX_txn *txn, int keep) { + MDBX_env *const env = txn->mt_env; + tASSERT(txn, !txn->mt_parent && txn == env->me_txn0); + bool locked = false; + struct mdbx_defer_free_item *defer_chain = nullptr; + TXN_FOREACH_DBI_USER(txn, dbi) { + if (likely((txn->mt_dbi_state[dbi] & DBI_CREAT) == 0)) + continue; + if (!locked) { + int err = osal_fastmutex_acquire(&env->me_dbi_lock); + if (unlikely(err != MDBX_SUCCESS)) + return err; + locked = true; + if (dbi >= env->me_numdbs) + /* хендл был закрыт из другого потока пока захватывали блокировку */ + continue; + } + tASSERT(txn, dbi < env->me_numdbs); + if (keep) { + env->me_db_flags[dbi] = txn->mt_dbs[dbi].md_flags | DB_VALID; + } else { + uint32_t seq = dbi_seq_next(env, dbi); + struct mdbx_defer_free_item *item = env->me_dbxs[dbi].md_name.iov_base; + if (item) { + env->me_db_flags[dbi] = 0; + env->me_dbxs[dbi].md_name.iov_len = 0; + env->me_dbxs[dbi].md_name.iov_base = nullptr; + atomic_store32(&env->me_dbi_seqs[dbi], seq, mo_AcquireRelease); + osal_flush_incoherent_cpu_writeback(); + item->next = defer_chain; + defer_chain = item; + } else { + eASSERT(env, env->me_dbxs[dbi].md_name.iov_len == 0); + eASSERT(env, env->me_db_flags[dbi] == 0); + } + } + } + + if (locked) { + size_t i = env->me_numdbs; + while ((env->me_db_flags[i - 1] & DB_VALID) == 0) { + --i; + eASSERT(env, i >= CORE_DBS); + eASSERT(env, !env->me_db_flags[i] && !env->me_dbxs[i].md_name.iov_len && + !env->me_dbxs[i].md_name.iov_base); + } + env->me_numdbs = (unsigned)i; + env_defer_free_and_release(env, defer_chain); + } + return MDBX_SUCCESS; +} + int mdbx_cmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, const MDBX_val *b) { eASSERT(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); - return txn->mt_dbxs[dbi].md_cmp(a, b); + tASSERT(txn, (dbi_state(txn, dbi) & DBI_VALID) && !dbi_changed(txn, dbi)); + tASSERT(txn, dbi < txn->mt_env->me_numdbs && + (txn->mt_env->me_db_flags[dbi] & DB_VALID) != 0); + return txn->mt_env->me_dbxs[dbi].md_cmp(a, b); } int mdbx_dcmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, const MDBX_val *b) { eASSERT(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); - return txn->mt_dbxs[dbi].md_dcmp(a, b); + tASSERT(txn, (dbi_state(txn, dbi) & DBI_VALID) && !dbi_changed(txn, dbi)); + tASSERT(txn, dbi < txn->mt_env->me_numdbs && + (txn->mt_env->me_db_flags[dbi] & DB_VALID)); + return txn->mt_env->me_dbxs[dbi].md_dcmp(a, b); } /* Allocate memory for a page. @@ -8159,7 +8738,8 @@ static void refund_loose(MDBX_txn *txn) { /* Filter-out loose chain & dispose refunded pages. */ unlink_loose: - for (MDBX_page **link = &txn->tw.loose_pages; *link;) { + for (MDBX_page *__restrict *__restrict link = &txn->tw.loose_pages; + *link;) { MDBX_page *dp = *link; tASSERT(txn, dp->mp_flags == P_LOOSE); MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(dp), sizeof(MDBX_page *)); @@ -8510,7 +9090,7 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, return MDBX_SUCCESS; } -#if !MDBX_DEBUG && !defined(MDBX_USE_VALGRIND) && !defined(__SANITIZE_ADDRESS__) +#if !MDBX_DEBUG && !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) if (unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) #endif { @@ -8527,7 +9107,7 @@ static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, goto skip_invalidate; } -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) #endif kill_page(txn, mp, pgno, npages); @@ -8870,12 +9450,15 @@ static size_t txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); txn_lru_turn(txn); size_t keep = m0 ? cursor_keep(txn, m0) : 0; - for (size_t i = FREE_DBI; i < txn->mt_numdbs; ++i) - if (F_ISSET(txn->mt_dbistate[i], DBI_DIRTY | DBI_VALID) && - txn->mt_dbs[i].md_root != P_INVALID) - for (MDBX_cursor *mc = txn->mt_cursors[i]; mc; mc = mc->mc_next) + + TXN_FOREACH_DBI_ALL(txn, dbi) { + if (F_ISSET(txn->mt_dbi_state[dbi], DBI_DIRTY | DBI_VALID) && + txn->mt_dbs[dbi].md_root != P_INVALID) + for (MDBX_cursor *mc = txn->mt_cursors[dbi]; mc; mc = mc->mc_next) if (mc != m0) keep += cursor_keep(txn, mc); + } + return keep; } @@ -8932,33 +9515,6 @@ spill_prio(const MDBX_txn *txn, const size_t i, const uint32_t reciprocal) { return prio = (unsigned)factor; } -/* Spill pages from the dirty list back to disk. - * This is intended to prevent running into MDBX_TXN_FULL situations, - * but note that they may still occur in a few cases: - * - * 1) our estimate of the txn size could be too small. Currently this - * seems unlikely, except with a large number of MDBX_MULTIPLE items. - * - * 2) child txns may run out of space if their parents dirtied a - * lot of pages and never spilled them. TODO: we probably should do - * a preemptive spill during mdbx_txn_begin() of a child txn, if - * the parent's dirtyroom is below a given threshold. - * - * Otherwise, if not using nested txns, it is expected that apps will - * not run into MDBX_TXN_FULL any more. The pages are flushed to disk - * the same way as for a txn commit, e.g. their dirty status is cleared. - * If the txn never references them again, they can be left alone. - * If the txn only reads them, they can be used without any fuss. - * If the txn writes them again, they can be dirtied immediately without - * going thru all of the work of page_touch(). Such references are - * handled by page_unspill(). - * - * Also note, we never spill DB root pages, nor pages of active cursors, - * because we'll need these back again soon anyway. And in nested txns, - * we can't spill a page in a child txn if it was already spilled in a - * parent txn. That would alter the parent txns' data even though - * the child hasn't committed yet, and we'd have no way to undo it if - * the child aborted. */ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intptr_t wanna_spill_entries, const intptr_t wanna_spill_npages, @@ -9613,7 +10169,7 @@ __cold static void meta_troika_dump(const MDBX_env *env, const meta_ptr_t recent = meta_recent(env, troika); const meta_ptr_t prefer_steady = meta_prefer_steady(env, troika); const meta_ptr_t tail = meta_tail(env, troika); - NOTICE("%" PRIaTXN ".%c:%" PRIaTXN ".%c:%" PRIaTXN ".%c, fsm=0x%02x, " + NOTICE("troika: %" PRIaTXN ".%c:%" PRIaTXN ".%c:%" PRIaTXN ".%c, fsm=0x%02x, " "head=%d-%" PRIaTXN ".%c, " "base=%d-%" PRIaTXN ".%c, " "tail=%d-%" PRIaTXN ".%c, " @@ -9630,6 +10186,14 @@ __cold static void meta_troika_dump(const MDBX_env *env, /*----------------------------------------------------------------------------*/ +static __inline MDBX_CONST_FUNCTION MDBX_lockinfo * +lckless_stub(const MDBX_env *env) { + uintptr_t stub = (uintptr_t)&env->x_lckless_stub; + /* align to avoid false-positive alarm from UndefinedBehaviorSanitizer */ + stub = (stub + MDBX_CACHELINE_SIZE - 1) & ~(MDBX_CACHELINE_SIZE - 1); + return (MDBX_lockinfo *)stub; +} + /* Find oldest txnid still referenced. */ static txnid_t find_oldest_reader(MDBX_env *const env, const txnid_t steady) { const uint32_t nothing_changed = MDBX_STRING_TETRAD("None"); @@ -9637,7 +10201,7 @@ static txnid_t find_oldest_reader(MDBX_env *const env, const txnid_t steady) { MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (unlikely(lck == NULL /* exclusive without-lck mode */)) { - eASSERT(env, env->me_lck == (void *)&env->x_lckless_stub); + eASSERT(env, env->me_lck == lckless_stub(env)); env->me_lck->mti_readers_refresh_flag.weak = nothing_changed; return env->me_lck->mti_oldest_reader.weak = steady; } @@ -10020,11 +10584,21 @@ __cold static void munlock_all(const MDBX_env *env) { } __cold static unsigned default_rp_augment_limit(const MDBX_env *env) { - /* default rp_augment_limit = npages / 3 */ - const size_t augment = env->me_dbgeo.now / 3 >> env->me_psize2log; - eASSERT(env, augment < MDBX_PGL_LIMIT); - return pnl_bytes2size(pnl_size2bytes( - (augment > MDBX_PNL_INITIAL) ? augment : MDBX_PNL_INITIAL)); + const size_t timeframe = /* 16 секунд */ 16 << 16; + const size_t remain_1sec = + (env->me_options.gc_time_limit < timeframe) + ? timeframe - (size_t)env->me_options.gc_time_limit + : 0; + const size_t minimum = (env->me_maxgc_ov1page * 2 > MDBX_PNL_INITIAL) + ? env->me_maxgc_ov1page * 2 + : MDBX_PNL_INITIAL; + const size_t one_third = env->me_dbgeo.now / 3 >> env->me_psize2log; + const size_t augment_limit = + (one_third > minimum) + ? minimum + (one_third - minimum) / timeframe * remain_1sec + : minimum; + eASSERT(env, augment_limit < MDBX_PGL_LIMIT); + return pnl_bytes2size(pnl_size2bytes(augment_limit)); } static bool default_prefault_write(const MDBX_env *env) { @@ -10084,9 +10658,9 @@ __cold static int dxb_resize(MDBX_env *const env, const pgno_t used_pgno, } const size_t limit_bytes = pgno_align2os_bytes(env, limit_pgno); const size_t size_bytes = pgno_align2os_bytes(env, size_pgno); -#if MDBX_ENABLE_MADVISE || defined(MDBX_USE_VALGRIND) +#if MDBX_ENABLE_MADVISE || defined(ENABLE_MEMCHECK) const void *const prev_map = env->me_dxb_mmap.base; -#endif /* MDBX_ENABLE_MADVISE || MDBX_USE_VALGRIND */ +#endif /* MDBX_ENABLE_MADVISE || ENABLE_MEMCHECK */ VERBOSE("resize/%d datafile/mapping: " "present %" PRIuPTR " -> %" PRIuPTR ", " @@ -10107,60 +10681,63 @@ __cold static int dxb_resize(MDBX_env *const env, const pgno_t used_pgno, size_bytes == env->me_dxb_mmap.filesize) goto bailout; + /* При использовании MDBX_NOSTICKYTHREADS с транзакциями могут работать любые + * потоки и у нас нет информации о том, какие именно. Поэтому нет возможности + * выполнить remap-действия требующие приостановки работающих с БД потоков. */ + if ((env->me_flags & MDBX_NOSTICKYTHREADS) == 0) { #if defined(_WIN32) || defined(_WIN64) - if ((env->me_flags & MDBX_NOTLS) == 0 && - ((size_bytes < env->me_dxb_mmap.current && mode > implicit_grow) || - limit_bytes != env->me_dxb_mmap.limit)) { - /* 1) Windows allows only extending a read-write section, but not a - * corresponding mapped view. Therefore in other cases we must suspend - * the local threads for safe remap. - * 2) At least on Windows 10 1803 the entire mapped section is unavailable - * for short time during NtExtendSection() or VirtualAlloc() execution. - * 3) Under Wine runtime environment on Linux a section extending is not - * supported. - * - * THEREFORE LOCAL THREADS SUSPENDING IS ALWAYS REQUIRED! */ - array_onstack.limit = ARRAY_LENGTH(array_onstack.handles); - array_onstack.count = 0; - suspended = &array_onstack; - rc = osal_suspend_threads_before_remap(env, &suspended); - if (rc != MDBX_SUCCESS) { - ERROR("failed suspend-for-remap: errcode %d", rc); - goto bailout; - } - mresize_flags |= (mode < explicit_resize) - ? MDBX_MRESIZE_MAY_UNMAP - : MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; - } -#else /* Windows */ - MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (mode == explicit_resize && limit_bytes != env->me_dxb_mmap.limit && - !(env->me_flags & MDBX_NOTLS)) { - mresize_flags |= MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; - if (lck) { - int err = osal_rdt_lock(env) /* lock readers table until remap done */; - if (unlikely(MDBX_IS_ERROR(err))) { - rc = err; + if ((size_bytes < env->me_dxb_mmap.current && mode > implicit_grow) || + limit_bytes != env->me_dxb_mmap.limit) { + /* 1) Windows allows only extending a read-write section, but not a + * corresponding mapped view. Therefore in other cases we must suspend + * the local threads for safe remap. + * 2) At least on Windows 10 1803 the entire mapped section is unavailable + * for short time during NtExtendSection() or VirtualAlloc() execution. + * 3) Under Wine runtime environment on Linux a section extending is not + * supported. + * + * THEREFORE LOCAL THREADS SUSPENDING IS ALWAYS REQUIRED! */ + array_onstack.limit = ARRAY_LENGTH(array_onstack.handles); + array_onstack.count = 0; + suspended = &array_onstack; + rc = osal_suspend_threads_before_remap(env, &suspended); + if (rc != MDBX_SUCCESS) { + ERROR("failed suspend-for-remap: errcode %d", rc); goto bailout; } + mresize_flags |= (mode < explicit_resize) + ? MDBX_MRESIZE_MAY_UNMAP + : MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; + } +#else /* Windows */ + MDBX_lockinfo *const lck = env->me_lck_mmap.lck; + if (mode == explicit_resize && limit_bytes != env->me_dxb_mmap.limit) { + mresize_flags |= MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; + if (lck) { + int err = osal_rdt_lock(env) /* lock readers table until remap done */; + if (unlikely(MDBX_IS_ERROR(err))) { + rc = err; + goto bailout; + } - /* looking for readers from this process */ - const size_t snap_nreaders = - atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - eASSERT(env, mode == explicit_resize); - for (size_t i = 0; i < snap_nreaders; ++i) { - if (lck->mti_readers[i].mr_pid.weak == env->me_pid && - lck->mti_readers[i].mr_tid.weak != osal_thread_self()) { - /* the base address of the mapping can't be changed since - * the other reader thread from this process exists. */ - osal_rdt_unlock(env); - mresize_flags &= ~(MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE); - break; + /* looking for readers from this process */ + const size_t snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); + eASSERT(env, mode == explicit_resize); + for (size_t i = 0; i < snap_nreaders; ++i) { + if (lck->mti_readers[i].mr_pid.weak == env->me_pid && + lck->mti_readers[i].mr_tid.weak != osal_thread_self()) { + /* the base address of the mapping can't be changed since + * the other reader thread from this process exists. */ + osal_rdt_unlock(env); + mresize_flags &= ~(MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE); + break; + } } } } - } #endif /* ! Windows */ + } const pgno_t aligned_munlock_pgno = (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) @@ -10273,7 +10850,7 @@ __cold static int dxb_resize(MDBX_env *const env, const pgno_t used_pgno, env->me_dbgeo.now = env->me_dxb_mmap.current; env->me_dbgeo.upper = env->me_dxb_mmap.limit; adjust_defaults(env); -#ifdef MDBX_USE_VALGRIND +#ifdef ENABLE_MEMCHECK if (prev_limit != env->me_dxb_mmap.limit || prev_map != env->me_map) { VALGRIND_DISCARD(env->me_valgrind_handle); env->me_valgrind_handle = 0; @@ -10281,7 +10858,7 @@ __cold static int dxb_resize(MDBX_env *const env, const pgno_t used_pgno, env->me_valgrind_handle = VALGRIND_CREATE_BLOCK(env->me_map, env->me_dxb_mmap.limit, "mdbx"); } -#endif /* MDBX_USE_VALGRIND */ +#endif /* ENABLE_MEMCHECK */ } else { if (rc != MDBX_UNABLE_EXTEND_MAPSIZE && rc != MDBX_EPERM) { ERROR("failed resize datafile/mapping: " @@ -10581,9 +11158,9 @@ scan4seq_sse2(pgno_t *range, const size_t len, const size_t seq) { do { mask = (uint8_t)diffcmp2mask_sse2(range - 3, offset, pattern); if (mask) { -#ifndef __SANITIZE_ADDRESS__ +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) found: -#endif /* __SANITIZE_ADDRESS__ */ +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ return range + 28 - __builtin_clz(mask); } range -= 4; @@ -10596,7 +11173,7 @@ scan4seq_sse2(pgno_t *range, const size_t len, const size_t seq) { * только за пределами региона выделенного под PNL, но и пересекать границу * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ -#ifndef __SANITIZE_ADDRESS__ +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */; if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && !RUNNING_ON_VALGRIND) { @@ -10608,7 +11185,7 @@ scan4seq_sse2(pgno_t *range, const size_t len, const size_t seq) { goto found; return nullptr; } -#endif /* __SANITIZE_ADDRESS__ */ +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ do if (*range - range[offset] == target) return range; @@ -10652,9 +11229,9 @@ scan4seq_avx2(pgno_t *range, const size_t len, const size_t seq) { do { mask = (uint8_t)diffcmp2mask_avx2(range - 7, offset, pattern); if (mask) { -#ifndef __SANITIZE_ADDRESS__ +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) found: -#endif /* __SANITIZE_ADDRESS__ */ +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ return range + 24 - __builtin_clz(mask); } range -= 8; @@ -10667,7 +11244,7 @@ scan4seq_avx2(pgno_t *range, const size_t len, const size_t seq) { * только за пределами региона выделенного под PNL, но и пересекать границу * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ -#ifndef __SANITIZE_ADDRESS__ +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) const unsigned on_page_safe_mask = 0xfe0 /* enough for '-31' bytes offset */; if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && !RUNNING_ON_VALGRIND) { @@ -10679,7 +11256,7 @@ scan4seq_avx2(pgno_t *range, const size_t len, const size_t seq) { goto found; return nullptr; } -#endif /* __SANITIZE_ADDRESS__ */ +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ if (range - 3 > detent) { mask = diffcmp2mask_sse2avx(range - 3, offset, *(const __m128i *)&pattern); if (mask) @@ -10720,9 +11297,9 @@ scan4seq_avx512bw(pgno_t *range, const size_t len, const size_t seq) { do { mask = diffcmp2mask_avx512bw(range - 15, offset, pattern); if (mask) { -#ifndef __SANITIZE_ADDRESS__ +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) found: -#endif /* __SANITIZE_ADDRESS__ */ +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ return range + 16 - __builtin_clz(mask); } range -= 16; @@ -10735,7 +11312,7 @@ scan4seq_avx512bw(pgno_t *range, const size_t len, const size_t seq) { * только за пределами региона выделенного под PNL, но и пересекать границу * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ -#ifndef __SANITIZE_ADDRESS__ +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) const unsigned on_page_safe_mask = 0xfc0 /* enough for '-63' bytes offset */; if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && !RUNNING_ON_VALGRIND) { @@ -10747,7 +11324,7 @@ scan4seq_avx512bw(pgno_t *range, const size_t len, const size_t seq) { goto found; return nullptr; } -#endif /* __SANITIZE_ADDRESS__ */ +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ if (range - 7 > detent) { mask = diffcmp2mask_avx2(range - 7, offset, *(const __m256i *)&pattern); if (mask) @@ -10800,9 +11377,9 @@ __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, do { mask = diffcmp2mask_neon(range - 3, offset, pattern); if (mask) { -#ifndef __SANITIZE_ADDRESS__ +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) found: -#endif /* __SANITIZE_ADDRESS__ */ +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ return ptr_disp(range, -(__builtin_clzl(mask) >> sizeof(size_t) / 4)); } range -= 4; @@ -10815,7 +11392,7 @@ __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, * только за пределами региона выделенного под PNL, но и пересекать границу * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ -#ifndef __SANITIZE_ADDRESS__ +#if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__) const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */; if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && !RUNNING_ON_VALGRIND) { @@ -10827,7 +11404,7 @@ __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, goto found; return nullptr; } -#endif /* __SANITIZE_ADDRESS__ */ +#endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */ do if (*range - range[offset] == target) return range; @@ -10898,22 +11475,6 @@ static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, //------------------------------------------------------------------------------ -/* Allocate page numbers and memory for writing. Maintain mt_last_reclaimed, - * mt_relist and mt_next_pgno. Set MDBX_TXN_ERROR on failure. - * - * If there are free pages available from older transactions, they - * are re-used first. Otherwise allocate a new page at mt_next_pgno. - * Do not modify the GC, just merge GC records into mt_relist - * and move mt_last_reclaimed to say which records were consumed. Only this - * function can create mt_relist and move - * mt_last_reclaimed/mt_next_pgno. - * - * [in] mc cursor A cursor handle identifying the transaction and - * database for which we are allocating. - * [in] num the number of pages to allocate. - * - * Returns 0 on success, non-zero on failure.*/ - #define MDBX_ALLOC_DEFAULT 0 #define MDBX_ALLOC_RESERVE 1 #define MDBX_ALLOC_UNIMPORTANT 2 @@ -11290,12 +11851,24 @@ static __inline pgr_t page_alloc_finalize(MDBX_env *const env, return ret; } +struct monotime_cache { + uint64_t value; + int expire_countdown; +}; + +static __inline uint64_t monotime_since_cached(uint64_t begin_timestamp, + struct monotime_cache *cache) { + if (cache->expire_countdown) + cache->expire_countdown -= 1; + else { + cache->value = osal_monotime(); + cache->expire_countdown = 42 / 3; + } + return cache->value - begin_timestamp; +} + static pgr_t page_alloc_slowpath(const MDBX_cursor *const mc, const size_t num, uint8_t flags) { -#if MDBX_ENABLE_PROFGC - const uint64_t monotime_before = osal_monotime(); -#endif /* MDBX_ENABLE_PROFGC */ - pgr_t ret; MDBX_txn *const txn = mc->mc_txn; MDBX_env *const env = txn->mt_env; @@ -11310,8 +11883,19 @@ static pgr_t page_alloc_slowpath(const MDBX_cursor *const mc, const size_t num, eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - pgno_t pgno = 0; size_t newnext; + const uint64_t monotime_begin = + (MDBX_ENABLE_PROFGC || (num > 1 && env->me_options.gc_time_limit)) + ? osal_monotime() + : 0; + struct monotime_cache now_cache; + now_cache.expire_countdown = + 1 /* старт с 1 позволяет избавиться как от лишних системных вызовов когда + лимит времени задан нулевой или уже исчерпан, так и от подсчета + времени при не-достижении rp_augment_limit */ + ; + now_cache.value = monotime_begin; + pgno_t pgno = 0; if (num > 1) { #if MDBX_ENABLE_PROFGC prof->xpages += 1; @@ -11425,6 +12009,8 @@ next_gc:; goto depleted_gc; } if (unlikely(key.iov_len != sizeof(txnid_t))) { + ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC key-length"); ret.err = MDBX_CORRUPTED; goto fail; } @@ -11451,6 +12037,8 @@ next_gc:; if (unlikely(data.iov_len % sizeof(pgno_t) || data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || !pnl_check(gc_pnl, txn->mt_next_pgno))) { + ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC value-length"); ret.err = MDBX_CORRUPTED; goto fail; } @@ -11487,7 +12075,10 @@ next_gc:; txn->tw.relist) >= env->me_options.rp_augment_limit) && ((/* not a slot-request from gc-update */ num && /* have enough unallocated space */ txn->mt_geo.upper >= - txn->mt_next_pgno + num) || + txn->mt_next_pgno + num && + monotime_since_cached(monotime_begin, &now_cache) + + txn->tw.gc_time_acc >= + env->me_options.gc_time_limit) || gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= MDBX_PGL_LIMIT)) { /* Stop reclaiming to avoid large/overflow the page list. This is a rare * case while search for a continuously multi-page region in a @@ -11531,6 +12122,8 @@ next_gc:; flags |= MDBX_ALLOC_SHOULD_SCAN; if (AUDIT_ENABLED()) { if (unlikely(!pnl_check(txn->tw.relist, txn->mt_next_pgno))) { + ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid txn retired-list"); ret.err = MDBX_CORRUPTED; goto fail; } @@ -11789,6 +12382,8 @@ next_gc:; (size_t)txn->mt_dbs[FREE_DBI].md_entries); ret.page = NULL; } + if (num > 1) + txn->tw.gc_time_acc += monotime_since_cached(monotime_begin, &now_cache); } else { early_exit: DEBUG("return NULL for %zu pages for ALLOC_%s, rc %d", num, @@ -11797,7 +12392,7 @@ next_gc:; } #if MDBX_ENABLE_PROFGC - prof->rtime_monotonic += osal_monotime() - monotime_before; + prof->rtime_monotonic += osal_monotime() - monotime_begin; #endif /* MDBX_ENABLE_PROFGC */ return ret; } @@ -11805,7 +12400,8 @@ next_gc:; __hot static pgr_t page_alloc(const MDBX_cursor *const mc) { MDBX_txn *const txn = mc->mc_txn; tASSERT(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); - tASSERT(txn, F_ISSET(txn->mt_dbistate[mc->mc_dbi], DBI_DIRTY | DBI_VALID)); + tASSERT(txn, F_ISSET(dbi_state(txn, mc->mc_dbi), + DBI_LINDO | DBI_VALID | DBI_DIRTY)); /* If there are any loose pages, just use them */ while (likely(txn->tw.loose_pages)) { @@ -11945,7 +12541,7 @@ __hot static int page_touch(MDBX_cursor *mc) { int rc; tASSERT(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); - tASSERT(txn, F_ISSET(*mc->mc_dbistate, DBI_DIRTY | DBI_VALID)); + tASSERT(txn, F_ISSET(*mc->mc_dbi_state, DBI_LINDO | DBI_VALID | DBI_DIRTY)); tASSERT(txn, !IS_OVERFLOW(mp)); if (ASSERT_ENABLED()) { if (mc->mc_flags & C_SUB) { @@ -11953,7 +12549,7 @@ __hot static int page_touch(MDBX_cursor *mc) { MDBX_cursor_couple *couple = container_of(mx, MDBX_cursor_couple, inner); tASSERT(txn, mc->mc_db == &couple->outer.mc_xcursor->mx_db); tASSERT(txn, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); - tASSERT(txn, *couple->outer.mc_dbistate & DBI_DIRTY); + tASSERT(txn, *couple->outer.mc_dbi_state & DBI_DIRTY); } tASSERT(txn, dirtylist_check(txn)); } @@ -12142,25 +12738,30 @@ static int meta_sync(const MDBX_env *env, const meta_ptr_t head) { return rc; } +static __inline bool env_txn0_owned(const MDBX_env *env) { + return (env->me_flags & MDBX_NOSTICKYTHREADS) + ? (env->me_txn0->mt_owner != 0) + : (env->me_txn0->mt_owner == osal_thread_self()); +} + __cold static int env_sync(MDBX_env *env, bool force, bool nonblock) { - bool locked = false; + if (unlikely(env->me_flags & MDBX_RDONLY)) + return MDBX_EACCESS; + + const bool txn0_owned = env_txn0_owned(env); + bool should_unlock = false; int rc = MDBX_RESULT_TRUE /* means "nothing to sync" */; retry:; unsigned flags = env->me_flags & ~(MDBX_NOMETASYNC | MDBX_SHRINK_ALLOWED); - if (unlikely((flags & (MDBX_RDONLY | MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE)) != + if (unlikely((flags & (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE)) != MDBX_ENV_ACTIVE)) { - rc = MDBX_EACCESS; - if (!(flags & MDBX_ENV_ACTIVE)) - rc = MDBX_EPERM; - if (flags & MDBX_FATAL_ERROR) - rc = MDBX_PANIC; + rc = (flags & MDBX_FATAL_ERROR) ? MDBX_PANIC : MDBX_EPERM; goto bailout; } - const bool inside_txn = (env->me_txn0->mt_owner == osal_thread_self()); const meta_troika_t troika = - (inside_txn | locked) ? env->me_txn0->tw.troika : meta_tap(env); + (txn0_owned | should_unlock) ? env->me_txn0->tw.troika : meta_tap(env); const meta_ptr_t head = meta_recent(env, &troika); const uint64_t unsynced_pages = atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed); @@ -12171,7 +12772,7 @@ retry:; goto bailout; } - if (!inside_txn && locked && (env->me_flags & MDBX_WRITEMAP) && + if (should_unlock && (env->me_flags & MDBX_WRITEMAP) && unlikely(head.ptr_c->mm_geo.next > bytes2pgno(env, env->me_dxb_mmap.current))) { @@ -12201,8 +12802,8 @@ retry:; osal_monotime() - eoos_timestamp >= autosync_period)) flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; - if (!inside_txn) { - if (!locked) { + if (!txn0_owned) { + if (!should_unlock) { #if MDBX_ENABLE_PGOP_STAT unsigned wops = 0; #endif /* MDBX_ENABLE_PGOP_STAT */ @@ -12244,11 +12845,11 @@ retry:; rc = MDBX_SUCCESS /* means "some data was synced" */; } - err = mdbx_txn_lock(env, nonblock); + err = osal_txn_lock(env, nonblock); if (unlikely(err != MDBX_SUCCESS)) return err; - locked = true; + should_unlock = true; #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += wops; #endif /* MDBX_ENABLE_PGOP_STAT */ @@ -12262,8 +12863,8 @@ retry:; flags |= MDBX_SHRINK_ALLOWED; } - eASSERT(env, inside_txn || locked); - eASSERT(env, !inside_txn || (flags & MDBX_SHRINK_ALLOWED) == 0); + eASSERT(env, txn0_owned || should_unlock); + eASSERT(env, !txn0_owned || (flags & MDBX_SHRINK_ALLOWED) == 0); if (!head.is_steady && unlikely(env->me_stuck_meta >= 0) && troika.recent != (uint8_t)env->me_stuck_meta) { @@ -12290,8 +12891,8 @@ retry:; rc = meta_sync(env, head); bailout: - if (locked) - mdbx_txn_unlock(env); + if (should_unlock) + osal_txn_unlock(env); return rc; } @@ -12307,7 +12908,7 @@ static __inline int check_env(const MDBX_env *env, const bool wanna_active) { if (wanna_active) { #if MDBX_ENV_CHECKPID - if (unlikely(env->me_pid != osal_getpid())) { + if (unlikely(env->me_pid != osal_getpid()) && env->me_pid) { ((MDBX_env *)env)->me_flags |= MDBX_FATAL_ERROR; return MDBX_PANIC; } @@ -12328,110 +12929,7 @@ __cold int mdbx_env_sync_ex(MDBX_env *env, bool force, bool nonblock) { return env_sync(env, force, nonblock); } -/* Back up parent txn's cursors, then grab the originals for tracking */ -static int cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { - tASSERT(parent, parent->mt_cursors[FREE_DBI] == nullptr); - nested->mt_cursors[FREE_DBI] = nullptr; - for (int i = parent->mt_numdbs; --i > FREE_DBI;) { - nested->mt_cursors[i] = NULL; - MDBX_cursor *mc = parent->mt_cursors[i]; - if (mc != NULL) { - size_t size = mc->mc_xcursor ? sizeof(MDBX_cursor) + sizeof(MDBX_xcursor) - : sizeof(MDBX_cursor); - for (MDBX_cursor *bk; mc; mc = bk->mc_next) { - bk = mc; - if (mc->mc_signature != MDBX_MC_LIVE) - continue; - bk = osal_malloc(size); - if (unlikely(!bk)) - return MDBX_ENOMEM; -#if MDBX_DEBUG - memset(bk, 0xCD, size); - VALGRIND_MAKE_MEM_UNDEFINED(bk, size); -#endif /* MDBX_DEBUG */ - *bk = *mc; - mc->mc_backup = bk; - /* Kill pointers into src to reduce abuse: The - * user may not use mc until dst ends. But we need a valid - * txn pointer here for cursor fixups to keep working. */ - mc->mc_txn = nested; - mc->mc_db = &nested->mt_dbs[i]; - mc->mc_dbistate = &nested->mt_dbistate[i]; - MDBX_xcursor *mx = mc->mc_xcursor; - if (mx != NULL) { - *(MDBX_xcursor *)(bk + 1) = *mx; - mx->mx_cursor.mc_txn = nested; - } - mc->mc_next = nested->mt_cursors[i]; - nested->mt_cursors[i] = mc; - } - } - } - return MDBX_SUCCESS; -} - -/* Close this txn's cursors, give parent txn's cursors back to parent. - * - * [in] txn the transaction handle. - * [in] merge true to keep changes to parent cursors, false to revert. - * - * Returns 0 on success, non-zero on failure. */ -static void cursors_eot(MDBX_txn *txn, const bool merge) { - tASSERT(txn, txn->mt_cursors[FREE_DBI] == nullptr); - for (intptr_t i = txn->mt_numdbs; --i > FREE_DBI;) { - MDBX_cursor *mc = txn->mt_cursors[i]; - if (!mc) - continue; - txn->mt_cursors[i] = nullptr; - do { - const unsigned stage = mc->mc_signature; - MDBX_cursor *const next = mc->mc_next; - MDBX_cursor *const bk = mc->mc_backup; - ENSURE(txn->mt_env, - stage == MDBX_MC_LIVE || (stage == MDBX_MC_WAIT4EOT && bk)); - cASSERT(mc, mc->mc_dbi == (MDBX_dbi)i); - if (bk) { - MDBX_xcursor *mx = mc->mc_xcursor; - tASSERT(txn, txn->mt_parent != NULL); - /* Zap: Using uninitialized memory '*mc->mc_backup'. */ - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6001); - ENSURE(txn->mt_env, bk->mc_signature == MDBX_MC_LIVE); - tASSERT(txn, mx == bk->mc_xcursor); - if (stage == MDBX_MC_WAIT4EOT /* Cursor was closed by user */) - mc->mc_signature = stage /* Promote closed state to parent txn */; - else if (merge) { - /* Restore pointers to parent txn */ - mc->mc_next = bk->mc_next; - mc->mc_backup = bk->mc_backup; - mc->mc_txn = bk->mc_txn; - mc->mc_db = bk->mc_db; - mc->mc_dbistate = bk->mc_dbistate; - if (mx) { - if (mx != bk->mc_xcursor) { - *bk->mc_xcursor = *mx; - mx = bk->mc_xcursor; - } - mx->mx_cursor.mc_txn = bk->mc_txn; - } - } else { - /* Restore from backup, i.e. rollback/abort nested txn */ - *mc = *bk; - if (mx) - *mx = *(MDBX_xcursor *)(bk + 1); - } - bk->mc_signature = 0; - osal_free(bk); - } else { - ENSURE(txn->mt_env, stage == MDBX_MC_LIVE); - mc->mc_signature = MDBX_MC_READY4CLOSE /* Cursor may be reused */; - mc->mc_flags = 0 /* reset C_UNTRACK */; - } - mc = next; - } while (mc); - } -} - -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) /* Find largest mvcc-snapshot still referenced by this process. */ static pgno_t find_largest_this(MDBX_env *env, pgno_t largest) { MDBX_lockinfo *const lck = env->me_lck_mmap.lck; @@ -12479,13 +12977,16 @@ static void txn_valgrind(MDBX_env *env, MDBX_txn *txn) { } else { /* transaction end */ bool should_unlock = false; pgno_t last = MAX_PAGENO + 1; - if (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()) { + if (env->me_pid != osal_getpid()) { + /* resurrect after fork */ + return; + } else if (env->me_txn && env_txn0_owned(env)) { /* inside write-txn */ last = meta_recent(env, &env->me_txn0->tw.troika).ptr_v->mm_geo.next; } else if (env->me_flags & MDBX_RDONLY) { /* read-only mode, no write-txn, no wlock mutex */ last = NUM_METAS; - } else if (mdbx_txn_lock(env, true) == MDBX_SUCCESS) { + } else if (osal_txn_lock(env, true) == MDBX_SUCCESS) { /* no write-txn */ last = NUM_METAS; should_unlock = true; @@ -12506,10 +13007,10 @@ static void txn_valgrind(MDBX_env *env, MDBX_txn *txn) { pgno2bytes(env, edge - last)); } if (should_unlock) - mdbx_txn_unlock(env); + osal_txn_unlock(env); } } -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ typedef struct { int err; @@ -12575,7 +13076,7 @@ static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) { safe64_reset(&result.rslot->mr_txnid, true); if (slot == nreaders) env->me_lck->mti_numreaders.weak = (uint32_t)++nreaders; - result.rslot->mr_tid.weak = (env->me_flags & MDBX_NOTLS) ? 0 : tid; + result.rslot->mr_tid.weak = (env->me_flags & MDBX_NOSTICKYTHREADS) ? 0 : tid; atomic_store32(&result.rslot->mr_pid, env->me_pid, mo_AcquireRelease); osal_rdt_unlock(env); @@ -12595,12 +13096,12 @@ __cold int mdbx_thread_register(const MDBX_env *env) { return (env->me_flags & MDBX_EXCLUSIVE) ? MDBX_EINVAL : MDBX_EPERM; if (unlikely((env->me_flags & MDBX_ENV_TXKEY) == 0)) { - eASSERT(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); - return MDBX_EINVAL /* MDBX_NOTLS mode */; + eASSERT(env, env->me_flags & MDBX_NOSTICKYTHREADS); + return MDBX_EINVAL /* MDBX_NOSTICKYTHREADS mode */; } - eASSERT(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY | - MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY); + eASSERT(env, (env->me_flags & (MDBX_NOSTICKYTHREADS | MDBX_ENV_TXKEY)) == + MDBX_ENV_TXKEY); MDBX_reader *r = thread_rthc_get(env->me_txkey); if (unlikely(r != NULL)) { eASSERT(env, r->mr_pid.weak == env->me_pid); @@ -12611,7 +13112,7 @@ __cold int mdbx_thread_register(const MDBX_env *env) { } const uintptr_t tid = osal_thread_self(); - if (env->me_txn0 && unlikely(env->me_txn0->mt_owner == tid)) + if (env->me_txn && unlikely(env->me_txn0->mt_owner == tid)) return MDBX_TXN_OVERLAPPING; return bind_rslot((MDBX_env *)env, tid).err; } @@ -12625,12 +13126,12 @@ __cold int mdbx_thread_unregister(const MDBX_env *env) { return MDBX_RESULT_TRUE; if (unlikely((env->me_flags & MDBX_ENV_TXKEY) == 0)) { - eASSERT(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); - return MDBX_RESULT_TRUE /* MDBX_NOTLS mode */; + eASSERT(env, env->me_flags & MDBX_NOSTICKYTHREADS); + return MDBX_RESULT_TRUE /* MDBX_NOSTICKYTHREADS mode */; } - eASSERT(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY | - MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY); + eASSERT(env, (env->me_flags & (MDBX_NOSTICKYTHREADS | MDBX_ENV_TXKEY)) == + MDBX_ENV_TXKEY); MDBX_reader *r = thread_rthc_get(env->me_txkey); if (unlikely(r == NULL)) return MDBX_RESULT_TRUE /* not registered */; @@ -12798,11 +13299,16 @@ __hot static int coherency_check_head(MDBX_txn *txn, const meta_ptr_t head, /* Copy the DB info and flags */ txn->mt_geo = head.ptr_v->mm_geo; memcpy(txn->mt_dbs, head.ptr_c->mm_dbs, CORE_DBS * sizeof(MDBX_db)); + VALGRIND_MAKE_MEM_UNDEFINED(txn->mt_dbs + CORE_DBS, + txn->mt_env->me_maxdbs - CORE_DBS); txn->mt_canary = head.ptr_v->mm_canary; if (unlikely(!coherency_check(txn->mt_env, head.txnid, txn->mt_dbs, head.ptr_v, *timestamp == 0))) return coherency_timeout(timestamp, -1, txn->mt_env); + + tASSERT(txn, txn->mt_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); + tASSERT(txn, db_check_flags(txn->mt_dbs[MAIN_DBI].md_flags)); return MDBX_SUCCESS; } @@ -12826,6 +13332,9 @@ static int coherency_check_written(const MDBX_env *env, const txnid_t txnid, } if (unlikely(!coherency_check(env, head_txnid, meta->mm_dbs, meta, report))) return coherency_timeout(timestamp, pgno, env); + + eASSERT(env, meta->mm_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); + eASSERT(env, db_check_flags(meta->mm_dbs[MAIN_DBI].md_flags)); return MDBX_SUCCESS; } @@ -12837,7 +13346,7 @@ static bool check_meta_coherency(const MDBX_env *env, } /* Common code for mdbx_txn_begin() and mdbx_txn_renew(). */ -static int txn_renew(MDBX_txn *txn, const unsigned flags) { +static int txn_renew(MDBX_txn *txn, unsigned flags) { MDBX_env *env = txn->mt_env; int rc; @@ -12862,18 +13371,19 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { 0); const uintptr_t tid = osal_thread_self(); + flags |= env->me_flags & (MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP); if (flags & MDBX_TXN_RDONLY) { - eASSERT(env, (flags & ~(MDBX_TXN_RO_BEGIN_FLAGS | MDBX_WRITEMAP)) == 0); - txn->mt_flags = - MDBX_TXN_RDONLY | (env->me_flags & (MDBX_NOTLS | MDBX_WRITEMAP)); + eASSERT(env, (flags & ~(MDBX_TXN_RO_BEGIN_FLAGS | MDBX_WRITEMAP | + MDBX_NOSTICKYTHREADS)) == 0); + txn->mt_flags = flags; MDBX_reader *r = txn->to.reader; STATIC_ASSERT(sizeof(uintptr_t) <= sizeof(r->mr_tid)); if (likely(env->me_flags & MDBX_ENV_TXKEY)) { - eASSERT(env, !(env->me_flags & MDBX_NOTLS)); + eASSERT(env, !(env->me_flags & MDBX_NOSTICKYTHREADS)); r = thread_rthc_get(env->me_txkey); if (likely(r)) { if (unlikely(!r->mr_pid.weak) && - (runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN)) { + (mdbx_static.flags & MDBX_DBG_LEGACY_MULTIOPEN)) { thread_rthc_set(env->me_txkey, nullptr); r = nullptr; } else { @@ -12882,7 +13392,8 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { } } } else { - eASSERT(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); + eASSERT(env, + !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOSTICKYTHREADS)); } if (likely(r)) { @@ -12896,6 +13407,7 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { r = brs.rslot; } txn->to.reader = r; + STATIC_ASSERT(MDBX_TXN_RDONLY_PREPARE > MDBX_TXN_RDONLY); if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) { eASSERT(env, txn->mt_txnid == 0); eASSERT(env, txn->mt_owner == 0); @@ -12908,6 +13420,7 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED; return MDBX_SUCCESS; } + txn->mt_owner = tid; /* Seek & fetch the last meta */ uint64_t timestamp = 0; @@ -12928,9 +13441,9 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { mo_Relaxed); safe64_write(&r->mr_txnid, head.txnid); eASSERT(env, r->mr_pid.weak == osal_getpid()); - eASSERT(env, - r->mr_tid.weak == - ((env->me_flags & MDBX_NOTLS) ? 0 : osal_thread_self())); + eASSERT(env, r->mr_tid.weak == ((env->me_flags & MDBX_NOSTICKYTHREADS) + ? 0 + : osal_thread_self())); eASSERT(env, r->mr_txnid.weak == head.txnid || (r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD && head.txnid < env->me_lck->mti_oldest_reader.weak)); @@ -12938,8 +13451,7 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { mo_AcquireRelease); } else { /* exclusive mode without lck */ - eASSERT(env, !env->me_lck_mmap.lck && - env->me_lck == (void *)&env->x_lckless_stub); + eASSERT(env, !env->me_lck_mmap.lck && env->me_lck == lckless_stub(env)); } jitter4testing(true); @@ -12983,21 +13495,20 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { rc = MDBX_CORRUPTED; goto bailout; } - eASSERT(env, txn->mt_txnid >= env->me_lck->mti_oldest_reader.weak); - txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ ENSURE(env, txn->mt_txnid >= /* paranoia is appropriate here */ env->me_lck ->mti_oldest_reader.weak); - txn->mt_numdbs = env->me_numdbs; + tASSERT(txn, txn->mt_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); + tASSERT(txn, db_check_flags(txn->mt_dbs[MAIN_DBI].md_flags)); } else { eASSERT(env, (flags & ~(MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS | - MDBX_WRITEMAP)) == 0); + MDBX_WRITEMAP | MDBX_NOSTICKYTHREADS)) == 0); if (unlikely(txn->mt_owner == tid || /* not recovery mode */ env->me_stuck_meta >= 0)) return MDBX_BUSY; MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (lck && (env->me_flags & MDBX_NOTLS) == 0 && - (runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) { + if (lck && (env->me_flags & MDBX_NOSTICKYTHREADS) == 0 && + (mdbx_static.flags & MDBX_DBG_LEGACY_OVERLAP) == 0) { const size_t snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); for (size_t i = 0; i < snap_nreaders; ++i) { @@ -13014,16 +13525,16 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { /* Not yet touching txn == env->me_txn0, it may be active */ jitter4testing(false); - rc = mdbx_txn_lock(env, !!(flags & MDBX_TXN_TRY)); + rc = osal_txn_lock(env, !!(flags & MDBX_TXN_TRY)); if (unlikely(rc)) return rc; if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { - mdbx_txn_unlock(env); + osal_txn_unlock(env); return MDBX_PANIC; } #if defined(_WIN32) || defined(_WIN64) if (unlikely(!env->me_map)) { - mdbx_txn_unlock(env); + osal_txn_unlock(env); return MDBX_EPERM; } #endif /* Windows */ @@ -13046,6 +13557,8 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { goto bailout; } + tASSERT(txn, txn->mt_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); + tASSERT(txn, db_check_flags(txn->mt_dbs[MAIN_DBI].md_flags)); txn->mt_flags = flags; txn->mt_child = NULL; txn->tw.loose_pages = NULL; @@ -13056,12 +13569,11 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { MDBX_PNL_SETSIZE(txn->tw.retired_pages, 0); txn->tw.spilled.list = NULL; txn->tw.spilled.least_removed = 0; + txn->tw.gc_time_acc = 0; txn->tw.last_reclaimed = 0; if (txn->tw.lifo_reclaimed) MDBX_PNL_SETSIZE(txn->tw.lifo_reclaimed, 0); env->me_txn = txn; - txn->mt_numdbs = env->me_numdbs; - memcpy(txn->mt_dbiseqs, env->me_dbiseqs, txn->mt_numdbs * sizeof(unsigned)); if ((txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) { rc = dpl_alloc(txn); @@ -13079,24 +13591,98 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { eASSERT(env, txn->tw.writemap_spilled_npages == 0); } - /* Setup db info */ - osal_compiler_barrier(); - memset(txn->mt_cursors, 0, sizeof(MDBX_cursor *) * txn->mt_numdbs); - for (size_t i = CORE_DBS; i < txn->mt_numdbs; i++) { - const unsigned db_flags = env->me_dbflags[i]; - txn->mt_dbs[i].md_flags = db_flags & DB_PERSISTENT_FLAGS; - txn->mt_dbistate[i] = - (db_flags & DB_VALID) ? DBI_VALID | DBI_USRVALID | DBI_STALE : 0; - } - txn->mt_dbistate[MAIN_DBI] = DBI_VALID | DBI_USRVALID; - rc = - setup_dbx(&txn->mt_dbxs[MAIN_DBI], &txn->mt_dbs[MAIN_DBI], env->me_psize); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - txn->mt_dbistate[FREE_DBI] = DBI_VALID; txn->mt_front = txn->mt_txnid + ((flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == 0); + /* Setup db info */ + tASSERT(txn, txn->mt_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); + tASSERT(txn, db_check_flags(txn->mt_dbs[MAIN_DBI].md_flags)); + VALGRIND_MAKE_MEM_UNDEFINED(txn->mt_dbi_state, env->me_maxdbs); +#if MDBX_ENABLE_DBI_SPARSE + txn->mt_numdbs = CORE_DBS; + VALGRIND_MAKE_MEM_UNDEFINED( + txn->mt_dbi_sparse, + ceil_powerof2(env->me_maxdbs, CHAR_BIT * sizeof(txn->mt_dbi_sparse[0])) / + CHAR_BIT); + txn->mt_dbi_sparse[0] = (1 << CORE_DBS) - 1; +#else + txn->mt_numdbs = (env->me_numdbs < 8) ? env->me_numdbs : 8; + if (txn->mt_numdbs > CORE_DBS) + memset(txn->mt_dbi_state + CORE_DBS, 0, txn->mt_numdbs - CORE_DBS); +#endif /* MDBX_ENABLE_DBI_SPARSE */ + txn->mt_dbi_state[FREE_DBI] = DBI_LINDO | DBI_VALID; + txn->mt_dbi_state[MAIN_DBI] = DBI_LINDO | DBI_VALID; + txn->mt_cursors[FREE_DBI] = nullptr; + txn->mt_cursors[MAIN_DBI] = nullptr; + txn->mt_dbi_seqs[FREE_DBI] = 0; + txn->mt_dbi_seqs[MAIN_DBI] = + atomic_load32(&env->me_dbi_seqs[MAIN_DBI], mo_AcquireRelease); + + if (unlikely(env->me_db_flags[MAIN_DBI] != + (DB_VALID | txn->mt_dbs[MAIN_DBI].md_flags))) { + const bool need_txn_lock = env->me_txn0 && env->me_txn0->mt_owner != tid; + bool should_unlock = false; + if (need_txn_lock) { + rc = osal_txn_lock(env, true); + if (rc == MDBX_SUCCESS) + should_unlock = true; + else if (rc != MDBX_BUSY && rc != MDBX_EDEADLK) + goto bailout; + } + rc = osal_fastmutex_acquire(&env->me_dbi_lock); + if (likely(rc == MDBX_SUCCESS)) { + uint32_t seq = dbi_seq_next(env, MAIN_DBI); + /* проверяем повторно после захвата блокировки */ + if (env->me_db_flags[MAIN_DBI] != + (DB_VALID | txn->mt_dbs[MAIN_DBI].md_flags)) { + if (!need_txn_lock || should_unlock || + /* если нет активной пишущей транзакции, + * то следующая будет ждать на me_dbi_lock */ + !env->me_txn) { + if (env->me_db_flags[MAIN_DBI] != 0 || MDBX_DEBUG) + NOTICE("renew MainDB for %s-txn %" PRIaTXN + " since db-flags changes 0x%x -> 0x%x", + (txn->mt_flags & MDBX_TXN_RDONLY) ? "ro" : "rw", + txn->mt_txnid, env->me_db_flags[MAIN_DBI] & ~DB_VALID, + txn->mt_dbs[MAIN_DBI].md_flags); + env->me_db_flags[MAIN_DBI] = DB_POISON; + atomic_store32(&env->me_dbi_seqs[MAIN_DBI], seq, mo_AcquireRelease); + rc = setup_sdb(&env->me_dbxs[MAIN_DBI], &txn->mt_dbs[MAIN_DBI], + env->me_psize); + if (likely(rc == MDBX_SUCCESS)) { + seq = dbi_seq_next(env, MAIN_DBI); + env->me_db_flags[MAIN_DBI] = + DB_VALID | txn->mt_dbs[MAIN_DBI].md_flags; + txn->mt_dbi_seqs[MAIN_DBI] = atomic_store32( + &env->me_dbi_seqs[MAIN_DBI], seq, mo_AcquireRelease); + } + } else { + ERROR("MainDB db-flags changes 0x%x -> 0x%x ahead of read-txn " + "%" PRIaTXN, + txn->mt_dbs[MAIN_DBI].md_flags, + env->me_db_flags[MAIN_DBI] & ~DB_VALID, txn->mt_txnid); + rc = MDBX_INCOMPATIBLE; + } + } + ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + } else { + DEBUG("me_dbi_lock failed, err %d", rc); + } + if (should_unlock) + osal_txn_unlock(env); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + + if (unlikely(txn->mt_dbs[FREE_DBI].md_flags != MDBX_INTEGERKEY)) { + ERROR("unexpected/invalid db-flags 0x%u for GC/FreeDB", + txn->mt_dbs[FREE_DBI].md_flags); + rc = MDBX_INCOMPATIBLE; + goto bailout; + } + + tASSERT(txn, txn->mt_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); + tASSERT(txn, db_check_flags(txn->mt_dbs[MAIN_DBI].md_flags)); if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { WARNING("%s", "environment had fatal error, must shutdown!"); rc = MDBX_PANIC; @@ -13181,34 +13767,27 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { * since Wine don't support section extending, * i.e. in both cases unmap+map are required. */ used_bytes < env->me_dbgeo.upper && env->me_dbgeo.grow)) && - /* avoid recursive use SRW */ (txn->mt_flags & MDBX_NOTLS) == 0) { + /* avoid recursive use SRW */ (txn->mt_flags & + MDBX_NOSTICKYTHREADS) == 0) { txn->mt_flags |= MDBX_SHRINK_ALLOWED; osal_srwlock_AcquireShared(&env->me_remap_guard); } #endif /* Windows */ } else { - if (unlikely(txn->mt_dbs[FREE_DBI].md_flags != MDBX_INTEGERKEY)) { - ERROR("unexpected/invalid db-flags 0x%u for GC/FreeDB", - txn->mt_dbs[FREE_DBI].md_flags); - rc = MDBX_INCOMPATIBLE; - goto bailout; - } - tASSERT(txn, txn == env->me_txn0); MDBX_cursor *const gc = ptr_disp(txn, sizeof(MDBX_txn)); rc = cursor_init(gc, txn, FREE_DBI); if (rc != MDBX_SUCCESS) goto bailout; } -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) txn_valgrind(env, txn); -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ - txn->mt_owner = tid; +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ return MDBX_SUCCESS; } bailout: tASSERT(txn, rc != MDBX_SUCCESS); - txn_end(txn, MDBX_END_SLOT | MDBX_END_FAIL_BEGIN); + txn_end(txn, TXN_END_SLOT | TXN_END_FAIL_BEGIN); return rc; } @@ -13223,15 +13802,13 @@ static __always_inline int check_txn(const MDBX_txn *txn, int bad_bits) { return MDBX_BAD_TXN; tASSERT(txn, (txn->mt_flags & MDBX_TXN_FINISHED) || - (txn->mt_flags & MDBX_NOTLS) == - ((txn->mt_flags & MDBX_TXN_RDONLY) - ? txn->mt_env->me_flags & MDBX_NOTLS - : 0)); + (txn->mt_flags & MDBX_NOSTICKYTHREADS) == + (txn->mt_env->me_flags & MDBX_NOSTICKYTHREADS)); #if MDBX_TXN_CHECKOWNER - STATIC_ASSERT(MDBX_NOTLS > MDBX_TXN_FINISHED + MDBX_TXN_RDONLY); - if (unlikely(txn->mt_owner != osal_thread_self()) && - (txn->mt_flags & (MDBX_NOTLS | MDBX_TXN_FINISHED | MDBX_TXN_RDONLY)) < - (MDBX_TXN_FINISHED | MDBX_TXN_RDONLY)) + STATIC_ASSERT((long)MDBX_NOSTICKYTHREADS > (long)MDBX_TXN_FINISHED); + if ((txn->mt_flags & (MDBX_NOSTICKYTHREADS | MDBX_TXN_FINISHED)) < + MDBX_TXN_FINISHED && + unlikely(txn->mt_owner != osal_thread_self())) return txn->mt_owner ? MDBX_THREAD_MISMATCH : MDBX_BAD_TXN; #endif /* MDBX_TXN_CHECKOWNER */ @@ -13312,8 +13889,6 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, ~flags)) /* write txn in RDONLY env */ return MDBX_EACCESS; - flags |= env->me_flags & MDBX_WRITEMAP; - MDBX_txn *txn = nullptr; if (parent) { /* Nested transactions: Max 1 child, write txns only, no writemap */ @@ -13332,11 +13907,12 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, } tASSERT(parent, audit_ex(parent, 0, false) == 0); - flags |= parent->mt_flags & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS); + flags |= parent->mt_flags & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS | + MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP); } else if (flags & MDBX_TXN_RDONLY) { - if (env->me_txn0 && + if ((env->me_flags & MDBX_NOSTICKYTHREADS) == 0 && env->me_txn && unlikely(env->me_txn0->mt_owner == osal_thread_self()) && - (runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) + (mdbx_static.flags & MDBX_DBG_LEGACY_OVERLAP) == 0) return MDBX_TXN_OVERLAPPING; } else { /* Reuse preallocated write txn. However, do not touch it until @@ -13345,11 +13921,24 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, goto renew; } + const intptr_t bitmap_bytes = +#if MDBX_ENABLE_DBI_SPARSE + ceil_powerof2(env->me_maxdbs, CHAR_BIT * sizeof(txn->mt_dbi_sparse[0])) / + CHAR_BIT; +#else + 0; +#endif /* MDBX_ENABLE_DBI_SPARSE */ + STATIC_ASSERT(sizeof(txn->tw) > sizeof(txn->to)); const size_t base = (flags & MDBX_TXN_RDONLY) ? sizeof(MDBX_txn) - sizeof(txn->tw) + sizeof(txn->to) : sizeof(MDBX_txn); const size_t size = - base + env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + 1); + base + + ((flags & MDBX_TXN_RDONLY) + ? (size_t)bitmap_bytes + env->me_maxdbs * sizeof(txn->mt_dbi_seqs[0]) + : 0) + + env->me_maxdbs * (sizeof(txn->mt_dbs[0]) + sizeof(txn->mt_cursors[0]) + + sizeof(txn->mt_dbi_state[0])); txn = osal_malloc(size); if (unlikely(txn == nullptr)) { DEBUG("calloc: %s", "failed"); @@ -13363,18 +13952,22 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, memset(txn, 0, (MDBX_GOOFY_MSVC_STATIC_ANALYZER && base > size) ? size : base); txn->mt_dbs = ptr_disp(txn, base); - txn->mt_cursors = ptr_disp(txn->mt_dbs, sizeof(MDBX_db) * env->me_maxdbs); + txn->mt_cursors = + ptr_disp(txn->mt_dbs, env->me_maxdbs * sizeof(txn->mt_dbs[0])); #if MDBX_DEBUG txn->mt_cursors[FREE_DBI] = nullptr; /* avoid SIGSEGV in an assertion later */ -#endif /* MDBX_DEBUG */ - txn->mt_dbistate = ptr_disp(txn, size - env->me_maxdbs); - txn->mt_dbxs = env->me_dbxs; /* static */ +#endif + txn->mt_dbi_state = + ptr_disp(txn, size - env->me_maxdbs * sizeof(txn->mt_dbi_state[0])); txn->mt_flags = flags; txn->mt_env = env; if (parent) { tASSERT(parent, dirtylist_check(parent)); - txn->mt_dbiseqs = parent->mt_dbiseqs; +#if MDBX_ENABLE_DBI_SPARSE + txn->mt_dbi_sparse = parent->mt_dbi_sparse; +#endif /* MDBX_ENABLE_DBI_SPARSE */ + txn->mt_dbi_seqs = parent->mt_dbi_seqs; txn->mt_geo = parent->mt_geo; rc = dpl_alloc(txn); if (likely(rc == MDBX_SUCCESS)) { @@ -13431,6 +14024,7 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, = parent->mt_next_pgno) - MDBX_ENABLE_REFUND)); + txn->tw.gc_time_acc = parent->tw.gc_time_acc; txn->tw.last_reclaimed = parent->tw.last_reclaimed; if (parent->tw.lifo_reclaimed) { txn->tw.lifo_reclaimed = parent->tw.lifo_reclaimed; @@ -13451,14 +14045,19 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, parent->mt_flags |= MDBX_TXN_HAS_CHILD; parent->mt_child = txn; txn->mt_parent = parent; - txn->mt_numdbs = parent->mt_numdbs; txn->mt_owner = parent->mt_owner; - memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); txn->tw.troika = parent->tw.troika; - /* Copy parent's mt_dbistate, but clear DB_NEW */ - for (size_t i = 0; i < txn->mt_numdbs; i++) - txn->mt_dbistate[i] = - parent->mt_dbistate[i] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); + + txn->mt_cursors[FREE_DBI] = nullptr; + txn->mt_cursors[MAIN_DBI] = nullptr; + txn->mt_dbi_state[FREE_DBI] = + parent->mt_dbi_state[FREE_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); + txn->mt_dbi_state[MAIN_DBI] = + parent->mt_dbi_state[MAIN_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); + memset(txn->mt_dbi_state + CORE_DBS, 0, + (txn->mt_numdbs = parent->mt_numdbs) - CORE_DBS); + memcpy(txn->mt_dbs, parent->mt_dbs, sizeof(txn->mt_dbs[0]) * CORE_DBS); + tASSERT(parent, parent->tw.dirtyroom + parent->tw.dirtylist->length == (parent->mt_parent ? parent->mt_parent->tw.dirtyroom @@ -13467,15 +14066,22 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, (txn->mt_parent ? txn->mt_parent->tw.dirtyroom : txn->mt_env->me_options.dp_limit)); env->me_txn = txn; - rc = cursor_shadow(parent, txn); + tASSERT(parent, parent->mt_cursors[FREE_DBI] == nullptr); + rc = parent->mt_cursors[MAIN_DBI] + ? cursor_shadow(parent->mt_cursors[MAIN_DBI], txn, MAIN_DBI) + : MDBX_SUCCESS; if (AUDIT_ENABLED() && ASSERT_ENABLED()) { txn->mt_signature = MDBX_MT_SIGNATURE; tASSERT(txn, audit_ex(txn, 0, false) == 0); } if (unlikely(rc != MDBX_SUCCESS)) - txn_end(txn, MDBX_END_FAIL_BEGINCHILD); + txn_end(txn, TXN_END_FAIL_BEGINCHILD); } else { /* MDBX_TXN_RDONLY */ - txn->mt_dbiseqs = env->me_dbiseqs; + txn->mt_dbi_seqs = + ptr_disp(txn->mt_cursors, env->me_maxdbs * sizeof(txn->mt_cursors[0])); +#if MDBX_ENABLE_DBI_SPARSE + txn->mt_dbi_sparse = ptr_disp(txn->mt_dbi_state, -bitmap_bytes); +#endif /* MDBX_ENABLE_DBI_SPARSE */ renew: rc = txn_renew(txn, flags); } @@ -13488,12 +14094,13 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, eASSERT(env, txn->mt_flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED)); else if (flags & MDBX_TXN_RDONLY) eASSERT(env, (txn->mt_flags & - ~(MDBX_NOTLS | MDBX_TXN_RDONLY | MDBX_WRITEMAP | + ~(MDBX_NOSTICKYTHREADS | MDBX_TXN_RDONLY | MDBX_WRITEMAP | /* Win32: SRWL flag */ MDBX_SHRINK_ALLOWED)) == 0); else { - eASSERT(env, (txn->mt_flags & - ~(MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED | MDBX_NOMETASYNC | - MDBX_SAFE_NOSYNC | MDBX_TXN_SPILLS)) == 0); + eASSERT(env, + (txn->mt_flags & + ~(MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED | + MDBX_NOMETASYNC | MDBX_SAFE_NOSYNC | MDBX_TXN_SPILLS)) == 0); assert(!txn->tw.spilled.list && !txn->tw.spilled.least_removed); } txn->mt_signature = MDBX_MT_SIGNATURE; @@ -13645,141 +14252,17 @@ uint64_t mdbx_txn_id(const MDBX_txn *txn) { } int mdbx_txn_flags(const MDBX_txn *txn) { - if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) { - assert((-1 & (int)MDBX_TXN_INVALID) != 0); - return -1; - } + STATIC_ASSERT( + (MDBX_TXN_INVALID & + (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | + MDBX_TXN_HAS_CHILD | MDBX_TXN_DRAINED_GC | MDBX_SHRINK_ALLOWED | + MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) == 0); + if (unlikely(!txn || txn->mt_signature != MDBX_MT_SIGNATURE)) + return MDBX_TXN_INVALID; assert(0 == (int)(txn->mt_flags & MDBX_TXN_INVALID)); return txn->mt_flags; } -/* Check for misused dbi handles */ -static __inline bool dbi_changed(const MDBX_txn *txn, size_t dbi) { - if (txn->mt_dbiseqs == txn->mt_env->me_dbiseqs) - return false; - if (likely( - txn->mt_dbiseqs[dbi].weak == - atomic_load32((MDBX_atomic_uint32_t *)&txn->mt_env->me_dbiseqs[dbi], - mo_AcquireRelease))) - return false; - return true; -} - -static __inline unsigned dbi_seq(const MDBX_env *const env, size_t slot) { - unsigned v = env->me_dbiseqs[slot].weak + 1; - return v + (v == 0); -} - -static void dbi_import_locked(MDBX_txn *txn) { - const MDBX_env *const env = txn->mt_env; - size_t n = env->me_numdbs; - for (size_t i = CORE_DBS; i < n; ++i) { - if (i >= txn->mt_numdbs) { - txn->mt_cursors[i] = NULL; - if (txn->mt_dbiseqs != env->me_dbiseqs) - txn->mt_dbiseqs[i].weak = 0; - txn->mt_dbistate[i] = 0; - } - if ((dbi_changed(txn, i) && - (txn->mt_dbistate[i] & (DBI_CREAT | DBI_DIRTY | DBI_FRESH)) == 0) || - ((env->me_dbflags[i] & DB_VALID) && - !(txn->mt_dbistate[i] & DBI_VALID))) { - tASSERT(txn, - (txn->mt_dbistate[i] & (DBI_CREAT | DBI_DIRTY | DBI_FRESH)) == 0); - txn->mt_dbiseqs[i] = env->me_dbiseqs[i]; - txn->mt_dbs[i].md_flags = env->me_dbflags[i] & DB_PERSISTENT_FLAGS; - txn->mt_dbistate[i] = 0; - if (env->me_dbflags[i] & DB_VALID) { - txn->mt_dbistate[i] = DBI_VALID | DBI_USRVALID | DBI_STALE; - tASSERT(txn, txn->mt_dbxs[i].md_cmp != NULL); - tASSERT(txn, txn->mt_dbxs[i].md_name.iov_base != NULL); - } - } - } - while (unlikely(n < txn->mt_numdbs)) - if (txn->mt_cursors[txn->mt_numdbs - 1] == NULL && - (txn->mt_dbistate[txn->mt_numdbs - 1] & DBI_USRVALID) == 0) - txn->mt_numdbs -= 1; - else { - if ((txn->mt_dbistate[n] & DBI_USRVALID) == 0) { - if (txn->mt_dbiseqs != env->me_dbiseqs) - txn->mt_dbiseqs[n].weak = 0; - txn->mt_dbistate[n] = 0; - } - ++n; - } - txn->mt_numdbs = (MDBX_dbi)n; -} - -/* Import DBI which opened after txn started into context */ -__cold static bool dbi_import(MDBX_txn *txn, MDBX_dbi dbi) { - if (dbi < CORE_DBS || - (dbi >= txn->mt_numdbs && dbi >= txn->mt_env->me_numdbs)) - return false; - - ENSURE(txn->mt_env, - osal_fastmutex_acquire(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS); - dbi_import_locked(txn); - ENSURE(txn->mt_env, - osal_fastmutex_release(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS); - return txn->mt_dbistate[dbi] & DBI_USRVALID; -} - -/* Export or close DBI handles opened in this txn. */ -static void dbi_update(MDBX_txn *txn, int keep) { - tASSERT(txn, !txn->mt_parent && txn == txn->mt_env->me_txn0); - MDBX_dbi n = txn->mt_numdbs; - if (n) { - bool locked = false; - MDBX_env *const env = txn->mt_env; - - for (size_t i = n; --i >= CORE_DBS;) { - if (likely((txn->mt_dbistate[i] & DBI_CREAT) == 0)) - continue; - if (!locked) { - ENSURE(env, osal_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); - locked = true; - } - if (env->me_numdbs <= i || - txn->mt_dbiseqs[i].weak != env->me_dbiseqs[i].weak) - continue /* dbi explicitly closed and/or then re-opened by other txn */; - if (keep) { - env->me_dbflags[i] = txn->mt_dbs[i].md_flags | DB_VALID; - } else { - const MDBX_val name = env->me_dbxs[i].md_name; - if (name.iov_base) { - env->me_dbxs[i].md_name.iov_base = nullptr; - eASSERT(env, env->me_dbflags[i] == 0); - atomic_store32(&env->me_dbiseqs[i], dbi_seq(env, i), - mo_AcquireRelease); - env->me_dbxs[i].md_name.iov_len = 0; - if (name.iov_len) - osal_free(name.iov_base); - } else { - eASSERT(env, name.iov_len == 0); - eASSERT(env, env->me_dbflags[i] == 0); - } - } - } - - n = env->me_numdbs; - if (n > CORE_DBS && unlikely(!(env->me_dbflags[n - 1] & DB_VALID))) { - if (!locked) { - ENSURE(env, osal_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); - locked = true; - } - - n = env->me_numdbs; - while (n > CORE_DBS && !(env->me_dbflags[n - 1] & DB_VALID)) - --n; - env->me_numdbs = n; - } - - if (unlikely(locked)) - ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); - } -} - /* Filter-out pgno list from transaction's dirty-page list */ static void dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, const bool spilled) { tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); @@ -13854,22 +14337,15 @@ static void dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, const bool spilled) { * [in] mode why and how to end the transaction */ static int txn_end(MDBX_txn *txn, const unsigned mode) { MDBX_env *env = txn->mt_env; - static const char *const names[] = MDBX_END_NAMES; - -#if MDBX_ENV_CHECKPID - if (unlikely(txn->mt_env->me_pid != osal_getpid())) { - env->me_flags |= MDBX_FATAL_ERROR; - return MDBX_PANIC; - } -#endif /* MDBX_ENV_CHECKPID */ + static const char *const names[] = TXN_END_NAMES; - DEBUG("%s txn %" PRIaTXN "%c %p on mdbenv %p, root page %" PRIaPGNO + DEBUG("%s txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, - names[mode & MDBX_END_OPMASK], txn->mt_txnid, + names[mode & TXN_END_OPMASK], txn->mt_txnid, (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root); - if (!(mode & MDBX_END_EOTDONE)) /* !(already closed cursors) */ + if (!(mode & TXN_END_EOTDONE)) /* !(already closed cursors) */ cursors_eot(txn, false); int rc = MDBX_SUCCESS; @@ -13884,11 +14360,9 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { eASSERT(env, txn->mt_txnid == slot->mr_txnid.weak && slot->mr_txnid.weak >= env->me_lck->mti_oldest_reader.weak); -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - atomic_add32(&env->me_ignore_EDEADLK, 1); +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) txn_valgrind(env, nullptr); - atomic_sub32(&env->me_ignore_EDEADLK, 1); -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ atomic_store32(&slot->mr_snapshot_pages_used, 0, mo_Relaxed); safe64_reset(&slot->mr_txnid, false); atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, @@ -13897,7 +14371,7 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { eASSERT(env, slot->mr_pid.weak == env->me_pid); eASSERT(env, slot->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); } - if (mode & MDBX_END_SLOT) { + if (mode & TXN_END_SLOT) { if ((env->me_flags & MDBX_ENV_TXKEY) == 0) atomic_store32(&slot->mr_pid, 0, mo_Relaxed); txn->to.reader = NULL; @@ -13914,26 +14388,25 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { ENSURE(env, txn->mt_txnid >= /* paranoia is appropriate here */ env->me_lck ->mti_oldest_reader.weak); -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) if (txn == env->me_txn0) txn_valgrind(env, nullptr); -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ txn->mt_flags = MDBX_TXN_FINISHED; - txn->mt_owner = 0; env->me_txn = txn->mt_parent; pnl_free(txn->tw.spilled.list); txn->tw.spilled.list = nullptr; if (txn == env->me_txn0) { eASSERT(env, txn->mt_parent == NULL); /* Export or close DBI handles created in this txn */ - dbi_update(txn, mode & MDBX_END_UPDATE); + rc = dbi_update(txn, mode & TXN_END_UPDATE); pnl_shrink(&txn->tw.retired_pages); pnl_shrink(&txn->tw.relist); if (!(env->me_flags & MDBX_WRITEMAP)) dlist_free(txn); /* The writer mutex was locked in mdbx_txn_begin. */ - mdbx_txn_unlock(env); + osal_txn_unlock(env); } else { eASSERT(env, txn->mt_parent != NULL); MDBX_txn *const parent = txn->mt_parent; @@ -13945,6 +14418,7 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { eASSERT(env, memcmp(&txn->tw.troika, &parent->tw.troika, sizeof(meta_troika_t)) == 0); + txn->mt_owner = 0; if (txn->tw.lifo_reclaimed) { eASSERT(env, MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) >= (uintptr_t)parent->tw.lifo_reclaimed); @@ -13999,7 +14473,7 @@ static int txn_end(MDBX_txn *txn, const unsigned mode) { } eASSERT(env, txn == env->me_txn0 || txn->mt_owner == 0); - if ((mode & MDBX_END_FREE) != 0 && txn != env->me_txn0) { + if ((mode & TXN_END_FREE) != 0 && txn != env->me_txn0) { txn->mt_signature = 0; osal_free(txn); } @@ -14017,7 +14491,7 @@ int mdbx_txn_reset(MDBX_txn *txn) { return MDBX_EINVAL; /* LY: don't close DBI-handles */ - rc = txn_end(txn, MDBX_END_RESET | MDBX_END_UPDATE); + rc = txn_end(txn, TXN_END_RESET | TXN_END_UPDATE); if (rc == MDBX_SUCCESS) { tASSERT(txn, txn->mt_signature == MDBX_MT_SIGNATURE); tASSERT(txn, txn->mt_owner == 0); @@ -14038,30 +14512,76 @@ int mdbx_txn_break(MDBX_txn *txn) { return MDBX_SUCCESS; } -int mdbx_txn_abort(MDBX_txn *txn) { - int rc = check_txn(txn, 0); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - +static int txn_abort(MDBX_txn *txn) { if (txn->mt_flags & MDBX_TXN_RDONLY) /* LY: don't close DBI-handles */ - return txn_end(txn, MDBX_END_ABORT | MDBX_END_UPDATE | MDBX_END_SLOT | - MDBX_END_FREE); + return txn_end(txn, TXN_END_ABORT | TXN_END_UPDATE | TXN_END_SLOT | + TXN_END_FREE); if (unlikely(txn->mt_flags & MDBX_TXN_FINISHED)) return MDBX_BAD_TXN; if (txn->mt_child) - mdbx_txn_abort(txn->mt_child); + txn_abort(txn->mt_child); tASSERT(txn, (txn->mt_flags & MDBX_TXN_ERROR) || dirtylist_check(txn)); - return txn_end(txn, MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE); + return txn_end(txn, TXN_END_ABORT | TXN_END_SLOT | TXN_END_FREE); +} + +int mdbx_txn_abort(MDBX_txn *txn) { + int rc = check_txn(txn, 0); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + rc = check_env(txn->mt_env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if ((txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_NOSTICKYTHREADS)) == + MDBX_NOSTICKYTHREADS && + unlikely(txn->mt_owner != osal_thread_self())) { + mdbx_txn_break(txn); + return MDBX_THREAD_MISMATCH; + } + + return txn_abort(txn); +} + +__cold static MDBX_db *audit_db_dig(const MDBX_txn *txn, const size_t dbi, + MDBX_db *fallback) { + const MDBX_txn *dig = txn; + do { + tASSERT(txn, txn->mt_numdbs == dig->mt_numdbs); + const uint8_t state = dbi_state(dig, dbi); + if (state & DBI_LINDO) + switch (state & (DBI_VALID | DBI_STALE | DBI_OLDEN)) { + case DBI_VALID: + case DBI_OLDEN: + return dig->mt_dbs + dbi; + case 0: + return nullptr; + case DBI_VALID | DBI_STALE: + case DBI_OLDEN | DBI_STALE: + break; + default: + tASSERT(txn, !!"unexpected dig->mt_dbi_state[dbi]"); + } + dig = dig->mt_parent; + } while (dig); + return fallback; +} + +static size_t audit_db_used(const MDBX_db *db) { + return db ? (size_t)db->md_branch_pages + (size_t)db->md_leaf_pages + + (size_t)db->md_overflow_pages + : 0; } /* Count all the pages in each DB and in the GC and make sure * it matches the actual number of pages being used. */ -__cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, - bool dont_filter_gc) { +__cold static int audit_ex_locked(MDBX_txn *txn, size_t retired_stored, + bool dont_filter_gc) { + const MDBX_env *const env = txn->mt_env; size_t pending = 0; if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) pending = txn->tw.loose_count + MDBX_PNL_GETSIZE(txn->tw.relist) + @@ -14076,8 +14596,11 @@ __cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, MDBX_val key, data; while ((rc = cursor_get(&cx.outer, &key, &data, MDBX_NEXT)) == 0) { if (!dont_filter_gc) { - if (unlikely(key.iov_len != sizeof(txnid_t))) + if (unlikely(key.iov_len != sizeof(txnid_t))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC-key size", (unsigned)key.iov_len); return MDBX_CORRUPTED; + } txnid_t id = unaligned_peek_u64(4, key.iov_base); if (txn->tw.lifo_reclaimed) { for (size_t i = 1; i <= MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed); ++i) @@ -14092,79 +14615,69 @@ __cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, } tASSERT(txn, rc == MDBX_NOTFOUND); - for (size_t i = FREE_DBI; i < txn->mt_numdbs; i++) - txn->mt_dbistate[i] &= ~DBI_AUDITED; + const size_t done_bitmap_size = (txn->mt_numdbs + CHAR_BIT - 1) / CHAR_BIT; + uint8_t *const done_bitmap = alloca(done_bitmap_size); + memset(done_bitmap, 0, done_bitmap_size); + if (txn->mt_parent) { + tASSERT(txn, txn->mt_numdbs == txn->mt_parent->mt_numdbs && + txn->mt_numdbs == txn->mt_env->me_txn->mt_numdbs); +#if MDBX_ENABLE_DBI_SPARSE + tASSERT(txn, txn->mt_dbi_sparse == txn->mt_parent->mt_dbi_sparse && + txn->mt_dbi_sparse == txn->mt_env->me_txn->mt_dbi_sparse); +#endif /* MDBX_ENABLE_DBI_SPARSE */ + } + + size_t used = NUM_METAS + + audit_db_used(audit_db_dig(txn, FREE_DBI, nullptr)) + + audit_db_used(audit_db_dig(txn, MAIN_DBI, nullptr)); + rc = cursor_init(&cx.outer, txn, MAIN_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - size_t used = NUM_METAS; - for (size_t i = FREE_DBI; i <= MAIN_DBI; i++) { - if (!(txn->mt_dbistate[i] & DBI_VALID)) - continue; - rc = cursor_init(&cx.outer, txn, i); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - txn->mt_dbistate[i] |= DBI_AUDITED; - if (txn->mt_dbs[i].md_root == P_INVALID) - continue; - used += (size_t)txn->mt_dbs[i].md_branch_pages + - (size_t)txn->mt_dbs[i].md_leaf_pages + - (size_t)txn->mt_dbs[i].md_overflow_pages; + for (rc = page_search(&cx.outer, NULL, MDBX_PS_FIRST); rc == MDBX_SUCCESS; + rc = cursor_sibling(&cx.outer, SIBLING_RIGHT)) { + MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top]; + for (size_t k = 0; k < page_numkeys(mp); k++) { + MDBX_node *node = page_node(mp, k); + if (node_flags(node) != F_SUBDATA) + continue; + if (unlikely(node_ds(node) != sizeof(MDBX_db))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid dupsort sub-tree node size", (unsigned)node_ds(node)); + return MDBX_CORRUPTED; + } - if (i != MAIN_DBI) - continue; - rc = page_search(&cx.outer, NULL, MDBX_PS_FIRST); - while (rc == MDBX_SUCCESS) { - MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top]; - for (size_t j = 0; j < page_numkeys(mp); j++) { - MDBX_node *node = page_node(mp, j); - if (node_flags(node) == F_SUBDATA) { - if (unlikely(node_ds(node) != sizeof(MDBX_db))) - return MDBX_CORRUPTED; - MDBX_db db_copy, *db; - memcpy(db = &db_copy, node_data(node), sizeof(db_copy)); - if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) { - for (MDBX_dbi k = txn->mt_numdbs; --k > MAIN_DBI;) { - if ((txn->mt_dbistate[k] & DBI_VALID) && - /* txn->mt_dbxs[k].md_name.iov_base && */ - node_ks(node) == txn->mt_dbxs[k].md_name.iov_len && - memcmp(node_key(node), txn->mt_dbxs[k].md_name.iov_base, - node_ks(node)) == 0) { - txn->mt_dbistate[k] |= DBI_AUDITED; - if (!(txn->mt_dbistate[k] & MDBX_DBI_STALE)) - db = txn->mt_dbs + k; - break; - } - } - } - used += (size_t)db->md_branch_pages + (size_t)db->md_leaf_pages + - (size_t)db->md_overflow_pages; - } + MDBX_db reside; + const MDBX_db *db = memcpy(&reside, node_data(node), sizeof(reside)); + const MDBX_val name = {node_key(node), node_ks(node)}; + for (size_t dbi = CORE_DBS; dbi < env->me_numdbs; ++dbi) { + if (dbi >= txn->mt_numdbs || !(env->me_db_flags[dbi] & DB_VALID)) + continue; + if (env->me_dbxs[MAIN_DBI].md_cmp(&name, &env->me_dbxs[dbi].md_name)) + continue; + + done_bitmap[dbi / CHAR_BIT] |= 1 << dbi % CHAR_BIT; + db = audit_db_dig(txn, dbi, &reside); + break; } - rc = cursor_sibling(&cx.outer, SIBLING_RIGHT); + used += audit_db_used(db); } - tASSERT(txn, rc == MDBX_NOTFOUND); } + tASSERT(txn, rc == MDBX_NOTFOUND); - for (size_t i = FREE_DBI; i < txn->mt_numdbs; i++) { - if ((txn->mt_dbistate[i] & (DBI_VALID | DBI_AUDITED | DBI_STALE)) != - DBI_VALID) + for (size_t dbi = CORE_DBS; dbi < txn->mt_numdbs; ++dbi) { + if (done_bitmap[dbi / CHAR_BIT] & (1 << dbi % CHAR_BIT)) continue; - for (MDBX_txn *t = txn; t; t = t->mt_parent) - if (F_ISSET(t->mt_dbistate[i], DBI_DIRTY | DBI_CREAT)) { - used += (size_t)t->mt_dbs[i].md_branch_pages + - (size_t)t->mt_dbs[i].md_leaf_pages + - (size_t)t->mt_dbs[i].md_overflow_pages; - txn->mt_dbistate[i] |= DBI_AUDITED; - break; - } - MDBX_ANALYSIS_ASSUME(txn != nullptr); - if (!(txn->mt_dbistate[i] & DBI_AUDITED)) { + const MDBX_db *db = audit_db_dig(txn, dbi, nullptr); + if (db) + used += audit_db_used(db); + else if (dbi_state(txn, dbi)) WARNING("audit %s@%" PRIaTXN ": unable account dbi %zd / \"%*s\", state 0x%02x", - txn->mt_parent ? "nested-" : "", txn->mt_txnid, i, - (int)txn->mt_dbxs[i].md_name.iov_len, - (const char *)txn->mt_dbxs[i].md_name.iov_base, - txn->mt_dbistate[i]); - } + txn->mt_parent ? "nested-" : "", txn->mt_txnid, dbi, + (int)env->me_dbxs[dbi].md_name.iov_len, + (const char *)env->me_dbxs[dbi].md_name.iov_base, + dbi_state(txn, dbi)); } if (pending + gc + used == txn->mt_next_pgno) @@ -14185,9 +14698,22 @@ __cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, return MDBX_PROBLEM; } +__cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, + bool dont_filter_gc) { + MDBX_env *const env = txn->mt_env; + int rc = osal_fastmutex_acquire(&env->me_dbi_lock); + if (likely(rc == MDBX_SUCCESS)) { + rc = audit_ex_locked(txn, retired_stored, dont_filter_gc); + ENSURE(txn->mt_env, + osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + } + return rc; +} + typedef struct gc_update_context { - size_t retired_stored, loop; - size_t settled, cleaned_slot, reused_slot, filled_slot; + size_t loop, reserve_adj; + size_t retired_stored; + size_t reserved, cleaned_slot, reused_slot, fill_idx; txnid_t cleaned_id, rid; bool lifo, dense; #if MDBX_ENABLE_BIGFOOT @@ -14232,7 +14758,8 @@ static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { err = cursor_del(gc, 0); TRACE("== clear-4linear, backlog %zu, err %d", gcu_backlog_size(txn), err); - } + } else + err = (err == MDBX_NOTFOUND) ? MDBX_SUCCESS : err; } #if MDBX_ENABLE_BIGFOOT while (!err && --ctx->bigfoot >= txn->mt_txnid); @@ -14334,13 +14861,13 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx) { } static __inline void gcu_clean_reserved(MDBX_env *env, MDBX_val pnl) { -#if MDBX_DEBUG && (defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)) +#if MDBX_DEBUG && (defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__)) /* Для предотвращения предупреждения Valgrind из mdbx_dump_val() * вызванное через макрос DVAL_DEBUG() на выходе * из cursor_set(MDBX_SET_KEY), которая вызывается ниже внутри update_gc() в * цикле очистки и цикле заполнения зарезервированных элементов. */ memset(pnl.iov_base, 0xBB, pnl.iov_len); -#endif /* MDBX_DEBUG && (MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__) */ +#endif /* MDBX_DEBUG && (ENABLE_MEMCHECK || __SANITIZE_ADDRESS__) */ /* PNL is initially empty, zero out at least the length */ memset(pnl.iov_base, 0, sizeof(pgno_t)); @@ -14371,7 +14898,8 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { /* txn->tw.relist[] can grow and shrink during this call. * txn->tw.last_reclaimed and txn->tw.retired_pages[] can only grow. * But page numbers cannot disappear from txn->tw.retired_pages[]. */ - +retry_clean_adj: + ctx->reserve_adj = 0; retry: if (ctx->loop++) TRACE("%s", " >> restart"); @@ -14391,10 +14919,10 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { goto bailout; } - ctx->settled = 0; + ctx->reserved = 0; ctx->cleaned_slot = 0; ctx->reused_slot = 0; - ctx->filled_slot = ~0u; + ctx->fill_idx = ~0u; ctx->cleaned_id = 0; ctx->rid = txn->tw.last_reclaimed; while (true) { @@ -14416,10 +14944,10 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { if (ctx->cleaned_slot < (txn->tw.lifo_reclaimed ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) : 0)) { - ctx->settled = 0; + ctx->reserved = 0; ctx->cleaned_slot = 0; ctx->reused_slot = 0; - ctx->filled_slot = ~0u; + ctx->fill_idx = ~0u; /* LY: cleanup reclaimed records. */ do { ctx->cleaned_id = txn->tw.lifo_reclaimed[++ctx->cleaned_slot]; @@ -14458,11 +14986,13 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { goto bailout; if (!MDBX_DISABLE_VALIDATION && unlikely(key.iov_len != sizeof(txnid_t))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC-key size", (unsigned)key.iov_len); rc = MDBX_CORRUPTED; goto bailout; } ctx->rid = ctx->cleaned_id; - ctx->settled = 0; + ctx->reserved = 0; ctx->reused_slot = 0; ctx->cleaned_id = unaligned_peek_u64(4, key.iov_base); if (ctx->cleaned_id > txn->tw.last_reclaimed) @@ -14602,7 +15132,7 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { if (unlikely(!ctx->retired_stored)) { /* Make sure last page of GC is touched and on retired-list */ rc = cursor_last(&ctx->cursor, nullptr, nullptr); - if (likely(rc != MDBX_SUCCESS)) + if (likely(rc == MDBX_SUCCESS)) rc = gcu_touch(ctx); if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND) goto bailout; @@ -14657,14 +15187,14 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { if (unlikely(rc != MDBX_SUCCESS)) goto bailout; -#if MDBX_DEBUG && (defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)) +#if MDBX_DEBUG && (defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__)) /* Для предотвращения предупреждения Valgrind из mdbx_dump_val() * вызванное через макрос DVAL_DEBUG() на выходе * из cursor_set(MDBX_SET_KEY), которая вызывается как выше в цикле * очистки, так и ниже в цикле заполнения зарезервированных элементов. */ memset(data.iov_base, 0xBB, data.iov_len); -#endif /* MDBX_DEBUG && (MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__) */ +#endif /* MDBX_DEBUG && (ENABLE_MEMCHECK || __SANITIZE_ADDRESS__) */ if (retired_pages_before == MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { const size_t at = (ctx->lifo == MDBX_PNL_ASCENDING) @@ -14704,14 +15234,14 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { if (unlikely(rc != MDBX_SUCCESS)) goto bailout; -#if MDBX_DEBUG && (defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__)) +#if MDBX_DEBUG && (defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__)) /* Для предотвращения предупреждения Valgrind из mdbx_dump_val() * вызванное через макрос DVAL_DEBUG() на выходе * из cursor_set(MDBX_SET_KEY), которая вызывается как выше в цикле * очистки, так и ниже в цикле заполнения зарезервированных элементов. */ memset(data.iov_base, 0xBB, data.iov_len); -#endif /* MDBX_DEBUG && (MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__) */ +#endif /* MDBX_DEBUG && (ENABLE_MEMCHECK || __SANITIZE_ADDRESS__) */ /* Retry if tw.retired_pages[] grew during the Put() */ } while (data.iov_len < MDBX_PNL_SIZEOF(txn->tw.retired_pages)); @@ -14733,10 +15263,10 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { DEBUG_EXTRA_PRINT("%s\n", "."); } if (unlikely(amount != MDBX_PNL_GETSIZE(txn->tw.relist) && - ctx->settled)) { + ctx->reserved)) { TRACE("%s: reclaimed-list changed %zu -> %zu, retry", dbg_prefix_mode, amount, MDBX_PNL_GETSIZE(txn->tw.relist)); - goto retry /* rare case, but avoids GC fragmentation + goto retry_clean_adj /* rare case, but avoids GC fragmentation and one cycle. */ ; } @@ -14754,10 +15284,11 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - const size_t left = amount - ctx->settled; - TRACE("%s: amount %zu, settled %zd, left %zd, lifo-reclaimed-slots %zu, " + const size_t left = amount - ctx->reserved - ctx->reserve_adj; + TRACE("%s: amount %zu, settled %zd, reserve_adj %zu, left %zd, " + "lifo-reclaimed-slots %zu, " "reused-gc-slots %zu", - dbg_prefix_mode, amount, ctx->settled, left, + dbg_prefix_mode, amount, ctx->reserved, ctx->reserve_adj, left, txn->tw.lifo_reclaimed ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) : 0, ctx->reused_slot); if (0 >= (intptr_t)left) @@ -14851,6 +15382,8 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { rc = cursor_first(&ctx->cursor, &key, nullptr); if (unlikely(rc != MDBX_SUCCESS || key.iov_len != sizeof(txnid_t))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC-key size", (unsigned)key.iov_len); rc = MDBX_CORRUPTED; goto bailout; } @@ -14906,6 +15439,8 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { rc = cursor_first(&ctx->cursor, &key, nullptr); if (likely(rc == MDBX_SUCCESS)) { if (unlikely(key.iov_len != sizeof(txnid_t))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC-key size", (unsigned)key.iov_len); rc = MDBX_CORRUPTED; goto bailout; } @@ -14915,7 +15450,7 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { if (unlikely(ctx->rid == 0)) { ERROR("%s", "** no GC tail-space to store (going dense-mode)"); ctx->dense = true; - goto retry; + goto retry_clean_adj; } } else if (rc != MDBX_NOTFOUND) goto bailout; @@ -15004,7 +15539,7 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { key.iov_base = &reservation_gc_id; data.iov_len = (chunk + 1) * sizeof(pgno_t); TRACE("%s: reserve %zu [%zu...%zu) @%" PRIaTXN, dbg_prefix_mode, chunk, - ctx->settled + 1, ctx->settled + chunk + 1, reservation_gc_id); + ctx->reserved + 1, ctx->reserved + chunk + 1, reservation_gc_id); gcu_prepare_backlog(txn, ctx); rc = cursor_put_nochecklen(&ctx->cursor, &key, &data, MDBX_RESERVE | MDBX_NOOVERWRITE); @@ -15014,17 +15549,17 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { goto bailout; gcu_clean_reserved(env, data); - ctx->settled += chunk; - TRACE("%s: settled %zu (+%zu), continue", dbg_prefix_mode, ctx->settled, + ctx->reserved += chunk; + TRACE("%s: settled %zu (+%zu), continue", dbg_prefix_mode, ctx->reserved, chunk); if (txn->tw.lifo_reclaimed && unlikely(amount < MDBX_PNL_GETSIZE(txn->tw.relist)) && - (ctx->loop < 5 || - MDBX_PNL_GETSIZE(txn->tw.relist) - amount > env->me_maxgc_ov1page)) { + (ctx->loop < 5 || MDBX_PNL_GETSIZE(txn->tw.relist) - amount > + env->me_maxgc_ov1page / 2)) { NOTICE("** restart: reclaimed-list growth %zu -> %zu", amount, MDBX_PNL_GETSIZE(txn->tw.relist)); - goto retry; + goto retry_clean_adj; } continue; @@ -15037,7 +15572,8 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { TRACE("%s", " >> filling"); /* Fill in the reserved records */ - ctx->filled_slot = + size_t excess_slots = 0; + ctx->fill_idx = txn->tw.lifo_reclaimed ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot : ctx->reused_slot; @@ -15045,18 +15581,21 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - MDBX_ENABLE_REFUND)); tASSERT(txn, dirtylist_check(txn)); - if (MDBX_PNL_GETSIZE(txn->tw.relist)) { + if (ctx->reserved || MDBX_PNL_GETSIZE(txn->tw.relist)) { MDBX_val key, data; key.iov_len = data.iov_len = 0; /* avoid MSVC warning */ key.iov_base = data.iov_base = NULL; const size_t amount = MDBX_PNL_GETSIZE(txn->tw.relist); - size_t left = amount; + size_t left = amount, excess = 0; if (txn->tw.lifo_reclaimed == nullptr) { tASSERT(txn, ctx->lifo == 0); rc = cursor_first(&ctx->cursor, &key, &data); - if (unlikely(rc != MDBX_SUCCESS)) + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc == MDBX_NOTFOUND && ctx->reserve_adj) + goto retry_clean_adj; goto bailout; + } } else { tASSERT(txn, ctx->lifo != 0); } @@ -15068,24 +15607,33 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { if (txn->tw.lifo_reclaimed == nullptr) { tASSERT(txn, ctx->lifo == 0); fill_gc_id = unaligned_peek_u64(4, key.iov_base); - if (ctx->filled_slot-- == 0 || fill_gc_id > txn->tw.last_reclaimed) { - NOTICE( - "** restart: reserve depleted (filled_slot %zu, fill_id %" PRIaTXN - " > last_reclaimed %" PRIaTXN, - ctx->filled_slot, fill_gc_id, txn->tw.last_reclaimed); + if (ctx->fill_idx == 0 || fill_gc_id > txn->tw.last_reclaimed) { + if (!left) + break; + NOTICE("** restart: reserve depleted (fill_idx %zu, fill_id %" PRIaTXN + " > last_reclaimed %" PRIaTXN ", left %zu", + ctx->fill_idx, fill_gc_id, txn->tw.last_reclaimed, left); + ctx->reserve_adj = + (ctx->reserve_adj > left) ? ctx->reserve_adj - left : 0; goto retry; } + ctx->fill_idx -= 1; } else { tASSERT(txn, ctx->lifo != 0); - if (++ctx->filled_slot > MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)) { - NOTICE("** restart: reserve depleted (filled_gc_slot %zu > " - "lifo_reclaimed %zu" PRIaTXN, - ctx->filled_slot, MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)); + if (ctx->fill_idx >= MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)) { + if (!left) + break; + NOTICE("** restart: reserve depleted (fill_idx %zu >= " + "lifo_reclaimed %zu, left %zu", + ctx->fill_idx, MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed), left); + ctx->reserve_adj = + (ctx->reserve_adj > left) ? ctx->reserve_adj - left : 0; goto retry; } - fill_gc_id = txn->tw.lifo_reclaimed[ctx->filled_slot]; + ctx->fill_idx += 1; + fill_gc_id = txn->tw.lifo_reclaimed[ctx->fill_idx]; TRACE("%s: seek-reservation @%" PRIaTXN " at lifo_reclaimed[%zu]", - dbg_prefix_mode, fill_gc_id, ctx->filled_slot); + dbg_prefix_mode, fill_gc_id, ctx->fill_idx); key.iov_base = &fill_gc_id; key.iov_len = sizeof(fill_gc_id); rc = cursor_set(&ctx->cursor, &key, &data, MDBX_SET_KEY).err; @@ -15104,12 +15652,17 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { tASSERT(txn, data.iov_len >= sizeof(pgno_t) * 2); size_t chunk = data.iov_len / sizeof(pgno_t) - 1; if (unlikely(chunk > left)) { + const size_t delta = chunk - left; + excess += delta; + if (!left) { + excess_slots += 1; + goto next; + } TRACE("%s: chunk %zu > left %zu, @%" PRIaTXN, dbg_prefix_mode, chunk, left, fill_gc_id); - if ((ctx->loop < 5 && chunk - left > ctx->loop / 2) || - chunk - left > env->me_maxgc_ov1page) { + if ((ctx->loop < 5 && delta > (ctx->loop / 2)) || + delta > env->me_maxgc_ov1page) data.iov_len = (left + 1) * sizeof(pgno_t); - } chunk = left; } rc = cursor_put_nochecklen(&ctx->cursor, &key, &data, @@ -15122,14 +15675,14 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { amount != MDBX_PNL_GETSIZE(txn->tw.relist))) { NOTICE("** restart: reclaimed-list growth (%zu -> %zu, loose +%zu)", amount, MDBX_PNL_GETSIZE(txn->tw.relist), txn->tw.loose_count); - goto retry; + goto retry_clean_adj; } if (unlikely(txn->tw.lifo_reclaimed ? ctx->cleaned_slot < MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) : ctx->cleaned_id < txn->tw.last_reclaimed)) { NOTICE("%s", "** restart: reclaimed-slots changed"); - goto retry; + goto retry_clean_adj; } if (unlikely(ctx->retired_stored != MDBX_PNL_GETSIZE(txn->tw.retired_pages))) { @@ -15137,7 +15690,7 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { ctx->retired_stored < MDBX_PNL_GETSIZE(txn->tw.retired_pages)); NOTICE("** restart: retired-list growth (%zu -> %zu)", ctx->retired_stored, MDBX_PNL_GETSIZE(txn->tw.retired_pages)); - goto retry; + goto retry_clean_adj; } pgno_t *dst = data.iov_base; @@ -15155,35 +15708,44 @@ static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - if (left == 0) { - rc = MDBX_SUCCESS; - break; - } + next: if (txn->tw.lifo_reclaimed == nullptr) { tASSERT(txn, ctx->lifo == 0); rc = cursor_next(&ctx->cursor, &key, &data, MDBX_NEXT); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc != MDBX_NOTFOUND) + goto bailout; + rc = MDBX_SUCCESS; + break; + } } else { tASSERT(txn, ctx->lifo != 0); } } + + if (excess) { + size_t n = excess, adj = excess; + while (n >= env->me_maxgc_ov1page) + adj -= n /= env->me_maxgc_ov1page; + ctx->reserve_adj += adj; + TRACE("%s: extra %zu reserved space, adj +%zu (%zu)", dbg_prefix_mode, + excess, adj, ctx->reserve_adj); + } } tASSERT(txn, rc == MDBX_SUCCESS); if (unlikely(txn->tw.loose_count != 0)) { NOTICE("** restart: got %zu loose pages", txn->tw.loose_count); - goto retry; + goto retry_clean_adj; } - if (unlikely(ctx->filled_slot != - (txn->tw.lifo_reclaimed - ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - : 0))) { - const bool will_retry = ctx->loop < 9; - NOTICE("** %s: reserve excess (filled-slot %zu, loop %zu)", - will_retry ? "restart" : "ignore", ctx->filled_slot, ctx->loop); + if (unlikely(excess_slots)) { + const bool will_retry = ctx->loop < 5 || excess_slots > 1; + NOTICE("** %s: reserve excess (excess-slots %zu, filled-slot %zu, adj %zu, " + "loop %zu)", + will_retry ? "restart" : "ignore", excess_slots, ctx->fill_idx, + ctx->reserve_adj, ctx->loop); if (will_retry) goto retry; } @@ -15246,21 +15808,6 @@ static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) { return rc; } -/* Check txn and dbi arguments to a function */ -static __always_inline bool check_dbi(const MDBX_txn *txn, MDBX_dbi dbi, - unsigned validity) { - if (likely(dbi < txn->mt_numdbs)) { - if (likely(!dbi_changed(txn, dbi))) { - if (likely(txn->mt_dbistate[dbi] & validity)) - return true; - if (likely(dbi < CORE_DBS || - (txn->mt_env->me_dbflags[dbi] & DB_VALID) == 0)) - return false; - } - } - return dbi_import((MDBX_txn *)txn, dbi); -} - /* Merge child txn into parent */ static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, const size_t parent_retired_len) { @@ -15677,10 +16224,16 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { /* txn_end() mode for a commit which writes nothing */ unsigned end_mode = - MDBX_END_PURE_COMMIT | MDBX_END_UPDATE | MDBX_END_SLOT | MDBX_END_FREE; + TXN_END_PURE_COMMIT | TXN_END_UPDATE | TXN_END_SLOT | TXN_END_FREE; if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) goto done; + if ((txn->mt_flags & MDBX_NOSTICKYTHREADS) && + unlikely(txn->mt_owner != osal_thread_self())) { + rc = MDBX_THREAD_MISMATCH; + goto fail; + } + if (txn->mt_child) { rc = mdbx_txn_commit_ex(txn->mt_child, NULL); tASSERT(txn, txn->mt_child == NULL); @@ -15705,10 +16258,10 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { if (txn->tw.dirtylist->length == 0 && !(txn->mt_flags & MDBX_TXN_DIRTY) && parent->mt_numdbs == txn->mt_numdbs) { - for (int i = txn->mt_numdbs; --i >= 0;) { - tASSERT(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); - if ((txn->mt_dbistate[i] & DBI_STALE) && - !(parent->mt_dbistate[i] & DBI_STALE)) + TXN_FOREACH_DBI_ALL(txn, i) { + tASSERT(txn, (txn->mt_dbi_state[i] & DBI_DIRTY) == 0); + if ((txn->mt_dbi_state[i] & DBI_STALE) && + !(parent->mt_dbi_state[i] & DBI_STALE)) tASSERT(txn, memcmp(&parent->mt_dbs[i], &txn->mt_dbs[i], sizeof(MDBX_db)) == 0); } @@ -15722,7 +16275,8 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { tASSERT(txn, txn->tw.loose_count == 0); /* fast completion of pure nested transaction */ - end_mode = MDBX_END_PURE_COMMIT | MDBX_END_SLOT | MDBX_END_FREE; + VERBOSE("fast-complete pure nested txn %" PRIaTXN, txn->mt_txnid); + end_mode = TXN_END_PURE_COMMIT | TXN_END_SLOT | TXN_END_FREE; goto done; } @@ -15767,6 +16321,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { pnl_free(parent->tw.relist); parent->tw.relist = txn->tw.relist; txn->tw.relist = NULL; + parent->tw.gc_time_acc = txn->tw.gc_time_acc; parent->tw.last_reclaimed = txn->tw.last_reclaimed; parent->mt_geo = txn->mt_geo; @@ -15782,20 +16337,26 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { /* Merge our cursors into parent's and close them */ cursors_eot(txn, true); - end_mode |= MDBX_END_EOTDONE; + end_mode |= TXN_END_EOTDONE; /* Update parent's DBs array */ - memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); - parent->mt_numdbs = txn->mt_numdbs; - for (size_t i = 0; i < txn->mt_numdbs; i++) { - /* preserve parent's status */ - const uint8_t state = - txn->mt_dbistate[i] | - (parent->mt_dbistate[i] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)); - DEBUG("dbi %zu dbi-state %s 0x%02x -> 0x%02x", i, - (parent->mt_dbistate[i] != state) ? "update" : "still", - parent->mt_dbistate[i], state); - parent->mt_dbistate[i] = state; + eASSERT(env, parent->mt_numdbs == txn->mt_numdbs); + TXN_FOREACH_DBI_ALL(txn, dbi) { + if (txn->mt_dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)) { + parent->mt_dbs[dbi] = txn->mt_dbs[dbi]; + /* preserve parent's status */ + const uint8_t state = + txn->mt_dbi_state[dbi] | + (parent->mt_dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)); + DEBUG("dbi %zu dbi-state %s 0x%02x -> 0x%02x", dbi, + (parent->mt_dbi_state[dbi] != state) ? "update" : "still", + parent->mt_dbi_state[dbi], state); + parent->mt_dbi_state[dbi] = state; + } else { + eASSERT(env, txn->mt_dbi_state[dbi] == + (parent->mt_dbi_state[dbi] & + ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY))); + } } if (latency) { @@ -15840,15 +16401,16 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); + : env->me_options.dp_limit)); } cursors_eot(txn, false); - end_mode |= MDBX_END_EOTDONE; + end_mode |= TXN_END_EOTDONE; if ((!txn->tw.dirtylist || txn->tw.dirtylist->length == 0) && (txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0) { - for (intptr_t i = txn->mt_numdbs; --i >= 0;) - tASSERT(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); + TXN_FOREACH_DBI_ALL(txn, i) { + tASSERT(txn, !(txn->mt_dbi_state[i] & DBI_DIRTY)); + } #if defined(MDBX_NOSUCCESS_EMPTY_COMMIT) && MDBX_NOSUCCESS_EMPTY_COMMIT rc = txn_end(txn, end_mode); if (unlikely(rc != MDBX_SUCCESS)) @@ -15860,37 +16422,37 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { #endif /* MDBX_NOSUCCESS_EMPTY_COMMIT */ } - DEBUG("committing txn %" PRIaTXN " %p on mdbenv %p, root page %" PRIaPGNO + DEBUG("committing txn %" PRIaTXN " %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, txn->mt_txnid, (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root); - /* Update DB root pointers */ if (txn->mt_numdbs > CORE_DBS) { - MDBX_cursor_couple couple; - MDBX_val data; - data.iov_len = sizeof(MDBX_db); - - rc = cursor_init(&couple.outer, txn, MAIN_DBI); + /* Update subDB root pointers */ + MDBX_cursor_couple cx; + rc = cursor_init(&cx.outer, txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) goto fail; - for (MDBX_dbi i = CORE_DBS; i < txn->mt_numdbs; i++) { - if (txn->mt_dbistate[i] & DBI_DIRTY) { - MDBX_db *db = &txn->mt_dbs[i]; - DEBUG("update main's entry for sub-db %u, mod_txnid %" PRIaTXN - " -> %" PRIaTXN, - i, db->md_mod_txnid, txn->mt_txnid); - /* Может быть mod_txnid > front после коммита вложенных тразакций */ - db->md_mod_txnid = txn->mt_txnid; - data.iov_base = db; - WITH_CURSOR_TRACKING( - couple.outer, - rc = cursor_put_nochecklen(&couple.outer, &txn->mt_dbxs[i].md_name, - &data, F_SUBDATA)); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; + cx.outer.mc_next = txn->mt_cursors[MAIN_DBI]; + txn->mt_cursors[MAIN_DBI] = &cx.outer; + TXN_FOREACH_DBI_USER(txn, i) { + if ((txn->mt_dbi_state[i] & DBI_DIRTY) == 0) + continue; + MDBX_db *const db = &txn->mt_dbs[i]; + DEBUG("update main's entry for sub-db %zu, mod_txnid %" PRIaTXN + " -> %" PRIaTXN, + i, db->md_mod_txnid, txn->mt_txnid); + /* Может быть mod_txnid > front после коммита вложенных тразакций */ + db->md_mod_txnid = txn->mt_txnid; + MDBX_val data = {db, sizeof(MDBX_db)}; + rc = cursor_put_nochecklen(&cx.outer, &env->me_dbxs[i].md_name, &data, + F_SUBDATA); + if (unlikely(rc != MDBX_SUCCESS)) { + txn->mt_cursors[MAIN_DBI] = cx.outer.mc_next; + goto fail; } } + txn->mt_cursors[MAIN_DBI] = cx.outer.mc_next; } ts_1 = latency ? osal_monotime() : 0; @@ -15906,11 +16468,11 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { goto fail; tASSERT(txn, txn->tw.loose_count == 0); - txn->mt_dbs[FREE_DBI].md_mod_txnid = (txn->mt_dbistate[FREE_DBI] & DBI_DIRTY) + txn->mt_dbs[FREE_DBI].md_mod_txnid = (txn->mt_dbi_state[FREE_DBI] & DBI_DIRTY) ? txn->mt_txnid : txn->mt_dbs[FREE_DBI].md_mod_txnid; - txn->mt_dbs[MAIN_DBI].md_mod_txnid = (txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY) + txn->mt_dbs[MAIN_DBI].md_mod_txnid = (txn->mt_dbi_state[MAIN_DBI] & DBI_DIRTY) ? txn->mt_txnid : txn->mt_dbs[MAIN_DBI].md_mod_txnid; @@ -16045,7 +16607,7 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { goto fail; } - end_mode = MDBX_END_COMMITTED | MDBX_END_UPDATE | MDBX_END_EOTDONE; + end_mode = TXN_END_COMMITTED | TXN_END_UPDATE | TXN_END_EOTDONE; done: if (latency) @@ -16223,6 +16785,10 @@ static __always_inline bool eq_fast(const MDBX_val *a, const MDBX_val *b) { eq_fast_slowpath(a->iov_base, b->iov_base, a->iov_len); } +static int cmp_equal_or_greater(const MDBX_val *a, const MDBX_val *b) { + return eq_fast(a, b) ? 0 : 1; +} + static int validate_meta(MDBX_env *env, MDBX_meta *const meta, const MDBX_page *const page, const unsigned meta_number, unsigned *guess_pagesize) { @@ -16276,6 +16842,17 @@ static int validate_meta(MDBX_env *env, MDBX_meta *const meta, return MDBX_RESULT_TRUE; } + if (unlikely(meta->mm_dbs[FREE_DBI].md_flags != MDBX_INTEGERKEY)) { + WARNING("meta[%u] has invalid %s flags 0x%u, skip it", meta_number, + "GC/FreeDB", meta->mm_dbs[FREE_DBI].md_flags); + return MDBX_INCOMPATIBLE; + } + if (unlikely(!db_check_flags(meta->mm_dbs[MAIN_DBI].md_flags))) { + WARNING("meta[%u] has invalid %s flags 0x%u, skip it", meta_number, + "MainDB", meta->mm_dbs[MAIN_DBI].md_flags); + return MDBX_INCOMPATIBLE; + } + DEBUG("checking meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO " +%u -%u, txn_id %" PRIaTXN ", %s", @@ -16660,6 +17237,8 @@ __cold static MDBX_meta *init_metas(const MDBX_env *env, void *buffer) { static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, meta_troika_t *const troika) { eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); + eASSERT(env, pending->mm_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); + eASSERT(env, db_check_flags(pending->mm_dbs[MAIN_DBI].md_flags)); const MDBX_meta *const meta0 = METAPAGE(env, 0); const MDBX_meta *const meta1 = METAPAGE(env, 1); const MDBX_meta *const meta2 = METAPAGE(env, 2); @@ -16703,7 +17282,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, : pending->mm_geo.next); eASSERT(env, largest_pgno >= NUM_METAS); -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) const pgno_t edge = env->me_poison_edge; if (edge > largest_pgno) { env->me_poison_edge = largest_pgno; @@ -16714,7 +17293,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)), pgno2bytes(env, edge - largest_pgno)); } -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ #if MDBX_ENABLE_MADVISE && \ (defined(MADV_DONTNEED) || defined(POSIX_MADV_DONTNEED)) @@ -16805,9 +17384,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, const txnid_t txnid = safe64_txnid_next(pending->unsafe_txnid); NOTICE("force-forward pending-txn %" PRIaTXN " -> %" PRIaTXN, pending->unsafe_txnid, txnid); - ENSURE(env, !env->me_txn0 || - (env->me_txn0->mt_owner != osal_thread_self() && - !env->me_txn)); + ENSURE(env, !env->me_txn0 || !env->me_txn); if (unlikely(txnid > MAX_TXNID)) { rc = MDBX_TXN_FULL; ERROR("txnid overflow, raise %d", rc); @@ -16958,6 +17535,8 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, target->mm_geo = pending->mm_geo; target->mm_dbs[FREE_DBI] = pending->mm_dbs[FREE_DBI]; target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI]; + eASSERT(env, target->mm_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); + eASSERT(env, db_check_flags(target->mm_dbs[MAIN_DBI].md_flags)); target->mm_canary = pending->mm_canary; memcpy(target->mm_pages_retired, pending->mm_pages_retired, 8); jitter4testing(true); @@ -17012,6 +17591,8 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ const MDBX_meta undo_meta = *target; + eASSERT(env, pending->mm_dbs[FREE_DBI].md_flags == MDBX_INTEGERKEY); + eASSERT(env, db_check_flags(pending->mm_dbs[MAIN_DBI].md_flags)); rc = osal_pwrite(env->me_fd4meta, pending, sizeof(MDBX_meta), ptr_dist(target, env->me_map)); if (unlikely(rc != MDBX_SUCCESS)) { @@ -17177,14 +17758,6 @@ __cold static void setup_pagesize(MDBX_env *env, const size_t pagesize) { env->me_options.dp_initial = env->me_options.dp_limit; } -static __inline MDBX_CONST_FUNCTION MDBX_lockinfo * -lckless_stub(const MDBX_env *env) { - uintptr_t stub = (uintptr_t)&env->x_lckless_stub; - /* align to avoid false-positive alarm from UndefinedBehaviorSanitizer */ - stub = (stub + MDBX_CACHELINE_SIZE - 1) & ~(MDBX_CACHELINE_SIZE - 1); - return (MDBX_lockinfo *)stub; -} - __cold int mdbx_env_create(MDBX_env **penv) { if (unlikely(!penv)) return MDBX_EINVAL; @@ -17228,7 +17801,6 @@ __cold int mdbx_env_create(MDBX_env **penv) { env->me_maxdbs = env->me_numdbs = CORE_DBS; env->me_lazy_fd = env->me_dsync_fd = env->me_fd4meta = env->me_lfd = INVALID_HANDLE_VALUE; - env->me_pid = osal_getpid(); env->me_stuck_meta = -1; env->me_options.rp_augment_limit = MDBX_PNL_INITIAL; @@ -17268,7 +17840,7 @@ __cold int mdbx_env_create(MDBX_env **penv) { #if MDBX_LOCKING > MDBX_LOCKING_SYSV MDBX_lockinfo *const stub = lckless_stub(env); - rc = osal_ipclock_stub(&stub->mti_wlock); + rc = osal_ipclock_stubinit(&stub->mti_wlock); #endif /* MDBX_LOCKING */ if (unlikely(rc != MDBX_SUCCESS)) { osal_fastmutex_destroy(&env->me_remap_guard); @@ -17326,8 +17898,9 @@ __cold int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, if (unlikely(rc != MDBX_SUCCESS)) return rc; - const bool inside_txn = - (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()); + const bool txn0_owned = env->me_txn0 && env_txn0_owned(env); + const bool inside_txn = txn0_owned && env->me_txn; + bool should_unlock = false; #if MDBX_DEBUG if (growth_step < 0) { @@ -17338,17 +17911,16 @@ __cold int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, #endif /* MDBX_DEBUG */ intptr_t reasonable_maxsize = 0; - bool need_unlock = false; if (env->me_map) { /* env already mapped */ if (unlikely(env->me_flags & MDBX_RDONLY)) return MDBX_EACCESS; - if (!inside_txn) { - int err = mdbx_txn_lock(env, false); + if (!txn0_owned) { + int err = osal_txn_lock(env, false); if (unlikely(err != MDBX_SUCCESS)) return err; - need_unlock = true; + should_unlock = true; env->me_txn0->tw.troika = meta_tap(env); eASSERT(env, !env->me_txn && !env->me_txn0->mt_child); env->me_txn0->mt_txnid = @@ -17574,7 +18146,7 @@ __cold int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, MDBX_meta meta; memset(&meta, 0, sizeof(meta)); if (!inside_txn) { - eASSERT(env, need_unlock); + eASSERT(env, should_unlock); const meta_ptr_t head = meta_recent(env, &env->me_txn0->tw.troika); uint64_t timestamp = 0; @@ -17662,7 +18234,7 @@ __cold int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } -#endif +#endif /* Windows */ if (new_geo.now != current_geo->now || new_geo.upper != current_geo->upper) { @@ -17695,8 +18267,8 @@ __cold int mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, } bailout: - if (need_unlock) - mdbx_txn_unlock(env); + if (should_unlock) + osal_txn_unlock(env); return rc; } @@ -17711,6 +18283,7 @@ __cold static int alloc_page_buf(MDBX_env *env) { __cold static int setup_dxb(MDBX_env *env, const int lck_rc, const mdbx_mode_t mode_bits) { MDBX_meta header; + eASSERT(env, !(env->me_flags & MDBX_ENV_ACTIVE)); int rc = MDBX_RESULT_FALSE; int err = read_header(env, &header, lck_rc, mode_bits); if (unlikely(err != MDBX_SUCCESS)) { @@ -17761,6 +18334,19 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, pv2pages(header.mm_geo.shrink_pv), unaligned_peek_u64(4, header.mm_txnid_a), durable_caption(&header)); + if (unlikely(header.mm_dbs[FREE_DBI].md_flags != MDBX_INTEGERKEY)) { + ERROR("unexpected/invalid db-flags 0x%u for GC/FreeDB", + header.mm_dbs[FREE_DBI].md_flags); + return MDBX_INCOMPATIBLE; + } + env->me_db_flags[FREE_DBI] = DB_VALID | MDBX_INTEGERKEY; + env->me_dbxs[FREE_DBI].md_cmp = cmp_int_align4; /* aligned MDBX_INTEGERKEY */ + env->me_dbxs[FREE_DBI].md_dcmp = cmp_lenfast; + env->me_dbxs[FREE_DBI].md_klen_max = env->me_dbxs[FREE_DBI].md_klen_min = 8; + env->me_dbxs[FREE_DBI].md_vlen_min = 4; + env->me_dbxs[FREE_DBI].md_vlen_max = + mdbx_env_get_maxvalsize_ex(env, MDBX_INTEGERKEY); + if (env->me_psize != header.mm_psize) setup_pagesize(env, header.mm_psize); const size_t used_bytes = pgno2bytes(env, header.mm_geo.next); @@ -17906,7 +18492,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, return err; #endif /* MADV_DONTDUMP */ #if defined(MADV_DODUMP) - if (runtime_flags & MDBX_DBG_DUMP) { + if (mdbx_static.flags & MDBX_DBG_DUMP) { const size_t meta_length_aligned2os = pgno_align2os_bytes(env, NUM_METAS); err = madvise(env->me_map, meta_length_aligned2os, MADV_DODUMP) ? ignore_enosys(errno) @@ -17917,14 +18503,14 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, #endif /* MADV_DODUMP */ #endif /* MDBX_ENABLE_MADVISE */ -#ifdef MDBX_USE_VALGRIND +#ifdef ENABLE_MEMCHECK env->me_valgrind_handle = VALGRIND_CREATE_BLOCK(env->me_map, env->me_dxb_mmap.limit, "mdbx"); -#endif /* MDBX_USE_VALGRIND */ +#endif /* ENABLE_MEMCHECK */ eASSERT(env, used_bytes >= pgno2bytes(env, NUM_METAS) && used_bytes <= env->me_dxb_mmap.limit); -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) if (env->me_dxb_mmap.filesize > used_bytes && env->me_dxb_mmap.filesize < env->me_dxb_mmap.limit) { VALGRIND_MAKE_MEM_NOACCESS(ptr_disp(env->me_map, used_bytes), @@ -17936,13 +18522,12 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, bytes2pgno(env, (env->me_dxb_mmap.filesize < env->me_dxb_mmap.limit) ? env->me_dxb_mmap.filesize : env->me_dxb_mmap.limit); -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ meta_troika_t troika = meta_tap(env); #if MDBX_DEBUG meta_troika_dump(env, &troika); #endif - eASSERT(env, !env->me_txn && !env->me_txn0); //-------------------------------- validate/rollback head & steady meta-pages if (unlikely(env->me_stuck_meta >= 0)) { /* recovery mode */ @@ -18157,7 +18742,7 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, bytes2pgno(env, used_aligned2os_bytes), mo_Relaxed); if ((env->me_flags & MDBX_RDONLY) == 0 && env->me_stuck_meta < 0 && - (runtime_flags & MDBX_DBG_DONT_UPGRADE) == 0) { + (mdbx_static.flags & MDBX_DBG_DONT_UPGRADE) == 0) { for (int n = 0; n < NUM_METAS; ++n) { MDBX_meta *const meta = METAPAGE(env, n); if (unlikely(unaligned_peek_u64(4, &meta->mm_magic_and_version) != @@ -18238,101 +18823,48 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, /******************************************************************************/ -/* Open and/or initialize the lock region for the environment. */ -__cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, - mdbx_mode_t mode) { - eASSERT(env, env->me_lazy_fd != INVALID_HANDLE_VALUE); - eASSERT(env, env->me_lfd == INVALID_HANDLE_VALUE); - - int err = osal_openfile(MDBX_OPEN_LCK, env, lck_pathname, &env->me_lfd, mode); - if (err != MDBX_SUCCESS) { - switch (err) { - default: - return err; - case MDBX_ENOFILE: - case MDBX_EACCESS: - case MDBX_EPERM: - if (!F_ISSET(env->me_flags, MDBX_RDONLY | MDBX_EXCLUSIVE)) - return err; - break; - case MDBX_EROFS: - if ((env->me_flags & MDBX_RDONLY) == 0) - return err; - break; - } - - if (err != MDBX_ENOFILE) { - /* ENSURE the file system is read-only */ - err = osal_check_fs_rdonly(env->me_lazy_fd, lck_pathname, err); - if (err != MDBX_SUCCESS && - /* ignore ERROR_NOT_SUPPORTED for exclusive mode */ - !(err == MDBX_ENOSYS && (env->me_flags & MDBX_EXCLUSIVE))) - return err; - } +__cold static int setup_lck_locked(MDBX_env *env) { + int err = rthc_register(env); + if (unlikely(err != MDBX_SUCCESS)) + return err; - /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ - /* beginning of a locked section ---------------------------------------- */ - lcklist_lock(); - eASSERT(env, env->me_lcklist_next == nullptr); - env->me_lfd = INVALID_HANDLE_VALUE; - const int rc = osal_lck_seize(env); - if (MDBX_IS_ERROR(rc)) { - /* Calling lcklist_detach_locked() is required to restore POSIX-filelock - * and this job will be done by env_close(). */ - lcklist_unlock(); - return rc; - } - /* insert into inprocess lck-list */ - env->me_lcklist_next = inprocess_lcklist_head; - inprocess_lcklist_head = env; - lcklist_unlock(); - /* end of a locked section ---------------------------------------------- */ + int lck_seize_rc = osal_lck_seize(env); + if (unlikely(MDBX_IS_ERROR(lck_seize_rc))) + return lck_seize_rc; + if (env->me_lfd == INVALID_HANDLE_VALUE) { env->me_lck = lckless_stub(env); env->me_maxreaders = UINT_MAX; DEBUG("lck-setup:%s%s%s", " lck-less", (env->me_flags & MDBX_RDONLY) ? " readonly" : "", - (rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); - return rc; + (lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); + return lck_seize_rc; } - /* beginning of a locked section ------------------------------------------ */ - lcklist_lock(); - eASSERT(env, env->me_lcklist_next == nullptr); - - /* Try to get exclusive lock. If we succeed, then - * nobody is using the lock region and we should initialize it. */ - err = osal_lck_seize(env); - if (MDBX_IS_ERROR(err)) { - bailout: - /* Calling lcklist_detach_locked() is required to restore POSIX-filelock - * and this job will be done by env_close(). */ - lcklist_unlock(); - return err; - } + DEBUG("lck-setup:%s%s%s", " with-lck", + (env->me_flags & MDBX_RDONLY) ? " readonly" : "", + (lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); MDBX_env *inprocess_neighbor = nullptr; - if (err == MDBX_RESULT_TRUE) { - err = uniq_check(&env->me_lck_mmap, &inprocess_neighbor); - if (MDBX_IS_ERROR(err)) - goto bailout; - if (inprocess_neighbor && - ((runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 || - (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) != 0)) { - err = MDBX_BUSY; - goto bailout; + err = rthc_uniq_check(&env->me_lck_mmap, &inprocess_neighbor); + if (unlikely(MDBX_IS_ERROR(err))) + return err; + if (inprocess_neighbor) { + if ((mdbx_static.flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 || + (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) != 0) + return MDBX_BUSY; + if (lck_seize_rc == MDBX_RESULT_TRUE) { + err = osal_lck_downgrade(env); + if (unlikely(err != MDBX_SUCCESS)) + return err; + lck_seize_rc = MDBX_RESULT_FALSE; } } - const int lck_seize_rc = err; - - DEBUG("lck-setup:%s%s%s", " with-lck", - (env->me_flags & MDBX_RDONLY) ? " readonly" : "", - (lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); uint64_t size = 0; err = osal_filesize(env->me_lfd, &size); if (unlikely(err != MDBX_SUCCESS)) - goto bailout; + return err; if (lck_seize_rc == MDBX_RESULT_TRUE) { size = ceil_powerof2(env->me_maxreaders * sizeof(MDBX_reader) + @@ -18340,15 +18872,12 @@ __cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, env->me_os_psize); jitter4testing(false); } else { - if (env->me_flags & MDBX_EXCLUSIVE) { - err = MDBX_BUSY; - goto bailout; - } + if (env->me_flags & MDBX_EXCLUSIVE) + return MDBX_BUSY; if (size > INT_MAX || (size & (env->me_os_psize - 1)) != 0 || size < env->me_os_psize) { ERROR("lck-file has invalid size %" PRIu64 " bytes", size); - err = MDBX_PROBLEM; - goto bailout; + return MDBX_PROBLEM; } } @@ -18356,8 +18885,7 @@ __cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, ((size_t)size - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader); if (maxreaders < 4) { ERROR("lck-size too small (up to %" PRIuPTR " readers)", maxreaders); - err = MDBX_PROBLEM; - goto bailout; + return MDBX_PROBLEM; } env->me_maxreaders = (maxreaders <= MDBX_READERS_LIMIT) ? (unsigned)maxreaders @@ -18368,14 +18896,14 @@ __cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, lck_seize_rc ? MMAP_OPTION_TRUNCATE | MMAP_OPTION_SEMAPHORE : MMAP_OPTION_SEMAPHORE); if (unlikely(err != MDBX_SUCCESS)) - goto bailout; + return err; #if MDBX_ENABLE_MADVISE #ifdef MADV_DODUMP err = madvise(env->me_lck_mmap.lck, size, MADV_DODUMP) ? ignore_enosys(errno) : MDBX_SUCCESS; if (unlikely(MDBX_IS_ERROR(err))) - goto bailout; + return err; #endif /* MADV_DODUMP */ #ifdef MADV_WILLNEED @@ -18383,18 +18911,19 @@ __cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, ? ignore_enosys(errno) : MDBX_SUCCESS; if (unlikely(MDBX_IS_ERROR(err))) - goto bailout; + return err; #elif defined(POSIX_MADV_WILLNEED) err = ignore_enosys( posix_madvise(env->me_lck_mmap.lck, size, POSIX_MADV_WILLNEED)); if (unlikely(MDBX_IS_ERROR(err))) - goto bailout; + return err; #endif /* MADV_WILLNEED */ #endif /* MDBX_ENABLE_MADVISE */ - struct MDBX_lockinfo *const lck = env->me_lck_mmap.lck; + struct MDBX_lockinfo *lck = env->me_lck_mmap.lck; if (lck_seize_rc == MDBX_RESULT_TRUE) { - /* LY: exclusive mode, check and reset lck content */ + /* If we succeed got exclusive lock, then nobody is using the lock region + * and we should initialize it. */ memset(lck, 0, (size_t)size); jitter4testing(false); lck->mti_magic_and_version = MDBX_LOCK_MAGIC; @@ -18406,7 +18935,8 @@ __cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, MDBX_SYNC_DATA | MDBX_SYNC_SIZE); if (unlikely(err != MDBX_SUCCESS)) { ERROR("initial-%s for lck-file failed, err %d", "msync/fsync", err); - goto bailout; + eASSERT(env, MDBX_IS_ERROR(err)); + return err; } } else { if (lck->mti_magic_and_version != MDBX_LOCK_MAGIC) { @@ -18416,34 +18946,69 @@ __cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, ? "invalid magic" : "incompatible version (only applications with nearly or the " "same versions of libmdbx can share the same database)"); - err = invalid ? MDBX_INVALID : MDBX_VERSION_MISMATCH; - goto bailout; + return invalid ? MDBX_INVALID : MDBX_VERSION_MISMATCH; } if (lck->mti_os_and_format != MDBX_LOCK_FORMAT) { ERROR("lock region has os/format signature 0x%" PRIx32 ", expected 0x%" PRIx32, lck->mti_os_and_format, MDBX_LOCK_FORMAT); - err = MDBX_VERSION_MISMATCH; - goto bailout; + return MDBX_VERSION_MISMATCH; } } err = osal_lck_init(env, inprocess_neighbor, lck_seize_rc); - if (MDBX_IS_ERROR(err)) - goto bailout; - - ENSURE(env, env->me_lcklist_next == nullptr); - /* insert into inprocess lck-list */ - env->me_lcklist_next = inprocess_lcklist_head; - inprocess_lcklist_head = env; - lcklist_unlock(); - /* end of a locked section ------------------------------------------------ */ + if (unlikely(err != MDBX_SUCCESS)) { + eASSERT(env, MDBX_IS_ERROR(err)); + return err; + } - eASSERT(env, !MDBX_IS_ERROR(lck_seize_rc)); env->me_lck = lck; + eASSERT(env, !MDBX_IS_ERROR(lck_seize_rc)); return lck_seize_rc; } +/* Open and/or initialize the lock region for the environment. */ +__cold static int setup_lck(MDBX_env *env, mdbx_mode_t mode) { + eASSERT(env, env->me_lazy_fd != INVALID_HANDLE_VALUE); + eASSERT(env, env->me_lfd == INVALID_HANDLE_VALUE); + + int err = osal_openfile(MDBX_OPEN_LCK, env, env->me_pathname.lck, + &env->me_lfd, mode); + if (err != MDBX_SUCCESS) { + switch (err) { + default: + return err; + case MDBX_ENOFILE: + case MDBX_EACCESS: + case MDBX_EPERM: + if (!F_ISSET(env->me_flags, MDBX_RDONLY | MDBX_EXCLUSIVE)) + return err; + break; + case MDBX_EROFS: + if ((env->me_flags & MDBX_RDONLY) == 0) + return err; + break; + } + + if (err != MDBX_ENOFILE) { + /* ENSURE the file system is read-only */ + err = osal_check_fs_rdonly(env->me_lazy_fd, env->me_pathname.lck, err); + if (err != MDBX_SUCCESS && + /* ignore ERROR_NOT_SUPPORTED for exclusive mode */ + !(err == MDBX_ENOSYS && (env->me_flags & MDBX_EXCLUSIVE))) + return err; + } + + /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ + env->me_lfd = INVALID_HANDLE_VALUE; + } + + rthc_lock(); + err = setup_lck_locked(env); + rthc_unlock(); + return err; +} + __cold int mdbx_is_readahead_reasonable(size_t volume, intptr_t redundancy) { if (volume <= 1024 * 1024 * 4ul) return MDBX_RESULT_TRUE; @@ -18513,11 +19078,11 @@ __cold static int __must_check_result override_meta(MDBX_env *env, if (shape) { if (txnid && unlikely(!check_meta_coherency(env, shape, false))) { ERROR("bailout overriding meta-%zu since model failed " - "freedb/maindb %s-check for txnid #%" PRIaTXN, + "FreeDB/MainDB %s-check for txnid #%" PRIaTXN, target, "pre", constmeta_txnid(shape)); return MDBX_PROBLEM; } - if (runtime_flags & MDBX_DBG_DONT_UPGRADE) + if (mdbx_static.flags & MDBX_DBG_DONT_UPGRADE) memcpy(&model->mm_magic_and_version, &shape->mm_magic_and_version, sizeof(model->mm_magic_and_version)); model->mm_extra_flags = shape->mm_extra_flags; @@ -18537,7 +19102,7 @@ __cold static int __must_check_result override_meta(MDBX_env *env, sizeof(model->mm_magic_and_version)); if (unlikely(!check_meta_coherency(env, model, false))) { ERROR("bailout overriding meta-%zu since model failed " - "freedb/maindb %s-check for txnid #%" PRIaTXN, + "FreeDB/MainDB %s-check for txnid #%" PRIaTXN, target, "post", txnid); return MDBX_PROBLEM; } @@ -18669,12 +19234,6 @@ __cold int mdbx_env_open_for_recoveryW(MDBX_env *env, const wchar_t *pathname, 0); } -typedef struct { - void *buffer_for_free; - pathchar_t *lck, *dxb; - size_t ent_len; -} MDBX_handle_env_pathname; - __cold static int check_alternative_lck_absent(const pathchar_t *lck_pathname) { int err = osal_fileexists(lck_pathname); if (unlikely(err != MDBX_RESULT_FALSE)) { @@ -18686,11 +19245,9 @@ __cold static int check_alternative_lck_absent(const pathchar_t *lck_pathname) { return err; } -__cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, - const pathchar_t *pathname, - MDBX_env_flags_t *flags, +__cold static int env_handle_pathname(MDBX_env *env, const pathchar_t *pathname, const mdbx_mode_t mode) { - memset(ctx, 0, sizeof(*ctx)); + memset(&env->me_pathname, 0, sizeof(env->me_pathname)); if (unlikely(!pathname || !*pathname)) return MDBX_EINVAL; @@ -18701,21 +19258,22 @@ __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, rc = GetLastError(); if (rc != MDBX_ENOFILE) return rc; - if (mode == 0 || (*flags & MDBX_RDONLY) != 0) + if (mode == 0 || (env->me_flags & MDBX_RDONLY) != 0) /* can't open existing */ return rc; /* auto-create directory if requested */ - if ((*flags & MDBX_NOSUBDIR) == 0 && !CreateDirectoryW(pathname, nullptr)) { + if ((env->me_flags & MDBX_NOSUBDIR) == 0 && + !CreateDirectoryW(pathname, nullptr)) { rc = GetLastError(); if (rc != ERROR_ALREADY_EXISTS) return rc; } } else { /* ignore passed MDBX_NOSUBDIR flag and set it automatically */ - *flags |= MDBX_NOSUBDIR; + env->me_flags |= MDBX_NOSUBDIR; if (dwAttrib & FILE_ATTRIBUTE_DIRECTORY) - *flags -= MDBX_NOSUBDIR; + env->me_flags -= MDBX_NOSUBDIR; } #else struct stat st; @@ -18723,7 +19281,7 @@ __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, rc = errno; if (rc != MDBX_ENOFILE) return rc; - if (mode == 0 || (*flags & MDBX_RDONLY) != 0) + if (mode == 0 || (env->me_flags & MDBX_RDONLY) != 0) /* can't open non-existing */ return rc /* MDBX_ENOFILE */; @@ -18734,16 +19292,16 @@ __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, /* always add read/write/search for owner */ S_IRWXU | ((mode & S_IRGRP) ? /* +search if readable by group */ S_IXGRP : 0) | ((mode & S_IROTH) ? /* +search if readable by others */ S_IXOTH : 0); - if ((*flags & MDBX_NOSUBDIR) == 0 && mkdir(pathname, dir_mode)) { + if ((env->me_flags & MDBX_NOSUBDIR) == 0 && mkdir(pathname, dir_mode)) { rc = errno; if (rc != EEXIST) return rc; } } else { /* ignore passed MDBX_NOSUBDIR flag and set it automatically */ - *flags |= MDBX_NOSUBDIR; + env->me_flags |= MDBX_NOSUBDIR; if (S_ISDIR(st.st_mode)) - *flags -= MDBX_NOSUBDIR; + env->me_flags -= MDBX_NOSUBDIR; } #endif @@ -18759,41 +19317,42 @@ __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, const size_t pathname_len = strlen(pathname); #endif assert(!osal_isdirsep(lock_suffix[0])); - ctx->ent_len = pathname_len; + size_t base_len = pathname_len; static const size_t dxb_name_len = ARRAY_LENGTH(dxb_name) - 1; - if (*flags & MDBX_NOSUBDIR) { - if (ctx->ent_len > dxb_name_len && - osal_pathequal(pathname + ctx->ent_len - dxb_name_len, dxb_name, + if (env->me_flags & MDBX_NOSUBDIR) { + if (base_len > dxb_name_len && + osal_pathequal(pathname + base_len - dxb_name_len, dxb_name, dxb_name_len)) { - *flags -= MDBX_NOSUBDIR; - ctx->ent_len -= dxb_name_len; - } else if (ctx->ent_len == dxb_name_len - 1 && osal_isdirsep(dxb_name[0]) && + env->me_flags -= MDBX_NOSUBDIR; + base_len -= dxb_name_len; + } else if (base_len == dxb_name_len - 1 && osal_isdirsep(dxb_name[0]) && osal_isdirsep(lck_name[0]) && - osal_pathequal(pathname + ctx->ent_len - dxb_name_len + 1, + osal_pathequal(pathname + base_len - dxb_name_len + 1, dxb_name + 1, dxb_name_len - 1)) { - *flags -= MDBX_NOSUBDIR; - ctx->ent_len -= dxb_name_len - 1; + env->me_flags -= MDBX_NOSUBDIR; + base_len -= dxb_name_len - 1; } } const size_t suflen_with_NOSUBDIR = sizeof(lock_suffix) + sizeof(pathchar_t); const size_t suflen_without_NOSUBDIR = sizeof(lck_name) + sizeof(dxb_name); - const size_t enogh4any = (suflen_with_NOSUBDIR > suflen_without_NOSUBDIR) - ? suflen_with_NOSUBDIR - : suflen_without_NOSUBDIR; - const size_t bytes_needed = sizeof(pathchar_t) * ctx->ent_len * 2 + enogh4any; - ctx->buffer_for_free = osal_malloc(bytes_needed); - if (!ctx->buffer_for_free) + const size_t enough4any = (suflen_with_NOSUBDIR > suflen_without_NOSUBDIR) + ? suflen_with_NOSUBDIR + : suflen_without_NOSUBDIR; + const size_t bytes_needed = + sizeof(pathchar_t) * (base_len * 2 + pathname_len + 1) + enough4any; + env->me_pathname.buffer = osal_malloc(bytes_needed); + if (!env->me_pathname.buffer) return MDBX_ENOMEM; - ctx->dxb = ctx->buffer_for_free; - ctx->lck = ctx->dxb + ctx->ent_len + dxb_name_len + 1; - pathchar_t *const buf = ctx->buffer_for_free; + env->me_pathname.specified = env->me_pathname.buffer; + env->me_pathname.dxb = env->me_pathname.specified + pathname_len + 1; + env->me_pathname.lck = env->me_pathname.dxb + base_len + dxb_name_len + 1; rc = MDBX_SUCCESS; - if (ctx->ent_len) { - memcpy(buf + /* shutting up goofy MSVC static analyzer */ 0, pathname, - sizeof(pathchar_t) * pathname_len); - if (*flags & MDBX_NOSUBDIR) { + pathchar_t *const buf = env->me_pathname.buffer; + if (base_len) { + memcpy(buf, pathname, sizeof(pathchar_t) * pathname_len); + if (env->me_flags & MDBX_NOSUBDIR) { const pathchar_t *const lck_ext = osal_fileext(lck_name, ARRAY_LENGTH(lck_name)); if (lck_ext) { @@ -18803,32 +19362,33 @@ __cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, rc = check_alternative_lck_absent(buf); } } else { - memcpy(buf + ctx->ent_len, dxb_name, sizeof(dxb_name)); - memcpy(buf + ctx->ent_len + dxb_name_len, lock_suffix, - sizeof(lock_suffix)); + memcpy(buf + base_len, dxb_name, sizeof(dxb_name)); + memcpy(buf + base_len + dxb_name_len, lock_suffix, sizeof(lock_suffix)); rc = check_alternative_lck_absent(buf); } - memcpy(ctx->dxb + /* shutting up goofy MSVC static analyzer */ 0, pathname, - sizeof(pathchar_t) * (ctx->ent_len + 1)); - memcpy(ctx->lck, pathname, sizeof(pathchar_t) * ctx->ent_len); - if (*flags & MDBX_NOSUBDIR) { - memcpy(ctx->lck + ctx->ent_len, lock_suffix, sizeof(lock_suffix)); + memcpy(env->me_pathname.dxb, pathname, sizeof(pathchar_t) * (base_len + 1)); + memcpy(env->me_pathname.lck, pathname, sizeof(pathchar_t) * base_len); + if (env->me_flags & MDBX_NOSUBDIR) { + memcpy(env->me_pathname.lck + base_len, lock_suffix, sizeof(lock_suffix)); } else { - memcpy(ctx->dxb + ctx->ent_len, dxb_name, sizeof(dxb_name)); - memcpy(ctx->lck + ctx->ent_len, lck_name, sizeof(lck_name)); + memcpy(env->me_pathname.dxb + base_len, dxb_name, sizeof(dxb_name)); + memcpy(env->me_pathname.lck + base_len, lck_name, sizeof(lck_name)); } } else { - assert(!(*flags & MDBX_NOSUBDIR)); - memcpy(buf + /* shutting up goofy MSVC static analyzer */ 0, dxb_name + 1, - sizeof(dxb_name) - sizeof(pathchar_t)); + assert(!(env->me_flags & MDBX_NOSUBDIR)); + memcpy(buf, dxb_name + 1, sizeof(dxb_name) - sizeof(pathchar_t)); memcpy(buf + dxb_name_len - 1, lock_suffix, sizeof(lock_suffix)); rc = check_alternative_lck_absent(buf); - memcpy(ctx->dxb + /* shutting up goofy MSVC static analyzer */ 0, - dxb_name + 1, sizeof(dxb_name) - sizeof(pathchar_t)); - memcpy(ctx->lck, lck_name + 1, sizeof(lck_name) - sizeof(pathchar_t)); + memcpy(env->me_pathname.dxb, dxb_name + 1, + sizeof(dxb_name) - sizeof(pathchar_t)); + memcpy(env->me_pathname.lck, lck_name + 1, + sizeof(lck_name) - sizeof(pathchar_t)); } + + memcpy(env->me_pathname.specified, pathname, + sizeof(pathchar_t) * (pathname_len + 1)); return rc; } @@ -18866,23 +19426,19 @@ __cold int mdbx_env_deleteW(const wchar_t *pathname, (mode == MDBX_ENV_ENSURE_UNUSED) ? MDBX_EXCLUSIVE : MDBX_ENV_DEFAULTS; dummy_env->me_os_psize = (unsigned)osal_syspagesize(); dummy_env->me_psize = (unsigned)mdbx_default_pagesize(); - dummy_env->me_pathname = (pathchar_t *)pathname; - MDBX_handle_env_pathname env_pathname; STATIC_ASSERT(sizeof(dummy_env->me_flags) == sizeof(MDBX_env_flags_t)); - int rc = MDBX_RESULT_TRUE, - err = handle_env_pathname(&env_pathname, pathname, - (MDBX_env_flags_t *)&dummy_env->me_flags, 0); + int rc = MDBX_RESULT_TRUE, err = env_handle_pathname(dummy_env, pathname, 0); if (likely(err == MDBX_SUCCESS)) { mdbx_filehandle_t clk_handle = INVALID_HANDLE_VALUE, dxb_handle = INVALID_HANDLE_VALUE; if (mode > MDBX_ENV_JUST_DELETE) { - err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.dxb, - &dxb_handle, 0); + err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, + dummy_env->me_pathname.dxb, &dxb_handle, 0); err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; if (err == MDBX_SUCCESS) { - err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.lck, - &clk_handle, 0); + err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, + dummy_env->me_pathname.lck, &clk_handle, 0); err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; } if (err == MDBX_SUCCESS && clk_handle != INVALID_HANDLE_VALUE) @@ -18892,7 +19448,7 @@ __cold int mdbx_env_deleteW(const wchar_t *pathname, } if (err == MDBX_SUCCESS) { - err = osal_removefile(env_pathname.dxb); + err = osal_removefile(dummy_env->me_pathname.dxb); if (err == MDBX_SUCCESS) rc = MDBX_SUCCESS; else if (err == MDBX_ENOFILE) @@ -18900,14 +19456,17 @@ __cold int mdbx_env_deleteW(const wchar_t *pathname, } if (err == MDBX_SUCCESS) { - err = osal_removefile(env_pathname.lck); + err = osal_removefile(dummy_env->me_pathname.lck); if (err == MDBX_SUCCESS) rc = MDBX_SUCCESS; else if (err == MDBX_ENOFILE) err = MDBX_SUCCESS; } - if (err == MDBX_SUCCESS && !(dummy_env->me_flags & MDBX_NOSUBDIR)) { + if (err == MDBX_SUCCESS && !(dummy_env->me_flags & MDBX_NOSUBDIR) && + (/* pathname != "." */ pathname[0] != '.' || pathname[1] != 0) && + (/* pathname != ".." */ pathname[0] != '.' || pathname[1] != '.' || + pathname[2] != 0)) { err = osal_removedirectory(pathname); if (err == MDBX_SUCCESS) rc = MDBX_SUCCESS; @@ -18922,92 +19481,11 @@ __cold int mdbx_env_deleteW(const wchar_t *pathname, } else if (err == MDBX_ENOFILE) err = MDBX_SUCCESS; - osal_free(env_pathname.buffer_for_free); + osal_free(dummy_env->me_pathname.buffer); return (err == MDBX_SUCCESS) ? rc : err; } -__cold int mdbx_env_open(MDBX_env *env, const char *pathname, - MDBX_env_flags_t flags, mdbx_mode_t mode) { -#if defined(_WIN32) || defined(_WIN64) - wchar_t *pathnameW = nullptr; - int rc = osal_mb2w(pathname, &pathnameW); - if (likely(rc == MDBX_SUCCESS)) { - rc = mdbx_env_openW(env, pathnameW, flags, mode); - osal_free(pathnameW); - if (rc == MDBX_SUCCESS) - /* force to make cache of the multi-byte pathname representation */ - mdbx_env_get_path(env, &pathname); - } - return rc; -} - -__cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, - MDBX_env_flags_t flags, mdbx_mode_t mode) { -#endif /* Windows */ - - int rc = check_env(env, false); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(flags & ~ENV_USABLE_FLAGS)) - return MDBX_EINVAL; - - if (unlikely(env->me_lazy_fd != INVALID_HANDLE_VALUE || - (env->me_flags & MDBX_ENV_ACTIVE) != 0 || env->me_map)) - return MDBX_EPERM; - - /* Pickup previously mdbx_env_set_flags(), - * but avoid MDBX_UTTERLY_NOSYNC by disjunction */ - const uint32_t saved_me_flags = env->me_flags; - flags = merge_sync_flags(flags | MDBX_DEPRECATED_COALESCE, env->me_flags); - - if (flags & MDBX_RDONLY) { - /* Silently ignore irrelevant flags when we're only getting read access */ - flags &= ~(MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC | MDBX_SAFE_NOSYNC | - MDBX_NOMETASYNC | MDBX_DEPRECATED_COALESCE | MDBX_LIFORECLAIM | - MDBX_NOMEMINIT | MDBX_ACCEDE); - mode = 0; - } else { -#if MDBX_MMAP_INCOHERENT_FILE_WRITE - /* Temporary `workaround` for OpenBSD kernel's flaw. - * See https://libmdbx.dqdkfa.ru/dead-github/issues/67 */ - if ((flags & MDBX_WRITEMAP) == 0) { - if (flags & MDBX_ACCEDE) - flags |= MDBX_WRITEMAP; - else { - debug_log(MDBX_LOG_ERROR, __func__, __LINE__, - "System (i.e. OpenBSD) requires MDBX_WRITEMAP because " - "of an internal flaw(s) in a file/buffer/page cache.\n"); - return 42 /* ENOPROTOOPT */; - } - } -#endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ - } - - MDBX_handle_env_pathname env_pathname; - rc = handle_env_pathname(&env_pathname, pathname, &flags, mode); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - - env->me_flags = (flags & ~MDBX_FATAL_ERROR) | MDBX_ENV_ACTIVE; - env->me_pathname = osal_calloc(env_pathname.ent_len + 1, sizeof(pathchar_t)); - env->me_dbxs = osal_calloc(env->me_maxdbs, sizeof(MDBX_dbx)); - env->me_dbflags = osal_calloc(env->me_maxdbs, sizeof(env->me_dbflags[0])); - env->me_dbiseqs = osal_calloc(env->me_maxdbs, sizeof(env->me_dbiseqs[0])); - if (!(env->me_dbxs && env->me_pathname && env->me_dbflags && - env->me_dbiseqs)) { - rc = MDBX_ENOMEM; - goto bailout; - } - memcpy(env->me_pathname, env_pathname.dxb, - env_pathname.ent_len * sizeof(pathchar_t)); - env->me_dbxs[FREE_DBI].md_cmp = cmp_int_align4; /* aligned MDBX_INTEGERKEY */ - env->me_dbxs[FREE_DBI].md_dcmp = cmp_lenfast; - env->me_dbxs[FREE_DBI].md_klen_max = env->me_dbxs[FREE_DBI].md_klen_min = 8; - env->me_dbxs[FREE_DBI].md_vlen_min = 4; - env->me_dbxs[FREE_DBI].md_vlen_max = - mdbx_env_get_maxvalsize_ex(env, MDBX_INTEGERKEY); - +__cold static int env_open(MDBX_env *env, mdbx_mode_t mode) { /* Использование O_DSYNC или FILE_FLAG_WRITE_THROUGH: * * 0) Если размер страниц БД меньше системной страницы ОЗУ, то ядру ОС @@ -19095,18 +19573,17 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, * при этом для записи мета требуется отдельный не-overlapped дескриптор. */ - rc = osal_openfile((flags & MDBX_RDONLY) ? MDBX_OPEN_DXB_READ - : MDBX_OPEN_DXB_LAZY, - env, env_pathname.dxb, &env->me_lazy_fd, mode); - if (rc != MDBX_SUCCESS) - goto bailout; + env->me_pid = osal_getpid(); + int rc = osal_openfile((env->me_flags & MDBX_RDONLY) ? MDBX_OPEN_DXB_READ + : MDBX_OPEN_DXB_LAZY, + env, env->me_pathname.dxb, &env->me_lazy_fd, mode); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; #if MDBX_LOCKING == MDBX_LOCKING_SYSV - env->me_sysv_ipc.key = ftok(env_pathname.dxb, 42); - if (env->me_sysv_ipc.key == -1) { - rc = errno; - goto bailout; - } + env->me_sysv_ipc.key = ftok(env->me_pathname.dxb, 42); + if (unlikely(env->me_sysv_ipc.key == -1)) + return errno; #endif /* MDBX_LOCKING */ /* Set the position in files outside of the data to avoid corruption @@ -19118,9 +19595,9 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, #if defined(_WIN32) || defined(_WIN64) eASSERT(env, env->me_overlapped_fd == 0); bool ior_direct = false; - if (!(flags & + if (!(env->me_flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_EXCLUSIVE))) { - if (MDBX_AVOID_MSYNC && (flags & MDBX_WRITEMAP)) { + if (MDBX_AVOID_MSYNC && (env->me_flags & MDBX_WRITEMAP)) { /* Запрошен режим MDBX_SYNC_DURABLE | MDBX_WRITEMAP при активной опции * MDBX_AVOID_MSYNC. * @@ -19157,24 +19634,20 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, rc = osal_openfile(ior_direct ? MDBX_OPEN_DXB_OVERLAPPED_DIRECT : MDBX_OPEN_DXB_OVERLAPPED, - env, env_pathname.dxb, &env->me_overlapped_fd, 0); - if (rc != MDBX_SUCCESS) - goto bailout; + env, env->me_pathname.dxb, &env->me_overlapped_fd, 0); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; env->me_data_lock_event = CreateEventW(nullptr, true, false, nullptr); - if (!env->me_data_lock_event) { - rc = (int)GetLastError(); - goto bailout; - } + if (unlikely(!env->me_data_lock_event)) + return (int)GetLastError(); osal_fseek(env->me_overlapped_fd, safe_parking_lot_offset); } #else if (mode == 0) { /* pickup mode for lck-file */ struct stat st; - if (fstat(env->me_lazy_fd, &st)) { - rc = errno; - goto bailout; - } + if (unlikely(fstat(env->me_lazy_fd, &st))) + return errno; mode = st.st_mode; } mode = (/* inherit read permissions for group and others */ mode & @@ -19183,25 +19656,25 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, ((mode & S_IRGRP) ? /* +write if readable by group */ S_IWGRP : 0) | ((mode & S_IROTH) ? /* +write if readable by others */ S_IWOTH : 0); #endif /* !Windows */ - const int lck_rc = setup_lck(env, env_pathname.lck, mode); - if (MDBX_IS_ERROR(lck_rc)) { - rc = lck_rc; - goto bailout; - } - osal_fseek(env->me_lfd, safe_parking_lot_offset); + const int lck_rc = setup_lck(env, mode); + if (unlikely(MDBX_IS_ERROR(lck_rc))) + return lck_rc; + if (env->me_lfd != INVALID_HANDLE_VALUE) + osal_fseek(env->me_lfd, safe_parking_lot_offset); eASSERT(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); - if (!(flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC + if (!(env->me_flags & + (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC #if defined(_WIN32) || defined(_WIN64) - | MDBX_EXCLUSIVE + | MDBX_EXCLUSIVE #endif /* !Windows */ - ))) { - rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb, + ))) { + rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env->me_pathname.dxb, &env->me_dsync_fd, 0); - if (MDBX_IS_ERROR(rc)) - goto bailout; + if (unlikely(MDBX_IS_ERROR(rc))) + return rc; if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { - if ((flags & MDBX_NOMETASYNC) == 0) + if ((env->me_flags & MDBX_NOMETASYNC) == 0) env->me_fd4meta = env->me_dsync_fd; osal_fseek(env->me_dsync_fd, safe_parking_lot_offset); } @@ -19276,37 +19749,35 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, ERROR("current mode/flags 0x%X incompatible with requested 0x%X, " "rigorous diff 0x%X", env->me_flags, snap_flags, rigorous_diff); - rc = MDBX_INCOMPATIBLE; - goto bailout; + return MDBX_INCOMPATIBLE; } } mincore_clean_cache(env); const int dxb_rc = setup_dxb(env, lck_rc, mode); - if (MDBX_IS_ERROR(dxb_rc)) { - rc = dxb_rc; - goto bailout; - } + if (MDBX_IS_ERROR(dxb_rc)) + return dxb_rc; rc = osal_check_fs_incore(env->me_lazy_fd); env->me_incore = false; if (rc == MDBX_RESULT_TRUE) { env->me_incore = true; NOTICE("%s", "in-core database"); + rc = MDBX_SUCCESS; } else if (unlikely(rc != MDBX_SUCCESS)) { ERROR("check_fs_incore(), err %d", rc); - goto bailout; + return rc; } if (unlikely(/* recovery mode */ env->me_stuck_meta >= 0) && (lck_rc != /* exclusive */ MDBX_RESULT_TRUE || - (flags & MDBX_EXCLUSIVE) == 0)) { + (env->me_flags & MDBX_EXCLUSIVE) == 0)) { ERROR("%s", "recovery requires exclusive mode"); - rc = MDBX_BUSY; - goto bailout; + return MDBX_BUSY; } DEBUG("opened dbenv %p", (void *)env); + env->me_flags |= MDBX_ENV_ACTIVE; if (!lck || lck_rc == MDBX_RESULT_TRUE) { env->me_lck->mti_envmode.weak = env->me_flags & mode_flags; env->me_lck->mti_meta_sync_txnid.weak = @@ -19319,130 +19790,222 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, DEBUG("lck-downgrade-%s: rc %i", (env->me_flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc); if (rc != MDBX_SUCCESS) - goto bailout; + return rc; } else { rc = cleanup_dead_readers(env, false, NULL); if (MDBX_IS_ERROR(rc)) - goto bailout; + return rc; } + } - if ((env->me_flags & MDBX_NOTLS) == 0) { - rc = rthc_alloc(&env->me_txkey, &lck->mti_readers[0], - &lck->mti_readers[env->me_maxreaders]); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - env->me_flags |= MDBX_ENV_TXKEY; + rc = (env->me_flags & MDBX_RDONLY) + ? MDBX_SUCCESS + : osal_ioring_create(&env->me_ioring +#if defined(_WIN32) || defined(_WIN64) + , + ior_direct, env->me_overlapped_fd +#endif /* Windows */ + ); + return rc; +} + +__cold int mdbx_env_open(MDBX_env *env, const char *pathname, + MDBX_env_flags_t flags, mdbx_mode_t mode) { +#if defined(_WIN32) || defined(_WIN64) + wchar_t *pathnameW = nullptr; + int rc = osal_mb2w(pathname, &pathnameW); + if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_env_openW(env, pathnameW, flags, mode); + osal_free(pathnameW); + if (rc == MDBX_SUCCESS) + /* force to make cache of the multi-byte pathname representation */ + mdbx_env_get_path(env, &pathname); + } + return rc; +} + +__cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, + MDBX_env_flags_t flags, mdbx_mode_t mode) { +#endif /* Windows */ + + int rc = check_env(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(flags & ~ENV_USABLE_FLAGS)) + return MDBX_EINVAL; + + if (unlikely(env->me_lazy_fd != INVALID_HANDLE_VALUE || + (env->me_flags & MDBX_ENV_ACTIVE) != 0 || env->me_map)) + return MDBX_EPERM; + + /* Pickup previously mdbx_env_set_flags(), + * but avoid MDBX_UTTERLY_NOSYNC by disjunction */ + const uint32_t saved_me_flags = env->me_flags; + flags = merge_sync_flags(flags | MDBX_DEPRECATED_COALESCE, env->me_flags); + + if (flags & MDBX_RDONLY) { + /* Silently ignore irrelevant flags when we're only getting read access */ + flags &= ~(MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC | MDBX_SAFE_NOSYNC | + MDBX_NOMETASYNC | MDBX_DEPRECATED_COALESCE | MDBX_LIFORECLAIM | + MDBX_NOMEMINIT | MDBX_ACCEDE); + mode = 0; + } else { +#if MDBX_MMAP_INCOHERENT_FILE_WRITE + /* Temporary `workaround` for OpenBSD kernel's flaw. + * See https://libmdbx.dqdkfa.ru/dead-github/issues/67 */ + if ((flags & MDBX_WRITEMAP) == 0) { + if (flags & MDBX_ACCEDE) + flags |= MDBX_WRITEMAP; + else { + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, + "System (i.e. OpenBSD) requires MDBX_WRITEMAP because " + "of an internal flaw(s) in a file/buffer/page cache.\n"); + return 42 /* ENOPROTOOPT */; + } } +#endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ + } + + env->me_flags = (flags & ~MDBX_FATAL_ERROR); + rc = env_handle_pathname(env, pathname, mode); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + env->me_dbxs = osal_calloc(env->me_maxdbs, sizeof(env->me_dbxs[0])); + env->me_db_flags = osal_calloc(env->me_maxdbs, sizeof(env->me_db_flags[0])); + env->me_dbi_seqs = osal_calloc(env->me_maxdbs, sizeof(env->me_dbi_seqs[0])); + if (unlikely(!(env->me_dbxs && env->me_db_flags && env->me_dbi_seqs))) { + rc = MDBX_ENOMEM; + goto bailout; } if ((flags & MDBX_RDONLY) == 0) { - const size_t tsize = sizeof(MDBX_txn) + sizeof(MDBX_cursor), - size = tsize + env->me_maxdbs * - (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + - sizeof(MDBX_atomic_uint32_t) + 1); + MDBX_txn *txn = nullptr; + const intptr_t bitmap_bytes = +#if MDBX_ENABLE_DBI_SPARSE + ceil_powerof2(env->me_maxdbs, + CHAR_BIT * sizeof(txn->mt_dbi_sparse[0])) / + CHAR_BIT; +#else + 0; +#endif /* MDBX_ENABLE_DBI_SPARSE */ + const size_t base = sizeof(MDBX_txn) + sizeof(MDBX_cursor); + const size_t size = + base + bitmap_bytes + + env->me_maxdbs * + (sizeof(txn->mt_dbs[0]) + sizeof(txn->mt_cursors[0]) + + sizeof(txn->mt_dbi_seqs[0]) + sizeof(txn->mt_dbi_state[0])); rc = alloc_page_buf(env); - if (rc == MDBX_SUCCESS) { - memset(env->me_pbuf, -1, env->me_psize * (size_t)2); - memset(ptr_disp(env->me_pbuf, env->me_psize * (size_t)2), 0, - env->me_psize); - MDBX_txn *txn = osal_calloc(1, size); - if (txn) { - txn->mt_dbs = ptr_disp(txn, tsize); - txn->mt_cursors = - ptr_disp(txn->mt_dbs, sizeof(MDBX_db) * env->me_maxdbs); - txn->mt_dbiseqs = - ptr_disp(txn->mt_cursors, sizeof(MDBX_cursor *) * env->me_maxdbs); - txn->mt_dbistate = ptr_disp( - txn->mt_dbiseqs, sizeof(MDBX_atomic_uint32_t) * env->me_maxdbs); - txn->mt_env = env; - txn->mt_dbxs = env->me_dbxs; - txn->mt_flags = MDBX_TXN_FINISHED; - env->me_txn0 = txn; - txn->tw.retired_pages = pnl_alloc(MDBX_PNL_INITIAL); - txn->tw.relist = pnl_alloc(MDBX_PNL_INITIAL); - if (unlikely(!txn->tw.retired_pages || !txn->tw.relist)) - rc = MDBX_ENOMEM; - } else - rc = MDBX_ENOMEM; + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + memset(env->me_pbuf, -1, env->me_psize * (size_t)2); + memset(ptr_disp(env->me_pbuf, env->me_psize * (size_t)2), 0, env->me_psize); + txn = osal_calloc(1, size); + if (unlikely(!txn)) { + rc = MDBX_ENOMEM; + goto bailout; } - if (rc == MDBX_SUCCESS) - rc = osal_ioring_create(&env->me_ioring -#if defined(_WIN32) || defined(_WIN64) - , - ior_direct, env->me_overlapped_fd -#endif /* Windows */ - ); - if (rc == MDBX_SUCCESS) - adjust_defaults(env); + txn->mt_dbs = ptr_disp(txn, base); + txn->mt_cursors = + ptr_disp(txn->mt_dbs, env->me_maxdbs * sizeof(txn->mt_dbs[0])); + txn->mt_dbi_seqs = + ptr_disp(txn->mt_cursors, env->me_maxdbs * sizeof(txn->mt_cursors[0])); + txn->mt_dbi_state = + ptr_disp(txn, size - env->me_maxdbs * sizeof(txn->mt_dbi_state[0])); +#if MDBX_ENABLE_DBI_SPARSE + txn->mt_dbi_sparse = ptr_disp(txn->mt_dbi_state, -bitmap_bytes); +#endif /* MDBX_ENABLE_DBI_SPARSE */ + txn->mt_env = env; + txn->mt_flags = MDBX_TXN_FINISHED; + env->me_txn0 = txn; + txn->tw.retired_pages = pnl_alloc(MDBX_PNL_INITIAL); + txn->tw.relist = pnl_alloc(MDBX_PNL_INITIAL); + if (unlikely(!txn->tw.retired_pages || !txn->tw.relist)) { + rc = MDBX_ENOMEM; + goto bailout; + } + adjust_defaults(env); } -#if MDBX_DEBUG - if (rc == MDBX_SUCCESS) { - const meta_troika_t troika = meta_tap(env); - const meta_ptr_t head = meta_recent(env, &troika); - const MDBX_db *db = &head.ptr_c->mm_dbs[MAIN_DBI]; + rc = env_open(env, mode); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; - DEBUG("opened database version %u, pagesize %u", - (uint8_t)unaligned_peek_u64(4, head.ptr_c->mm_magic_and_version), - env->me_psize); - DEBUG("using meta page %" PRIaPGNO ", txn %" PRIaTXN, - data_page(head.ptr_c)->mp_pgno, head.txnid); - DEBUG("depth: %u", db->md_depth); - DEBUG("entries: %" PRIu64, db->md_entries); - DEBUG("branch pages: %" PRIaPGNO, db->md_branch_pages); - DEBUG("leaf pages: %" PRIaPGNO, db->md_leaf_pages); - DEBUG("large/overflow pages: %" PRIaPGNO, db->md_overflow_pages); - DEBUG("root: %" PRIaPGNO, db->md_root); - DEBUG("schema_altered: %" PRIaTXN, db->md_mod_txnid); - } -#endif +#if MDBX_DEBUG + const meta_troika_t troika = meta_tap(env); + const meta_ptr_t head = meta_recent(env, &troika); + const MDBX_db *db = &head.ptr_c->mm_dbs[MAIN_DBI]; + + DEBUG("opened database version %u, pagesize %u", + (uint8_t)unaligned_peek_u64(4, head.ptr_c->mm_magic_and_version), + env->me_psize); + DEBUG("using meta page %" PRIaPGNO ", txn %" PRIaTXN, + data_page(head.ptr_c)->mp_pgno, head.txnid); + DEBUG("depth: %u", db->md_depth); + DEBUG("entries: %" PRIu64, db->md_entries); + DEBUG("branch pages: %" PRIaPGNO, db->md_branch_pages); + DEBUG("leaf pages: %" PRIaPGNO, db->md_leaf_pages); + DEBUG("large/overflow pages: %" PRIaPGNO, db->md_overflow_pages); + DEBUG("root: %" PRIaPGNO, db->md_root); + DEBUG("schema_altered: %" PRIaTXN, db->md_mod_txnid); +#endif /* MDBX_DEBUG */ -bailout: - if (rc != MDBX_SUCCESS) { - rc = env_close(env) ? MDBX_PANIC : rc; - env->me_flags = - saved_me_flags | ((rc != MDBX_PANIC) ? 0 : MDBX_FATAL_ERROR); - } else { -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) + if (likely(rc == MDBX_SUCCESS)) { +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) txn_valgrind(env, nullptr); -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ + } else { + bailout: + if (likely(env_close(env, false) == MDBX_SUCCESS)) { + env->me_flags = saved_me_flags; + } else { + rc = MDBX_PANIC; + env->me_flags = saved_me_flags | MDBX_FATAL_ERROR; + } } - osal_free(env_pathname.buffer_for_free); return rc; } /* Destroy resources from mdbx_env_open(), clear our readers & DBIs */ -__cold static int env_close(MDBX_env *env) { +__cold static int env_close(MDBX_env *env, bool resurrect_after_fork) { const unsigned flags = env->me_flags; - if (!(flags & MDBX_ENV_ACTIVE)) { - ENSURE(env, env->me_lcklist_next == nullptr); - return MDBX_SUCCESS; - } - env->me_flags &= ~ENV_INTERNAL_FLAGS; if (flags & MDBX_ENV_TXKEY) { - rthc_remove(env->me_txkey); - env->me_txkey = (osal_thread_key_t)0; + thread_key_delete(env->me_txkey); + env->me_txkey = 0; + } + + if (env->me_lck) + munlock_all(env); + + rthc_lock(); + int rc = rthc_remove(env); + rthc_unlock(); + +#if MDBX_ENABLE_DBI_LOCKFREE + for (struct mdbx_defer_free_item *next, *ptr = env->me_defer_free; ptr; + ptr = next) { + next = ptr->next; + osal_free(ptr); } + env->me_defer_free = nullptr; +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ - munlock_all(env); if (!(env->me_flags & MDBX_RDONLY)) osal_ioring_destroy(&env->me_ioring); - lcklist_lock(); - const int rc = lcklist_detach_locked(env); - lcklist_unlock(); - env->me_lck = nullptr; if (env->me_lck_mmap.lck) osal_munmap(&env->me_lck_mmap); if (env->me_map) { osal_munmap(&env->me_dxb_mmap); -#ifdef MDBX_USE_VALGRIND +#ifdef ENABLE_MEMCHECK VALGRIND_DISCARD(env->me_valgrind_handle); env->me_valgrind_handle = -1; -#endif +#endif /* ENABLE_MEMCHECK */ } #if defined(_WIN32) || defined(_WIN64) @@ -19452,6 +20015,11 @@ __cold static int env_close(MDBX_env *env) { CloseHandle(env->me_data_lock_event); env->me_data_lock_event = INVALID_HANDLE_VALUE; } + eASSERT(env, !resurrect_after_fork); + if (env->me_pathname_char) { + osal_free(env->me_pathname_char); + env->me_pathname_char = nullptr; + } #endif /* Windows */ if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { @@ -19469,49 +20037,82 @@ __cold static int env_close(MDBX_env *env) { env->me_lfd = INVALID_HANDLE_VALUE; } - if (env->me_dbxs) { - for (size_t i = CORE_DBS; i < env->me_numdbs; ++i) - if (env->me_dbxs[i].md_name.iov_len) - osal_free(env->me_dbxs[i].md_name.iov_base); - osal_free(env->me_dbxs); - env->me_numdbs = CORE_DBS; - env->me_dbxs = nullptr; - } - if (env->me_pbuf) { - osal_memalign_free(env->me_pbuf); - env->me_pbuf = nullptr; - } - if (env->me_dbiseqs) { - osal_free(env->me_dbiseqs); - env->me_dbiseqs = nullptr; - } - if (env->me_dbflags) { - osal_free(env->me_dbflags); - env->me_dbflags = nullptr; - } - if (env->me_pathname) { - osal_free(env->me_pathname); - env->me_pathname = nullptr; - } -#if defined(_WIN32) || defined(_WIN64) - if (env->me_pathname_char) { - osal_free(env->me_pathname_char); - env->me_pathname_char = nullptr; - } -#endif /* Windows */ - if (env->me_txn0) { - dpl_free(env->me_txn0); - txl_free(env->me_txn0->tw.lifo_reclaimed); - pnl_free(env->me_txn0->tw.retired_pages); - pnl_free(env->me_txn0->tw.spilled.list); - pnl_free(env->me_txn0->tw.relist); - osal_free(env->me_txn0); - env->me_txn0 = nullptr; + if (!resurrect_after_fork) { + if (env->me_dbxs) { + for (size_t i = CORE_DBS; i < env->me_numdbs; ++i) + if (env->me_dbxs[i].md_name.iov_len) + osal_free(env->me_dbxs[i].md_name.iov_base); + osal_free(env->me_dbxs); + env->me_numdbs = CORE_DBS; + env->me_dbxs = nullptr; + } + if (env->me_pbuf) { + osal_memalign_free(env->me_pbuf); + env->me_pbuf = nullptr; + } + if (env->me_dbi_seqs) { + osal_free(env->me_dbi_seqs); + env->me_dbi_seqs = nullptr; + } + if (env->me_db_flags) { + osal_free(env->me_db_flags); + env->me_db_flags = nullptr; + } + if (env->me_pathname.buffer) { + osal_free(env->me_pathname.buffer); + env->me_pathname.buffer = nullptr; + } + if (env->me_txn0) { + dpl_free(env->me_txn0); + txl_free(env->me_txn0->tw.lifo_reclaimed); + pnl_free(env->me_txn0->tw.retired_pages); + pnl_free(env->me_txn0->tw.spilled.list); + pnl_free(env->me_txn0->tw.relist); + osal_free(env->me_txn0); + env->me_txn0 = nullptr; + } } env->me_stuck_meta = -1; return rc; } +#if !(defined(_WIN32) || defined(_WIN64)) +__cold int mdbx_env_resurrect_after_fork(MDBX_env *env) { + if (unlikely(!env)) + return MDBX_EINVAL; + + if (unlikely(env->me_signature.weak != MDBX_ME_SIGNATURE)) + return MDBX_EBADSIGN; + + if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) + return MDBX_PANIC; + + if (unlikely((env->me_flags & MDBX_ENV_ACTIVE) == 0)) + return MDBX_SUCCESS; + + const uint32_t new_pid = osal_getpid(); + if (unlikely(env->me_pid == new_pid)) + return MDBX_SUCCESS; + + if (!atomic_cas32(&env->me_signature, MDBX_ME_SIGNATURE, ~MDBX_ME_SIGNATURE)) + return MDBX_EBADSIGN; + + if (env->me_txn) + txn_abort(env->me_txn0); + env->me_live_reader = 0; + int rc = env_close(env, true); + env->me_signature.weak = MDBX_ME_SIGNATURE; + if (likely(rc == MDBX_SUCCESS)) { + rc = (env->me_flags & MDBX_EXCLUSIVE) ? MDBX_BUSY : env_open(env, 0); + if (unlikely(rc != MDBX_SUCCESS && env_close(env, false) != MDBX_SUCCESS)) { + rc = MDBX_PANIC; + env->me_flags |= MDBX_FATAL_ERROR; + } + } + return rc; +} +#endif /* Windows */ + __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { MDBX_page *dp; int rc = MDBX_SUCCESS; @@ -19563,8 +20164,11 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { #endif /* Windows */ } + if (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()) + osal_txn_unlock(env); + eASSERT(env, env->me_signature.weak == 0); - rc = env_close(env) ? MDBX_PANIC : rc; + rc = env_close(env, false) ? MDBX_PANIC : rc; ENSURE(env, osal_fastmutex_destroy(&env->me_dbi_lock) == MDBX_SUCCESS); #if defined(_WIN32) || defined(_WIN64) /* me_remap_guard don't have destructor (Slim Reader/Writer Lock) */ @@ -19575,7 +20179,8 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { #if MDBX_LOCKING > MDBX_LOCKING_SYSV MDBX_lockinfo *const stub = lckless_stub(env); - ENSURE(env, osal_ipclock_destroy(&stub->mti_wlock) == 0); + /* может вернуть ошибку в дочернем процессе после fork() */ + osal_ipclock_destroy(&stub->mti_wlock); #endif /* MDBX_LOCKING */ while ((dp = env->me_dp_reserve) != NULL) { @@ -19586,8 +20191,6 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { osal_free(ptr); } VALGRIND_DESTROY_MEMPOOL(env); - ENSURE(env, env->me_lcklist_next == nullptr); - env->me_pid = 0; osal_free(env); return rc; @@ -19929,21 +20532,22 @@ __hot __noinline static int page_search_root(MDBX_cursor *mc, return MDBX_SUCCESS; } -static int setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, +static int setup_sdb(MDBX_dbx *const dbx, const MDBX_db *const db, const unsigned pagesize) { + if (unlikely(!db_check_flags(db->md_flags))) { + ERROR("incompatible or invalid db.md_flags (%u) ", db->md_flags); + return MDBX_INCOMPATIBLE; + } if (unlikely(!dbx->md_cmp)) { dbx->md_cmp = get_default_keycmp(db->md_flags); dbx->md_dcmp = get_default_datacmp(db->md_flags); } - dbx->md_klen_min = - (db->md_flags & MDBX_INTEGERKEY) ? 4 /* sizeof(uint32_t) */ : 0; + dbx->md_klen_min = keysize_min(db->md_flags); dbx->md_klen_max = keysize_max(pagesize, db->md_flags); assert(dbx->md_klen_max != (unsigned)-1); - dbx->md_vlen_min = (db->md_flags & MDBX_INTEGERDUP) - ? 4 /* sizeof(uint32_t) */ - : ((db->md_flags & MDBX_DUPFIXED) ? 1 : 0); + dbx->md_vlen_min = valsize_min(db->md_flags); dbx->md_vlen_max = valsize_max(pagesize, db->md_flags); assert(dbx->md_vlen_max != (size_t)-1); @@ -19961,18 +20565,14 @@ static int setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, static int fetch_sdb(MDBX_txn *txn, size_t dbi) { MDBX_cursor_couple couple; - if (unlikely(dbi_changed(txn, dbi))) { - NOTICE("dbi %zu was changed for txn %" PRIaTXN, dbi, txn->mt_txnid); - return MDBX_BAD_DBI; - } int rc = cursor_init(&couple.outer, txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) return rc; - MDBX_dbx *const dbx = &txn->mt_dbxs[dbi]; + MDBX_dbx *const dbx = &txn->mt_env->me_dbxs[dbi]; rc = page_search(&couple.outer, &dbx->md_name, 0); if (unlikely(rc != MDBX_SUCCESS)) { - notfound: + bailout: NOTICE("dbi %zu refs to inaccessible subDB `%*s` for txn %" PRIaTXN " (err %d)", dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, @@ -19984,7 +20584,7 @@ static int fetch_sdb(MDBX_txn *txn, size_t dbi) { struct node_result nsr = node_search(&couple.outer, &dbx->md_name); if (unlikely(!nsr.exact)) { rc = MDBX_NOTFOUND; - goto notfound; + goto bailout; } if (unlikely((node_flags(nsr.node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) { NOTICE("dbi %zu refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", @@ -20027,11 +20627,11 @@ static int fetch_sdb(MDBX_txn *txn, size_t dbi) { return MDBX_CORRUPTED; } #endif /* !MDBX_DISABLE_VALIDATION */ - rc = setup_dbx(dbx, db, txn->mt_env->me_psize); + rc = setup_sdb(dbx, db, txn->mt_env->me_psize); if (unlikely(rc != MDBX_SUCCESS)) return rc; - txn->mt_dbistate[dbi] &= ~DBI_STALE; + txn->mt_dbi_state[dbi] &= ~DBI_STALE; return MDBX_SUCCESS; } @@ -20062,8 +20662,8 @@ __hot static int page_search_lowest(MDBX_cursor *mc) { * [in] key the key to search for, or NULL for first/last page. * [in] flags If MDBX_PS_MODIFY is set, visited pages in the DB * are touched (updated with new page numbers). - * If MDBX_PS_FIRST or MDBX_PS_LAST is set, find first or last - * leaf. + * If MDBX_PS_FIRST or MDBX_PS_LAST is set, + * find first or last leaf. * This is used by mdbx_cursor_first() and mdbx_cursor_last(). * If MDBX_PS_ROOTONLY set, just fetch root node, no further * lookups. @@ -20081,7 +20681,7 @@ __hot static int page_search(MDBX_cursor *mc, const MDBX_val *key, int flags) { } /* Make sure we're using an up-to-date root */ - if (unlikely(*mc->mc_dbistate & DBI_STALE)) { + if (unlikely(*mc->mc_dbi_state & DBI_STALE)) { rc = fetch_sdb(mc->mc_txn, mc->mc_dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -20105,7 +20705,7 @@ __hot static int page_search(MDBX_cursor *mc, const MDBX_val *key, int flags) { do if ((scan->mt_flags & MDBX_TXN_DIRTY) && (mc->mc_dbi == MAIN_DBI || - (scan->mt_dbistate[mc->mc_dbi] & DBI_DIRTY))) { + (scan->mt_dbi_state[mc->mc_dbi] & DBI_DIRTY))) { /* После коммита вложенных тразакций может быть mod_txnid > front */ pp_txnid = scan->mt_front; break; @@ -20181,9 +20781,6 @@ int mdbx_get(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, if (unlikely(!key || !data)) return MDBX_EINVAL; - if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) - return MDBX_BAD_DBI; - MDBX_cursor_couple cx; rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) @@ -20201,9 +20798,6 @@ int mdbx_get_equal_or_great(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, if (unlikely(!key || !data)) return MDBX_EINVAL; - if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) - return MDBX_BAD_DBI; - if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) return MDBX_BAD_TXN; @@ -20227,9 +20821,6 @@ int mdbx_get_ex(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, if (unlikely(!key || !data)) return MDBX_EINVAL; - if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) - return MDBX_BAD_DBI; - MDBX_cursor_couple cx; rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) @@ -20323,39 +20914,41 @@ static int cursor_sibling(MDBX_cursor *mc, int dir) { /* Move the cursor to the next data item. */ static int cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { - MDBX_page *mp; - MDBX_node *node; + assert(op == MDBX_NEXT || op == MDBX_NEXT_DUP || op == MDBX_NEXT_NODUP); int rc; if (unlikely(mc->mc_flags & C_DEL) && op == MDBX_NEXT_DUP) return MDBX_NOTFOUND; - if (unlikely(!(mc->mc_flags & C_INITIALIZED))) + if (unlikely(!(mc->mc_flags & C_INITIALIZED))) { + if (unlikely(mc->mc_flags & C_SUB)) + return MDBX_NOTFOUND; return cursor_first(mc, key, data); + } - mp = mc->mc_pg[mc->mc_top]; + const MDBX_page *mp = mc->mc_pg[mc->mc_top]; if (unlikely(mc->mc_flags & C_EOF)) { if (mc->mc_ki[mc->mc_top] + (size_t)1 >= page_numkeys(mp)) return MDBX_NOTFOUND; mc->mc_flags ^= C_EOF; } - if (mc->mc_db->md_flags & MDBX_DUPSORT) { - node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (node_flags(node) & F_DUPDATA) { - if (op == MDBX_NEXT || op == MDBX_NEXT_DUP) { + if (mc->mc_xcursor) { + if (op != MDBX_NEXT_NODUP) { + const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); + if (node_flags(node) & F_DUPDATA) { rc = cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_NEXT); - if (op != MDBX_NEXT || rc != MDBX_NOTFOUND) { - if (likely(rc == MDBX_SUCCESS)) - get_key_optional(node, key); - return rc; + if (likely(rc == MDBX_SUCCESS)) { + get_key_optional(node, key); + return MDBX_SUCCESS; } + if (unlikely(rc != MDBX_NOTFOUND)) + return rc; } - } else { - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - if (op == MDBX_NEXT_DUP) + if (op != MDBX_NEXT) return MDBX_NOTFOUND; } + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); } DEBUG("cursor_next: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, @@ -20399,7 +20992,7 @@ static int cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_SUCCESS; } - node = page_node(mp, mc->mc_ki[mc->mc_top]); + const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); if (node_flags(node) & F_DUPDATA) { rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) @@ -20420,40 +21013,41 @@ static int cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, /* Move the cursor to the previous data item. */ static int cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { - MDBX_page *mp; - MDBX_node *node; + assert(op == MDBX_PREV || op == MDBX_PREV_DUP || op == MDBX_PREV_NODUP); int rc; if (unlikely(mc->mc_flags & C_DEL) && op == MDBX_PREV_DUP) return MDBX_NOTFOUND; if (unlikely(!(mc->mc_flags & C_INITIALIZED))) { + if (unlikely(mc->mc_flags & C_SUB)) + return MDBX_NOTFOUND; rc = cursor_last(mc, key, data); - if (unlikely(rc)) + if (unlikely(rc != MDBX_SUCCESS)) return rc; mc->mc_ki[mc->mc_top]++; } - mp = mc->mc_pg[mc->mc_top]; - if ((mc->mc_db->md_flags & MDBX_DUPSORT) && - mc->mc_ki[mc->mc_top] < page_numkeys(mp)) { - node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (node_flags(node) & F_DUPDATA) { - if (op == MDBX_PREV || op == MDBX_PREV_DUP) { - rc = cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_PREV); - if (op != MDBX_PREV || rc != MDBX_NOTFOUND) { + const MDBX_page *mp = mc->mc_pg[mc->mc_top]; + if (mc->mc_xcursor) { + if (op != MDBX_PREV_NODUP) { + if (likely(mc->mc_ki[mc->mc_top] < page_numkeys(mp))) { + const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); + if (node_flags(node) & F_DUPDATA) { + rc = cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_PREV); if (likely(rc == MDBX_SUCCESS)) { get_key_optional(node, key); mc->mc_flags &= ~C_EOF; + return MDBX_SUCCESS; } - return rc; + if (unlikely(rc != MDBX_NOTFOUND)) + return rc; } } - } else { - mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - if (op == MDBX_PREV_DUP) + if (op != MDBX_PREV) return MDBX_NOTFOUND; } + mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); } DEBUG("cursor_prev: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, @@ -20489,8 +21083,7 @@ static int cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_SUCCESS; } - node = page_node(mp, mc->mc_ki[mc->mc_top]); - + const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); if (node_flags(node) & F_DUPDATA) { rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) @@ -20525,7 +21118,7 @@ cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { } MDBX_val aligned_key = *key; - uint64_t aligned_keybytes; + uint64_t aligned_key_buf; if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { switch (aligned_key.iov_len) { default: @@ -20536,13 +21129,13 @@ cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { if (unlikely(3 & (uintptr_t)aligned_key.iov_base)) /* copy instead of return error to avoid break compatibility */ aligned_key.iov_base = - memcpy(&aligned_keybytes, aligned_key.iov_base, 4); + memcpy(&aligned_key_buf, aligned_key.iov_base, 4); break; case 8: if (unlikely(7 & (uintptr_t)aligned_key.iov_base)) /* copy instead of return error to avoid break compatibility */ aligned_key.iov_base = - memcpy(&aligned_keybytes, aligned_key.iov_base, 8); + memcpy(&aligned_key_buf, aligned_key.iov_base, 8); break; } } @@ -20581,7 +21174,7 @@ cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { } if (cmp > 0) { const size_t nkeys = page_numkeys(mp); - if (nkeys > 1) { + if (likely(nkeys > 1)) { if (IS_LEAF2(mp)) { nodekey.iov_base = page_leaf2key(mp, nkeys - 1, nodekey.iov_len); } else { @@ -20600,8 +21193,9 @@ cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { goto got_node; } if (cmp < 0) { - if (mc->mc_ki[mc->mc_top] < page_numkeys(mp)) { - /* This is definitely the right page, skip search_page */ + /* This is definitely the right page, skip search_page */ + if (mc->mc_ki[mc->mc_top] != 0 /* уже проверяли выше */ && + mc->mc_ki[mc->mc_top] < page_numkeys(mp)) { if (IS_LEAF2(mp)) { nodekey.iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], nodekey.iov_len); @@ -20625,23 +21219,22 @@ cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { } /* If any parents have right-sibs, search. * Otherwise, there's nothing further. */ - size_t i; - for (i = 0; i < mc->mc_top; i++) + for (size_t i = 0; i < mc->mc_top; i++) if (mc->mc_ki[i] < page_numkeys(mc->mc_pg[i]) - 1) - break; - if (i == mc->mc_top) { - /* There are no other pages */ - cASSERT(mc, nkeys <= UINT16_MAX); - mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; - mc->mc_flags |= C_EOF; - ret.err = MDBX_NOTFOUND; - return ret; - } + goto continue_other_pages; + + /* There are no other pages */ + cASSERT(mc, nkeys <= UINT16_MAX); + mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; + mc->mc_flags |= C_EOF; + ret.err = MDBX_NOTFOUND; + return ret; } + continue_other_pages: if (!mc->mc_top) { /* There are no other pages */ mc->mc_ki[mc->mc_top] = 0; - if (op == MDBX_SET_RANGE) + if (op >= MDBX_SET_RANGE) goto got_node; cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || @@ -20666,7 +21259,7 @@ search_node:; node = nsr.node; ret.exact = nsr.exact; if (!ret.exact) { - if (op != MDBX_SET_RANGE) { + if (op < MDBX_SET_RANGE) { /* MDBX_SET specified and not an exact match. */ if (unlikely(mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top]))) @@ -20703,7 +21296,7 @@ search_node:; } if (IS_LEAF2(mp)) { - if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) { + if (op >= MDBX_SET_KEY) { key->iov_len = mc->mc_db->md_xsize; key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); } @@ -20715,7 +21308,7 @@ search_node:; ret.err = cursor_xinit1(mc, node, mp); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; - if (op == MDBX_SET || op == MDBX_SET_KEY || op == MDBX_SET_RANGE) { + if (op >= MDBX_SET) { MDBX_ANALYSIS_ASSUME(mc->mc_xcursor != nullptr); ret.err = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(ret.err != MDBX_SUCCESS)) @@ -20731,7 +21324,7 @@ search_node:; } } } else if (likely(data)) { - if (op == MDBX_GET_BOTH || op == MDBX_GET_BOTH_RANGE) { + if (op <= MDBX_GET_BOTH_RANGE) { if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || data->iov_len > mc->mc_dbx->md_vlen_max)) { cASSERT(mc, !"Invalid data-size"); @@ -20783,7 +21376,7 @@ search_node:; } /* The key already matches in all other cases */ - if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) + if (op >= MDBX_SET_KEY) get_key_optional(node, key); DEBUG("==> cursor placed on key [%s], data [%s]", DKEY_DEBUG(key), @@ -20942,6 +21535,8 @@ static __hot int cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return rc; } } else { + cASSERT(mc, !mc->mc_xcursor || !(mc->mc_xcursor->mx_cursor.mc_flags & + C_INITIALIZED)); rc = node_read(mc, node, data, mp); if (unlikely(rc)) return rc; @@ -21068,6 +21663,7 @@ static __hot int cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, case MDBX_LAST_DUP: mfunc = cursor_last; goto move; + case MDBX_SET_UPPERBOUND: /* mostly same as MDBX_SET_LOWERBOUND */ case MDBX_SET_LOWERBOUND: { if (unlikely(key == NULL || data == NULL)) @@ -21111,6 +21707,153 @@ static __hot int cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, } break; } + + /* Doubtless API to positioning of the cursor at a specified key. */ + case MDBX_TO_KEY_LESSER_THAN: + case MDBX_TO_KEY_LESSER_OR_EQUAL: + case MDBX_TO_KEY_EQUAL: + case MDBX_TO_KEY_GREATER_OR_EQUAL: + case MDBX_TO_KEY_GREATER_THAN: { + if (unlikely(key == NULL)) + return MDBX_EINVAL; + struct cursor_set_result csr = cursor_set(mc, key, data, MDBX_SET_RANGE); + rc = csr.err; + if (csr.exact) { + cASSERT(mc, csr.err == MDBX_SUCCESS); + if (op == MDBX_TO_KEY_LESSER_THAN) + rc = cursor_prev(mc, key, data, MDBX_PREV_NODUP); + else if (op == MDBX_TO_KEY_GREATER_THAN) + rc = cursor_next(mc, key, data, MDBX_NEXT_NODUP); + } else if (op < MDBX_TO_KEY_EQUAL && + (rc == MDBX_NOTFOUND || rc == MDBX_SUCCESS)) + rc = cursor_prev(mc, key, data, MDBX_PREV_NODUP); + else if (op == MDBX_TO_KEY_EQUAL && rc == MDBX_SUCCESS) + rc = MDBX_NOTFOUND; + break; + } + + /* Doubtless API to positioning of the cursor at a specified key-value pair + * for multi-value hives. */ + case MDBX_TO_EXACT_KEY_VALUE_LESSER_THAN: + case MDBX_TO_EXACT_KEY_VALUE_LESSER_OR_EQUAL: + case MDBX_TO_EXACT_KEY_VALUE_EQUAL: + case MDBX_TO_EXACT_KEY_VALUE_GREATER_OR_EQUAL: + case MDBX_TO_EXACT_KEY_VALUE_GREATER_THAN: { + if (unlikely(key == NULL || data == NULL)) + return MDBX_EINVAL; + MDBX_val save_data = *data; + struct cursor_set_result csr = cursor_set(mc, key, data, MDBX_SET_KEY); + rc = csr.err; + if (rc == MDBX_SUCCESS) { + cASSERT(mc, csr.exact); + MDBX_cursor *const mx = + (mc->mc_xcursor && + (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + ? &mc->mc_xcursor->mx_cursor + : nullptr; + if (mx) { + csr = cursor_set(mx, &save_data, NULL, MDBX_SET_RANGE); + rc = csr.err; + if (csr.exact) { + cASSERT(mc, csr.err == MDBX_SUCCESS); + if (op == MDBX_TO_EXACT_KEY_VALUE_LESSER_THAN) + rc = cursor_prev(mx, data, NULL, MDBX_PREV); + else if (op == MDBX_TO_EXACT_KEY_VALUE_GREATER_THAN) + rc = cursor_next(mx, data, NULL, MDBX_NEXT); + } else if (op < MDBX_TO_EXACT_KEY_VALUE_EQUAL && + (rc == MDBX_NOTFOUND || rc == MDBX_SUCCESS)) + rc = cursor_prev(mx, data, NULL, MDBX_PREV); + else if (op == MDBX_TO_EXACT_KEY_VALUE_EQUAL && rc == MDBX_SUCCESS) + rc = MDBX_NOTFOUND; + } else { + int cmp = mc->mc_dbx->md_dcmp(data, &save_data); + switch (op) { + default: + __unreachable(); + case MDBX_TO_EXACT_KEY_VALUE_LESSER_THAN: + rc = (cmp < 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; + break; + case MDBX_TO_EXACT_KEY_VALUE_LESSER_OR_EQUAL: + rc = (cmp <= 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; + break; + case MDBX_TO_EXACT_KEY_VALUE_EQUAL: + rc = (cmp == 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; + break; + case MDBX_TO_EXACT_KEY_VALUE_GREATER_OR_EQUAL: + rc = (cmp >= 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; + break; + case MDBX_TO_EXACT_KEY_VALUE_GREATER_THAN: + rc = (cmp > 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; + break; + } + } + } + break; + } + case MDBX_TO_PAIR_LESSER_THAN: + case MDBX_TO_PAIR_LESSER_OR_EQUAL: + case MDBX_TO_PAIR_EQUAL: + case MDBX_TO_PAIR_GREATER_OR_EQUAL: + case MDBX_TO_PAIR_GREATER_THAN: { + if (unlikely(key == NULL || data == NULL)) + return MDBX_EINVAL; + MDBX_val save_data = *data; + struct cursor_set_result csr = cursor_set(mc, key, data, MDBX_SET_RANGE); + rc = csr.err; + if (csr.exact) { + cASSERT(mc, csr.err == MDBX_SUCCESS); + MDBX_cursor *const mx = + (mc->mc_xcursor && + (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + ? &mc->mc_xcursor->mx_cursor + : nullptr; + if (mx) { + csr = cursor_set(mx, &save_data, NULL, MDBX_SET_RANGE); + rc = csr.err; + if (csr.exact) { + cASSERT(mc, csr.err == MDBX_SUCCESS); + if (op == MDBX_TO_PAIR_LESSER_THAN) + rc = cursor_prev(mc, key, data, MDBX_PREV); + else if (op == MDBX_TO_PAIR_GREATER_THAN) + rc = cursor_next(mc, key, data, MDBX_NEXT); + } else if (op < MDBX_TO_PAIR_EQUAL && + (rc == MDBX_NOTFOUND || rc == MDBX_SUCCESS)) + rc = cursor_prev(mc, key, data, MDBX_PREV); + else if (op == MDBX_TO_PAIR_EQUAL && rc == MDBX_SUCCESS) + rc = MDBX_NOTFOUND; + else if (op > MDBX_TO_PAIR_EQUAL && rc == MDBX_NOTFOUND) + rc = cursor_next(mc, key, data, MDBX_NEXT); + } else { + int cmp = mc->mc_dbx->md_dcmp(data, &save_data); + switch (op) { + default: + __unreachable(); + case MDBX_TO_PAIR_LESSER_THAN: + rc = (cmp < 0) ? MDBX_SUCCESS : cursor_prev(mc, key, data, MDBX_PREV); + break; + case MDBX_TO_PAIR_LESSER_OR_EQUAL: + rc = + (cmp <= 0) ? MDBX_SUCCESS : cursor_prev(mc, key, data, MDBX_PREV); + break; + case MDBX_TO_PAIR_EQUAL: + rc = (cmp == 0) ? MDBX_SUCCESS : MDBX_NOTFOUND; + break; + case MDBX_TO_PAIR_GREATER_OR_EQUAL: + rc = + (cmp >= 0) ? MDBX_SUCCESS : cursor_next(mc, key, data, MDBX_NEXT); + break; + case MDBX_TO_PAIR_GREATER_THAN: + rc = (cmp > 0) ? MDBX_SUCCESS : cursor_next(mc, key, data, MDBX_NEXT); + break; + } + } + } else if (op < MDBX_TO_PAIR_EQUAL && + (rc == MDBX_NOTFOUND || rc == MDBX_SUCCESS)) + rc = cursor_prev(mc, key, data, MDBX_PREV_NODUP); + else if (op == MDBX_TO_PAIR_EQUAL && rc == MDBX_SUCCESS) + rc = MDBX_NOTFOUND; + break; + } default: DEBUG("unhandled/unimplemented cursor operation %u", op); return MDBX_EINVAL; @@ -21136,6 +21879,80 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return cursor_get(mc, key, data, op); } +int mdbx_cursor_scan(MDBX_cursor *mc, MDBX_predicate_func *predicate, + void *context, MDBX_cursor_op start_op, + MDBX_cursor_op turn_op, void *arg) { + if (unlikely(!predicate)) + return MDBX_EINVAL; + + const unsigned valid_start_mask = + 1 << MDBX_FIRST | 1 << MDBX_FIRST_DUP | 1 << MDBX_LAST | + 1 << MDBX_LAST_DUP | 1 << MDBX_GET_CURRENT | 1 << MDBX_GET_MULTIPLE; + if (unlikely(start_op > 30 || ((1 << start_op) & valid_start_mask) == 0)) + return MDBX_EINVAL; + + const unsigned valid_turn_mask = + 1 << MDBX_NEXT | 1 << MDBX_NEXT_DUP | 1 << MDBX_NEXT_NODUP | + 1 << MDBX_PREV | 1 << MDBX_PREV_DUP | 1 << MDBX_PREV_NODUP | + 1 << MDBX_NEXT_MULTIPLE | 1 << MDBX_PREV_MULTIPLE; + if (unlikely(turn_op > 30 || ((1 << turn_op) & valid_turn_mask) == 0)) + return MDBX_EINVAL; + + MDBX_val key = {nullptr, 0}, data = {nullptr, 0}; + int rc = mdbx_cursor_get(mc, &key, &data, start_op); + while (likely(rc == MDBX_SUCCESS)) { + rc = predicate(context, &key, &data, arg); + if (rc != MDBX_RESULT_FALSE) + return rc; + rc = cursor_get(mc, &key, &data, turn_op); + } + return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc; +} + +int mdbx_cursor_scan_from(MDBX_cursor *mc, MDBX_predicate_func *predicate, + void *context, MDBX_cursor_op from_op, MDBX_val *key, + MDBX_val *value, MDBX_cursor_op turn_op, void *arg) { + if (unlikely(!predicate)) + return MDBX_EINVAL; + + const unsigned valid_start_mask = + 1 << MDBX_GET_BOTH | 1 << MDBX_GET_BOTH_RANGE | 1 << MDBX_SET_KEY | + 1 << MDBX_GET_MULTIPLE | 1 << MDBX_SET_LOWERBOUND | + 1 << MDBX_SET_UPPERBOUND; + ; + if (unlikely(from_op < MDBX_TO_KEY_LESSER_THAN && + ((1 << from_op) & valid_start_mask) == 0)) + return MDBX_EINVAL; + + const unsigned valid_turn_mask = + 1 << MDBX_NEXT | 1 << MDBX_NEXT_DUP | 1 << MDBX_NEXT_NODUP | + 1 << MDBX_PREV | 1 << MDBX_PREV_DUP | 1 << MDBX_PREV_NODUP | + 1 << MDBX_NEXT_MULTIPLE | 1 << MDBX_PREV_MULTIPLE; + if (unlikely(turn_op > 30 || ((1 << turn_op) & valid_turn_mask) == 0)) + return MDBX_EINVAL; + + int rc = mdbx_cursor_get(mc, key, value, from_op); + if (unlikely(MDBX_IS_ERROR(rc))) + return rc; + + cASSERT(mc, key != nullptr); + MDBX_val stub; + if (!value) { + value = &stub; + rc = cursor_get(mc, key, value, MDBX_GET_CURRENT); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + for (;;) { + rc = predicate(context, key, value, arg); + if (rc != MDBX_RESULT_FALSE) + return rc; + rc = cursor_get(mc, key, value, turn_op); + if (rc != MDBX_SUCCESS) + return (rc == MDBX_NOTFOUND) ? MDBX_RESULT_FALSE : rc; + } +} + static int cursor_first_batch(MDBX_cursor *mc) { if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { int err = page_search(mc, NULL, MDBX_PS_FIRST); @@ -21262,16 +22079,19 @@ int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, } static int touch_dbi(MDBX_cursor *mc) { - cASSERT(mc, (*mc->mc_dbistate & DBI_DIRTY) == 0); - *mc->mc_dbistate |= DBI_DIRTY; + cASSERT(mc, (*mc->mc_dbi_state & DBI_DIRTY) == 0); + *mc->mc_dbi_state |= DBI_DIRTY; mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; if (mc->mc_dbi >= CORE_DBS) { /* Touch DB record of named DB */ MDBX_cursor_couple cx; - int rc = cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI); + int rc = dbi_check(mc->mc_txn, MAIN_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + rc = cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) return rc; - mc->mc_txn->mt_dbistate[MAIN_DBI] |= DBI_DIRTY; + mc->mc_txn->mt_dbi_state[MAIN_DBI] |= DBI_DIRTY; rc = page_search(&cx.outer, &mc->mc_dbx->md_name, MDBX_PS_MODIFY); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -21285,11 +22105,13 @@ static __hot int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, cASSERT(mc, (mc->mc_flags & C_INITIALIZED) || mc->mc_snum == 0); cASSERT(mc, cursor_is_tracked(mc)); + cASSERT(mc, F_ISSET(dbi_state(mc->mc_txn, FREE_DBI), DBI_LINDO | DBI_VALID)); + cASSERT(mc, F_ISSET(dbi_state(mc->mc_txn, MAIN_DBI), DBI_LINDO | DBI_VALID)); if ((mc->mc_flags & C_SUB) == 0) { MDBX_txn *const txn = mc->mc_txn; txn_lru_turn(txn); - if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) { + if (unlikely((*mc->mc_dbi_state & DBI_DIRTY) == 0)) { int err = touch_dbi(mc); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -21326,7 +22148,8 @@ static __hot int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, } int rc = MDBX_SUCCESS; - if (likely(mc->mc_snum)) { + if (likely(mc->mc_snum) && + !IS_MODIFIABLE(mc->mc_txn, mc->mc_pg[mc->mc_snum - 1])) { mc->mc_top = 0; do { rc = page_touch(mc); @@ -22456,7 +23279,7 @@ static pgr_t page_new(MDBX_cursor *mc, const unsigned flags) { DEBUG("db %u allocated new page %" PRIaPGNO, mc->mc_dbi, ret.page->mp_pgno); ret.page->mp_flags = (uint16_t)flags; - cASSERT(mc, *mc->mc_dbistate & DBI_DIRTY); + cASSERT(mc, *mc->mc_dbi_state & DBI_DIRTY); cASSERT(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); #if MDBX_ENABLE_PGOP_STAT mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += 1; @@ -22487,7 +23310,7 @@ static pgr_t page_new_large(MDBX_cursor *mc, const size_t npages) { DEBUG("db %u allocated new large-page %" PRIaPGNO ", num %zu", mc->mc_dbi, ret.page->mp_pgno, npages); ret.page->mp_flags = P_OVERFLOW; - cASSERT(mc, *mc->mc_dbistate & DBI_DIRTY); + cASSERT(mc, *mc->mc_dbi_state & DBI_DIRTY); cASSERT(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); #if MDBX_ENABLE_PGOP_STAT mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += npages; @@ -22809,7 +23632,7 @@ static int cursor_xinit0(MDBX_cursor *mc) { mx->mx_cursor.mc_db = &mx->mx_db; mx->mx_cursor.mc_dbx = &mx->mx_dbx; mx->mx_cursor.mc_dbi = mc->mc_dbi; - mx->mx_cursor.mc_dbistate = mc->mc_dbistate; + mx->mx_cursor.mc_dbi_state = mc->mc_dbi_state; mx->mx_cursor.mc_snum = 0; mx->mx_cursor.mc_top = 0; mx->mx_cursor.mc_flags = C_SUB; @@ -22831,7 +23654,7 @@ static int cursor_xinit0(MDBX_cursor *mc) { * [in] mc The main cursor whose sorted-dups cursor is to be initialized. * [in] node The data containing the MDBX_db record for the sorted-dup database. */ -static int cursor_xinit1(MDBX_cursor *mc, MDBX_node *node, +static int cursor_xinit1(MDBX_cursor *mc, const MDBX_node *node, const MDBX_page *mp) { MDBX_xcursor *mx = mc->mc_xcursor; if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) { @@ -22848,7 +23671,8 @@ static int cursor_xinit1(MDBX_cursor *mc, MDBX_node *node, case F_DUPDATA | F_SUBDATA: if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) != sizeof(MDBX_db))) { - ERROR("invalid nested-db record size %zu", node_ds(node)); + ERROR("invalid nested-db record size (%zu, expect %zu)", node_ds(node), + sizeof(MDBX_db)); return MDBX_CORRUPTED; } memcpy(&mx->mx_db, node_data(node), sizeof(MDBX_db)); @@ -22955,7 +23779,8 @@ static int cursor_xinit2(MDBX_cursor *mc, MDBX_xcursor *src_mx, static __inline int couple_init(MDBX_cursor_couple *couple, const size_t dbi, const MDBX_txn *const txn, MDBX_db *const db, - MDBX_dbx *const dbx, uint8_t *const dbstate) { + MDBX_dbx *const dbx, uint8_t *const dbi_state) { + tASSERT(txn, F_ISSET(*dbi_state, DBI_VALID | DBI_LINDO)); couple->outer.mc_signature = MDBX_MC_LIVE; couple->outer.mc_next = NULL; couple->outer.mc_backup = NULL; @@ -22963,7 +23788,7 @@ static __inline int couple_init(MDBX_cursor_couple *couple, const size_t dbi, couple->outer.mc_txn = (MDBX_txn *)txn; couple->outer.mc_db = db; couple->outer.mc_dbx = dbx; - couple->outer.mc_dbistate = dbstate; + couple->outer.mc_dbi_state = dbi_state; couple->outer.mc_snum = 0; couple->outer.mc_top = 0; couple->outer.mc_pg[0] = 0; @@ -22978,11 +23803,11 @@ static __inline int couple_init(MDBX_cursor_couple *couple, const size_t dbi, couple->outer.mc_xcursor = NULL; int rc = MDBX_SUCCESS; - if (unlikely(*couple->outer.mc_dbistate & DBI_STALE)) { + if (unlikely(*couple->outer.mc_dbi_state & DBI_STALE)) { rc = page_search(&couple->outer, NULL, MDBX_PS_ROOTONLY); rc = (rc != MDBX_NOTFOUND) ? rc : MDBX_SUCCESS; } else if (unlikely(dbx->md_klen_max == 0)) { - rc = setup_dbx(dbx, db, txn->mt_env->me_psize); + rc = setup_sdb(dbx, db, txn->mt_env->me_psize); } if (couple->outer.mc_db->md_flags & MDBX_DUPSORT) { @@ -23000,9 +23825,12 @@ static __inline int couple_init(MDBX_cursor_couple *couple, const size_t dbi, /* Initialize a cursor for a given transaction and database. */ static int cursor_init(MDBX_cursor *mc, const MDBX_txn *txn, size_t dbi) { STATIC_ASSERT(offsetof(MDBX_cursor_couple, outer) == 0); - return couple_init(container_of(mc, MDBX_cursor_couple, outer), dbi, txn, - &txn->mt_dbs[dbi], &txn->mt_dbxs[dbi], - &txn->mt_dbistate[dbi]); + int rc = dbi_check(txn, dbi); + if (likely(rc == MDBX_SUCCESS)) + rc = couple_init(container_of(mc, MDBX_cursor_couple, outer), dbi, txn, + &txn->mt_dbs[dbi], &txn->mt_env->me_dbxs[dbi], + &txn->mt_dbi_state[dbi]); + return rc; } MDBX_cursor *mdbx_cursor_create(void *context) { @@ -23041,6 +23869,38 @@ void *mdbx_cursor_get_userctx(const MDBX_cursor *mc) { return couple->mc_userctx; } +int mdbx_cursor_unbind(MDBX_cursor *mc) { + if (unlikely(!mc)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_SUCCESS + : MDBX_EBADSIGN; + + if (unlikely(mc->mc_backup)) /* Cursor from parent transaction */ + return MDBX_EINVAL; + + eASSERT(nullptr, mc->mc_txn && mc->mc_txn->mt_signature == MDBX_MT_SIGNATURE); + cASSERT(mc, mc->mc_signature == MDBX_MC_LIVE); + cASSERT(mc, !mc->mc_backup); + if (unlikely(!mc->mc_txn || mc->mc_txn->mt_signature != MDBX_MT_SIGNATURE)) { + ERROR("Wrong cursor's transaction %p 0x%x", + __Wpedantic_format_voidptr(mc->mc_txn), + mc->mc_txn ? mc->mc_txn->mt_signature : 0); + return MDBX_PROBLEM; + } + if (mc->mc_flags & C_UNTRACK) { + MDBX_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; + while (*prev && *prev != mc) + prev = &(*prev)->mc_next; + cASSERT(mc, *prev == mc); + *prev = mc->mc_next; + } + mc->mc_signature = MDBX_MC_READY4CLOSE; + mc->mc_flags = 0; + return MDBX_SUCCESS; +} + int mdbx_cursor_bind(const MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { if (unlikely(!mc)) return MDBX_EINVAL; @@ -23053,8 +23913,9 @@ int mdbx_cursor_bind(const MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (unlikely(!check_dbi(txn, dbi, DBI_VALID))) - return MDBX_BAD_DBI; + rc = dbi_check(txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; if (unlikely(dbi == FREE_DBI && !(txn->mt_flags & MDBX_TXN_RDONLY))) return MDBX_EACCESS; @@ -23066,10 +23927,10 @@ int mdbx_cursor_bind(const MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { mc->mc_txn != txn)) return MDBX_EINVAL; - assert(mc->mc_db == &txn->mt_dbs[dbi]); - assert(mc->mc_dbx == &txn->mt_dbxs[dbi]); - assert(mc->mc_dbi == dbi); - assert(mc->mc_dbistate == &txn->mt_dbistate[dbi]); + cASSERT(mc, mc->mc_db == &txn->mt_dbs[dbi]); + cASSERT(mc, mc->mc_dbx == &txn->mt_env->me_dbxs[dbi]); + cASSERT(mc, mc->mc_dbi == dbi); + cASSERT(mc, mc->mc_dbi_state == &txn->mt_dbi_state[dbi]); return likely(mc->mc_dbi == dbi && /* paranoia */ mc->mc_signature == MDBX_MC_LIVE && mc->mc_txn == txn) @@ -23078,27 +23939,9 @@ int mdbx_cursor_bind(const MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { } if (mc->mc_signature == MDBX_MC_LIVE) { - if (unlikely(!mc->mc_txn || - mc->mc_txn->mt_signature != MDBX_MT_SIGNATURE)) { - ERROR("Wrong cursor's transaction %p 0x%x", - __Wpedantic_format_voidptr(mc->mc_txn), - mc->mc_txn ? mc->mc_txn->mt_signature : 0); - return MDBX_PROBLEM; - } - if (mc->mc_flags & C_UNTRACK) { - MDBX_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; - while (*prev && *prev != mc) - prev = &(*prev)->mc_next; - cASSERT(mc, *prev == mc); - *prev = mc->mc_next; - } - mc->mc_signature = MDBX_MC_READY4CLOSE; - mc->mc_flags = 0; - mc->mc_dbi = UINT_MAX; - mc->mc_next = NULL; - mc->mc_db = NULL; - mc->mc_dbx = NULL; - mc->mc_dbistate = NULL; + rc = mdbx_cursor_unbind(mc); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; } cASSERT(mc, !(mc->mc_flags & C_UNTRACK)); @@ -23136,6 +23979,83 @@ int mdbx_cursor_renew(const MDBX_txn *txn, MDBX_cursor *mc) { return likely(mc) ? mdbx_cursor_bind(txn, mc, mc->mc_dbi) : MDBX_EINVAL; } +int mdbx_cursor_compare(const MDBX_cursor *l, const MDBX_cursor *r, + bool ignore_multival) { + const int incomparable = INT16_MAX + 1; + if (unlikely(!l)) + return r ? -incomparable * 9 : 0; + else if (unlikely(!r)) + return incomparable * 9; + + if (unlikely(l->mc_signature != MDBX_MC_LIVE)) + return (r->mc_signature == MDBX_MC_LIVE) ? -incomparable * 8 : 0; + if (unlikely(r->mc_signature != MDBX_MC_LIVE)) + return (l->mc_signature == MDBX_MC_LIVE) ? incomparable * 8 : 0; + + if (unlikely(l->mc_dbx != r->mc_dbx)) { + if (l->mc_txn->mt_env != r->mc_txn->mt_env) + return (l->mc_txn->mt_env > r->mc_txn->mt_env) ? incomparable * 7 + : -incomparable * 7; + if (l->mc_txn->mt_txnid != r->mc_txn->mt_txnid) + return (l->mc_txn->mt_txnid > r->mc_txn->mt_txnid) ? incomparable * 6 + : -incomparable * 6; + return (l->mc_dbx > r->mc_dbx) ? incomparable * 5 : -incomparable * 5; + } + assert(l->mc_dbi == r->mc_dbi); + + int diff = (l->mc_flags & C_INITIALIZED) - (l->mc_flags & C_INITIALIZED); + if (unlikely(diff)) + return (diff > 0) ? incomparable * 4 : -incomparable * 4; + if (unlikely((l->mc_flags & C_INITIALIZED) == 0)) + return 0; + + size_t detent = (l->mc_snum <= r->mc_snum) ? l->mc_snum : r->mc_snum; + for (size_t i = 0; i < detent; ++i) { + diff = l->mc_ki[i] - r->mc_ki[i]; + if (diff) + return diff; + } + if (unlikely(l->mc_snum != r->mc_snum)) + return (l->mc_snum > r->mc_snum) ? incomparable * 3 : -incomparable * 3; + + assert((l->mc_xcursor != nullptr) == (r->mc_xcursor != nullptr)); + if (unlikely((l->mc_xcursor != nullptr) != (r->mc_xcursor != nullptr))) + return l->mc_xcursor ? incomparable * 2 : -incomparable * 2; + if (ignore_multival || !l->mc_xcursor) + return 0; + +#if MDBX_DEBUG + if (l->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + const MDBX_page *mp = l->mc_pg[l->mc_top]; + const MDBX_node *node = page_node(mp, l->mc_ki[l->mc_top]); + assert(node_flags(node) & F_DUPDATA); + } + if (l->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + const MDBX_page *mp = r->mc_pg[r->mc_top]; + const MDBX_node *node = page_node(mp, r->mc_ki[r->mc_top]); + assert(node_flags(node) & F_DUPDATA); + } +#endif /* MDBX_DEBUG */ + + l = &l->mc_xcursor->mx_cursor; + r = &r->mc_xcursor->mx_cursor; + diff = (l->mc_flags & C_INITIALIZED) - (l->mc_flags & C_INITIALIZED); + if (unlikely(diff)) + return (diff > 0) ? incomparable * 2 : -incomparable * 2; + if (unlikely((l->mc_flags & C_INITIALIZED) == 0)) + return 0; + + detent = (l->mc_snum <= r->mc_snum) ? l->mc_snum : r->mc_snum; + for (size_t i = 0; i < detent; ++i) { + diff = l->mc_ki[i] - r->mc_ki[i]; + if (diff) + return diff; + } + if (unlikely(l->mc_snum != r->mc_snum)) + return (l->mc_snum > r->mc_snum) ? incomparable : -incomparable; + return 0; +} + int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest) { if (unlikely(!src)) return MDBX_EINVAL; @@ -23150,7 +24070,7 @@ int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest) { assert(dest->mc_db == src->mc_db); assert(dest->mc_dbi == src->mc_dbi); assert(dest->mc_dbx == src->mc_dbx); - assert(dest->mc_dbistate == src->mc_dbistate); + assert(dest->mc_dbi_state == src->mc_dbi_state); again: assert(dest->mc_txn == src->mc_txn); dest->mc_flags ^= (dest->mc_flags ^ src->mc_flags) & ~C_UNTRACK; @@ -23200,6 +24120,32 @@ void mdbx_cursor_close(MDBX_cursor *mc) { } } +int mdbx_txn_release_all_cursors(const MDBX_txn *txn, bool unbind) { + int rc = check_txn(txn, MDBX_TXN_FINISHED | MDBX_TXN_HAS_CHILD); + if (likely(rc == MDBX_SUCCESS)) { + TXN_FOREACH_DBI_FROM(txn, i, MAIN_DBI) { + while (txn->mt_cursors[i]) { + MDBX_cursor *mc = txn->mt_cursors[i]; + ENSURE(NULL, mc->mc_signature == MDBX_MC_LIVE && + (mc->mc_flags & C_UNTRACK) && !mc->mc_backup); + rc = likely(rc < INT_MAX) ? rc + 1 : rc; + txn->mt_cursors[i] = mc->mc_next; + if (unbind) { + mc->mc_signature = MDBX_MC_READY4CLOSE; + mc->mc_flags = 0; + } else { + mc->mc_signature = 0; + mc->mc_next = mc; + osal_free(mc); + } + } + } + } else { + eASSERT(nullptr, rc < 0); + } + return rc; +} + MDBX_txn *mdbx_cursor_txn(const MDBX_cursor *mc) { if (unlikely(!mc || mc->mc_signature != MDBX_MC_LIVE)) return NULL; @@ -23654,6 +24600,8 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); cASSERT(csrc, csrc->mc_snum < csrc->mc_db->md_depth || IS_LEAF(csrc->mc_pg[csrc->mc_db->md_depth - 1])); + cASSERT(cdst, csrc->mc_txn->mt_env->me_options.prefer_waf_insteadof_balance || + page_room(pdst) >= page_used(cdst->mc_txn->mt_env, psrc)); const int pagetype = PAGETYPE_WHOLE(psrc); /* Move all nodes from src to dst */ @@ -23890,7 +24838,7 @@ static void cursor_restore(const MDBX_cursor *csrc, MDBX_cursor *cdst) { cASSERT(cdst, cdst->mc_txn == csrc->mc_txn); cASSERT(cdst, cdst->mc_db == csrc->mc_db); cASSERT(cdst, cdst->mc_dbx == csrc->mc_dbx); - cASSERT(cdst, cdst->mc_dbistate == csrc->mc_dbistate); + cASSERT(cdst, cdst->mc_dbi_state == csrc->mc_dbi_state); cdst->mc_snum = csrc->mc_snum; cdst->mc_top = csrc->mc_top; cdst->mc_flags = csrc->mc_flags; @@ -23915,7 +24863,7 @@ static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { cdst->mc_txn = csrc->mc_txn; cdst->mc_db = csrc->mc_db; cdst->mc_dbx = csrc->mc_dbx; - cdst->mc_dbistate = csrc->mc_dbistate; + cdst->mc_dbi_state = csrc->mc_dbi_state; cursor_restore(csrc, cdst); } @@ -23973,7 +24921,7 @@ static int rebalance(MDBX_cursor *mc) { if (nkeys == 0) { cASSERT(mc, IS_LEAF(mp)); DEBUG("%s", "tree is completely empty"); - cASSERT(mc, (*mc->mc_dbistate & DBI_DIRTY) != 0); + cASSERT(mc, (*mc->mc_dbi_state & DBI_DIRTY) != 0); mc->mc_db->md_root = P_INVALID; mc->mc_db->md_depth = 0; cASSERT(mc, mc->mc_db->md_branch_pages == 0 && @@ -24154,14 +25102,11 @@ static int rebalance(MDBX_cursor *mc) { return MDBX_SUCCESS; } - /* Заглушено в ветке v0.12.x, будет работать в v0.13.1 и далее. - * - * if (mc->mc_txn->mt_env->me_options.prefer_waf_insteadof_balance && - * likely(room_threshold > 0)) { - * room_threshold = 0; - * goto retry; - * } - */ + if (mc->mc_txn->mt_env->me_options.prefer_waf_insteadof_balance && + likely(room_threshold > 0)) { + room_threshold = 0; + goto retry; + } if (likely(!involve) && (likely(mc->mc_dbi != FREE_DBI) || mc->mc_txn->tw.loose_pages || MDBX_PNL_GETSIZE(mc->mc_txn->tw.relist) || (mc->mc_flags & C_GCU) || @@ -24453,7 +25398,8 @@ __cold static int page_check(const MDBX_cursor *const mc, break; case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: if (unlikely(dsize != sizeof(MDBX_db))) { - rc = bad_page(mp, "invalid nested-db record size (%zu)\n", dsize); + rc = bad_page(mp, "invalid nested-db record size (%zu, expect %zu)\n", + dsize, sizeof(MDBX_db)); continue; } break; @@ -24658,7 +25604,7 @@ int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, if (unlikely(!key)) return MDBX_EINVAL; - if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) + if (unlikely(dbi <= FREE_DBI)) return MDBX_BAD_DBI; if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) @@ -25347,7 +26293,7 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, if (unlikely(!key || !data)) return MDBX_EINVAL; - if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) + if (unlikely(dbi <= FREE_DBI)) return MDBX_BAD_DBI; if (unlikely(flags & ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS | @@ -25602,6 +26548,9 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, } else if (node_flags(node) & F_SUBDATA) { if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) != sizeof(MDBX_db))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid dupsort sub-tree node size", + (unsigned)node_ds(node)); rc = MDBX_CORRUPTED; goto done; } @@ -25701,8 +26650,8 @@ __cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb) { memset(&couple, 0, sizeof(couple)); couple.inner.mx_cursor.mc_signature = ~MDBX_MC_LIVE; MDBX_dbx dbx = {.md_klen_min = INT_MAX}; - uint8_t dbistate = DBI_VALID | DBI_AUDITED; - int rc = couple_init(&couple, ~0u, ctx->mc_txn, sdb, &dbx, &dbistate); + uint8_t dbi_state = DBI_LINDO | DBI_VALID; + int rc = couple_init(&couple, ~0u, ctx->mc_txn, sdb, &dbx, &dbi_state); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -25799,9 +26748,16 @@ __cold static int env_compact(MDBX_env *env, MDBX_txn *read_txn, MDBX_SUCCESS) { const MDBX_PNL pnl = data.iov_base; if (unlikely(data.iov_len % sizeof(pgno_t) || - data.iov_len < MDBX_PNL_SIZEOF(pnl) || - !(pnl_check(pnl, read_txn->mt_next_pgno)))) + data.iov_len < MDBX_PNL_SIZEOF(pnl))) { + ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC-record length", data.iov_len); + return MDBX_CORRUPTED; + } + if (unlikely(!pnl_check(pnl, read_txn->mt_next_pgno))) { + ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid GC-record content"); return MDBX_CORRUPTED; + } gc += MDBX_PNL_GETSIZE(pnl); } if (unlikely(rc != MDBX_NOTFOUND)) @@ -25909,18 +26865,18 @@ __cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, const bool dest_is_pipe, const MDBX_copy_flags_t flags) { /* We must start the actual read txn after blocking writers */ - int rc = txn_end(read_txn, MDBX_END_RESET_TMP); + int rc = txn_end(read_txn, TXN_END_RESET_TMP); if (unlikely(rc != MDBX_SUCCESS)) return rc; /* Temporarily block writers until we snapshot the meta pages */ - rc = mdbx_txn_lock(env, false); + rc = osal_txn_lock(env, false); if (unlikely(rc != MDBX_SUCCESS)) return rc; rc = txn_renew(read_txn, MDBX_TXN_RDONLY); if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_txn_unlock(env); + osal_txn_unlock(env); return rc; } @@ -25932,7 +26888,7 @@ __cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, memcpy(buffer, env->me_map, meta_bytes); MDBX_meta *const headcopy = /* LY: get pointer to the snapshot copy */ ptr_disp(buffer, ptr_dist(meta_recent(env, &troika).ptr_c, env->me_map)); - mdbx_txn_unlock(env); + osal_txn_unlock(env); if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) meta_make_sizeable(headcopy); @@ -26184,15 +27140,11 @@ __cold int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags, if (unlikely(env->me_flags & MDBX_RDONLY)) return MDBX_EACCESS; - if ((env->me_flags & MDBX_ENV_ACTIVE) && - unlikely(env->me_txn0->mt_owner == osal_thread_self())) - return MDBX_BUSY; - - const bool lock_needed = (env->me_flags & MDBX_ENV_ACTIVE) && - env->me_txn0->mt_owner != osal_thread_self(); + const bool lock_needed = + (env->me_flags & MDBX_ENV_ACTIVE) && !env_txn0_owned(env); bool should_unlock = false; if (lock_needed) { - rc = mdbx_txn_lock(env, false); + rc = osal_txn_lock(env, false); if (unlikely(rc)) return rc; should_unlock = true; @@ -26204,7 +27156,7 @@ __cold int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags, env->me_flags &= ~flags; if (should_unlock) - mdbx_txn_unlock(env); + osal_txn_unlock(env); return MDBX_SUCCESS; } @@ -26256,7 +27208,7 @@ __cold int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **arg) { if (unlikely(!arg)) return MDBX_EINVAL; - *arg = env->me_pathname; + *arg = env->me_pathname.specified; return MDBX_SUCCESS; } #endif /* Windows */ @@ -26273,12 +27225,14 @@ __cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { if (!env->me_pathname_char) { *arg = nullptr; DWORD flags = /* WC_ERR_INVALID_CHARS */ 0x80; - size_t mb_len = WideCharToMultiByte(CP_THREAD_ACP, flags, env->me_pathname, - -1, nullptr, 0, nullptr, nullptr); + size_t mb_len = + WideCharToMultiByte(CP_THREAD_ACP, flags, env->me_pathname.specified, + -1, nullptr, 0, nullptr, nullptr); rc = mb_len ? MDBX_SUCCESS : (int)GetLastError(); if (rc == ERROR_INVALID_FLAGS) { - mb_len = WideCharToMultiByte(CP_THREAD_ACP, flags = 0, env->me_pathname, - -1, nullptr, 0, nullptr, nullptr); + mb_len = WideCharToMultiByte(CP_THREAD_ACP, flags = 0, + env->me_pathname.specified, -1, nullptr, 0, + nullptr, nullptr); rc = mb_len ? MDBX_SUCCESS : (int)GetLastError(); } if (unlikely(rc != MDBX_SUCCESS)) @@ -26287,9 +27241,9 @@ __cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { char *const mb_pathname = osal_malloc(mb_len); if (!mb_pathname) return MDBX_ENOMEM; - if (mb_len != (size_t)WideCharToMultiByte(CP_THREAD_ACP, flags, - env->me_pathname, -1, mb_pathname, - (int)mb_len, nullptr, nullptr)) { + if (mb_len != (size_t)WideCharToMultiByte( + CP_THREAD_ACP, flags, env->me_pathname.specified, -1, + mb_pathname, (int)mb_len, nullptr, nullptr)) { rc = (int)GetLastError(); osal_free(mb_pathname); return rc; @@ -26301,7 +27255,7 @@ __cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { } *arg = env->me_pathname_char; #else - *arg = env->me_pathname; + *arg = env->me_pathname.specified; #endif /* Windows */ return MDBX_SUCCESS; } @@ -26347,26 +27301,22 @@ __cold static int stat_acc(const MDBX_txn *txn, MDBX_stat *st, size_t bytes) { if (unlikely(err != MDBX_SUCCESS)) return err; - st->ms_psize = txn->mt_env->me_psize; -#if 1 - /* assuming GC is internal and not subject for accounting */ - stat_get(&txn->mt_dbs[MAIN_DBI], st, bytes); -#else - stat_get(&txn->mt_dbs[FREE_DBI], st, bytes); - stat_add(&txn->mt_dbs[MAIN_DBI], st, bytes); -#endif + MDBX_cursor_couple cx; + err = cursor_init(&cx.outer, (MDBX_txn *)txn, MAIN_DBI); + if (unlikely(err != MDBX_SUCCESS)) + return err; - /* account opened named subDBs */ - for (MDBX_dbi dbi = CORE_DBS; dbi < txn->mt_numdbs; dbi++) - if ((txn->mt_dbistate[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID) + const MDBX_env *const env = txn->mt_env; + st->ms_psize = env->me_psize; + TXN_FOREACH_DBI_FROM( + txn, dbi, + /* assuming GC is internal and not subject for accounting */ MAIN_DBI) { + if ((txn->mt_dbi_state[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID) stat_add(txn->mt_dbs + dbi, st, bytes); + } - if (!(txn->mt_dbs[MAIN_DBI].md_flags & (MDBX_DUPSORT | MDBX_INTEGERKEY)) && + if (!(txn->mt_dbs[MAIN_DBI].md_flags & MDBX_DUPSORT) && txn->mt_dbs[MAIN_DBI].md_entries /* TODO: use `md_subs` field */) { - MDBX_cursor_couple cx; - err = cursor_init(&cx.outer, (MDBX_txn *)txn, MAIN_DBI); - if (unlikely(err != MDBX_SUCCESS)) - return err; /* scan and account not opened named subDBs */ err = page_search(&cx.outer, NULL, MDBX_PS_FIRST); @@ -26376,18 +27326,22 @@ __cold static int stat_acc(const MDBX_txn *txn, MDBX_stat *st, size_t bytes) { const MDBX_node *node = page_node(mp, i); if (node_flags(node) != F_SUBDATA) continue; - if (unlikely(node_ds(node) != sizeof(MDBX_db))) + if (unlikely(node_ds(node) != sizeof(MDBX_db))) { + ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid subDb node size", node_ds(node)); return MDBX_CORRUPTED; + } /* skip opened and already accounted */ - for (MDBX_dbi dbi = CORE_DBS; dbi < txn->mt_numdbs; dbi++) - if ((txn->mt_dbistate[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID && - node_ks(node) == txn->mt_dbxs[dbi].md_name.iov_len && - memcmp(node_key(node), txn->mt_dbxs[dbi].md_name.iov_base, - node_ks(node)) == 0) { + const MDBX_val name = {node_key(node), node_ks(node)}; + TXN_FOREACH_DBI_USER(txn, dbi) { + if ((txn->mt_dbi_state[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID && + env->me_dbxs[MAIN_DBI].md_cmp(&name, + &env->me_dbxs[dbi].md_name) == 0) { node = NULL; break; } + } if (node) { MDBX_db db; @@ -26422,7 +27376,7 @@ __cold int mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn, if (unlikely(err != MDBX_SUCCESS)) return err; - if (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()) + if (env->me_txn && env_txn0_owned(env)) /* inside write-txn */ return stat_acc(env->me_txn, dest, bytes); @@ -26447,9 +27401,6 @@ __cold int mdbx_dbi_dupsort_depthmask(const MDBX_txn *txn, MDBX_dbi dbi, if (unlikely(!mask)) return MDBX_EINVAL; - if (unlikely(!check_dbi(txn, dbi, DBI_VALID))) - return MDBX_BAD_DBI; - MDBX_cursor_couple cx; rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) @@ -26480,7 +27431,8 @@ __cold int mdbx_dbi_dupsort_depthmask(const MDBX_txn *txn, MDBX_dbi dbi, *mask |= 1 << UNALIGNED_PEEK_16(db, MDBX_db, md_depth); break; default: - ERROR("wrong node-flags %u", flags); + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid node-size", flags); return MDBX_CORRUPTED; } rc = cursor_next(&cx.outer, &key, &data, MDBX_NEXT_NODUP); @@ -26489,11 +27441,13 @@ __cold int mdbx_dbi_dupsort_depthmask(const MDBX_txn *txn, MDBX_dbi dbi, return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc; } -__cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, - MDBX_envinfo *arg, const size_t bytes) { - +__cold static int env_info_snap(const MDBX_env *env, const MDBX_txn *txn, + MDBX_envinfo *out, const size_t bytes, + meta_troika_t *const troika) { const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid); const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat); + if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) + return MDBX_PANIC; /* is the environment open? * (https://libmdbx.dqdkfa.ru/dead-github/issues/171) */ @@ -26501,18 +27455,18 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, /* environment not yet opened */ #if 1 /* default behavior: returns the available info but zeroed the rest */ - memset(arg, 0, bytes); - arg->mi_geo.lower = env->me_dbgeo.lower; - arg->mi_geo.upper = env->me_dbgeo.upper; - arg->mi_geo.shrink = env->me_dbgeo.shrink; - arg->mi_geo.grow = env->me_dbgeo.grow; - arg->mi_geo.current = env->me_dbgeo.now; - arg->mi_maxreaders = env->me_maxreaders; - arg->mi_dxb_pagesize = env->me_psize; - arg->mi_sys_pagesize = env->me_os_psize; + memset(out, 0, bytes); + out->mi_geo.lower = env->me_dbgeo.lower; + out->mi_geo.upper = env->me_dbgeo.upper; + out->mi_geo.shrink = env->me_dbgeo.shrink; + out->mi_geo.grow = env->me_dbgeo.grow; + out->mi_geo.current = env->me_dbgeo.now; + out->mi_maxreaders = env->me_maxreaders; + out->mi_dxb_pagesize = env->me_psize; + out->mi_sys_pagesize = env->me_os_psize; if (likely(bytes > size_before_bootid)) { - arg->mi_bootid.current.x = bootid.x; - arg->mi_bootid.current.y = bootid.y; + out->mi_bootid.current.x = bootid.x; + out->mi_bootid.current.y = bootid.y; } return MDBX_SUCCESS; #else @@ -26521,142 +27475,243 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, #endif } + *troika = (txn && !(txn->mt_flags & MDBX_TXN_RDONLY)) ? txn->tw.troika + : meta_tap(env); + const meta_ptr_t head = meta_recent(env, troika); const MDBX_meta *const meta0 = METAPAGE(env, 0); const MDBX_meta *const meta1 = METAPAGE(env, 1); const MDBX_meta *const meta2 = METAPAGE(env, 2); - if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) - return MDBX_PANIC; - - meta_troika_t holder; - meta_troika_t const *troika; - if (txn && !(txn->mt_flags & MDBX_TXN_RDONLY)) - troika = &txn->tw.troika; - else { - holder = meta_tap(env); - troika = &holder; - } - - const meta_ptr_t head = meta_recent(env, troika); - arg->mi_recent_txnid = head.txnid; - arg->mi_meta0_txnid = troika->txnid[0]; - arg->mi_meta0_sign = unaligned_peek_u64(4, meta0->mm_sign); - arg->mi_meta1_txnid = troika->txnid[1]; - arg->mi_meta1_sign = unaligned_peek_u64(4, meta1->mm_sign); - arg->mi_meta2_txnid = troika->txnid[2]; - arg->mi_meta2_sign = unaligned_peek_u64(4, meta2->mm_sign); + out->mi_recent_txnid = head.txnid; + out->mi_meta_txnid[0] = troika->txnid[0]; + out->mi_meta_sign[0] = unaligned_peek_u64(4, meta0->mm_sign); + out->mi_meta_txnid[1] = troika->txnid[1]; + out->mi_meta_sign[1] = unaligned_peek_u64(4, meta1->mm_sign); + out->mi_meta_txnid[2] = troika->txnid[2]; + out->mi_meta_sign[2] = unaligned_peek_u64(4, meta2->mm_sign); if (likely(bytes > size_before_bootid)) { - memcpy(&arg->mi_bootid.meta0, &meta0->mm_bootid, 16); - memcpy(&arg->mi_bootid.meta1, &meta1->mm_bootid, 16); - memcpy(&arg->mi_bootid.meta2, &meta2->mm_bootid, 16); + memcpy(&out->mi_bootid.meta[0], &meta0->mm_bootid, 16); + memcpy(&out->mi_bootid.meta[1], &meta1->mm_bootid, 16); + memcpy(&out->mi_bootid.meta[2], &meta2->mm_bootid, 16); } const volatile MDBX_meta *txn_meta = head.ptr_v; - arg->mi_last_pgno = txn_meta->mm_geo.next - 1; - arg->mi_geo.current = pgno2bytes(env, txn_meta->mm_geo.now); + out->mi_last_pgno = txn_meta->mm_geo.next - 1; + out->mi_geo.current = pgno2bytes(env, txn_meta->mm_geo.now); if (txn) { - arg->mi_last_pgno = txn->mt_next_pgno - 1; - arg->mi_geo.current = pgno2bytes(env, txn->mt_end_pgno); + out->mi_last_pgno = txn->mt_next_pgno - 1; + out->mi_geo.current = pgno2bytes(env, txn->mt_end_pgno); const txnid_t wanna_meta_txnid = (txn->mt_flags & MDBX_TXN_RDONLY) ? txn->mt_txnid : txn->mt_txnid - xMDBX_TXNID_STEP; - txn_meta = (arg->mi_meta0_txnid == wanna_meta_txnid) ? meta0 : txn_meta; - txn_meta = (arg->mi_meta1_txnid == wanna_meta_txnid) ? meta1 : txn_meta; - txn_meta = (arg->mi_meta2_txnid == wanna_meta_txnid) ? meta2 : txn_meta; - } - arg->mi_geo.lower = pgno2bytes(env, txn_meta->mm_geo.lower); - arg->mi_geo.upper = pgno2bytes(env, txn_meta->mm_geo.upper); - arg->mi_geo.shrink = pgno2bytes(env, pv2pages(txn_meta->mm_geo.shrink_pv)); - arg->mi_geo.grow = pgno2bytes(env, pv2pages(txn_meta->mm_geo.grow_pv)); - const uint64_t unsynced_pages = - atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) + - (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != - (uint32_t)arg->mi_recent_txnid); - - arg->mi_mapsize = env->me_dxb_mmap.limit; + txn_meta = (out->mi_meta_txnid[0] == wanna_meta_txnid) ? meta0 : txn_meta; + txn_meta = (out->mi_meta_txnid[1] == wanna_meta_txnid) ? meta1 : txn_meta; + txn_meta = (out->mi_meta_txnid[2] == wanna_meta_txnid) ? meta2 : txn_meta; + } + out->mi_geo.lower = pgno2bytes(env, txn_meta->mm_geo.lower); + out->mi_geo.upper = pgno2bytes(env, txn_meta->mm_geo.upper); + out->mi_geo.shrink = pgno2bytes(env, pv2pages(txn_meta->mm_geo.shrink_pv)); + out->mi_geo.grow = pgno2bytes(env, pv2pages(txn_meta->mm_geo.grow_pv)); + out->mi_mapsize = env->me_dxb_mmap.limit; const MDBX_lockinfo *const lck = env->me_lck; - arg->mi_maxreaders = env->me_maxreaders; - arg->mi_numreaders = env->me_lck_mmap.lck + out->mi_maxreaders = env->me_maxreaders; + out->mi_numreaders = env->me_lck_mmap.lck ? atomic_load32(&lck->mti_numreaders, mo_Relaxed) : INT32_MAX; - arg->mi_dxb_pagesize = env->me_psize; - arg->mi_sys_pagesize = env->me_os_psize; + out->mi_dxb_pagesize = env->me_psize; + out->mi_sys_pagesize = env->me_os_psize; if (likely(bytes > size_before_bootid)) { - arg->mi_unsync_volume = pgno2bytes(env, (size_t)unsynced_pages); + const uint64_t unsynced_pages = + atomic_load64(&lck->mti_unsynced_pages, mo_Relaxed) + + ((uint32_t)out->mi_recent_txnid != + atomic_load32(&lck->mti_meta_sync_txnid, mo_Relaxed)); + out->mi_unsync_volume = pgno2bytes(env, (size_t)unsynced_pages); const uint64_t monotime_now = osal_monotime(); uint64_t ts = atomic_load64(&lck->mti_eoos_timestamp, mo_Relaxed); - arg->mi_since_sync_seconds16dot16 = + out->mi_since_sync_seconds16dot16 = ts ? osal_monotime_to_16dot16_noUnderflow(monotime_now - ts) : 0; ts = atomic_load64(&lck->mti_reader_check_timestamp, mo_Relaxed); - arg->mi_since_reader_check_seconds16dot16 = + out->mi_since_reader_check_seconds16dot16 = ts ? osal_monotime_to_16dot16_noUnderflow(monotime_now - ts) : 0; - arg->mi_autosync_threshold = pgno2bytes( + out->mi_autosync_threshold = pgno2bytes( env, atomic_load32(&lck->mti_autosync_threshold, mo_Relaxed)); - arg->mi_autosync_period_seconds16dot16 = + out->mi_autosync_period_seconds16dot16 = osal_monotime_to_16dot16_noUnderflow( atomic_load64(&lck->mti_autosync_period, mo_Relaxed)); - arg->mi_bootid.current.x = bootid.x; - arg->mi_bootid.current.y = bootid.y; - arg->mi_mode = env->me_lck_mmap.lck ? lck->mti_envmode.weak : env->me_flags; + out->mi_bootid.current.x = bootid.x; + out->mi_bootid.current.y = bootid.y; + out->mi_mode = env->me_lck_mmap.lck ? lck->mti_envmode.weak : env->me_flags; } if (likely(bytes > size_before_pgop_stat)) { #if MDBX_ENABLE_PGOP_STAT - arg->mi_pgop_stat.newly = + out->mi_pgop_stat.newly = atomic_load64(&lck->mti_pgop_stat.newly, mo_Relaxed); - arg->mi_pgop_stat.cow = atomic_load64(&lck->mti_pgop_stat.cow, mo_Relaxed); - arg->mi_pgop_stat.clone = + out->mi_pgop_stat.cow = atomic_load64(&lck->mti_pgop_stat.cow, mo_Relaxed); + out->mi_pgop_stat.clone = atomic_load64(&lck->mti_pgop_stat.clone, mo_Relaxed); - arg->mi_pgop_stat.split = + out->mi_pgop_stat.split = atomic_load64(&lck->mti_pgop_stat.split, mo_Relaxed); - arg->mi_pgop_stat.merge = + out->mi_pgop_stat.merge = atomic_load64(&lck->mti_pgop_stat.merge, mo_Relaxed); - arg->mi_pgop_stat.spill = + out->mi_pgop_stat.spill = atomic_load64(&lck->mti_pgop_stat.spill, mo_Relaxed); - arg->mi_pgop_stat.unspill = + out->mi_pgop_stat.unspill = atomic_load64(&lck->mti_pgop_stat.unspill, mo_Relaxed); - arg->mi_pgop_stat.wops = + out->mi_pgop_stat.wops = atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed); - arg->mi_pgop_stat.prefault = + out->mi_pgop_stat.prefault = atomic_load64(&lck->mti_pgop_stat.prefault, mo_Relaxed); - arg->mi_pgop_stat.mincore = + out->mi_pgop_stat.mincore = atomic_load64(&lck->mti_pgop_stat.mincore, mo_Relaxed); - arg->mi_pgop_stat.msync = + out->mi_pgop_stat.msync = atomic_load64(&lck->mti_pgop_stat.msync, mo_Relaxed); - arg->mi_pgop_stat.fsync = + out->mi_pgop_stat.fsync = atomic_load64(&lck->mti_pgop_stat.fsync, mo_Relaxed); #else - memset(&arg->mi_pgop_stat, 0, sizeof(arg->mi_pgop_stat)); + memset(&out->mi_pgop_stat, 0, sizeof(out->mi_pgop_stat)); #endif /* MDBX_ENABLE_PGOP_STAT*/ } - arg->mi_self_latter_reader_txnid = arg->mi_latter_reader_txnid = - arg->mi_recent_txnid; + txnid_t overall_latter_reader_txnid = out->mi_recent_txnid; + txnid_t self_latter_reader_txnid = overall_latter_reader_txnid; if (env->me_lck_mmap.lck) { - for (size_t i = 0; i < arg->mi_numreaders; ++i) { + for (size_t i = 0; i < out->mi_numreaders; ++i) { const uint32_t pid = atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); if (pid) { const txnid_t txnid = safe64_read(&lck->mti_readers[i].mr_txnid); - if (arg->mi_latter_reader_txnid > txnid) - arg->mi_latter_reader_txnid = txnid; - if (pid == env->me_pid && arg->mi_self_latter_reader_txnid > txnid) - arg->mi_self_latter_reader_txnid = txnid; + if (overall_latter_reader_txnid > txnid) + overall_latter_reader_txnid = txnid; + if (pid == env->me_pid && self_latter_reader_txnid > txnid) + self_latter_reader_txnid = txnid; } } } + out->mi_self_latter_reader_txnid = self_latter_reader_txnid; + out->mi_latter_reader_txnid = overall_latter_reader_txnid; osal_compiler_barrier(); return MDBX_SUCCESS; } +__cold int env_info(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo *out, + size_t bytes, meta_troika_t *troika) { + MDBX_envinfo snap; + int rc = env_info_snap(env, txn, &snap, sizeof(snap), troika); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + eASSERT(env, sizeof(snap) >= bytes); + while (1) { + rc = env_info_snap(env, txn, out, bytes, troika); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + snap.mi_since_sync_seconds16dot16 = out->mi_since_sync_seconds16dot16; + snap.mi_since_reader_check_seconds16dot16 = + out->mi_since_reader_check_seconds16dot16; + if (likely(memcmp(&snap, out, bytes) == 0)) + return MDBX_SUCCESS; + memcpy(&snap, out, bytes); + } +} + +__cold int mdbx_preopen_snapinfo(const char *pathname, MDBX_envinfo *out, + size_t bytes) { +#if defined(_WIN32) || defined(_WIN64) + wchar_t *pathnameW = nullptr; + int rc = osal_mb2w(pathname, &pathnameW); + if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_preopen_snapinfoW(pathnameW, out, bytes); + osal_free(pathnameW); + } + return rc; +} + +__cold int mdbx_preopen_snapinfoW(const wchar_t *pathname, MDBX_envinfo *out, + size_t bytes) { +#endif /* Windows */ + if (unlikely(!out)) + return MDBX_EINVAL; + + const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid); + const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat); + if (unlikely(bytes != sizeof(MDBX_envinfo)) && bytes != size_before_bootid && + bytes != size_before_pgop_stat) + return MDBX_EINVAL; + + memset(out, 0, bytes); + if (likely(bytes > size_before_bootid)) { + out->mi_bootid.current.x = bootid.x; + out->mi_bootid.current.y = bootid.y; + } + + MDBX_env env; + memset(&env, 0, sizeof(env)); + env.me_pid = osal_getpid(); + const size_t os_psize = osal_syspagesize(); + if (unlikely(!is_powerof2(os_psize) || os_psize < MIN_PAGESIZE)) { + ERROR("unsuitable system pagesize %" PRIuPTR, os_psize); + return MDBX_INCOMPATIBLE; + } + out->mi_sys_pagesize = env.me_os_psize = (unsigned)os_psize; + env.me_flags = MDBX_RDONLY | MDBX_NORDAHEAD | MDBX_ACCEDE | MDBX_VALIDATION; + env.me_stuck_meta = -1; + env.me_lfd = INVALID_HANDLE_VALUE; + env.me_lazy_fd = INVALID_HANDLE_VALUE; + env.me_dsync_fd = INVALID_HANDLE_VALUE; + env.me_fd4meta = INVALID_HANDLE_VALUE; +#if defined(_WIN32) || defined(_WIN64) + env.me_data_lock_event = INVALID_HANDLE_VALUE; + env.me_overlapped_fd = INVALID_HANDLE_VALUE; +#endif /* Windows */ + + int rc = env_handle_pathname(&env, pathname, 0); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + rc = osal_openfile(MDBX_OPEN_DXB_READ, &env, env.me_pathname.dxb, + &env.me_lazy_fd, 0); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + MDBX_meta header; + rc = read_header(&env, &header, 0, 0); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + setup_pagesize(&env, header.mm_psize); + out->mi_dxb_pagesize = env.me_psize; + out->mi_geo.lower = pgno2bytes(&env, header.mm_geo.lower); + out->mi_geo.upper = pgno2bytes(&env, header.mm_geo.upper); + out->mi_geo.shrink = pgno2bytes(&env, pv2pages(header.mm_geo.shrink_pv)); + out->mi_geo.grow = pgno2bytes(&env, pv2pages(header.mm_geo.grow_pv)); + out->mi_geo.current = pgno2bytes(&env, header.mm_geo.now); + out->mi_last_pgno = header.mm_geo.next - 1; + + const unsigned n = 0; + out->mi_recent_txnid = constmeta_txnid(&header); + out->mi_meta_sign[n] = unaligned_peek_u64(4, &header.mm_sign); + if (likely(bytes > size_before_bootid)) + memcpy(&out->mi_bootid.meta[n], &header.mm_bootid, 16); + +bailout: + env_close(&env, false); + return rc; +} + __cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, MDBX_envinfo *arg, size_t bytes) { if (unlikely((env == NULL && txn == NULL) || arg == NULL)) return MDBX_EINVAL; + const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid); + const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat); + if (unlikely(bytes != sizeof(MDBX_envinfo)) && bytes != size_before_bootid && + bytes != size_before_pgop_stat) + return MDBX_EINVAL; + if (txn) { int err = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR); if (unlikely(err != MDBX_SUCCESS)) @@ -26672,28 +27727,8 @@ __cold int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, env = txn->mt_env; } - const size_t size_before_bootid = offsetof(MDBX_envinfo, mi_bootid); - const size_t size_before_pgop_stat = offsetof(MDBX_envinfo, mi_pgop_stat); - if (unlikely(bytes != sizeof(MDBX_envinfo)) && bytes != size_before_bootid && - bytes != size_before_pgop_stat) - return MDBX_EINVAL; - - MDBX_envinfo snap; - int rc = fetch_envinfo_ex(env, txn, &snap, sizeof(snap)); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - while (1) { - rc = fetch_envinfo_ex(env, txn, arg, bytes); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - snap.mi_since_sync_seconds16dot16 = arg->mi_since_sync_seconds16dot16; - snap.mi_since_reader_check_seconds16dot16 = - arg->mi_since_reader_check_seconds16dot16; - if (likely(memcmp(&snap, arg, bytes) == 0)) - return MDBX_SUCCESS; - memcpy(&snap, arg, bytes); - } + meta_troika_t troika; + return env_info(env, txn, arg, bytes, &troika); } static __inline MDBX_cmp_func *get_default_keycmp(MDBX_db_flags_t flags) { @@ -26710,321 +27745,443 @@ static __inline MDBX_cmp_func *get_default_datacmp(MDBX_db_flags_t flags) { : ((flags & MDBX_REVERSEDUP) ? cmp_reverse : cmp_lexical)); } -static int dbi_bind(MDBX_txn *txn, const MDBX_dbi dbi, unsigned user_flags, +static int dbi_bind(MDBX_txn *txn, const size_t dbi, unsigned user_flags, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { - /* Accepting only three cases: - * 1) user_flags and both comparators are zero - * = assume that a by-default mode/flags is requested for reading; - * 2) user_flags exactly the same - * = assume that the target mode/flags are requested properly; - * 3) user_flags differs, but table is empty and MDBX_CREATE is provided - * = assume that a properly create request with custom flags; + const MDBX_env *const env = txn->mt_env; + eASSERT(env, dbi < txn->mt_numdbs && dbi < env->me_numdbs); + eASSERT(env, dbi_state(txn, dbi) & DBI_LINDO); + eASSERT(env, env->me_db_flags[dbi] != DB_POISON); + if ((env->me_db_flags[dbi] & DB_VALID) == 0) { + eASSERT(env, !env->me_dbxs[dbi].md_cmp && !env->me_dbxs[dbi].md_dcmp && + !env->me_dbxs[dbi].md_name.iov_len && + !env->me_dbxs[dbi].md_name.iov_base && + !env->me_dbxs[dbi].md_klen_max && + !env->me_dbxs[dbi].md_klen_min && + !env->me_dbxs[dbi].md_vlen_max && + !env->me_dbxs[dbi].md_vlen_min); + } else { + eASSERT(env, !(txn->mt_dbi_state[dbi] & DBI_VALID) || + (txn->mt_dbs[dbi].md_flags | DB_VALID) == + env->me_db_flags[dbi]); + eASSERT(env, env->me_dbxs[dbi].md_name.iov_base || dbi < CORE_DBS); + } + + /* Если dbi уже использовался, то корректными считаем четыре варианта: + * 1) user_flags равны MDBX_DB_ACCEDE + * = предполагаем что пользователь открывает существующую subDb, + * при этом код проверки не позволит установить другие компараторы. + * 2) user_flags нулевые, а оба компаратора пустые/нулевые или равны текущим + * = предполагаем что пользователь открывает существующую subDb + * старым способом с нулевыми с флагами по-умолчанию. + * 3) user_flags совпадают, а компараторы не заданы или те же + * = предполагаем что пользователь открывает subDb указывая все параметры; + * 4) user_flags отличаются, но subDb пустая и задан флаг MDBX_CREATE + * = предполагаем что пользователь пересоздает subDb; */ - if ((user_flags ^ txn->mt_dbs[dbi].md_flags) & DB_PERSISTENT_FLAGS) { + if ((user_flags & ~MDBX_CREATE) != + (unsigned)(env->me_db_flags[dbi] & DB_PERSISTENT_FLAGS)) { /* flags are differs, check other conditions */ - if ((!user_flags && (!keycmp || keycmp == txn->mt_dbxs[dbi].md_cmp) && - (!datacmp || datacmp == txn->mt_dbxs[dbi].md_dcmp)) || - user_flags == MDBX_ACCEDE) { - /* no comparators were provided and flags are zero, - * seems that is case #1 above */ - user_flags = txn->mt_dbs[dbi].md_flags; - } else if ((user_flags & MDBX_CREATE) && txn->mt_dbs[dbi].md_entries == 0) { - if (txn->mt_flags & MDBX_TXN_RDONLY) - return /* FIXME: return extended info */ MDBX_EACCESS; - /* make sure flags changes get committed */ - txn->mt_dbs[dbi].md_flags = user_flags & DB_PERSISTENT_FLAGS; - txn->mt_flags |= MDBX_TXN_DIRTY; - /* обнуляем компараторы для установки в соответствии с флагами, - * либо заданных пользователем */ - txn->mt_dbxs[dbi].md_cmp = nullptr; - txn->mt_dbxs[dbi].md_dcmp = nullptr; - } else { + if ((!user_flags && (!keycmp || keycmp == env->me_dbxs[dbi].md_cmp) && + (!datacmp || datacmp == env->me_dbxs[dbi].md_dcmp)) || + user_flags == MDBX_DB_ACCEDE) { + user_flags = env->me_db_flags[dbi] & DB_PERSISTENT_FLAGS; + } else if ((user_flags & MDBX_CREATE) == 0) return /* FIXME: return extended info */ MDBX_INCOMPATIBLE; + else { + eASSERT(env, env->me_db_flags[dbi] & DB_VALID); + if (txn->mt_dbi_state[dbi] & DBI_STALE) { + int err = fetch_sdb(txn, dbi); + if (unlikely(err == MDBX_SUCCESS)) + return err; + } + eASSERT(env, + (txn->mt_dbi_state[dbi] & (DBI_LINDO | DBI_VALID | DBI_STALE)) == + (DBI_LINDO | DBI_VALID)); + if (unlikely(txn->mt_dbs[dbi].md_leaf_pages)) + return /* FIXME: return extended info */ MDBX_INCOMPATIBLE; + + /* Пересоздаём subDB если там пусто */ + if (unlikely(txn->mt_cursors[dbi])) + return MDBX_DANGLING_DBI; + env->me_db_flags[dbi] = DB_POISON; + atomic_store32(&env->me_dbi_seqs[dbi], dbi_seq_next(env, MAIN_DBI), + mo_AcquireRelease); + + const uint32_t seq = dbi_seq_next(env, dbi); + const uint16_t db_flags = user_flags & DB_PERSISTENT_FLAGS; + eASSERT(env, txn->mt_dbs[dbi].md_depth == 0 && + txn->mt_dbs[dbi].md_entries == 0 && + txn->mt_dbs[dbi].md_root == P_INVALID); + env->me_dbxs[dbi].md_cmp = + keycmp ? keycmp : get_default_keycmp(user_flags); + env->me_dbxs[dbi].md_dcmp = + datacmp ? datacmp : get_default_datacmp(user_flags); + txn->mt_dbs[dbi].md_flags = db_flags; + txn->mt_dbs[dbi].md_xsize = 0; + if (unlikely(setup_sdb(&env->me_dbxs[dbi], &txn->mt_dbs[dbi], + env->me_psize))) { + txn->mt_dbi_state[dbi] = DBI_LINDO; + txn->mt_flags |= MDBX_TXN_ERROR; + return MDBX_PROBLEM; + } + + env->me_db_flags[dbi] = db_flags | DB_VALID; + atomic_store32(&env->me_dbi_seqs[dbi], seq, mo_AcquireRelease); + txn->mt_dbi_seqs[dbi] = seq; + txn->mt_dbi_state[dbi] = DBI_LINDO | DBI_VALID | DBI_CREAT | DBI_DIRTY; + txn->mt_flags |= MDBX_TXN_DIRTY; } } if (!keycmp) - keycmp = txn->mt_dbxs[dbi].md_cmp ? txn->mt_dbxs[dbi].md_cmp - : get_default_keycmp(user_flags); - if (txn->mt_dbxs[dbi].md_cmp != keycmp) { - if (txn->mt_dbxs[dbi].md_cmp) + keycmp = (env->me_db_flags[dbi] & DB_VALID) + ? env->me_dbxs[dbi].md_cmp + : get_default_keycmp(user_flags); + if (env->me_dbxs[dbi].md_cmp != keycmp) { + if (env->me_db_flags[dbi] & DB_VALID) return MDBX_EINVAL; - txn->mt_dbxs[dbi].md_cmp = keycmp; + env->me_dbxs[dbi].md_cmp = keycmp; } if (!datacmp) - datacmp = txn->mt_dbxs[dbi].md_dcmp ? txn->mt_dbxs[dbi].md_dcmp - : get_default_datacmp(user_flags); - if (txn->mt_dbxs[dbi].md_dcmp != datacmp) { - if (txn->mt_dbxs[dbi].md_dcmp) + datacmp = (env->me_db_flags[dbi] & DB_VALID) + ? env->me_dbxs[dbi].md_dcmp + : get_default_datacmp(user_flags); + if (env->me_dbxs[dbi].md_dcmp != datacmp) { + if (env->me_db_flags[dbi] & DB_VALID) return MDBX_EINVAL; - txn->mt_dbxs[dbi].md_dcmp = datacmp; + env->me_dbxs[dbi].md_dcmp = datacmp; } return MDBX_SUCCESS; } -static int dbi_open(MDBX_txn *txn, const MDBX_val *const table_name, - unsigned user_flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, - MDBX_cmp_func *datacmp) { - int rc = MDBX_EINVAL; - if (unlikely(!dbi)) - return rc; - - void *clone = nullptr; - bool locked = false; - if (unlikely((user_flags & ~DB_USABLE_FLAGS) != 0)) { - bailout: - tASSERT(txn, MDBX_IS_ERROR(rc)); - *dbi = 0; - if (locked) - ENSURE(txn->mt_env, - osal_fastmutex_release(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS); - osal_free(clone); - return rc; - } - - rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - - if ((user_flags & MDBX_CREATE) && unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) { - rc = MDBX_EACCESS; - goto bailout; - } - - switch (user_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED | MDBX_DUPSORT | - MDBX_REVERSEDUP | MDBX_ACCEDE)) { - case MDBX_ACCEDE: - if ((user_flags & MDBX_CREATE) == 0) - break; - __fallthrough /* fall through */; - default: - rc = MDBX_EINVAL; - goto bailout; - - case MDBX_DUPSORT: - case MDBX_DUPSORT | MDBX_REVERSEDUP: - case MDBX_DUPSORT | MDBX_DUPFIXED: - case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP: - case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP: - case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP: - case 0: - break; - } - - /* main table? */ - if (table_name == MDBX_PGWALK_MAIN || - table_name->iov_base == MDBX_PGWALK_MAIN) { - rc = dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - *dbi = MAIN_DBI; - return rc; - } - if (table_name == MDBX_PGWALK_GC || table_name->iov_base == MDBX_PGWALK_GC) { - rc = dbi_bind(txn, FREE_DBI, user_flags, keycmp, datacmp); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - *dbi = FREE_DBI; - return rc; - } - if (table_name == MDBX_PGWALK_META || - table_name->iov_base == MDBX_PGWALK_META) { - rc = MDBX_EINVAL; - goto bailout; - } +static __inline size_t dbi_namelen(const MDBX_val name) { + return (name.iov_len > sizeof(struct mdbx_defer_free_item)) + ? name.iov_len + : sizeof(struct mdbx_defer_free_item); +} - MDBX_val key = *table_name; +static int dbi_open_locked(MDBX_txn *txn, unsigned user_flags, MDBX_dbi *dbi, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp, + MDBX_val name) { MDBX_env *const env = txn->mt_env; - if (key.iov_len > env->me_leaf_nodemax - NODESIZE - sizeof(MDBX_db)) - return MDBX_EINVAL; /* Cannot mix named table(s) with DUPSORT flags */ + tASSERT(txn, + (txn->mt_dbi_state[MAIN_DBI] & (DBI_LINDO | DBI_VALID | DBI_STALE)) == + (DBI_LINDO | DBI_VALID)); if (unlikely(txn->mt_dbs[MAIN_DBI].md_flags & MDBX_DUPSORT)) { - if ((user_flags & MDBX_CREATE) == 0) { - rc = MDBX_NOTFOUND; - goto bailout; - } - if (txn->mt_dbs[MAIN_DBI].md_leaf_pages || txn->mt_dbxs[MAIN_DBI].md_cmp) { - /* В MAIN_DBI есть записи либо она уже использовалась. */ - rc = MDBX_INCOMPATIBLE; - goto bailout; - } - /* Пересоздаём MAIN_DBI если там пусто. */ - atomic_store32(&txn->mt_dbiseqs[MAIN_DBI], dbi_seq(env, MAIN_DBI), - mo_AcquireRelease); + if (unlikely((user_flags & MDBX_CREATE) == 0)) + return MDBX_NOTFOUND; + if (unlikely(txn->mt_dbs[MAIN_DBI].md_leaf_pages)) + /* В MainDB есть записи, либо она уже использовалась. */ + return MDBX_INCOMPATIBLE; + + /* Пересоздаём MainDB когда там пусто. */ tASSERT(txn, txn->mt_dbs[MAIN_DBI].md_depth == 0 && txn->mt_dbs[MAIN_DBI].md_entries == 0 && txn->mt_dbs[MAIN_DBI].md_root == P_INVALID); - txn->mt_dbs[MAIN_DBI].md_flags &= MDBX_REVERSEKEY | MDBX_INTEGERKEY; - txn->mt_dbistate[MAIN_DBI] |= DBI_DIRTY; + if (unlikely(txn->mt_cursors[MAIN_DBI])) + return MDBX_DANGLING_DBI; + env->me_db_flags[MAIN_DBI] = DB_POISON; + atomic_store32(&env->me_dbi_seqs[MAIN_DBI], dbi_seq_next(env, MAIN_DBI), + mo_AcquireRelease); + + const uint32_t seq = dbi_seq_next(env, MAIN_DBI); + const uint16_t main_flags = + txn->mt_dbs[MAIN_DBI].md_flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY); + env->me_dbxs[MAIN_DBI].md_cmp = get_default_keycmp(main_flags); + env->me_dbxs[MAIN_DBI].md_dcmp = get_default_datacmp(main_flags); + txn->mt_dbs[MAIN_DBI].md_flags = main_flags; + txn->mt_dbs[MAIN_DBI].md_xsize = 0; + if (unlikely(setup_sdb(&env->me_dbxs[MAIN_DBI], &txn->mt_dbs[MAIN_DBI], + env->me_psize) != MDBX_SUCCESS)) { + txn->mt_dbi_state[MAIN_DBI] = DBI_LINDO; + txn->mt_flags |= MDBX_TXN_ERROR; + env->me_flags |= MDBX_FATAL_ERROR; + return MDBX_FATAL_ERROR; + } + env->me_db_flags[MAIN_DBI] = main_flags | DB_VALID; + txn->mt_dbi_seqs[MAIN_DBI] = + atomic_store32(&env->me_dbi_seqs[MAIN_DBI], seq, mo_AcquireRelease); + txn->mt_dbi_state[MAIN_DBI] |= DBI_DIRTY; txn->mt_flags |= MDBX_TXN_DIRTY; - txn->mt_dbxs[MAIN_DBI].md_cmp = - get_default_keycmp(txn->mt_dbs[MAIN_DBI].md_flags); - txn->mt_dbxs[MAIN_DBI].md_dcmp = - get_default_datacmp(txn->mt_dbs[MAIN_DBI].md_flags); } - tASSERT(txn, txn->mt_dbxs[MAIN_DBI].md_cmp); + tASSERT(txn, env->me_dbxs[MAIN_DBI].md_cmp); /* Is the DB already open? */ - MDBX_dbi scan, slot; - for (slot = scan = txn->mt_numdbs; --scan >= CORE_DBS;) { - if (!txn->mt_dbxs[scan].md_name.iov_base) { + size_t slot = env->me_numdbs; + for (size_t scan = CORE_DBS; scan < env->me_numdbs; ++scan) { + if ((env->me_db_flags[scan] & DB_VALID) == 0) { /* Remember this free slot */ - slot = scan; + slot = (slot < scan) ? slot : scan; continue; } - if (key.iov_len == txn->mt_dbxs[scan].md_name.iov_len && - !memcmp(key.iov_base, txn->mt_dbxs[scan].md_name.iov_base, - key.iov_len)) { - rc = dbi_bind(txn, scan, user_flags, keycmp, datacmp); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - *dbi = scan; - return rc; + if (!env->me_dbxs[MAIN_DBI].md_cmp(&name, &env->me_dbxs[scan].md_name)) { + slot = scan; + int err = dbi_check(txn, slot); + if (err == MDBX_BAD_DBI && + txn->mt_dbi_state[slot] == (DBI_OLDEN | DBI_LINDO)) { + /* хендл использовался, стал невалидным, + * но теперь явно пере-открывается в этой транзакци */ + eASSERT(env, !txn->mt_cursors[slot]); + txn->mt_dbi_state[slot] = DBI_LINDO; + err = dbi_check(txn, slot); + } + if (err == MDBX_SUCCESS) { + err = dbi_bind(txn, slot, user_flags, keycmp, datacmp); + if (likely(err == MDBX_SUCCESS)) { + goto done; + } + } + return err; } } /* Fail, if no free slot and max hit */ - if (unlikely(slot >= env->me_maxdbs)) { - rc = MDBX_DBS_FULL; - goto bailout; - } + if (unlikely(slot >= env->me_maxdbs)) + return MDBX_DBS_FULL; + + if (env->me_numdbs == slot) + eASSERT(env, !env->me_db_flags[slot] && + !env->me_dbxs[slot].md_name.iov_len && + !env->me_dbxs[slot].md_name.iov_base); + + env->me_db_flags[slot] = DB_POISON; + atomic_store32(&env->me_dbi_seqs[slot], dbi_seq_next(env, slot), + mo_AcquireRelease); + memset(&env->me_dbxs[slot], 0, sizeof(env->me_dbxs[slot])); + if (env->me_numdbs == slot) + env->me_numdbs = (unsigned)slot + 1; + eASSERT(env, slot < env->me_numdbs); + + int err = dbi_check(txn, slot); + eASSERT(env, err == MDBX_BAD_DBI); + if (err != MDBX_BAD_DBI) + return MDBX_PROBLEM; /* Find the DB info */ - MDBX_val data; - MDBX_cursor_couple couple; - rc = cursor_init(&couple.outer, txn, MAIN_DBI); + MDBX_val body; + MDBX_cursor_couple cx; + int rc = cursor_init(&cx.outer, txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - rc = cursor_set(&couple.outer, &key, &data, MDBX_SET).err; + return rc; + rc = cursor_set(&cx.outer, &name, &body, MDBX_SET).err; if (unlikely(rc != MDBX_SUCCESS)) { if (rc != MDBX_NOTFOUND || !(user_flags & MDBX_CREATE)) - goto bailout; + return rc; } else { /* make sure this is actually a table */ - MDBX_node *node = page_node(couple.outer.mc_pg[couple.outer.mc_top], - couple.outer.mc_ki[couple.outer.mc_top]); - if (unlikely((node_flags(node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) { - rc = MDBX_INCOMPATIBLE; - goto bailout; - } - if (!MDBX_DISABLE_VALIDATION && unlikely(data.iov_len != sizeof(MDBX_db))) { - rc = MDBX_CORRUPTED; - goto bailout; + MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], + cx.outer.mc_ki[cx.outer.mc_top]); + if (unlikely((node_flags(node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) + return MDBX_INCOMPATIBLE; + if (!MDBX_DISABLE_VALIDATION && unlikely(body.iov_len != sizeof(MDBX_db))) { + ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid subDb node size", body.iov_len); + return MDBX_CORRUPTED; } - } - - if (rc != MDBX_SUCCESS && unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) { - rc = MDBX_EACCESS; - goto bailout; + memcpy(&txn->mt_dbs[slot], body.iov_base, sizeof(MDBX_db)); } /* Done here so we cannot fail after creating a new DB */ - if (key.iov_len) { - clone = osal_malloc(key.iov_len); - if (unlikely(!clone)) { - rc = MDBX_ENOMEM; - goto bailout; - } - key.iov_base = memcpy(clone, key.iov_base, key.iov_len); + void *clone = nullptr; + if (name.iov_len) { + clone = osal_malloc(dbi_namelen(name)); + if (unlikely(!clone)) + return MDBX_ENOMEM; + name.iov_base = memcpy(clone, name.iov_base, name.iov_len); } else - key.iov_base = ""; - - int err = osal_fastmutex_acquire(&env->me_dbi_lock); - if (unlikely(err != MDBX_SUCCESS)) { - rc = err; - goto bailout; - } - locked = true; - - /* Import handles from env */ - dbi_import_locked(txn); - - /* Rescan after mutex acquisition & import handles */ - for (slot = scan = txn->mt_numdbs; --scan >= CORE_DBS;) { - if (!txn->mt_dbxs[scan].md_name.iov_base) { - /* Remember this free slot */ - slot = scan; - continue; - } - if (key.iov_len == txn->mt_dbxs[scan].md_name.iov_len && - !memcmp(key.iov_base, txn->mt_dbxs[scan].md_name.iov_base, - key.iov_len)) { - rc = dbi_bind(txn, scan, user_flags, keycmp, datacmp); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - slot = scan; - goto done; - } - } - - if (unlikely(slot >= env->me_maxdbs)) { - rc = MDBX_DBS_FULL; - goto bailout; - } + name.iov_base = ""; - unsigned dbiflags = DBI_FRESH | DBI_VALID | DBI_USRVALID; - MDBX_db db_dummy; + uint8_t dbi_state = DBI_LINDO | DBI_VALID | DBI_FRESH; if (unlikely(rc)) { /* MDBX_NOTFOUND and MDBX_CREATE: Create new DB */ tASSERT(txn, rc == MDBX_NOTFOUND); - memset(&db_dummy, 0, sizeof(db_dummy)); - db_dummy.md_root = P_INVALID; - db_dummy.md_mod_txnid = txn->mt_txnid; - db_dummy.md_flags = user_flags & DB_PERSISTENT_FLAGS; - data.iov_len = sizeof(db_dummy); - data.iov_base = &db_dummy; + body.iov_base = + memset(&txn->mt_dbs[slot], 0, body.iov_len = sizeof(MDBX_db)); + txn->mt_dbs[slot].md_root = P_INVALID; + txn->mt_dbs[slot].md_mod_txnid = txn->mt_txnid; + txn->mt_dbs[slot].md_flags = user_flags & DB_PERSISTENT_FLAGS; WITH_CURSOR_TRACKING( - couple.outer, rc = cursor_put_checklen(&couple.outer, &key, &data, - F_SUBDATA | MDBX_NOOVERWRITE)); + cx.outer, rc = cursor_put_checklen(&cx.outer, &name, &body, + F_SUBDATA | MDBX_NOOVERWRITE)); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - dbiflags |= DBI_DIRTY | DBI_CREAT; + dbi_state |= DBI_DIRTY | DBI_CREAT; txn->mt_flags |= MDBX_TXN_DIRTY; - tASSERT(txn, (txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY) != 0); + tASSERT(txn, (txn->mt_dbi_state[MAIN_DBI] & DBI_DIRTY) != 0); } /* Got info, register DBI in this txn */ - memset(txn->mt_dbxs + slot, 0, sizeof(MDBX_dbx)); - memcpy(&txn->mt_dbs[slot], data.iov_base, sizeof(MDBX_db)); - env->me_dbflags[slot] = 0; + const uint32_t seq = dbi_seq_next(env, slot); + eASSERT(env, + env->me_db_flags[slot] == DB_POISON && !txn->mt_cursors[slot] && + (txn->mt_dbi_state[slot] & (DBI_LINDO | DBI_VALID)) == DBI_LINDO); + txn->mt_dbi_state[slot] = dbi_state; + memcpy(&txn->mt_dbs[slot], body.iov_base, sizeof(txn->mt_dbs[slot])); + env->me_db_flags[slot] = txn->mt_dbs[slot].md_flags; rc = dbi_bind(txn, slot, user_flags, keycmp, datacmp); - if (unlikely(rc != MDBX_SUCCESS)) { - tASSERT(txn, (dbiflags & DBI_CREAT) == 0); + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; + + env->me_dbxs[slot].md_name = name; + env->me_db_flags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID; + txn->mt_dbi_seqs[slot] = + atomic_store32(&env->me_dbi_seqs[slot], seq, mo_AcquireRelease); + +done: + *dbi = (MDBX_dbi)slot; + tASSERT(txn, + slot < txn->mt_numdbs && (env->me_db_flags[slot] & DB_VALID) != 0); + eASSERT(env, dbi_check(txn, slot) == MDBX_SUCCESS); + return MDBX_SUCCESS; + +bailout: + eASSERT(env, !txn->mt_cursors[slot] && !env->me_dbxs[slot].md_name.iov_len && + !env->me_dbxs[slot].md_name.iov_base); + txn->mt_dbi_state[slot] &= DBI_LINDO | DBI_OLDEN; + env->me_db_flags[slot] = 0; + osal_free(clone); + if (slot + 1 == env->me_numdbs) + txn->mt_numdbs = env->me_numdbs = (unsigned)slot; + return rc; +} + +static int dbi_open(MDBX_txn *txn, const MDBX_val *const name, + unsigned user_flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, + MDBX_cmp_func *datacmp) { + if (unlikely(!dbi)) + return MDBX_EINVAL; + *dbi = 0; + if (unlikely((user_flags & ~DB_USABLE_FLAGS) != 0)) + return MDBX_EINVAL; + + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if ((user_flags & MDBX_CREATE) && unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) + return MDBX_EACCESS; + + switch (user_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED | MDBX_DUPSORT | + MDBX_REVERSEDUP | MDBX_ACCEDE)) { + case MDBX_ACCEDE: + if ((user_flags & MDBX_CREATE) == 0) + break; + __fallthrough /* fall through */; + default: + return MDBX_EINVAL; + + case MDBX_DUPSORT: + case MDBX_DUPSORT | MDBX_REVERSEDUP: + case MDBX_DUPSORT | MDBX_DUPFIXED: + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP: + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP: + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP: + case MDBX_DB_DEFAULTS: + break; } + tASSERT(txn, db_check_flags((uint16_t)user_flags)); - txn->mt_dbistate[slot] = (uint8_t)dbiflags; - txn->mt_dbxs[slot].md_name = key; - txn->mt_dbiseqs[slot].weak = env->me_dbiseqs[slot].weak = dbi_seq(env, slot); - if (!(dbiflags & DBI_CREAT)) - env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID; - if (txn->mt_numdbs == slot) { - txn->mt_cursors[slot] = NULL; - osal_compiler_barrier(); - txn->mt_numdbs = slot + 1; + /* main table? */ + if (unlikely(name == MDBX_CHK_MAIN || name->iov_base == MDBX_CHK_MAIN)) { + rc = dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp); + if (likely(rc == MDBX_SUCCESS)) + *dbi = MAIN_DBI; + return rc; } - if (env->me_numdbs <= slot) { - osal_memory_fence(mo_AcquireRelease, true); - env->me_numdbs = slot + 1; + if (unlikely(name == MDBX_CHK_GC || name->iov_base == MDBX_CHK_GC)) { + rc = dbi_bind(txn, FREE_DBI, user_flags, keycmp, datacmp); + if (likely(rc == MDBX_SUCCESS)) + *dbi = FREE_DBI; + return rc; } + if (unlikely(name == MDBX_CHK_META || name->iov_base == MDBX_CHK_META)) + return MDBX_EINVAL; + if (unlikely(name->iov_len > + txn->mt_env->me_leaf_nodemax - NODESIZE - sizeof(MDBX_db))) + return MDBX_EINVAL; -done: - *dbi = slot; - ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); - return MDBX_SUCCESS; +#if MDBX_ENABLE_DBI_LOCKFREE + /* Is the DB already open? */ + const MDBX_env *const env = txn->mt_env; + size_t free_slot = env->me_numdbs; + for (size_t i = CORE_DBS; i < env->me_numdbs; ++i) { + retry: + if ((env->me_db_flags[i] & DB_VALID) == 0) { + free_slot = i; + continue; + } + + const uint32_t snap_seq = + atomic_load32(&env->me_dbi_seqs[i], mo_AcquireRelease); + const uint16_t snap_flags = env->me_db_flags[i]; + const MDBX_val snap_name = env->me_dbxs[i].md_name; + if (user_flags != MDBX_ACCEDE && + (((user_flags ^ snap_flags) & DB_PERSISTENT_FLAGS) || + (keycmp && keycmp != env->me_dbxs[i].md_cmp) || + (datacmp && datacmp != env->me_dbxs[i].md_dcmp))) + continue; + const uint32_t main_seq = + atomic_load32(&env->me_dbi_seqs[MAIN_DBI], mo_AcquireRelease); + MDBX_cmp_func *const snap_cmp = env->me_dbxs[MAIN_DBI].md_cmp; + if (unlikely(!(snap_flags & DB_VALID) || !snap_name.iov_base || + !snap_name.iov_len || !snap_cmp)) + continue; + + const bool name_match = snap_cmp(&snap_name, name) == 0; + osal_flush_incoherent_cpu_writeback(); + if (unlikely(snap_seq != + atomic_load32(&env->me_dbi_seqs[i], mo_AcquireRelease) || + main_seq != atomic_load32(&env->me_dbi_seqs[MAIN_DBI], + mo_AcquireRelease) || + snap_flags != env->me_db_flags[i] || + snap_name.iov_base != env->me_dbxs[i].md_name.iov_base || + snap_name.iov_len != env->me_dbxs[i].md_name.iov_len)) + goto retry; + if (name_match) { + rc = dbi_check(txn, i); + if (rc == MDBX_BAD_DBI && + txn->mt_dbi_state[i] == (DBI_OLDEN | DBI_LINDO)) { + /* хендл использовался, стал невалидным, + * но теперь явно пере-открывается в этой транзакци */ + eASSERT(env, !txn->mt_cursors[i]); + txn->mt_dbi_state[i] = DBI_LINDO; + rc = dbi_check(txn, i); + } + if (likely(rc == MDBX_SUCCESS)) { + rc = dbi_bind(txn, i, user_flags, keycmp, datacmp); + if (likely(rc == MDBX_SUCCESS)) + *dbi = (MDBX_dbi)i; + } + return rc; + } + } + + /* Fail, if no free slot and max hit */ + if (unlikely(free_slot >= env->me_maxdbs)) + return MDBX_DBS_FULL; +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ + + rc = osal_fastmutex_acquire(&txn->mt_env->me_dbi_lock); + if (likely(rc == MDBX_SUCCESS)) { + rc = dbi_open_locked(txn, user_flags, dbi, keycmp, datacmp, *name); + ENSURE(txn->mt_env, + osal_fastmutex_release(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS); + } + return rc; } static int dbi_open_cstr(MDBX_txn *txn, const char *name_cstr, MDBX_db_flags_t flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { MDBX_val thunk, *name; - if (name_cstr == MDBX_PGWALK_MAIN || name_cstr == MDBX_PGWALK_GC || - name_cstr == MDBX_PGWALK_META) + if (name_cstr == MDBX_CHK_MAIN || name_cstr == MDBX_CHK_GC || + name_cstr == MDBX_CHK_META) name = (void *)name_cstr; else { thunk.iov_len = strlen(name_cstr); @@ -27056,6 +28213,105 @@ int mdbx_dbi_open_ex2(MDBX_txn *txn, const MDBX_val *name, return dbi_open(txn, name, flags, dbi, keycmp, datacmp); } +__cold int mdbx_dbi_rename(MDBX_txn *txn, MDBX_dbi dbi, const char *name_cstr) { + MDBX_val thunk, *name; + if (name_cstr == MDBX_CHK_MAIN || name_cstr == MDBX_CHK_GC || + name_cstr == MDBX_CHK_META) + name = (void *)name_cstr; + else { + thunk.iov_len = strlen(name_cstr); + thunk.iov_base = (void *)name_cstr; + name = &thunk; + } + return mdbx_dbi_rename2(txn, dbi, name); +} + +struct dbi_rename_result { + struct mdbx_defer_free_item *defer; + int err; +}; + +__cold static struct dbi_rename_result +dbi_rename_locked(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val new_name) { + struct dbi_rename_result pair; + pair.defer = nullptr; + pair.err = dbi_check(txn, dbi); + if (unlikely(pair.err != MDBX_SUCCESS)) + return pair; + + MDBX_env *const env = txn->mt_env; + MDBX_val old_name = env->me_dbxs[dbi].md_name; + if (env->me_dbxs[MAIN_DBI].md_cmp(&new_name, &old_name) == 0 && + MDBX_DEBUG == 0) + return pair; + + MDBX_cursor_couple cx; + pair.err = cursor_init(&cx.outer, txn, MAIN_DBI); + if (unlikely(pair.err != MDBX_SUCCESS)) + return pair; + pair.err = cursor_set(&cx.outer, &new_name, nullptr, MDBX_SET).err; + if (unlikely(pair.err != MDBX_NOTFOUND)) { + pair.err = (pair.err == MDBX_SUCCESS) ? MDBX_KEYEXIST : pair.err; + return pair; + } + + pair.defer = osal_malloc(dbi_namelen(new_name)); + if (unlikely(!pair.defer)) { + pair.err = MDBX_ENOMEM; + return pair; + } + new_name.iov_base = memcpy(pair.defer, new_name.iov_base, new_name.iov_len); + + cx.outer.mc_next = txn->mt_cursors[MAIN_DBI]; + txn->mt_cursors[MAIN_DBI] = &cx.outer; + + MDBX_val data = {&txn->mt_dbs[dbi], sizeof(MDBX_db)}; + pair.err = cursor_put_checklen(&cx.outer, &new_name, &data, + F_SUBDATA | MDBX_NOOVERWRITE); + if (likely(pair.err == MDBX_SUCCESS)) { + pair.err = cursor_set(&cx.outer, &old_name, nullptr, MDBX_SET).err; + if (likely(pair.err == MDBX_SUCCESS)) + pair.err = cursor_del(&cx.outer, F_SUBDATA); + if (likely(pair.err == MDBX_SUCCESS)) { + pair.defer = env->me_dbxs[dbi].md_name.iov_base; + env->me_dbxs[dbi].md_name = new_name; + } else + txn->mt_flags |= MDBX_TXN_ERROR; + } + + txn->mt_cursors[MAIN_DBI] = cx.outer.mc_next; + return pair; +} + +__cold int mdbx_dbi_rename2(MDBX_txn *txn, MDBX_dbi dbi, + const MDBX_val *new_name) { + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(new_name == MDBX_CHK_MAIN || + new_name->iov_base == MDBX_CHK_MAIN || new_name == MDBX_CHK_GC || + new_name->iov_base == MDBX_CHK_GC || new_name == MDBX_CHK_META || + new_name->iov_base == MDBX_CHK_META)) + return MDBX_EINVAL; + + if (unlikely(dbi < CORE_DBS)) + return MDBX_EINVAL; + rc = dbi_check(txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + rc = osal_fastmutex_acquire(&txn->mt_env->me_dbi_lock); + if (likely(rc == MDBX_SUCCESS)) { + struct dbi_rename_result pair = dbi_rename_locked(txn, dbi, *new_name); + if (pair.defer) + pair.defer->next = nullptr; + env_defer_free_and_release(txn->mt_env, pair.defer); + rc = pair.err; + } + return rc; +} + __cold int mdbx_dbi_stat(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, size_t bytes) { int rc = check_txn(txn, MDBX_TXN_BLOCKED); @@ -27065,8 +28321,9 @@ __cold int mdbx_dbi_stat(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, if (unlikely(!dest)) return MDBX_EINVAL; - if (unlikely(!check_dbi(txn, dbi, DBI_VALID))) - return MDBX_BAD_DBI; + rc = dbi_check(txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid); if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid) @@ -27075,7 +28332,7 @@ __cold int mdbx_dbi_stat(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, if (unlikely(txn->mt_flags & MDBX_TXN_BLOCKED)) return MDBX_BAD_TXN; - if (unlikely(txn->mt_dbistate[dbi] & DBI_STALE)) { + if (unlikely(txn->mt_dbi_state[dbi] & DBI_STALE)) { rc = fetch_sdb((MDBX_txn *)txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -27086,31 +28343,35 @@ __cold int mdbx_dbi_stat(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, return MDBX_SUCCESS; } -static int dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { +static struct mdbx_defer_free_item *dbi_close_locked(MDBX_env *env, + MDBX_dbi dbi) { eASSERT(env, dbi >= CORE_DBS); if (unlikely(dbi >= env->me_numdbs)) - return MDBX_BAD_DBI; - - char *const ptr = env->me_dbxs[dbi].md_name.iov_base; - /* If there was no name, this was already closed */ - if (unlikely(!ptr)) - return MDBX_BAD_DBI; + return nullptr; - env->me_dbflags[dbi] = 0; - env->me_dbxs[dbi].md_name.iov_len = 0; - osal_memory_fence(mo_AcquireRelease, true); - env->me_dbxs[dbi].md_name.iov_base = NULL; - osal_free(ptr); + const uint32_t seq = dbi_seq_next(env, dbi); + struct mdbx_defer_free_item *defer_item = env->me_dbxs[dbi].md_name.iov_base; + if (likely(defer_item)) { + env->me_db_flags[dbi] = 0; + env->me_dbxs[dbi].md_name.iov_len = 0; + env->me_dbxs[dbi].md_name.iov_base = nullptr; + atomic_store32(&env->me_dbi_seqs[dbi], seq, mo_AcquireRelease); + osal_flush_incoherent_cpu_writeback(); + defer_item->next = nullptr; - if (env->me_numdbs == dbi + 1) { - size_t i = env->me_numdbs; - do - --i; - while (i > CORE_DBS && !env->me_dbxs[i - 1].md_name.iov_base); - env->me_numdbs = (MDBX_dbi)i; + if (env->me_numdbs == dbi + 1) { + size_t i = env->me_numdbs; + do { + --i; + eASSERT(env, i >= CORE_DBS); + eASSERT(env, !env->me_db_flags[i] && !env->me_dbxs[i].md_name.iov_len && + !env->me_dbxs[i].md_name.iov_base); + } while (i > CORE_DBS && !env->me_dbxs[i - 1].md_name.iov_base); + env->me_numdbs = (unsigned)i; + } } - return MDBX_SUCCESS; + return defer_item; } int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { @@ -27128,12 +28389,8 @@ int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { return MDBX_BAD_DBI; rc = osal_fastmutex_acquire(&env->me_dbi_lock); - if (likely(rc == MDBX_SUCCESS)) { - rc = (dbi < env->me_maxdbs && (env->me_dbflags[dbi] & DB_VALID)) - ? dbi_close_locked(env, dbi) - : MDBX_BAD_DBI; - ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); - } + if (likely(rc == MDBX_SUCCESS)) + rc = env_defer_free_and_release(env, dbi_close_locked(env, dbi)); return rc; } @@ -27146,12 +28403,13 @@ int mdbx_dbi_flags_ex(const MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags, if (unlikely(!flags || !state)) return MDBX_EINVAL; - if (unlikely(!check_dbi(txn, dbi, DBI_VALID))) - return MDBX_BAD_DBI; + rc = dbi_check(txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; *flags = txn->mt_dbs[dbi].md_flags & DB_PERSISTENT_FLAGS; *state = - txn->mt_dbistate[dbi] & (DBI_FRESH | DBI_CREAT | DBI_DIRTY | DBI_STALE); + txn->mt_dbi_state[dbi] & (DBI_FRESH | DBI_CREAT | DBI_DIRTY | DBI_STALE); return MDBX_SUCCESS; } @@ -27251,7 +28509,7 @@ static int drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { return rc; } -int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { +__cold int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -27271,25 +28529,22 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { /* Can't delete the main DB */ if (del && dbi >= CORE_DBS) { - rc = delete (txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); + rc = delete(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); if (likely(rc == MDBX_SUCCESS)) { - tASSERT(txn, txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY); + tASSERT(txn, txn->mt_dbi_state[MAIN_DBI] & DBI_DIRTY); tASSERT(txn, txn->mt_flags & MDBX_TXN_DIRTY); - txn->mt_dbistate[dbi] = DBI_STALE; - MDBX_env *env = txn->mt_env; + txn->mt_dbi_state[dbi] = DBI_LINDO | DBI_OLDEN; + MDBX_env *const env = txn->mt_env; rc = osal_fastmutex_acquire(&env->me_dbi_lock); - if (unlikely(rc != MDBX_SUCCESS)) { - txn->mt_flags |= MDBX_TXN_ERROR; + if (likely(rc == MDBX_SUCCESS)) { + rc = env_defer_free_and_release(env, dbi_close_locked(env, dbi)); goto bailout; } - dbi_close_locked(env, dbi); - ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); - } else { - txn->mt_flags |= MDBX_TXN_ERROR; } + txn->mt_flags |= MDBX_TXN_ERROR; } else { /* reset the DB record, mark it dirty */ - txn->mt_dbistate[dbi] |= DBI_DIRTY; + txn->mt_dbi_state[dbi] |= DBI_DIRTY; txn->mt_dbs[dbi].md_depth = 0; txn->mt_dbs[dbi].md_branch_pages = 0; txn->mt_dbs[dbi].md_leaf_pages = 0; @@ -27525,12 +28780,14 @@ __cold MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, return rc; } -__cold int mdbx_setup_debug(MDBX_log_level_t level, MDBX_debug_flags_t flags, - MDBX_debug_func *logger) { - const int rc = runtime_flags | (loglevel << 16); +__cold static int setup_debug(MDBX_log_level_t level, MDBX_debug_flags_t flags, + union logger_union logger, char *buffer, + size_t buffer_size) { + ENSURE(nullptr, osal_fastmutex_acquire(&debug_lock) == 0); + const int rc = mdbx_static.flags | (mdbx_static.loglevel << 16); if (level != MDBX_LOG_DONTCHANGE) - loglevel = (uint8_t)level; + mdbx_static.loglevel = (uint8_t)level; if (flags != MDBX_DBG_DONTCHANGE) { flags &= @@ -27539,14 +28796,37 @@ __cold int mdbx_setup_debug(MDBX_log_level_t level, MDBX_debug_flags_t flags, #endif MDBX_DBG_DUMP | MDBX_DBG_LEGACY_MULTIOPEN | MDBX_DBG_LEGACY_OVERLAP | MDBX_DBG_DONT_UPGRADE; - runtime_flags = (uint8_t)flags; + mdbx_static.flags = (uint8_t)flags; + } + + assert(MDBX_LOGGER_DONTCHANGE == ((MDBX_debug_func *)(intptr_t)-1)); + if (logger.ptr != (void *)((intptr_t)-1)) { + mdbx_static.logger.ptr = logger.ptr; + mdbx_static.logger_buffer = buffer; + mdbx_static.logger_buffer_size = buffer_size; } - if (logger != MDBX_LOGGER_DONTCHANGE) - debug_logger = logger; + ENSURE(nullptr, osal_fastmutex_release(&debug_lock) == 0); return rc; } +__cold int mdbx_setup_debug_nofmt(MDBX_log_level_t level, + MDBX_debug_flags_t flags, + MDBX_debug_func_nofmt *logger, char *buffer, + size_t buffer_size) { + union logger_union thunk; + thunk.nofmt = + (logger && buffer && buffer_size) ? logger : MDBX_LOGGER_NOFMT_DONTCHANGE; + return setup_debug(level, flags, thunk, buffer, buffer_size); +} + +__cold int mdbx_setup_debug(MDBX_log_level_t level, MDBX_debug_flags_t flags, + MDBX_debug_func *logger) { + union logger_union thunk; + thunk.fmt = logger; + return setup_debug(level, flags, thunk, nullptr, 0); +} + __cold static txnid_t kick_longlived_readers(MDBX_env *env, const txnid_t laggard) { DEBUG("DB size maxed out by reading #%" PRIaTXN, laggard); @@ -27699,12 +28979,12 @@ typedef struct mdbx_walk_ctx { bool mw_dont_check_keys_ordering; } mdbx_walk_ctx_t; -__cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, - const MDBX_val *name, int deep); +__cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_walk_sdb_t *sdb, + int deep); static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { if (mp) - switch (mp->mp_flags) { + switch (mp->mp_flags & ~P_SPILLED) { case P_BRANCH: return MDBX_page_branch; case P_LEAF: @@ -27713,15 +28993,13 @@ static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { return MDBX_page_dupfixed_leaf; case P_OVERFLOW: return MDBX_page_large; - case P_META: - return MDBX_page_meta; } return MDBX_page_broken; } /* Depth-first tree traversal. */ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, - const MDBX_val *name, int deep, + MDBX_walk_sdb_t *sdb, int deep, txnid_t parent_txnid) { assert(pgno != P_INVALID); MDBX_page *mp = nullptr; @@ -27745,9 +29023,10 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, continue; } - MDBX_node *node = page_node(mp, i); + const MDBX_node *node = page_node(mp, i); + header_size += NODESIZE; const size_t node_key_size = node_ks(node); - payload_size += NODESIZE + node_key_size; + payload_size += node_key_size; if (type == MDBX_page_branch) { assert(i > 0 || node_ks(node) == 0); @@ -27780,7 +29059,7 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, pagesize = pgno2bytes(ctx->mw_txn->mt_env, npages); const size_t over_unused = pagesize - over_payload - over_header; const int rc = ctx->mw_visitor(large_pgno, npages, ctx->mw_user, deep, - name, pagesize, MDBX_page_large, err, 1, + sdb, pagesize, MDBX_page_large, err, 1, over_payload, over_header, over_unused); if (unlikely(rc != MDBX_SUCCESS)) return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; @@ -27789,8 +29068,9 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, } break; case F_SUBDATA /* sub-db */: { - const size_t namelen = node_key_size; - if (unlikely(namelen == 0 || node_data_size != sizeof(MDBX_db))) { + if (unlikely(node_data_size != sizeof(MDBX_db))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid subDb node size", (unsigned)node_data_size); assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; } @@ -27800,6 +29080,8 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: if (unlikely(node_data_size != sizeof(MDBX_db))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid sub-tree node size", (unsigned)node_data_size); assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; } @@ -27809,6 +29091,8 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, case F_DUPDATA /* short sub-page */: { if (unlikely(node_data_size <= PAGEHDRSZ || (node_data_size & 1))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid sub-page node size", (unsigned)node_data_size); assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; break; @@ -27831,6 +29115,8 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, subtype = MDBX_subpage_dupfixed_leaf; break; default: + ERROR("%s/%d: %s 0x%x", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid sub-page flags", sp->mp_flags); assert(err == MDBX_CORRUPTED); subtype = MDBX_subpage_broken; err = MDBX_CORRUPTED; @@ -27848,6 +29134,8 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, subpayload_size += subnode_size; subalign_bytes += subnode_size & 1; if (unlikely(node_flags(subnode) != 0)) { + ERROR("%s/%d: %s 0x%x", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "unexpected sub-node flags", node_flags(subnode)); assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; } @@ -27855,7 +29143,7 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, } const int rc = - ctx->mw_visitor(pgno, 0, ctx->mw_user, deep + 1, name, node_data_size, + ctx->mw_visitor(pgno, 0, ctx->mw_user, deep + 1, sdb, node_data_size, subtype, err, nsubkeys, subpayload_size, subheader_size, subunused_size + subalign_bytes); if (unlikely(rc != MDBX_SUCCESS)) @@ -27867,13 +29155,15 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, } break; default: + ERROR("%s/%d: %s 0x%x", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid node flags", node_flags(node)); assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; } } const int rc = ctx->mw_visitor( - pgno, 1, ctx->mw_user, deep, name, ctx->mw_txn->mt_env->me_psize, type, + pgno, 1, ctx->mw_user, deep, sdb, ctx->mw_txn->mt_env->me_psize, type, err, nentries, payload_size, header_size, unused_size + align_bytes); if (unlikely(rc != MDBX_SUCCESS)) return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; @@ -27885,7 +29175,7 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, MDBX_node *node = page_node(mp, i); if (type == MDBX_page_branch) { assert(err == MDBX_SUCCESS); - err = walk_tree(ctx, node_pgno(node), name, deep + 1, mp->mp_txnid); + err = walk_tree(ctx, node_pgno(node), sdb, deep + 1, mp->mp_txnid); if (unlikely(err != MDBX_SUCCESS)) { if (err == MDBX_RESULT_TRUE) break; @@ -27901,32 +29191,44 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, case F_SUBDATA /* sub-db */: if (unlikely(node_ds(node) != sizeof(MDBX_db))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid sub-tree node size", (unsigned)node_ds(node)); assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; } else { - MDBX_db db; - memcpy(&db, node_data(node), sizeof(db)); - const MDBX_val subdb_name = {node_key(node), node_ks(node)}; + MDBX_db aligned_db; + memcpy(&aligned_db, node_data(node), sizeof(aligned_db)); + MDBX_walk_sdb_t sdb_info = { + {node_key(node), node_ks(node)}, nullptr, nullptr}; + sdb_info.internal = &aligned_db; assert(err == MDBX_SUCCESS); - err = walk_sdb(ctx, &db, &subdb_name, deep + 1); + err = walk_sdb(ctx, &sdb_info, deep + 1); } break; case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: - if (unlikely(node_ds(node) != sizeof(MDBX_db) || - ctx->mw_cursor->mc_xcursor == NULL)) { + if (unlikely(node_ds(node) != sizeof(MDBX_db))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "invalid dupsort sub-tree node size", (unsigned)node_ds(node)); + assert(err == MDBX_CORRUPTED); + err = MDBX_CORRUPTED; + } else if (unlikely(!ctx->mw_cursor->mc_xcursor)) { + ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, + "unexpected dupsort sub-tree node for non-dupsort subDB"); assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; } else { - MDBX_db db; - memcpy(&db, node_data(node), sizeof(db)); + MDBX_db aligned_db; + memcpy(&aligned_db, node_data(node), sizeof(aligned_db)); assert(ctx->mw_cursor->mc_xcursor == &container_of(ctx->mw_cursor, MDBX_cursor_couple, outer)->inner); assert(err == MDBX_SUCCESS); err = cursor_xinit1(ctx->mw_cursor, node, mp); if (likely(err == MDBX_SUCCESS)) { ctx->mw_cursor = &ctx->mw_cursor->mc_xcursor->mx_cursor; - err = walk_tree(ctx, db.md_root, name, deep + 1, mp->mp_txnid); + sdb->nested = &aligned_db; + err = walk_tree(ctx, aligned_db.md_root, sdb, deep + 1, mp->mp_txnid); + sdb->nested = nullptr; MDBX_xcursor *inner_xcursor = container_of(ctx->mw_cursor, MDBX_xcursor, mx_cursor); MDBX_cursor_couple *couple = @@ -27941,15 +29243,16 @@ __cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, return MDBX_SUCCESS; } -__cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, - const MDBX_val *name, int deep) { - if (unlikely(sdb->md_root == P_INVALID)) +__cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_walk_sdb_t *sdb, + int deep) { + struct MDBX_db *const db = sdb->internal; + if (unlikely(db->md_root == P_INVALID)) return MDBX_SUCCESS; /* empty db */ MDBX_cursor_couple couple; MDBX_dbx dbx = {.md_klen_min = INT_MAX}; - uint8_t dbistate = DBI_VALID | DBI_AUDITED; - int rc = couple_init(&couple, ~0u, ctx->mw_txn, sdb, &dbx, &dbistate); + uint8_t dbi_state = DBI_LINDO | DBI_VALID; + int rc = couple_init(&couple, ~0u, ctx->mw_txn, db, &dbx, &dbi_state); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -27961,8 +29264,8 @@ __cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, : CC_PAGECHECK; couple.outer.mc_next = ctx->mw_cursor; ctx->mw_cursor = &couple.outer; - rc = walk_tree(ctx, sdb->md_root, name, deep, - sdb->md_mod_txnid ? sdb->md_mod_txnid : ctx->mw_txn->mt_txnid); + rc = walk_tree(ctx, db->md_root, sdb, deep, + db->md_mod_txnid ? db->md_mod_txnid : ctx->mw_txn->mt_txnid); ctx->mw_cursor = couple.outer.mc_next; return rc; } @@ -27980,15 +29283,13 @@ __cold int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, ctx.mw_visitor = visitor; ctx.mw_dont_check_keys_ordering = dont_check_keys_ordering; - rc = visitor(0, NUM_METAS, user, 0, MDBX_PGWALK_META, - pgno2bytes(txn->mt_env, NUM_METAS), MDBX_page_meta, MDBX_SUCCESS, - NUM_METAS, sizeof(MDBX_meta) * NUM_METAS, PAGEHDRSZ * NUM_METAS, - (txn->mt_env->me_psize - sizeof(MDBX_meta) - PAGEHDRSZ) * - NUM_METAS); - if (!MDBX_IS_ERROR(rc)) - rc = walk_sdb(&ctx, &txn->mt_dbs[FREE_DBI], MDBX_PGWALK_GC, 0); - if (!MDBX_IS_ERROR(rc)) - rc = walk_sdb(&ctx, &txn->mt_dbs[MAIN_DBI], MDBX_PGWALK_MAIN, 0); + MDBX_walk_sdb_t sdb = {{MDBX_CHK_GC, 0}, &txn->mt_dbs[FREE_DBI], nullptr}; + rc = walk_sdb(&ctx, &sdb, 0); + if (!MDBX_IS_ERROR(rc)) { + sdb.name.iov_base = MDBX_CHK_MAIN; + sdb.internal = &txn->mt_dbs[MAIN_DBI]; + rc = walk_sdb(&ctx, &sdb, 0); + } return rc; } @@ -28042,6 +29343,29 @@ int mdbx_cursor_on_first(const MDBX_cursor *mc) { return MDBX_RESULT_TRUE; } +int mdbx_cursor_on_first_dup(const MDBX_cursor *mc) { + if (unlikely(mc == NULL)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + if (!(mc->mc_flags & C_INITIALIZED)) + return MDBX_RESULT_TRUE; + + if (!mc->mc_xcursor) + return MDBX_RESULT_TRUE; + + mc = &mc->mc_xcursor->mx_cursor; + for (size_t i = 0; i < mc->mc_snum; ++i) { + if (mc->mc_ki[i]) + return MDBX_RESULT_FALSE; + } + + return MDBX_RESULT_TRUE; +} + int mdbx_cursor_on_last(const MDBX_cursor *mc) { if (unlikely(mc == NULL)) return MDBX_EINVAL; @@ -28062,6 +29386,30 @@ int mdbx_cursor_on_last(const MDBX_cursor *mc) { return MDBX_RESULT_TRUE; } +int mdbx_cursor_on_last_dup(const MDBX_cursor *mc) { + if (unlikely(mc == NULL)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + if (!(mc->mc_flags & C_INITIALIZED)) + return MDBX_RESULT_FALSE; + + if (!mc->mc_xcursor) + return MDBX_RESULT_TRUE; + + mc = &mc->mc_xcursor->mx_cursor; + for (size_t i = 0; i < mc->mc_snum; ++i) { + size_t nkeys = page_numkeys(mc->mc_pg[i]); + if (mc->mc_ki[i] < nkeys - 1) + return MDBX_RESULT_FALSE; + } + + return MDBX_RESULT_TRUE; +} + int mdbx_cursor_eof(const MDBX_cursor *mc) { if (unlikely(mc == NULL)) return MDBX_EINVAL; @@ -28335,9 +29683,6 @@ int mdbx_estimate_range(const MDBX_txn *txn, MDBX_dbi dbi, if (unlikely(begin_key == MDBX_EPSILON && end_key == MDBX_EPSILON)) return MDBX_EINVAL; - if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) - return MDBX_BAD_DBI; - MDBX_cursor_couple begin; /* LY: first, initialize cursor to refresh a DB in case it have DB_STALE */ rc = cursor_init(&begin.outer, txn, dbi); @@ -28516,7 +29861,7 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, (flags & (MDBX_CURRENT | MDBX_RESERVE)) != MDBX_CURRENT)) return MDBX_EINVAL; - if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) + if (unlikely(dbi <= FREE_DBI)) return MDBX_BAD_DBI; if (unlikely(flags & @@ -28696,10 +30041,11 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (unlikely(!check_dbi(txn, dbi, DBI_USRVALID))) - return MDBX_BAD_DBI; + rc = dbi_check(txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; - if (unlikely(txn->mt_dbistate[dbi] & DBI_STALE)) { + if (unlikely(txn->mt_dbi_state[dbi] & DBI_STALE)) { rc = fetch_sdb(txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -28710,7 +30056,7 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, *result = dbs->md_seq; if (likely(increment > 0)) { - if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) + if (unlikely(dbi == FREE_DBI || (txn->mt_flags & MDBX_TXN_RDONLY) != 0)) return MDBX_EACCESS; uint64_t new = dbs->md_seq + increment; @@ -28720,7 +30066,7 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, tASSERT(txn, new > dbs->md_seq); dbs->md_seq = new; txn->mt_flags |= MDBX_TXN_DIRTY; - txn->mt_dbistate[dbi] |= DBI_DIRTY; + txn->mt_dbi_state[dbi] |= DBI_DIRTY; } return MDBX_SUCCESS; @@ -29005,7 +30351,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, return err; const bool lock_needed = ((env->me_flags & MDBX_ENV_ACTIVE) && env->me_txn0 && - env->me_txn0->mt_owner != osal_thread_self()); + !env_txn0_owned(env)); bool should_unlock = false; switch (option) { case MDBX_opt_sync_bytes: @@ -29080,7 +30426,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, return MDBX_EINVAL; if (env->me_options.dp_reserve_limit != (unsigned)value) { if (lock_needed) { - err = mdbx_txn_lock(env, false); + err = osal_txn_lock(env, false); if (unlikely(err != MDBX_SUCCESS)) return err; should_unlock = true; @@ -29111,6 +30457,23 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, } break; + case MDBX_opt_gc_time_limit: + if (value == /* default */ UINT64_MAX) + value = 0; + if (unlikely(value > UINT32_MAX)) + return MDBX_EINVAL; + if (unlikely(env->me_flags & MDBX_RDONLY)) + return MDBX_EACCESS; + value = osal_16dot16_to_monotime((uint32_t)value); + if (value != env->me_options.gc_time_limit) { + if (env->me_txn && lock_needed) + return MDBX_EPERM; + env->me_options.gc_time_limit = value; + if (!env->me_options.flags.non_auto.rp_augment_limit) + env->me_options.rp_augment_limit = default_rp_augment_limit(env); + } + break; + case MDBX_opt_txn_dp_limit: case MDBX_opt_txn_dp_initial: if (value == /* default */ UINT64_MAX) @@ -29120,7 +30483,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, if (unlikely(env->me_flags & MDBX_RDONLY)) return MDBX_EACCESS; if (lock_needed) { - err = mdbx_txn_lock(env, false); + err = osal_txn_lock(env, false); if (unlikely(err != MDBX_SUCCESS)) return err; should_unlock = true; @@ -29220,7 +30583,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, } if (should_unlock) - mdbx_txn_unlock(env); + osal_txn_unlock(env); return err; } @@ -29263,6 +30626,10 @@ __cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option, *pvalue = env->me_options.rp_augment_limit; break; + case MDBX_opt_gc_time_limit: + *pvalue = osal_monotime_to_16dot16(env->me_options.gc_time_limit); + break; + case MDBX_opt_txn_dp_limit: *pvalue = env->me_options.dp_limit; break; @@ -29550,14 +30917,42 @@ __cold int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn, return rc; } +#if !defined(_WIN32) && !defined(_WIN64) +__cold static void rthc_afterfork(void) { + NOTICE("drown %d rthc entries", rthc_count); + for (size_t i = 0; i < rthc_count; ++i) { + MDBX_env *const env = rthc_table[i].env; + NOTICE("drown env %p", __Wpedantic_format_voidptr(env)); + if (env->me_lck_mmap.lck) + osal_munmap(&env->me_lck_mmap); + if (env->me_map) { + osal_munmap(&env->me_dxb_mmap); +#ifdef ENABLE_MEMCHECK + VALGRIND_DISCARD(env->me_valgrind_handle); + env->me_valgrind_handle = -1; +#endif /* ENABLE_MEMCHECK */ + } + env->me_lck = lckless_stub(env); + rthc_drown(env); + } + if (rthc_table != rthc_table_static) + osal_free(rthc_table); + rthc_count = 0; + rthc_table = rthc_table_static; + rthc_limit = RTHC_INITIAL_LIMIT; + rthc_pending.weak = 0; +} +#endif /* ! Windows */ + __cold void global_ctor(void) { + ENSURE(nullptr, osal_fastmutex_init(&debug_lock) == 0); osal_ctor(); rthc_limit = RTHC_INITIAL_LIMIT; rthc_table = rthc_table_static; #if defined(_WIN32) || defined(_WIN64) InitializeCriticalSection(&rthc_critical_section); - InitializeCriticalSection(&lcklist_critical_section); #else + ENSURE(nullptr, pthread_atfork(nullptr, nullptr, rthc_afterfork) == 0); ENSURE(nullptr, pthread_key_create(&rthc_key, thread_dtor) == 0); TRACE("pid %d, &mdbx_rthc_key = %p, value 0x%x", osal_getpid(), __Wpedantic_format_voidptr(&rthc_key), (unsigned)rthc_key); @@ -29743,6 +31138,2139 @@ mdbx_key_from_int32(const int32_t i32) { #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ +/*------------------------------------------------------------------------------ + * Locking API */ + +int mdbx_txn_lock(MDBX_env *env, bool dont_wait) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(env->me_flags & MDBX_RDONLY)) + return MDBX_EACCESS; + if (unlikely(env->me_txn0->mt_owner || + (env->me_txn0->mt_flags & MDBX_TXN_FINISHED) == 0)) + return MDBX_BUSY; + + return osal_txn_lock(env, dont_wait); +} + +int mdbx_txn_unlock(MDBX_env *env) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(env->me_flags & MDBX_RDONLY)) + return MDBX_EACCESS; + if (unlikely(env->me_txn0->mt_owner != osal_thread_self())) + return MDBX_THREAD_MISMATCH; + if (unlikely((env->me_txn0->mt_flags & MDBX_TXN_FINISHED) == 0)) + return MDBX_BUSY; + + osal_txn_unlock(env); + return MDBX_SUCCESS; +} + +/******************************************************************************* + * Checking API */ + +typedef struct MDBX_chk_internal { + MDBX_chk_context_t *usr; + const struct MDBX_chk_callbacks *cb; + uint64_t monotime_timeout; + + size_t *problem_counter; + uint8_t flags; + bool got_break; + bool write_locked; + uint8_t scope_depth; + + MDBX_chk_subdb_t subdb_gc, subdb_main; + int16_t *pagemap; + MDBX_chk_subdb_t *last_lookup; + const void *last_nested; + MDBX_chk_scope_t scope_stack[12]; + MDBX_chk_subdb_t *subdb[MDBX_MAX_DBI + CORE_DBS]; + + MDBX_envinfo envinfo; + meta_troika_t troika; + MDBX_val v2a_buf; +} MDBX_chk_internal_t; + +__cold static int chk_check_break(MDBX_chk_scope_t *const scope) { + MDBX_chk_internal_t *const chk = scope->internal; + return (chk->got_break || (chk->cb->check_break && + (chk->got_break = chk->cb->check_break(chk->usr)))) + ? MDBX_RESULT_TRUE + : MDBX_RESULT_FALSE; +} + +__cold static void chk_line_end(MDBX_chk_line_t *line) { + if (likely(line)) { + MDBX_chk_internal_t *chk = line->ctx->internal; + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + if (likely(chk->cb->print_done)) + chk->cb->print_done(line); + } +} + +__cold __must_check_result static MDBX_chk_line_t * +chk_line_begin(MDBX_chk_scope_t *const scope, enum MDBX_chk_severity severity) { + MDBX_chk_internal_t *const chk = scope->internal; + if (severity < MDBX_chk_warning) + mdbx_env_chk_encount_problem(chk->usr); + MDBX_chk_line_t *line = nullptr; + if (likely(chk->cb->print_begin)) { + line = chk->cb->print_begin(chk->usr, severity); + if (likely(line)) { + assert(line->ctx == nullptr || (line->ctx == chk->usr && line->empty)); + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + line->ctx = chk->usr; + } + } + return line; +} + +__cold static MDBX_chk_line_t *chk_line_feed(MDBX_chk_line_t *line) { + if (likely(line)) { + MDBX_chk_internal_t *chk = line->ctx->internal; + enum MDBX_chk_severity severity = line->severity; + chk_line_end(line); + line = chk_line_begin(chk->usr->scope, severity); + } + return line; +} + +__cold static MDBX_chk_line_t *chk_flush(MDBX_chk_line_t *line) { + if (likely(line)) { + MDBX_chk_internal_t *chk = line->ctx->internal; + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + if (likely(chk->cb->print_flush)) { + chk->cb->print_flush(line); + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + line->out = line->begin; + } + } + return line; +} + +__cold static size_t chk_print_wanna(MDBX_chk_line_t *line, size_t need) { + if (likely(line && need)) { + size_t have = line->end - line->out; + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + if (need > have) { + line = chk_flush(line); + have = line->end - line->out; + } + return (need < have) ? need : have; + } + return 0; +} + +__cold static MDBX_chk_line_t *chk_puts(MDBX_chk_line_t *line, + const char *str) { + if (likely(line && str && *str)) { + MDBX_chk_internal_t *chk = line->ctx->internal; + size_t left = strlen(str); + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + if (chk->cb->print_chars) { + chk->cb->print_chars(line, str, left); + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + } else + do { + size_t chunk = chk_print_wanna(line, left); + assert(chunk <= left); + if (unlikely(!chunk)) + break; + memcpy(line->out, str, chunk); + line->out += chunk; + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + str += chunk; + left -= chunk; + } while (left); + line->empty = false; + } + return line; +} + +__cold static MDBX_chk_line_t *chk_print_va(MDBX_chk_line_t *line, + const char *fmt, va_list args) { + if (likely(line)) { + MDBX_chk_internal_t *chk = line->ctx->internal; + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + if (chk->cb->print_format) { + chk->cb->print_format(line, fmt, args); + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + } else { + va_list ones; + va_copy(ones, args); + const int needed = vsnprintf(nullptr, 0, fmt, ones); + va_end(ones); + if (likely(needed > 0)) { + const size_t have = chk_print_wanna(line, needed); + if (likely(have > 0)) { + int written = vsnprintf(line->out, have, fmt, args); + if (likely(written > 0)) + line->out += written; + assert(line->begin <= line->end && line->begin <= line->out && + line->out <= line->end); + } + } + } + line->empty = false; + } + return line; +} + +__cold static MDBX_chk_line_t *MDBX_PRINTF_ARGS(2, 3) + chk_print(MDBX_chk_line_t *line, const char *fmt, ...) { + if (likely(line)) { + // MDBX_chk_internal_t *chk = line->ctx->internal; + va_list args; + va_start(args, fmt); + line = chk_print_va(line, fmt, args); + va_end(args); + line->empty = false; + } + return line; +} + +__cold static MDBX_chk_line_t *chk_print_size(MDBX_chk_line_t *line, + const char *prefix, + const uint64_t value, + const char *suffix) { + static const char sf[] = + "KMGTPEZY"; /* LY: Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta! */ + if (likely(line)) { + MDBX_chk_internal_t *chk = line->ctx->internal; + prefix = prefix ? prefix : ""; + suffix = suffix ? suffix : ""; + if (chk->cb->print_size) + chk->cb->print_size(line, prefix, value, suffix); + else + for (unsigned i = 0;; ++i) { + const unsigned scale = 10 + i * 10; + const uint64_t rounded = value + (UINT64_C(5) << (scale - 10)); + const uint64_t integer = rounded >> scale; + const uint64_t fractional = + (rounded - (integer << scale)) * 100u >> scale; + if ((rounded >> scale) <= 1000) + return chk_print(line, "%s%" PRIu64 " (%u.%02u %ciB)%s", prefix, + value, (unsigned)integer, (unsigned)fractional, + sf[i], suffix); + } + line->empty = false; + } + return line; +} + +__cold static int chk_error_rc(MDBX_chk_scope_t *const scope, int err, + const char *subj) { + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_error); + if (line) + chk_line_end(chk_flush(chk_print(line, "%s() failed, error %s (%d)", subj, + mdbx_strerror(err), err))); + else + debug_log(MDBX_LOG_ERROR, "mdbx_env_chk", 0, "%s() failed, error %s (%d)", + subj, mdbx_strerror(err), err); + return err; +} + +__cold static void MDBX_PRINTF_ARGS(5, 6) + chk_object_issue(MDBX_chk_scope_t *const scope, const char *object, + uint64_t entry_number, const char *caption, + const char *extra_fmt, ...) { + MDBX_chk_internal_t *const chk = scope->internal; + MDBX_chk_issue_t *issue = chk->usr->scope->issues; + while (issue) { + if (issue->caption == caption) { + issue->count += 1; + break; + } else + issue = issue->next; + } + const bool fresh = issue == nullptr; + if (fresh) { + issue = osal_malloc(sizeof(*issue)); + if (likely(issue)) { + issue->caption = caption; + issue->count = 1; + issue->next = chk->usr->scope->issues; + chk->usr->scope->issues = issue; + } else + chk_error_rc(scope, ENOMEM, "adding issue"); + } + + va_list args; + va_start(args, extra_fmt); + if (chk->cb->issue) { + mdbx_env_chk_encount_problem(chk->usr); + chk->cb->issue(chk->usr, object, entry_number, caption, extra_fmt, args); + } else { + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_error); + if (entry_number != UINT64_MAX) + chk_print(line, "%s #%" PRIu64 ": %s", object, entry_number, caption); + else + chk_print(line, "%s: %s", object, caption); + if (extra_fmt) + chk_puts(chk_print_va(chk_puts(line, " ("), extra_fmt, args), ")"); + chk_line_end(fresh ? chk_flush(line) : line); + } + va_end(args); +} + +__cold static void MDBX_PRINTF_ARGS(2, 3) + chk_scope_issue(MDBX_chk_scope_t *const scope, const char *fmt, ...) { + MDBX_chk_internal_t *const chk = scope->internal; + va_list args; + va_start(args, fmt); + if (likely(chk->cb->issue)) { + mdbx_env_chk_encount_problem(chk->usr); + chk->cb->issue(chk->usr, nullptr, 0, nullptr, fmt, args); + } else + chk_line_end( + chk_print_va(chk_line_begin(scope, MDBX_chk_error), fmt, args)); + va_end(args); +} + +__cold static int chk_scope_end(MDBX_chk_internal_t *chk, int err) { + assert(chk->scope_depth > 0); + MDBX_chk_scope_t *const inner = chk->scope_stack + chk->scope_depth; + MDBX_chk_scope_t *const outer = chk->scope_depth ? inner - 1 : nullptr; + if (!outer || outer->stage != inner->stage) { + if (err == MDBX_SUCCESS && *chk->problem_counter) + err = MDBX_PROBLEM; + else if (*chk->problem_counter == 0 && MDBX_IS_ERROR(err)) + *chk->problem_counter = 1; + if (chk->problem_counter != &chk->usr->result.total_problems) { + chk->usr->result.total_problems += *chk->problem_counter; + chk->problem_counter = &chk->usr->result.total_problems; + } + if (chk->cb->stage_end) + err = chk->cb->stage_end(chk->usr, inner->stage, err); + } + if (chk->cb->scope_conclude) + err = chk->cb->scope_conclude(chk->usr, outer, inner, err); + chk->usr->scope = outer; + chk->usr->scope_nesting = chk->scope_depth -= 1; + if (outer) + outer->subtotal_issues += inner->subtotal_issues; + if (chk->cb->scope_pop) + chk->cb->scope_pop(chk->usr, outer, inner); + + while (inner->issues) { + MDBX_chk_issue_t *next = inner->issues->next; + osal_free(inner->issues); + inner->issues = next; + } + memset(inner, -1, sizeof(*inner)); + return err; +} + +__cold static int chk_scope_begin_args(MDBX_chk_internal_t *chk, + int verbosity_adjustment, + enum MDBX_chk_stage stage, + const void *object, size_t *problems, + const char *fmt, va_list args) { + if (unlikely(chk->scope_depth + 1u >= ARRAY_LENGTH(chk->scope_stack))) + return MDBX_BACKLOG_DEPLETED; + + MDBX_chk_scope_t *const outer = chk->scope_stack + chk->scope_depth; + const int verbosity = + outer->verbosity + + (verbosity_adjustment - 1) * (1 << MDBX_chk_severity_prio_shift); + MDBX_chk_scope_t *const inner = outer + 1; + memset(inner, 0, sizeof(*inner)); + inner->internal = outer->internal; + inner->stage = stage ? stage : (stage = outer->stage); + inner->object = object; + inner->verbosity = (verbosity < MDBX_chk_warning) + ? MDBX_chk_warning + : (enum MDBX_chk_severity)verbosity; + if (problems) + chk->problem_counter = problems; + else if (!chk->problem_counter || outer->stage != stage) + chk->problem_counter = &chk->usr->result.total_problems; + + if (chk->cb->scope_push) { + const int err = chk->cb->scope_push(chk->usr, outer, inner, fmt, args); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + chk->usr->scope = inner; + chk->usr->scope_nesting = chk->scope_depth += 1; + + if (stage != outer->stage && chk->cb->stage_begin) { + int err = chk->cb->stage_begin(chk->usr, stage); + if (unlikely(err != MDBX_SUCCESS)) { + err = chk_scope_end(chk, err); + assert(err != MDBX_SUCCESS); + return err ? err : MDBX_RESULT_TRUE; + } + } + return MDBX_SUCCESS; +} + +__cold static int MDBX_PRINTF_ARGS(6, 7) + chk_scope_begin(MDBX_chk_internal_t *chk, int verbosity_adjustment, + enum MDBX_chk_stage stage, const void *object, + size_t *problems, const char *fmt, ...) { + va_list args; + va_start(args, fmt); + int rc = chk_scope_begin_args(chk, verbosity_adjustment, stage, object, + problems, fmt, args); + va_end(args); + return rc; +} + +__cold static int chk_scope_restore(MDBX_chk_scope_t *const target, int err) { + MDBX_chk_internal_t *const chk = target->internal; + assert(target <= chk->usr->scope); + while (chk->usr->scope > target) + err = chk_scope_end(chk, err); + return err; +} + +__cold void chk_scope_pop(MDBX_chk_scope_t *const inner) { + if (inner && inner > inner->internal->scope_stack) + chk_scope_restore(inner - 1, MDBX_SUCCESS); +} + +__cold static MDBX_chk_scope_t *MDBX_PRINTF_ARGS(3, 4) + chk_scope_push(MDBX_chk_scope_t *const scope, int verbosity_adjustment, + const char *fmt, ...) { + chk_scope_restore(scope, MDBX_SUCCESS); + va_list args; + va_start(args, fmt); + int err = chk_scope_begin_args(scope->internal, verbosity_adjustment, + scope->stage, nullptr, nullptr, fmt, args); + va_end(args); + return err ? nullptr : scope + 1; +} + +__cold static const char *chk_v2a(MDBX_chk_internal_t *chk, + const MDBX_val *val) { + if (val == MDBX_CHK_MAIN) + return "@MAIN"; + if (val == MDBX_CHK_GC) + return "@GC"; + if (val == MDBX_CHK_META) + return "@META"; + + const unsigned char *const data = val->iov_base; + const size_t len = val->iov_len; + if (data == MDBX_CHK_MAIN) + return "@MAIN"; + if (data == MDBX_CHK_GC) + return "@GC"; + if (data == MDBX_CHK_META) + return "@META"; + + if (!len) + return ""; + if (!data) + return ""; + if (len > 65536) { + const size_t enough = 42; + if (chk->v2a_buf.iov_len < enough) { + void *ptr = osal_realloc(chk->v2a_buf.iov_base, enough); + if (unlikely(!ptr)) + return ""; + chk->v2a_buf.iov_base = ptr; + chk->v2a_buf.iov_len = enough; + } + snprintf(chk->v2a_buf.iov_base, chk->v2a_buf.iov_len, + "", len); + return chk->v2a_buf.iov_base; + } + + bool printable = true; + bool quoting = false; + size_t xchars = 0; + for (size_t i = 0; i < len && printable; ++i) { + quoting = quoting || !(data[i] == '_' || isalnum(data[i])); + printable = + isprint(data[i]) || (data[i] < ' ' && ++xchars < 4 && len > xchars * 4); + } + + size_t need = len + 1; + if (quoting || !printable) + need += len + /* quotes */ 2 + 2 * /* max xchars */ 4; + if (need > chk->v2a_buf.iov_len) { + void *ptr = osal_realloc(chk->v2a_buf.iov_base, need); + if (unlikely(!ptr)) + return ""; + chk->v2a_buf.iov_base = ptr; + chk->v2a_buf.iov_len = need; + } + + static const char hex[] = "0123456789abcdef"; + char *w = chk->v2a_buf.iov_base; + if (!quoting) { + memcpy(w, data, len); + w += len; + } else if (printable) { + *w++ = '\''; + for (size_t i = 0; i < len; ++i) { + if (data[i] < ' ') { + assert((char *)chk->v2a_buf.iov_base + chk->v2a_buf.iov_len > w + 4); + w[0] = '\\'; + w[1] = 'x'; + w[2] = hex[data[i] >> 4]; + w[3] = hex[data[i] & 15]; + w += 4; + } else if (strchr("\"'`\\", data[i])) { + assert((char *)chk->v2a_buf.iov_base + chk->v2a_buf.iov_len > w + 2); + w[0] = '\\'; + w[1] = data[i]; + w += 2; + } else { + assert((char *)chk->v2a_buf.iov_base + chk->v2a_buf.iov_len > w + 1); + *w++ = data[i]; + } + } + *w++ = '\''; + } else { + *w++ = '\\'; + *w++ = 'x'; + for (size_t i = 0; i < len; ++i) { + assert((char *)chk->v2a_buf.iov_base + chk->v2a_buf.iov_len > w + 2); + w[0] = hex[data[i] >> 4]; + w[1] = hex[data[i] & 15]; + w += 2; + } + } + assert((char *)chk->v2a_buf.iov_base + chk->v2a_buf.iov_len > w); + *w = 0; + return chk->v2a_buf.iov_base; +} + +__cold static void chk_dispose(MDBX_chk_internal_t *chk) { + assert(chk->subdb[FREE_DBI] == &chk->subdb_gc); + assert(chk->subdb[MAIN_DBI] == &chk->subdb_main); + for (size_t i = 0; i < ARRAY_LENGTH(chk->subdb); ++i) { + MDBX_chk_subdb_t *const sdb = chk->subdb[i]; + if (sdb) { + chk->subdb[i] = nullptr; + if (chk->cb->subdb_dispose && sdb->cookie) { + chk->cb->subdb_dispose(chk->usr, sdb); + sdb->cookie = nullptr; + } + if (sdb != &chk->subdb_gc && sdb != &chk->subdb_main) { + osal_free(sdb); + } + } + } + osal_free(chk->v2a_buf.iov_base); + osal_free(chk->pagemap); + chk->usr->internal = nullptr; + chk->usr->scope = nullptr; + chk->pagemap = nullptr; + memset(chk, 0xDD, sizeof(*chk)); + osal_free(chk); +} + +static size_t div_8s(size_t numerator, size_t divider) { + assert(numerator <= (SIZE_MAX >> 8)); + return (numerator << 8) / divider; +} + +static size_t mul_8s(size_t quotient, size_t multiplier) { + size_t hi = multiplier * (quotient >> 8); + size_t lo = multiplier * (quotient & 255) + 128; + return hi + (lo >> 8); +} + +static void histogram_reduce(struct MDBX_chk_histogram *p) { + const size_t size = ARRAY_LENGTH(p->ranges), last = size - 1; + // ищем пару для слияния с минимальной ошибкой + size_t min_err = SIZE_MAX, min_i = last - 1; + for (size_t i = 0; i < last; ++i) { + const size_t b1 = p->ranges[i].begin, e1 = p->ranges[i].end, + s1 = p->ranges[i].amount; + const size_t b2 = p->ranges[i + 1].begin, e2 = p->ranges[i + 1].end, + s2 = p->ranges[i + 1].amount; + const size_t l1 = e1 - b1, l2 = e2 - b2, lx = e2 - b1, sx = s1 + s2; + assert(s1 > 0 && b1 > 0 && b1 < e1); + assert(s2 > 0 && b2 > 0 && b2 < e2); + assert(e1 <= b2); + // за ошибку принимаем площадь изменений на гистограмме при слиянии + const size_t h1 = div_8s(s1, l1), h2 = div_8s(s2, l2), hx = div_8s(sx, lx); + const size_t d1 = mul_8s((h1 > hx) ? h1 - hx : hx - h1, l1); + const size_t d2 = mul_8s((h2 > hx) ? h2 - hx : hx - h2, l2); + const size_t dx = mul_8s(hx, b2 - e1); + const size_t err = d1 + d2 + dx; + if (min_err >= err) { + min_i = i; + min_err = err; + } + } + // объединяем + p->ranges[min_i].end = p->ranges[min_i + 1].end; + p->ranges[min_i].amount += p->ranges[min_i + 1].amount; + p->ranges[min_i].count += p->ranges[min_i + 1].count; + if (min_i < last) + // перемещаем хвост + memmove(p->ranges + min_i, p->ranges + min_i + 1, + (last - min_i) * sizeof(p->ranges[0])); + // обнуляем последний элемент и продолжаем + p->ranges[last].count = 0; +} + +static void histogram_acc(const size_t n, struct MDBX_chk_histogram *p) { + STATIC_ASSERT(ARRAY_LENGTH(p->ranges) > 2); + p->amount += n; + p->count += 1; + if (likely(n < 2)) { + p->ones += n; + p->pad += 1; + } else + for (;;) { + const size_t size = ARRAY_LENGTH(p->ranges), last = size - 1; + size_t i = 0; + while (i < size && p->ranges[i].count && n >= p->ranges[i].begin) { + if (n < p->ranges[i].end) { + // значение попадает в существующий интервал + p->ranges[i].amount += n; + p->ranges[i].count += 1; + return; + } + ++i; + } + if (p->ranges[last].count == 0) { + // использованы еще не все слоты, добавляем интервал + assert(i < size); + if (p->ranges[i].count) { + assert(i < last); + // раздвигаем +#ifdef __COVERITY__ + if (i < last) /* avoid Coverity false-positive issue */ +#endif /* __COVERITY__ */ + memmove(p->ranges + i + 1, p->ranges + i, + (last - i) * sizeof(p->ranges[0])); + } + p->ranges[i].begin = n; + p->ranges[i].end = n + 1; + p->ranges[i].amount = n; + p->ranges[i].count = 1; + return; + } + histogram_reduce(p); + } +} + +__cold static MDBX_chk_line_t * +histogram_dist(MDBX_chk_line_t *line, + const struct MDBX_chk_histogram *histogram, const char *prefix, + const char *first, bool amount) { + line = chk_print(line, "%s:", prefix); + const char *comma = ""; + const size_t first_val = amount ? histogram->ones : histogram->pad; + if (first_val) { + chk_print(line, " %s=%" PRIuSIZE, first, first_val); + comma = ","; + } + for (size_t n = 0; n < ARRAY_LENGTH(histogram->ranges); ++n) + if (histogram->ranges[n].count) { + chk_print(line, "%s %" PRIuSIZE, comma, histogram->ranges[n].begin); + if (histogram->ranges[n].begin != histogram->ranges[n].end - 1) + chk_print(line, "-%" PRIuSIZE, histogram->ranges[n].end - 1); + line = chk_print(line, "=%" PRIuSIZE, + amount ? histogram->ranges[n].amount + : histogram->ranges[n].count); + comma = ","; + } + return line; +} + +__cold static MDBX_chk_line_t * +histogram_print(MDBX_chk_scope_t *scope, MDBX_chk_line_t *line, + const struct MDBX_chk_histogram *histogram, const char *prefix, + const char *first, bool amount) { + if (histogram->count) { + line = chk_print(line, "%s %" PRIuSIZE, prefix, + amount ? histogram->amount : histogram->count); + if (scope->verbosity > MDBX_chk_info) + line = chk_puts( + histogram_dist(line, histogram, " (distribution", first, amount), + ")"); + } + return line; +} + +//----------------------------------------------------------------------------- + +__cold static int chk_get_sdb(MDBX_chk_scope_t *const scope, + const MDBX_walk_sdb_t *in, + MDBX_chk_subdb_t **out) { + MDBX_chk_internal_t *const chk = scope->internal; + if (chk->last_lookup && + chk->last_lookup->name.iov_base == in->name.iov_base) { + *out = chk->last_lookup; + return MDBX_SUCCESS; + } + + for (size_t i = 0; i < ARRAY_LENGTH(chk->subdb); ++i) { + MDBX_chk_subdb_t *sdb = chk->subdb[i]; + if (!sdb) { + sdb = osal_calloc(1, sizeof(MDBX_chk_subdb_t)); + if (unlikely(!sdb)) { + *out = nullptr; + return chk_error_rc(scope, MDBX_ENOMEM, "alloc_subDB"); + } + chk->subdb[i] = sdb; + sdb->flags = in->internal->md_flags; + sdb->id = -1; + sdb->name = in->name; + } + if (sdb->name.iov_base == in->name.iov_base) { + if (sdb->id < 0) { + sdb->id = (int)i; + sdb->cookie = + chk->cb->subdb_filter + ? chk->cb->subdb_filter(chk->usr, &sdb->name, sdb->flags) + : (void *)(intptr_t)-1; + } + *out = (chk->last_lookup = sdb); + return MDBX_SUCCESS; + } + } + chk_scope_issue(scope, "too many subDBs > %u", + (unsigned)ARRAY_LENGTH(chk->subdb) - CORE_DBS - /* meta */ 1); + *out = nullptr; + return MDBX_PROBLEM; +} + +//------------------------------------------------------------------------------ + +__cold static void chk_verbose_meta(MDBX_chk_scope_t *const scope, + const unsigned num) { + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_verbose); + MDBX_chk_internal_t *const chk = scope->internal; + if (line) { + MDBX_env *const env = chk->usr->env; + const bool have_bootid = (chk->envinfo.mi_bootid.current.x | + chk->envinfo.mi_bootid.current.y) != 0; + const bool bootid_match = + have_bootid && memcmp(&chk->envinfo.mi_bootid.meta[num], + &chk->envinfo.mi_bootid.current, + sizeof(chk->envinfo.mi_bootid.current)) == 0; + + const char *status = "stay"; + if (num == chk->troika.recent) + status = "head"; + else if (num == TROIKA_TAIL(&chk->troika)) + status = "tail"; + line = chk_print(line, "meta-%u: %s, ", num, status); + + switch (chk->envinfo.mi_meta_sign[num]) { + case MDBX_DATASIGN_NONE: + line = chk_puts(line, "no-sync/legacy"); + break; + case MDBX_DATASIGN_WEAK: + line = chk_print(line, "weak-%s", + have_bootid + ? (bootid_match ? "intact (same boot-id)" : "dead") + : "unknown (no boot-id)"); + break; + default: + line = chk_puts(line, "steady"); + break; + } + const txnid_t meta_txnid = chk->envinfo.mi_meta_txnid[num]; + line = chk_print(line, " txn#%" PRIaTXN ", ", meta_txnid); + if (chk->envinfo.mi_bootid.meta[num].x | chk->envinfo.mi_bootid.meta[num].y) + line = chk_print(line, "boot-id %" PRIx64 "-%" PRIx64 " (%s)", + chk->envinfo.mi_bootid.meta[num].x, + chk->envinfo.mi_bootid.meta[num].y, + bootid_match ? "live" : "not match"); + else + line = chk_puts(line, "no boot-id"); + + if (env->me_stuck_meta >= 0) { + if (num == (unsigned)env->me_stuck_meta) + line = chk_print(line, ", %s", "forced for checking"); + } else if (meta_txnid > chk->envinfo.mi_recent_txnid && + (env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) == + MDBX_EXCLUSIVE) + line = chk_print(line, + ", rolled-back %" PRIu64 " commit(s) (%" PRIu64 + " >>> %" PRIu64 ")", + meta_txnid - chk->envinfo.mi_recent_txnid, meta_txnid, + chk->envinfo.mi_recent_txnid); + chk_line_end(line); + } +} + +__cold static int +chk_pgvisitor(const size_t pgno, const unsigned npages, void *const ctx, + const int deep, const MDBX_walk_sdb_t *sdb_info, + const size_t page_size, const MDBX_page_type_t pagetype, + const MDBX_error_t page_err, const size_t nentries, + const size_t payload_bytes, const size_t header_bytes, + const size_t unused_bytes) { + MDBX_chk_scope_t *const scope = ctx; + MDBX_chk_internal_t *const chk = scope->internal; + MDBX_chk_context_t *const usr = chk->usr; + MDBX_env *const env = usr->env; + + MDBX_chk_subdb_t *sdb; + int err = chk_get_sdb(scope, sdb_info, &sdb); + if (unlikely(err)) + return err; + + if (deep > 42) { + chk_scope_issue(scope, "too deeply %u", deep); + return MDBX_CORRUPTED /* avoid infinite loop/recursion */; + } + histogram_acc(deep, &sdb->histogram.deep); + usr->result.processed_pages += npages; + const size_t page_bytes = payload_bytes + header_bytes + unused_bytes; + + int height = deep + 1; + if (sdb->id >= CORE_DBS) + height -= usr->txn->mt_dbs[MAIN_DBI].md_depth; + const struct MDBX_db *nested = sdb_info->nested; + if (nested) { + if (sdb->flags & MDBX_DUPSORT) + height -= sdb_info->internal->md_depth; + else { + chk_object_issue(scope, "nested tree", pgno, "unexpected", + "subDb %s flags 0x%x, deep %i", chk_v2a(chk, &sdb->name), + sdb->flags, deep); + nested = nullptr; + } + } else + chk->last_nested = nullptr; + + const char *pagetype_caption; + bool branch = false; + switch (pagetype) { + default: + chk_object_issue(scope, "page", pgno, "unknown page-type", + "type %u, deep %i", (unsigned)pagetype, deep); + pagetype_caption = "unknown"; + sdb->pages.other += npages; + break; + case MDBX_page_broken: + assert(page_err != MDBX_SUCCESS); + pagetype_caption = "broken"; + sdb->pages.other += npages; + break; + case MDBX_subpage_broken: + assert(page_err != MDBX_SUCCESS); + pagetype_caption = "broken-subpage"; + sdb->pages.other += npages; + break; + case MDBX_page_large: + pagetype_caption = "large"; + histogram_acc(npages, &sdb->histogram.large_pages); + if (sdb->flags & MDBX_DUPSORT) + chk_object_issue(scope, "page", pgno, "unexpected", + "type %u, subDb %s flags 0x%x, deep %i", + (unsigned)pagetype, chk_v2a(chk, &sdb->name), sdb->flags, + deep); + break; + case MDBX_page_branch: + branch = true; + if (!nested) { + pagetype_caption = "branch"; + sdb->pages.branch += 1; + } else { + pagetype_caption = "nested-branch"; + sdb->pages.nested_branch += 1; + } + break; + case MDBX_page_dupfixed_leaf: + if (!nested) + chk_object_issue(scope, "page", pgno, "unexpected", + "type %u, subDb %s flags 0x%x, deep %i", + (unsigned)pagetype, chk_v2a(chk, &sdb->name), sdb->flags, + deep); + /* fall through */ + __fallthrough; + case MDBX_page_leaf: + if (!nested) { + pagetype_caption = "leaf"; + sdb->pages.leaf += 1; + if (height != sdb_info->internal->md_depth) + chk_object_issue(scope, "page", pgno, "wrong tree height", + "actual %i != %i subDb %s", height, + sdb_info->internal->md_depth, + chk_v2a(chk, &sdb->name)); + } else { + pagetype_caption = + (pagetype == MDBX_page_leaf) ? "nested-leaf" : "nested-leaf-dupfixed"; + sdb->pages.nested_leaf += 1; + if (chk->last_nested != nested) { + histogram_acc(height, &sdb->histogram.nested_tree); + chk->last_nested = nested; + } + if (height != nested->md_depth) + chk_object_issue(scope, "page", pgno, "wrong nested-tree height", + "actual %i != %i dupsort-node %s", height, + nested->md_depth, chk_v2a(chk, &sdb->name)); + } + break; + case MDBX_subpage_dupfixed_leaf: + case MDBX_subpage_leaf: + pagetype_caption = (pagetype == MDBX_subpage_leaf) ? "subleaf-dupsort" + : "subleaf-dupfixed"; + sdb->pages.nested_subleaf += 1; + if ((sdb->flags & MDBX_DUPSORT) == 0 || nested) + chk_object_issue(scope, "page", pgno, "unexpected", + "type %u, subDb %s flags 0x%x, deep %i", + (unsigned)pagetype, chk_v2a(chk, &sdb->name), sdb->flags, + deep); + break; + } + + if (npages) { + if (sdb->cookie) { + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_extra); + if (npages == 1) + chk_print(line, "%s-page %" PRIuSIZE, pagetype_caption, pgno); + else + chk_print(line, "%s-span %" PRIuSIZE "[%u]", pagetype_caption, pgno, + npages); + chk_line_end( + chk_print(line, + " of %s: header %" PRIiPTR ", %s %" PRIiPTR + ", payload %" PRIiPTR ", unused %" PRIiPTR ", deep %i", + chk_v2a(chk, &sdb->name), header_bytes, + (pagetype == MDBX_page_branch) ? "keys" : "entries", + nentries, payload_bytes, unused_bytes, deep)); + } + + bool already_used = false; + for (unsigned n = 0; n < npages; ++n) { + const size_t spanpgno = pgno + n; + if (spanpgno >= usr->result.alloc_pages) { + chk_object_issue(scope, "page", spanpgno, "wrong page-no", + "%s-page: %" PRIuSIZE " > %" PRIuSIZE ", deep %i", + pagetype_caption, spanpgno, usr->result.alloc_pages, + deep); + sdb->pages.all += 1; + } else if (chk->pagemap[spanpgno]) { + const MDBX_chk_subdb_t *const rival = + chk->subdb[chk->pagemap[spanpgno] - 1]; + chk_object_issue(scope, "page", spanpgno, + (branch && rival == sdb) ? "loop" : "already used", + "%s-page: by %s, deep %i", pagetype_caption, + chk_v2a(chk, &rival->name), deep); + already_used = true; + } else { + chk->pagemap[spanpgno] = (int16_t)sdb->id + 1; + sdb->pages.all += 1; + } + } + + if (already_used) + return branch ? MDBX_RESULT_TRUE /* avoid infinite loop/recursion */ + : MDBX_SUCCESS; + } + + if (MDBX_IS_ERROR(page_err)) { + chk_object_issue(scope, "page", pgno, "invalid/corrupted", "%s-page", + pagetype_caption); + } else { + if (unused_bytes > page_size) + chk_object_issue(scope, "page", pgno, "illegal unused-bytes", + "%s-page: %u < %" PRIuSIZE " < %u", pagetype_caption, 0, + unused_bytes, env->me_psize); + + if (header_bytes < (int)sizeof(long) || + (size_t)header_bytes >= env->me_psize - sizeof(long)) { + chk_object_issue(scope, "page", pgno, "illegal header-length", + "%s-page: %" PRIuSIZE " < %" PRIuSIZE " < %" PRIuSIZE, + pagetype_caption, sizeof(long), header_bytes, + env->me_psize - sizeof(long)); + } + if (nentries < 1 || (pagetype == MDBX_page_branch && nentries < 2)) { + chk_object_issue(scope, "page", pgno, nentries ? "half-empty" : "empty", + "%s-page: payload %" PRIuSIZE " bytes, %" PRIuSIZE + " entries, deep %i", + pagetype_caption, payload_bytes, nentries, deep); + sdb->pages.empty += 1; + } + + if (npages) { + if (page_bytes != page_size) { + chk_object_issue(scope, "page", pgno, "misused", + "%s-page: %" PRIuPTR " != %" PRIuPTR " (%" PRIuPTR + "h + %" PRIuPTR "p + %" PRIuPTR "u), deep %i", + pagetype_caption, page_size, page_bytes, header_bytes, + payload_bytes, unused_bytes, deep); + if (page_size > page_bytes) + sdb->lost_bytes += page_size - page_bytes; + } else { + sdb->payload_bytes += payload_bytes + header_bytes; + usr->result.total_payload_bytes += payload_bytes + header_bytes; + } + } + } + return chk_check_break(scope); +} + +__cold static int chk_tree(MDBX_chk_scope_t *const scope) { + MDBX_chk_internal_t *const chk = scope->internal; + MDBX_chk_context_t *const usr = chk->usr; + MDBX_env *const env = usr->env; + MDBX_txn *const txn = usr->txn; + +#if defined(_WIN32) || defined(_WIN64) + SetLastError(ERROR_SUCCESS); +#else + errno = 0; +#endif /* Windows */ + chk->pagemap = osal_calloc(usr->result.alloc_pages, sizeof(*chk->pagemap)); + if (!chk->pagemap) { + int err = osal_get_errno(); + return chk_error_rc(scope, err ? err : MDBX_ENOMEM, "calloc"); + } + + if (scope->verbosity > MDBX_chk_info) + chk_scope_push(scope, 0, "Walking pages..."); + /* always skip key ordering checking + * to avoid MDBX_CORRUPTED in case custom comparators were used */ + usr->result.processed_pages = NUM_METAS; + int err = mdbx_env_pgwalk(txn, chk_pgvisitor, scope, true); + if (MDBX_IS_ERROR(err) && err != MDBX_EINTR) + chk_error_rc(scope, err, "mdbx_env_pgwalk"); + + for (size_t n = NUM_METAS; n < usr->result.alloc_pages; ++n) + if (!chk->pagemap[n]) + usr->result.unused_pages += 1; + + MDBX_chk_subdb_t total; + memset(&total, 0, sizeof(total)); + total.pages.all = NUM_METAS; + for (size_t i = 0; i < ARRAY_LENGTH(chk->subdb) && chk->subdb[i]; ++i) { + MDBX_chk_subdb_t *const sdb = chk->subdb[i]; + total.payload_bytes += sdb->payload_bytes; + total.lost_bytes += sdb->lost_bytes; + total.pages.all += sdb->pages.all; + total.pages.empty += sdb->pages.empty; + total.pages.other += sdb->pages.other; + total.pages.branch += sdb->pages.branch; + total.pages.leaf += sdb->pages.leaf; + total.pages.nested_branch += sdb->pages.nested_branch; + total.pages.nested_leaf += sdb->pages.nested_leaf; + total.pages.nested_subleaf += sdb->pages.nested_subleaf; + } + assert(total.pages.all == usr->result.processed_pages); + + const size_t total_page_bytes = pgno2bytes(env, total.pages.all); + if (usr->scope->subtotal_issues || usr->scope->verbosity >= MDBX_chk_verbose) + chk_line_end(chk_print(chk_line_begin(usr->scope, MDBX_chk_resolution), + "walked %zu pages, left/unused %zu" + ", %" PRIuSIZE " problem(s)", + usr->result.processed_pages, + usr->result.unused_pages, + usr->scope->subtotal_issues)); + + err = chk_scope_restore(scope, err); + if (scope->verbosity > MDBX_chk_info) { + for (size_t i = 0; i < ARRAY_LENGTH(chk->subdb) && chk->subdb[i]; ++i) { + MDBX_chk_subdb_t *const sdb = chk->subdb[i]; + MDBX_chk_scope_t *inner = + chk_scope_push(scope, 0, "tree %s:", chk_v2a(chk, &sdb->name)); + if (sdb->pages.all == 0) + chk_line_end( + chk_print(chk_line_begin(inner, MDBX_chk_resolution), "empty")); + else { + MDBX_chk_line_t *line = chk_line_begin(inner, MDBX_chk_info); + if (line) { + line = chk_print(line, "page usage: subtotal %" PRIuSIZE, + sdb->pages.all); + const size_t branch_pages = + sdb->pages.branch + sdb->pages.nested_branch; + const size_t leaf_pages = sdb->pages.leaf + sdb->pages.nested_leaf + + sdb->pages.nested_subleaf; + if (sdb->pages.other) + line = chk_print(line, ", other %" PRIuSIZE, sdb->pages.other); + if (sdb->pages.other == 0 || + (branch_pages | leaf_pages | sdb->histogram.large_pages.count) != + 0) { + line = chk_print(line, ", branch %" PRIuSIZE ", leaf %" PRIuSIZE, + branch_pages, leaf_pages); + if (sdb->histogram.large_pages.count || + (sdb->flags & MDBX_DUPSORT) == 0) { + line = chk_print(line, ", large %" PRIuSIZE, + sdb->histogram.large_pages.count); + if (sdb->histogram.large_pages.amount | + sdb->histogram.large_pages.count) + line = histogram_print(inner, line, &sdb->histogram.large_pages, + " amount", "single", true); + } + } + line = histogram_dist(chk_line_feed(line), &sdb->histogram.deep, + "tree deep density", "1", false); + if (sdb != &chk->subdb_gc && sdb->histogram.nested_tree.count) { + line = chk_print(chk_line_feed(line), "nested tree(s) %" PRIuSIZE, + sdb->histogram.nested_tree.count); + line = histogram_dist(line, &sdb->histogram.nested_tree, " density", + "1", false); + line = chk_print(chk_line_feed(line), + "nested tree(s) pages %" PRIuSIZE + ": branch %" PRIuSIZE ", leaf %" PRIuSIZE + ", subleaf %" PRIuSIZE, + sdb->pages.nested_branch + sdb->pages.nested_leaf, + sdb->pages.nested_branch, sdb->pages.nested_leaf, + sdb->pages.nested_subleaf); + } + + const size_t bytes = pgno2bytes(env, sdb->pages.all); + line = chk_print( + chk_line_feed(line), + "page filling: subtotal %" PRIuSIZE + " bytes (%.1f%%), payload %" PRIuSIZE + " (%.1f%%), unused %" PRIuSIZE " (%.1f%%)", + bytes, bytes * 100.0 / total_page_bytes, sdb->payload_bytes, + sdb->payload_bytes * 100.0 / bytes, bytes - sdb->payload_bytes, + (bytes - sdb->payload_bytes) * 100.0 / bytes); + if (sdb->pages.empty) + line = chk_print(line, ", %" PRIuSIZE " empty pages", + sdb->pages.empty); + if (sdb->lost_bytes) + line = + chk_print(line, ", %" PRIuSIZE " bytes lost", sdb->lost_bytes); + chk_line_end(line); + } + } + chk_scope_restore(scope, 0); + } + } + + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_resolution); + line = chk_print(line, + "summary: total %" PRIuSIZE " bytes, payload %" PRIuSIZE + " (%.1f%%), unused %" PRIuSIZE " (%.1f%%)," + " average fill %.1f%%", + total_page_bytes, usr->result.total_payload_bytes, + usr->result.total_payload_bytes * 100.0 / total_page_bytes, + total_page_bytes - usr->result.total_payload_bytes, + (total_page_bytes - usr->result.total_payload_bytes) * + 100.0 / total_page_bytes, + usr->result.total_payload_bytes * 100.0 / total_page_bytes); + if (total.pages.empty) + line = chk_print(line, ", %" PRIuSIZE " empty pages", total.pages.empty); + if (total.lost_bytes) + line = chk_print(line, ", %" PRIuSIZE " bytes lost", total.lost_bytes); + chk_line_end(line); + return err; +} + +typedef int(chk_kv_visitor)(MDBX_chk_scope_t *const scope, + MDBX_chk_subdb_t *sdb, const size_t record_number, + const MDBX_val *key, const MDBX_val *data); + +__cold static int chk_handle_kv(MDBX_chk_scope_t *const scope, + MDBX_chk_subdb_t *sdb, + const size_t record_number, const MDBX_val *key, + const MDBX_val *data) { + MDBX_chk_internal_t *const chk = scope->internal; + int err = MDBX_SUCCESS; + assert(sdb->cookie); + if (chk->cb->subdb_handle_kv) + err = chk->cb->subdb_handle_kv(chk->usr, sdb, record_number, key, data); + return err ? err : chk_check_break(scope); +} + +__cold static int chk_db(MDBX_chk_scope_t *const scope, MDBX_dbi dbi, + MDBX_chk_subdb_t *sdb, chk_kv_visitor *handler) { + MDBX_chk_internal_t *const chk = scope->internal; + MDBX_chk_context_t *const usr = chk->usr; + MDBX_env *const env = usr->env; + MDBX_txn *const txn = usr->txn; + MDBX_cursor *cursor = nullptr; + size_t record_count = 0, dups = 0, sub_databases = 0; + int err; + + if ((MDBX_TXN_FINISHED | MDBX_TXN_ERROR) & txn->mt_flags) { + chk_line_end( + chk_flush(chk_print(chk_line_begin(scope, MDBX_chk_error), + "abort processing %s due to a previous error", + chk_v2a(chk, &sdb->name)))); + err = MDBX_BAD_TXN; + goto bailout; + } + + if (0 > (int)dbi) { + err = dbi_open( + txn, &sdb->name, MDBX_DB_ACCEDE, &dbi, + (chk->flags & MDBX_CHK_IGNORE_ORDER) ? cmp_equal_or_greater : nullptr, + (chk->flags & MDBX_CHK_IGNORE_ORDER) ? cmp_equal_or_greater : nullptr); + if (unlikely(err)) { + tASSERT(txn, dbi >= txn->mt_env->me_numdbs || + (txn->mt_env->me_db_flags[dbi] & DB_VALID) == 0); + chk_error_rc(scope, err, "mdbx_dbi_open"); + goto bailout; + } + tASSERT(txn, dbi < txn->mt_env->me_numdbs && + (txn->mt_env->me_db_flags[dbi] & DB_VALID) != 0); + } + + const MDBX_db *const db = txn->mt_dbs + dbi; + if (handler) { + const char *key_mode = nullptr; + switch (sdb->flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY)) { + case 0: + key_mode = "usual"; + break; + case MDBX_REVERSEKEY: + key_mode = "reserve"; + break; + case MDBX_INTEGERKEY: + key_mode = "ordinal"; + break; + case MDBX_REVERSEKEY | MDBX_INTEGERKEY: + key_mode = "msgpack"; + break; + default: + key_mode = "inconsistent"; + chk_scope_issue(scope, "wrong key-mode (0x%x)", + sdb->flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY)); + } + + const char *value_mode = nullptr; + switch (sdb->flags & (MDBX_DUPSORT | MDBX_REVERSEDUP | MDBX_DUPFIXED | + MDBX_INTEGERDUP)) { + case 0: + value_mode = "single"; + break; + case MDBX_DUPSORT: + value_mode = "multi"; + break; + case MDBX_DUPSORT | MDBX_REVERSEDUP: + value_mode = "multi-reverse"; + break; + case MDBX_DUPSORT | MDBX_DUPFIXED: + value_mode = "multi-samelength"; + break; + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP: + value_mode = "multi-reverse-samelength"; + break; + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP: + value_mode = "multi-ordinal"; + break; + case MDBX_DUPSORT | MDBX_INTEGERDUP | MDBX_REVERSEDUP: + value_mode = "multi-msgpack"; + break; + case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP: + value_mode = "reserved"; + break; + default: + value_mode = "inconsistent"; + chk_scope_issue(scope, "wrong value-mode (0x%x)", + sdb->flags & (MDBX_DUPSORT | MDBX_REVERSEDUP | + MDBX_DUPFIXED | MDBX_INTEGERDUP)); + } + + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_info); + line = chk_print(line, "key-value kind: %s-key => %s-value", key_mode, + value_mode); + line = chk_print(line, ", flags:"); + if (!sdb->flags) + line = chk_print(line, " none"); + else { + const uint8_t f[] = {MDBX_DUPSORT, + MDBX_INTEGERKEY, + MDBX_REVERSEKEY, + MDBX_DUPFIXED, + MDBX_REVERSEDUP, + MDBX_INTEGERDUP, + 0}; + const char *const t[] = {"dupsort", "integerkey", "reversekey", + "dupfixed", "reversedup", "integerdup"}; + for (size_t i = 0; f[i]; i++) + if (sdb->flags & f[i]) + line = chk_print(line, " %s", t[i]); + } + chk_line_end(chk_print(line, " (0x%02X)", sdb->flags)); + + line = chk_print(chk_line_begin(scope, MDBX_chk_verbose), + "entries %" PRIu64 ", sequence %" PRIu64, db->md_entries, + db->md_seq); + if (db->md_mod_txnid) + line = chk_print(line, ", last modification txn#%" PRIaTXN, + db->md_mod_txnid); + if (db->md_root != P_INVALID) + line = chk_print(line, ", root #%" PRIaPGNO, db->md_root); + chk_line_end(line); + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_verbose), + "b-tree depth %u, pages: branch %" PRIaPGNO + ", leaf %" PRIaPGNO ", large %" PRIaPGNO, + db->md_depth, db->md_branch_pages, db->md_leaf_pages, + db->md_overflow_pages)); + + if ((chk->flags & MDBX_CHK_SKIP_BTREE_TRAVERSAL) == 0) { + const size_t branch_pages = sdb->pages.branch + sdb->pages.nested_branch; + const size_t leaf_pages = sdb->pages.leaf + sdb->pages.nested_leaf; + const size_t subtotal_pages = + db->md_branch_pages + db->md_leaf_pages + db->md_overflow_pages; + if (subtotal_pages != sdb->pages.all) + chk_scope_issue( + scope, "%s pages mismatch (%" PRIuSIZE " != walked %" PRIuSIZE ")", + "subtotal", subtotal_pages, sdb->pages.all); + if (db->md_branch_pages != branch_pages) + chk_scope_issue( + scope, "%s pages mismatch (%" PRIaPGNO " != walked %" PRIuSIZE ")", + "branch", db->md_branch_pages, branch_pages); + if (db->md_leaf_pages != leaf_pages) + chk_scope_issue( + scope, "%s pages mismatch (%" PRIaPGNO " != walked %" PRIuSIZE ")", + "all-leaf", db->md_leaf_pages, leaf_pages); + if (db->md_overflow_pages != sdb->histogram.large_pages.amount) + chk_scope_issue( + scope, "%s pages mismatch (%" PRIaPGNO " != walked %" PRIuSIZE ")", + "large/overlow", db->md_overflow_pages, + sdb->histogram.large_pages.amount); + } + } + + err = mdbx_cursor_open(txn, dbi, &cursor); + if (unlikely(err)) { + chk_error_rc(scope, err, "mdbx_cursor_open"); + goto bailout; + } + if (chk->flags & MDBX_CHK_IGNORE_ORDER) { + cursor->mc_checking |= CC_SKIPORD | CC_PAGECHECK; + if (cursor->mc_xcursor) + cursor->mc_xcursor->mx_cursor.mc_checking |= CC_SKIPORD | CC_PAGECHECK; + } + + const size_t maxkeysize = mdbx_env_get_maxkeysize_ex(env, sdb->flags); + MDBX_val prev_key = {nullptr, 0}, prev_data = {nullptr, 0}; + MDBX_val key, data; + err = mdbx_cursor_get(cursor, &key, &data, MDBX_FIRST); + while (err == MDBX_SUCCESS) { + err = chk_check_break(scope); + if (unlikely(err)) + goto bailout; + + bool bad_key = false; + if (key.iov_len > maxkeysize) { + chk_object_issue(scope, "entry", record_count, + "key length exceeds max-key-size", + "%" PRIuPTR " > %" PRIuPTR, key.iov_len, maxkeysize); + bad_key = true; + } else if ((sdb->flags & MDBX_INTEGERKEY) && key.iov_len != 8 && + key.iov_len != 4) { + chk_object_issue(scope, "entry", record_count, "wrong key length", + "%" PRIuPTR " != 4or8", key.iov_len); + bad_key = true; + } + + bool bad_data = false; + if ((sdb->flags & MDBX_INTEGERDUP) && data.iov_len != 8 && + data.iov_len != 4) { + chk_object_issue(scope, "entry", record_count, "wrong data length", + "%" PRIuPTR " != 4or8", data.iov_len); + bad_data = true; + } + + if (prev_key.iov_base) { + if (prev_data.iov_base && !bad_data && (sdb->flags & MDBX_DUPFIXED) && + prev_data.iov_len != data.iov_len) { + chk_object_issue(scope, "entry", record_count, "different data length", + "%" PRIuPTR " != %" PRIuPTR, prev_data.iov_len, + data.iov_len); + bad_data = true; + } + + if (!bad_key) { + int cmp = mdbx_cmp(txn, dbi, &key, &prev_key); + if (cmp == 0) { + ++dups; + if ((sdb->flags & MDBX_DUPSORT) == 0) { + chk_object_issue(scope, "entry", record_count, "duplicated entries", + nullptr); + if (prev_data.iov_base && data.iov_len == prev_data.iov_len && + memcmp(data.iov_base, prev_data.iov_base, data.iov_len) == 0) + chk_object_issue(scope, "entry", record_count, + "complete duplicate", nullptr); + } else if (!bad_data && prev_data.iov_base) { + cmp = mdbx_dcmp(txn, dbi, &data, &prev_data); + if (cmp == 0) + chk_object_issue(scope, "entry", record_count, + "complete duplicate", nullptr); + else if (cmp < 0 && !(chk->flags & MDBX_CHK_IGNORE_ORDER)) + chk_object_issue(scope, "entry", record_count, + "wrong order of multi-values", nullptr); + } + } else if (cmp < 0 && !(chk->flags & MDBX_CHK_IGNORE_ORDER)) + chk_object_issue(scope, "entry", record_count, + "wrong order of entries", nullptr); + } + } + + if (!bad_key) { + if (!prev_key.iov_base && (sdb->flags & MDBX_INTEGERKEY)) + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_info), + "fixed key-size %" PRIuSIZE, key.iov_len)); + prev_key = key; + } + if (!bad_data) { + if (!prev_data.iov_base && + (sdb->flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED))) + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_info), + "fixed data-size %" PRIuSIZE, data.iov_len)); + prev_data = data; + } + + record_count++; + histogram_acc(key.iov_len, &sdb->histogram.key_len); + histogram_acc(data.iov_len, &sdb->histogram.val_len); + + const MDBX_node *const node = + page_node(cursor->mc_pg[cursor->mc_top], cursor->mc_ki[cursor->mc_top]); + if (node_flags(node) == F_SUBDATA) { + if (dbi != MAIN_DBI || (sdb->flags & (MDBX_DUPSORT | MDBX_DUPFIXED | + MDBX_REVERSEDUP | MDBX_INTEGERDUP))) + chk_object_issue(scope, "entry", record_count, + "unexpected sub-database", "node-flags 0x%x", + node_flags(node)); + else if (data.iov_len != sizeof(MDBX_db)) + chk_object_issue(scope, "entry", record_count, + "wrong sub-database node size", + "node-size %" PRIuSIZE " != %" PRIuSIZE, data.iov_len, + sizeof(MDBX_db)); + else if (scope->stage == MDBX_chk_traversal_maindb) + /* подсчитываем subDB при первом проходе */ + sub_databases += 1; + else { + /* обработка subDB при втором проходе */ + MDBX_db aligned_db; + memcpy(&aligned_db, data.iov_base, sizeof(aligned_db)); + MDBX_walk_sdb_t sdb_info = {key, nullptr, nullptr}; + sdb_info.internal = &aligned_db; + MDBX_chk_subdb_t *subdb; + err = chk_get_sdb(scope, &sdb_info, &subdb); + if (unlikely(err)) + goto bailout; + if (subdb->cookie) { + err = chk_scope_begin(chk, 0, MDBX_chk_traversal_subdbs, subdb, + &usr->result.problems_kv, + "Processing subDB %s...", + chk_v2a(chk, &subdb->name)); + if (likely(!err)) { + err = chk_db(usr->scope, (MDBX_dbi)-1, subdb, chk_handle_kv); + if (err != MDBX_EINTR && err != MDBX_RESULT_TRUE) + usr->result.subdb_processed += 1; + } + err = chk_scope_restore(scope, err); + if (unlikely(err)) + goto bailout; + } else + chk_line_end(chk_flush( + chk_print(chk_line_begin(scope, MDBX_chk_processing), + "Skip processing %s...", chk_v2a(chk, &subdb->name)))); + } + } else if (handler) { + err = handler(scope, sdb, record_count, &key, &data); + if (unlikely(err)) + goto bailout; + } + + err = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT); + } + + err = (err != MDBX_NOTFOUND) ? chk_error_rc(scope, err, "mdbx_cursor_get") + : MDBX_SUCCESS; + if (err == MDBX_SUCCESS && record_count != db->md_entries) + chk_scope_issue(scope, + "different number of entries %" PRIuSIZE " != %" PRIu64, + record_count, db->md_entries); +bailout: + if (cursor) { + if (handler) { + if (sdb->histogram.key_len.count) { + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_info); + line = histogram_dist(line, &sdb->histogram.key_len, + "key length density", "0/1", false); + chk_line_feed(line); + line = histogram_dist(line, &sdb->histogram.val_len, + "value length density", "0/1", false); + chk_line_end(line); + } + if (scope->stage == MDBX_chk_traversal_maindb) + usr->result.subdb_total = sub_databases; + if (chk->cb->subdb_conclude) + err = chk->cb->subdb_conclude(usr, sdb, cursor, err); + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_resolution); + line = chk_print(line, "summary: %" PRIuSIZE " records,", record_count); + if (dups || (sdb->flags & (MDBX_DUPSORT | MDBX_DUPFIXED | + MDBX_REVERSEDUP | MDBX_INTEGERDUP))) + line = chk_print(line, " %" PRIuSIZE " dups,", dups); + if (sub_databases || dbi == MAIN_DBI) + line = chk_print(line, " %" PRIuSIZE " sub-databases,", sub_databases); + line = chk_print(line, + " %" PRIuSIZE " key's bytes," + " %" PRIuSIZE " data's bytes," + " %" PRIuSIZE " problem(s)", + sdb->histogram.key_len.amount, + sdb->histogram.val_len.amount, scope->subtotal_issues); + chk_line_end(chk_flush(line)); + } + + mdbx_cursor_close(cursor); + if (!txn->mt_cursors[dbi] && (txn->mt_dbi_state[dbi] & DBI_FRESH)) + mdbx_dbi_close(env, dbi); + } + return err; +} + +__cold static int chk_handle_gc(MDBX_chk_scope_t *const scope, + MDBX_chk_subdb_t *sdb, + const size_t record_number, const MDBX_val *key, + const MDBX_val *data) { + MDBX_chk_internal_t *const chk = scope->internal; + MDBX_chk_context_t *const usr = chk->usr; + assert(sdb == &chk->subdb_gc); + (void)sdb; + const char *bad = ""; + pgno_t *iptr = data->iov_base; + + if (key->iov_len != sizeof(txnid_t)) + chk_object_issue(scope, "entry", record_number, "wrong txn-id size", + "key-size %" PRIuSIZE, key->iov_len); + else { + txnid_t txnid; + memcpy(&txnid, key->iov_base, sizeof(txnid)); + if (txnid < 1 || txnid > usr->txn->mt_txnid) + chk_object_issue(scope, "entry", record_number, "wrong txn-id", + "%" PRIaTXN, txnid); + else { + if (data->iov_len < sizeof(pgno_t) || data->iov_len % sizeof(pgno_t)) + chk_object_issue(scope, "entry", txnid, "wrong idl size", "%" PRIuPTR, + data->iov_len); + size_t number = (data->iov_len >= sizeof(pgno_t)) ? *iptr++ : 0; + if (number > MDBX_PGL_LIMIT) + chk_object_issue(scope, "entry", txnid, "wrong idl length", "%" PRIuPTR, + number); + else if ((number + 1) * sizeof(pgno_t) > data->iov_len) { + chk_object_issue(scope, "entry", txnid, "trimmed idl", + "%" PRIuSIZE " > %" PRIuSIZE " (corruption)", + (number + 1) * sizeof(pgno_t), data->iov_len); + number = data->iov_len / sizeof(pgno_t) - 1; + } else if (data->iov_len - (number + 1) * sizeof(pgno_t) >= + /* LY: allow gap up to one page. it is ok + * and better than shink-and-retry inside update_gc() */ + usr->env->me_psize) + chk_object_issue(scope, "entry", txnid, "extra idl space", + "%" PRIuSIZE " < %" PRIuSIZE " (minor, not a trouble)", + (number + 1) * sizeof(pgno_t), data->iov_len); + + usr->result.gc_pages += number; + if (chk->envinfo.mi_latter_reader_txnid > txnid) + usr->result.reclaimable_pages += number; + + size_t prev = MDBX_PNL_ASCENDING ? NUM_METAS - 1 : usr->txn->mt_next_pgno; + size_t span = 1; + for (size_t i = 0; i < number; ++i) { + const size_t pgno = iptr[i]; + if (pgno < NUM_METAS) + chk_object_issue(scope, "entry", txnid, "wrong idl entry", + "pgno %" PRIuSIZE " < meta-pages %u", pgno, + NUM_METAS); + else if (pgno >= usr->result.backed_pages) + chk_object_issue(scope, "entry", txnid, "wrong idl entry", + "pgno %" PRIuSIZE " > backed-pages %" PRIuSIZE, pgno, + usr->result.backed_pages); + else if (pgno >= usr->result.alloc_pages) + chk_object_issue(scope, "entry", txnid, "wrong idl entry", + "pgno %" PRIuSIZE " > alloc-pages %" PRIuSIZE, pgno, + usr->result.alloc_pages - 1); + else { + if (MDBX_PNL_DISORDERED(prev, pgno)) { + bad = " [bad sequence]"; + chk_object_issue( + scope, "entry", txnid, "bad sequence", + "%" PRIuSIZE " %c [%" PRIuSIZE "].%" PRIuSIZE, prev, + (prev == pgno) ? '=' : (MDBX_PNL_ASCENDING ? '>' : '<'), i, + pgno); + } + if (chk->pagemap) { + const intptr_t id = chk->pagemap[pgno]; + if (id == 0) + chk->pagemap[pgno] = -1 /* mark the pgno listed in GC */; + else if (id > 0) { + assert(id - 1 <= (intptr_t)ARRAY_LENGTH(chk->subdb)); + chk_object_issue(scope, "page", pgno, "already used", "by %s", + chk_v2a(chk, &chk->subdb[id - 1]->name)); + } else + chk_object_issue(scope, "page", pgno, "already listed in GC", + nullptr); + } + } + prev = pgno; + while (i + span < number && + iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span) + : pgno_sub(pgno, span))) + ++span; + } + if (sdb->cookie) { + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_details), + "transaction %" PRIaTXN ", %" PRIuSIZE + " pages, maxspan %" PRIuSIZE "%s", + txnid, number, span, bad)); + for (size_t i = 0; i < number; i += span) { + const size_t pgno = iptr[i]; + for (span = 1; + i + span < number && + iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span) + : pgno_sub(pgno, span)); + ++span) + ; + histogram_acc(span, &sdb->histogram.nested_tree); + MDBX_chk_line_t *line = chk_line_begin(scope, MDBX_chk_extra); + if (line) { + if (span > 1) + line = + chk_print(line, "%9" PRIuSIZE "[%" PRIuSIZE "]", pgno, span); + else + line = chk_print(line, "%9" PRIuSIZE, pgno); + chk_line_end(line); + int err = chk_check_break(scope); + if (err) + return err; + } + } + } + } + } + return chk_check_break(scope); +} + +__cold static int env_chk(MDBX_chk_scope_t *const scope) { + MDBX_chk_internal_t *const chk = scope->internal; + MDBX_chk_context_t *const usr = chk->usr; + MDBX_env *const env = usr->env; + MDBX_txn *const txn = usr->txn; + int err = + env_info(env, txn, &chk->envinfo, sizeof(chk->envinfo), &chk->troika); + if (unlikely(err)) + return chk_error_rc(scope, err, "env_info"); + + MDBX_chk_line_t *line = + chk_puts(chk_line_begin(scope, MDBX_chk_info), "current boot-id "); + if (chk->envinfo.mi_bootid.current.x | chk->envinfo.mi_bootid.current.y) + line = chk_print(line, "%016" PRIx64 "-%016" PRIx64, + chk->envinfo.mi_bootid.current.x, + chk->envinfo.mi_bootid.current.y); + else + line = chk_puts(line, "unavailable"); + chk_line_end(line); + + err = osal_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); + if (unlikely(err)) + return chk_error_rc(scope, err, "osal_filesize"); + + //-------------------------------------------------------------------------- + + err = chk_scope_begin(chk, 1, MDBX_chk_meta, nullptr, + &usr->result.problems_meta, "Peek the meta-pages..."); + if (likely(!err)) { + MDBX_chk_scope_t *const inner = usr->scope; + const uint64_t dxbfile_pages = + env->me_dxb_mmap.filesize >> env->me_psize2log; + usr->result.alloc_pages = txn->mt_next_pgno; + usr->result.backed_pages = bytes2pgno(env, env->me_dxb_mmap.current); + if (unlikely(usr->result.backed_pages > dxbfile_pages)) + chk_scope_issue(inner, "backed-pages %zu > file-pages %" PRIu64, + usr->result.backed_pages, dxbfile_pages); + if (unlikely(dxbfile_pages < NUM_METAS)) + chk_scope_issue(inner, "file-pages %" PRIu64 " < %u", dxbfile_pages, + NUM_METAS); + if (unlikely(usr->result.backed_pages < NUM_METAS)) + chk_scope_issue(inner, "backed-pages %zu < %u", usr->result.backed_pages, + NUM_METAS); + if (unlikely(usr->result.backed_pages < NUM_METAS)) { + chk_scope_issue(inner, "backed-pages %zu < num-metas %u", + usr->result.backed_pages, NUM_METAS); + return MDBX_CORRUPTED; + } + if (unlikely(dxbfile_pages < NUM_METAS)) { + chk_scope_issue(inner, "backed-pages %zu < num-metas %u", + usr->result.backed_pages, NUM_METAS); + return MDBX_CORRUPTED; + } + if (unlikely(usr->result.backed_pages > (size_t)MAX_PAGENO + 1)) { + chk_scope_issue(inner, "backed-pages %zu > max-pages %zu", + usr->result.backed_pages, (size_t)MAX_PAGENO + 1); + usr->result.backed_pages = MAX_PAGENO + 1; + } + + if ((env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) { + if (unlikely(usr->result.backed_pages > dxbfile_pages)) { + chk_scope_issue(inner, "backed-pages %zu > file-pages %" PRIu64, + usr->result.backed_pages, dxbfile_pages); + usr->result.backed_pages = (size_t)dxbfile_pages; + } + if (unlikely(usr->result.alloc_pages > usr->result.backed_pages)) { + chk_scope_issue(scope, "alloc-pages %zu > backed-pages %zu", + usr->result.alloc_pages, usr->result.backed_pages); + usr->result.alloc_pages = usr->result.backed_pages; + } + } else { + /* DB may be shrunk by writer down to the allocated (but unused) pages. */ + if (unlikely(usr->result.alloc_pages > usr->result.backed_pages)) { + chk_scope_issue(inner, "alloc-pages %zu > backed-pages %zu", + usr->result.alloc_pages, usr->result.backed_pages); + usr->result.alloc_pages = usr->result.backed_pages; + } + if (unlikely(usr->result.alloc_pages > dxbfile_pages)) { + chk_scope_issue(inner, "alloc-pages %zu > file-pages %" PRIu64, + usr->result.alloc_pages, dxbfile_pages); + usr->result.alloc_pages = (size_t)dxbfile_pages; + } + if (unlikely(usr->result.backed_pages > dxbfile_pages)) + usr->result.backed_pages = (size_t)dxbfile_pages; + } + + line = chk_line_feed(chk_print( + chk_line_begin(inner, MDBX_chk_info), + "pagesize %u (%u system), max keysize %u..%u" + ", max readers %u", + env->me_psize, env->me_os_psize, + mdbx_env_get_maxkeysize_ex(env, MDBX_DUPSORT), + mdbx_env_get_maxkeysize_ex(env, MDBX_DB_DEFAULTS), env->me_maxreaders)); + line = chk_line_feed( + chk_print_size(line, "mapsize ", env->me_dxb_mmap.current, nullptr)); + if (txn->mt_geo.lower == txn->mt_geo.upper) + line = chk_print_size( + line, "fixed datafile: ", chk->envinfo.mi_geo.current, nullptr); + else { + line = chk_print_size( + line, "dynamic datafile: ", chk->envinfo.mi_geo.lower, nullptr); + line = chk_print_size(line, " .. ", chk->envinfo.mi_geo.upper, ", "); + line = chk_print_size(line, "+", chk->envinfo.mi_geo.grow, ", "); + + line = chk_line_feed( + chk_print_size(line, "-", chk->envinfo.mi_geo.shrink, nullptr)); + line = chk_print_size( + line, "current datafile: ", chk->envinfo.mi_geo.current, nullptr); + } + tASSERT(txn, txn->mt_geo.now == chk->envinfo.mi_geo.current / + chk->envinfo.mi_dxb_pagesize); + chk_line_end(chk_print(line, ", %u pages", txn->mt_geo.now)); +#if defined(_WIN32) || defined(_WIN64) || MDBX_DEBUG + if (txn->mt_geo.shrink_pv && txn->mt_geo.now != txn->mt_geo.upper && + scope->verbosity >= MDBX_chk_verbose) { + line = chk_line_begin(inner, MDBX_chk_notice); + chk_line_feed(chk_print( + line, " > WARNING: Due Windows system limitations a file couldn't")); + chk_line_feed(chk_print( + line, " > be truncated while the database is opened. So, the size")); + chk_line_feed(chk_print( + line, " > database file of may by large than the database itself,")); + chk_line_end(chk_print( + line, " > until it will be closed or reopened in read-write mode.")); + } +#endif /* Windows || Debug */ + chk_verbose_meta(inner, 0); + chk_verbose_meta(inner, 1); + chk_verbose_meta(inner, 2); + + if (env->me_stuck_meta >= 0) { + chk_line_end(chk_print(chk_line_begin(inner, MDBX_chk_processing), + "skip checking meta-pages since the %u" + " is selected for verification", + env->me_stuck_meta)); + line = chk_line_feed( + chk_print(chk_line_begin(inner, MDBX_chk_resolution), + "transactions: recent %" PRIu64 ", " + "selected for verification %" PRIu64 ", lag %" PRIi64, + chk->envinfo.mi_recent_txnid, + chk->envinfo.mi_meta_txnid[env->me_stuck_meta], + chk->envinfo.mi_recent_txnid - + chk->envinfo.mi_meta_txnid[env->me_stuck_meta])); + chk_line_end(line); + } else { + chk_line_end(chk_puts(chk_line_begin(inner, MDBX_chk_verbose), + "performs check for meta-pages clashes")); + const unsigned meta_clash_mask = meta_eq_mask(&chk->troika); + if (meta_clash_mask & 1) + chk_scope_issue(inner, "meta-%d and meta-%d are clashed", 0, 1); + if (meta_clash_mask & 2) + chk_scope_issue(inner, "meta-%d and meta-%d are clashed", 1, 2); + if (meta_clash_mask & 4) + chk_scope_issue(inner, "meta-%d and meta-%d are clashed", 2, 0); + + const unsigned prefer_steady_metanum = chk->troika.prefer_steady; + const uint64_t prefer_steady_txnid = + chk->troika.txnid[prefer_steady_metanum]; + const unsigned recent_metanum = chk->troika.recent; + const uint64_t recent_txnid = chk->troika.txnid[recent_metanum]; + if (env->me_flags & MDBX_EXCLUSIVE) { + chk_line_end( + chk_puts(chk_line_begin(inner, MDBX_chk_verbose), + "performs full check recent-txn-id with meta-pages")); + eASSERT(env, recent_txnid == chk->envinfo.mi_recent_txnid); + if (prefer_steady_txnid != recent_txnid) { + if ((chk->flags & MDBX_CHK_READWRITE) != 0 && + (env->me_flags & MDBX_RDONLY) == 0 && + recent_txnid > prefer_steady_txnid && + (chk->envinfo.mi_bootid.current.x | + chk->envinfo.mi_bootid.current.y) != 0 && + chk->envinfo.mi_bootid.current.x == + chk->envinfo.mi_bootid.meta[recent_metanum].x && + chk->envinfo.mi_bootid.current.y == + chk->envinfo.mi_bootid.meta[recent_metanum].y) { + chk_line_end( + chk_print(chk_line_begin(inner, MDBX_chk_verbose), + "recent meta-%u is weak, but boot-id match current" + " (will synced upon successful check)", + recent_metanum)); + } else + chk_scope_issue( + inner, + "steady meta-%d txn-id mismatch recent-txn-id (%" PRIi64 + " != %" PRIi64 ")", + prefer_steady_metanum, prefer_steady_txnid, recent_txnid); + } + } else if (chk->write_locked) { + chk_line_end( + chk_puts(chk_line_begin(inner, MDBX_chk_verbose), + "performs lite check recent-txn-id with meta-pages (not a " + "monopolistic mode)")); + if (recent_txnid != chk->envinfo.mi_recent_txnid) { + chk_scope_issue(inner, + "weak meta-%d txn-id mismatch recent-txn-id (%" PRIi64 + " != %" PRIi64 ")", + recent_metanum, recent_txnid, + chk->envinfo.mi_recent_txnid); + } + } else { + chk_line_end(chk_puts( + chk_line_begin(inner, MDBX_chk_verbose), + "skip check recent-txn-id with meta-pages (monopolistic or " + "read-write mode only)")); + } + + chk_line_end(chk_print( + chk_line_begin(inner, MDBX_chk_resolution), + "transactions: recent %" PRIu64 ", latter reader %" PRIu64 + ", lag %" PRIi64, + chk->envinfo.mi_recent_txnid, chk->envinfo.mi_latter_reader_txnid, + chk->envinfo.mi_recent_txnid - chk->envinfo.mi_latter_reader_txnid)); + } + } + err = chk_scope_restore(scope, err); + + //-------------------------------------------------------------------------- + + if (chk->flags & MDBX_CHK_SKIP_BTREE_TRAVERSAL) + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_processing), + "Skipping %s traversal...", "b-tree")); + else { + err = chk_scope_begin( + chk, -1, MDBX_chk_traversal_tree, nullptr, &usr->result.tree_problems, + "Traversal %s by txn#%" PRIaTXN "...", "b-tree", txn->mt_txnid); + if (likely(!err)) + err = chk_tree(usr->scope); + if (usr->result.tree_problems && usr->result.gc_tree_problems == 0) + usr->result.gc_tree_problems = usr->result.tree_problems; + if (usr->result.tree_problems && usr->result.kv_tree_problems == 0) + usr->result.kv_tree_problems = usr->result.tree_problems; + chk_scope_restore(scope, err); + } + + if (usr->result.gc_tree_problems > 0) + chk_line_end(chk_print( + chk_line_begin(scope, MDBX_chk_processing), + "Skip processing %s since %s is corrupted (%" PRIuSIZE " problem(s))", + chk_v2a(chk, MDBX_CHK_GC), "b-tree", + usr->result.problems_gc = usr->result.gc_tree_problems)); + else { + err = chk_scope_begin(chk, -1, MDBX_chk_traversal_freedb, &chk->subdb_gc, + &usr->result.problems_gc, + "Traversal %s by txn#%" PRIaTXN "...", "GC/freeDB", + txn->mt_txnid); + if (likely(!err)) + err = chk_db(usr->scope, FREE_DBI, &chk->subdb_gc, chk_handle_gc); + line = chk_line_begin(scope, MDBX_chk_info); + if (line) { + histogram_print(scope, line, &chk->subdb_gc.histogram.nested_tree, + "span(s)", "single", false); + chk_line_end(line); + } + if (usr->result.problems_gc == 0 && + (chk->flags & MDBX_CHK_SKIP_BTREE_TRAVERSAL) == 0) { + const size_t used_pages = usr->result.alloc_pages - usr->result.gc_pages; + if (usr->result.processed_pages != used_pages) + chk_scope_issue(usr->scope, + "used pages mismatch (%" PRIuSIZE + "(walked) != %" PRIuSIZE "(allocated - GC))", + usr->result.processed_pages, used_pages); + if (usr->result.unused_pages != usr->result.gc_pages) + chk_scope_issue(usr->scope, + "GC pages mismatch (%" PRIuSIZE + "(expected) != %" PRIuSIZE "(GC))", + usr->result.unused_pages, usr->result.gc_pages); + } + } + chk_scope_restore(scope, err); + + //-------------------------------------------------------------------------- + + err = chk_scope_begin(chk, 1, MDBX_chk_space, nullptr, nullptr, + "Page allocation:"); + const double percent_boundary_reciprocal = 100.0 / txn->mt_geo.upper; + const double percent_backed_reciprocal = 100.0 / usr->result.backed_pages; + const size_t detained = usr->result.gc_pages - usr->result.reclaimable_pages; + const size_t available2boundary = txn->mt_geo.upper - + usr->result.alloc_pages + + usr->result.reclaimable_pages; + const size_t available2backed = usr->result.backed_pages - + usr->result.alloc_pages + + usr->result.reclaimable_pages; + const size_t remained2boundary = txn->mt_geo.upper - usr->result.alloc_pages; + const size_t remained2backed = + usr->result.backed_pages - usr->result.alloc_pages; + + const size_t used = (chk->flags & MDBX_CHK_SKIP_BTREE_TRAVERSAL) + ? usr->result.alloc_pages - usr->result.gc_pages + : usr->result.processed_pages; + + line = chk_line_begin(usr->scope, MDBX_chk_info); + line = chk_print(line, + "backed by file: %" PRIuSIZE " pages (%.1f%%)" + ", %" PRIuSIZE " left to boundary (%.1f%%)", + usr->result.backed_pages, + usr->result.backed_pages * percent_boundary_reciprocal, + txn->mt_geo.upper - usr->result.backed_pages, + (txn->mt_geo.upper - usr->result.backed_pages) * + percent_boundary_reciprocal); + line = chk_line_feed(line); + + line = chk_print( + line, "%s: %" PRIuSIZE " page(s), %.1f%% of backed, %.1f%% of boundary", + "used", used, used * percent_backed_reciprocal, + used * percent_boundary_reciprocal); + line = chk_line_feed(line); + + line = chk_print( + line, + "%s: %" PRIuSIZE " page(s) (%.1f%%) of backed, %" PRIuSIZE + " to boundary (%.1f%% of boundary)", + "remained", remained2backed, remained2backed * percent_backed_reciprocal, + remained2boundary, remained2boundary * percent_boundary_reciprocal); + line = chk_line_feed(line); + + line = chk_print( + line, + "reclaimable: %" PRIuSIZE " (%.1f%% of backed, %.1f%% of boundary)" + ", GC %" PRIuSIZE " (%.1f%% of backed, %.1f%% of boundary)", + usr->result.reclaimable_pages, + usr->result.reclaimable_pages * percent_backed_reciprocal, + usr->result.reclaimable_pages * percent_boundary_reciprocal, + usr->result.gc_pages, usr->result.gc_pages * percent_backed_reciprocal, + usr->result.gc_pages * percent_boundary_reciprocal); + line = chk_line_feed(line); + + line = chk_print( + line, + "detained by reader(s): %" PRIuSIZE + " (%.1f%% of backed, %.1f%% of boundary)" + ", %u reader(s), lag %" PRIi64, + detained, detained * percent_backed_reciprocal, + detained * percent_boundary_reciprocal, chk->envinfo.mi_numreaders, + chk->envinfo.mi_recent_txnid - chk->envinfo.mi_latter_reader_txnid); + line = chk_line_feed(line); + + line = chk_print( + line, "%s: %" PRIuSIZE " page(s), %.1f%% of backed, %.1f%% of boundary", + "allocated", usr->result.alloc_pages, + usr->result.alloc_pages * percent_backed_reciprocal, + usr->result.alloc_pages * percent_boundary_reciprocal); + line = chk_line_feed(line); + + line = chk_print(line, + "%s: %" PRIuSIZE " page(s) (%.1f%%) of backed, %" PRIuSIZE + " to boundary (%.1f%% of boundary)", + "available", available2backed, + available2backed * percent_backed_reciprocal, + available2boundary, + available2boundary * percent_boundary_reciprocal); + chk_line_end(line); + + line = chk_line_begin(usr->scope, MDBX_chk_resolution); + line = chk_print(line, "%s %" PRIaPGNO " pages", + (txn->mt_geo.upper == txn->mt_geo.now) ? "total" : "upto", + txn->mt_geo.upper); + line = chk_print(line, ", backed %" PRIuSIZE " (%.1f%%)", + usr->result.backed_pages, + usr->result.backed_pages * percent_boundary_reciprocal); + line = chk_print(line, ", allocated %" PRIuSIZE " (%.1f%%)", + usr->result.alloc_pages, + usr->result.alloc_pages * percent_boundary_reciprocal); + line = + chk_print(line, ", available %" PRIuSIZE " (%.1f%%)", available2boundary, + available2boundary * percent_boundary_reciprocal); + chk_line_end(line); + chk_scope_restore(scope, err); + + //-------------------------------------------------------------------------- + + if (chk->flags & MDBX_CHK_SKIP_KV_TRAVERSAL) + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_processing), + "Skipping %s traversal...", "key-value")); + else if ((usr->result.problems_kv = usr->result.kv_tree_problems) > 0) + chk_line_end(chk_print( + chk_line_begin(scope, MDBX_chk_processing), + "Skip processing %s since %s is corrupted (%" PRIuSIZE " problem(s))", + chk_v2a(chk, MDBX_CHK_MAIN), "key-value", + usr->result.problems_kv = usr->result.kv_tree_problems)); + else { + err = + chk_scope_begin(chk, 0, MDBX_chk_traversal_maindb, &chk->subdb_main, + &usr->result.problems_kv, "Processing %s...", "MainDB"); + if (likely(!err)) + err = chk_db(usr->scope, MAIN_DBI, &chk->subdb_main, chk_handle_kv); + chk_scope_restore(scope, err); + + if (usr->result.problems_kv && usr->result.subdb_total) + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_processing), + "Skip processing %s", "sub-database(s)")); + else if (usr->result.problems_kv == 0 && usr->result.subdb_total == 0) + chk_line_end(chk_print(chk_line_begin(scope, MDBX_chk_info), "No %s", + "sub-database(s)")); + else if (usr->result.problems_kv == 0 && usr->result.subdb_total) { + err = chk_scope_begin(chk, 1, MDBX_chk_traversal_subdbs, nullptr, + &usr->result.problems_kv, + "Traversal %s by txn#%" PRIaTXN "...", + "sub-database(s)", txn->mt_txnid); + if (!err) + err = chk_db(usr->scope, MAIN_DBI, &chk->subdb_main, nullptr); + if (usr->scope->subtotal_issues) + chk_line_end( + chk_print(chk_line_begin(usr->scope, MDBX_chk_resolution), + "processed %" PRIuSIZE " of %" PRIuSIZE " subDb(s)" + ", %" PRIuSIZE " problems(s)", + usr->result.subdb_processed, usr->result.subdb_total, + usr->scope->subtotal_issues)); + } + chk_scope_restore(scope, err); + } + + return chk_scope_end(chk, chk_scope_begin(chk, 0, MDBX_chk_conclude, nullptr, + nullptr, nullptr)); +} + +__cold int mdbx_env_chk_encount_problem(MDBX_chk_context_t *ctx) { + if (likely(ctx && ctx->internal && ctx->internal->usr == ctx && + ctx->internal->problem_counter && ctx->scope)) { + *ctx->internal->problem_counter += 1; + ctx->scope->subtotal_issues += 1; + return MDBX_SUCCESS; + } + return MDBX_EINVAL; +} + +__cold int mdbx_env_chk(MDBX_env *env, const struct MDBX_chk_callbacks *cb, + MDBX_chk_context_t *ctx, + const enum MDBX_chk_flags_t flags, + enum MDBX_chk_severity verbosity, + unsigned timeout_seconds_16dot16) { + int err, rc = check_env(env, false); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + if (unlikely(!cb || !ctx || ctx->internal)) + return MDBX_EINVAL; + + MDBX_chk_internal_t *const chk = osal_calloc(1, sizeof(MDBX_chk_internal_t)); + if (unlikely(!chk)) + return MDBX_ENOMEM; + + chk->cb = cb; + chk->usr = ctx; + chk->usr->internal = chk; + chk->usr->env = env; + chk->flags = flags; + + chk->subdb_gc.id = -1; + chk->subdb_gc.name.iov_base = MDBX_CHK_GC; + chk->subdb[FREE_DBI] = &chk->subdb_gc; + + chk->subdb_main.id = -1; + chk->subdb_main.name.iov_base = MDBX_CHK_MAIN; + chk->subdb[MAIN_DBI] = &chk->subdb_main; + + chk->monotime_timeout = + timeout_seconds_16dot16 + ? osal_16dot16_to_monotime(timeout_seconds_16dot16) + osal_monotime() + : 0; + chk->usr->scope_nesting = 0; + chk->usr->result.subdbs = (const void *)&chk->subdb; + + MDBX_chk_scope_t *const top = chk->scope_stack; + top->verbosity = verbosity; + top->internal = chk; + + // init + rc = chk_scope_end( + chk, chk_scope_begin(chk, 0, MDBX_chk_init, nullptr, nullptr, nullptr)); + + // lock + if (likely(!rc)) + rc = chk_scope_begin( + chk, 0, MDBX_chk_lock, nullptr, nullptr, "Taking %slock...", + (env->me_flags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) ? "" : "read "); + if (likely(!rc) && (env->me_flags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) == 0 && + (flags & MDBX_CHK_READWRITE)) { + rc = mdbx_txn_lock(env, false); + if (unlikely(rc)) + chk_error_rc(ctx->scope, rc, "mdbx_txn_lock"); + else + chk->write_locked = true; + } + if (likely(!rc)) { + rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &ctx->txn); + if (unlikely(rc)) + chk_error_rc(ctx->scope, rc, "mdbx_txn_begin"); + } + chk_scope_end(chk, rc); + + // doit + if (likely(!rc)) { + chk->subdb_gc.flags = ctx->txn->mt_dbs[FREE_DBI].md_flags; + chk->subdb_main.flags = ctx->txn->mt_dbs[MAIN_DBI].md_flags; + rc = env_chk(top); + } + + // unlock + if (ctx->txn || chk->write_locked) { + chk_scope_begin(chk, 0, MDBX_chk_unlock, nullptr, nullptr, nullptr); + if (ctx->txn) { + err = mdbx_txn_abort(ctx->txn); + if (err && !rc) + rc = err; + ctx->txn = nullptr; + } + if (chk->write_locked) + mdbx_txn_unlock(env); + rc = chk_scope_end(chk, rc); + } + + // finalize + err = chk_scope_begin(chk, 0, MDBX_chk_finalize, nullptr, nullptr, nullptr); + rc = chk_scope_end(chk, err ? err : rc); + chk_dispose(chk); + return rc; +} + /******************************************************************************/ __dll_export @@ -29897,9 +33425,9 @@ __dll_export #ifdef __SANITIZE_ADDRESS__ " SANITIZE_ADDRESS=YES" #endif /* __SANITIZE_ADDRESS__ */ -#ifdef MDBX_USE_VALGRIND - " MDBX_USE_VALGRIND=YES" -#endif /* MDBX_USE_VALGRIND */ +#ifdef ENABLE_MEMCHECK + " ENABLE_MEMCHECK=YES" +#endif /* ENABLE_MEMCHECK */ #if MDBX_FORCE_ASSERTIONS " MDBX_FORCE_ASSERTIONS=YES" #endif /* MDBX_FORCE_ASSERTIONS */ @@ -30011,7 +33539,7 @@ const char *__asan_default_options(void) { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -30253,7 +33781,7 @@ MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, unsigned line) { #endif /* MDBX_DEBUG */ - if (debug_logger) + if (mdbx_static.logger.ptr) debug_log(MDBX_LOG_FATAL, func, line, "assert: %s\n", msg); else { #if defined(_WIN32) || defined(_WIN64) @@ -30296,7 +33824,7 @@ __cold void mdbx_panic(const char *fmt, ...) { ? "" : message; - if (debug_logger) + if (mdbx_static.logger.ptr) debug_log(MDBX_LOG_FATAL, "panic", 0, "%s", const_message); while (1) { @@ -30512,8 +34040,18 @@ MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) InitializeCriticalSection(fastmutex); return MDBX_SUCCESS; +#elif MDBX_DEBUG + pthread_mutexattr_t ma; + int rc = pthread_mutexattr_init(&ma); + if (likely(!rc)) { + rc = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK); + if (likely(!rc) || rc == ENOTSUP) + rc = pthread_mutex_init(fastmutex, &ma); + pthread_mutexattr_destroy(&ma); + } + return rc; #else - return pthread_mutex_init(fastmutex, NULL); + return pthread_mutex_init(fastmutex, nullptr); #endif } @@ -30535,7 +34073,7 @@ MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex) { 0xC0000194 /* STATUS_POSSIBLE_DEADLOCK / EXCEPTION_POSSIBLE_DEADLOCK */) ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH) { - return ERROR_POSSIBLE_DEADLOCK; + return MDBX_EDEADLK; } return MDBX_SUCCESS; #else @@ -31835,8 +35373,8 @@ MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, #else struct statvfs info; if (err != MDBX_ENOFILE) { - if (statvfs(pathname, &info) == 0 && (info.f_flag & ST_RDONLY) == 0) - return err; + if (statvfs(pathname, &info) == 0) + return (info.f_flag & ST_RDONLY) ? MDBX_SUCCESS : err; if (errno != MDBX_ENOFILE) return errno; } @@ -33487,10 +37025,8 @@ __cold int mdbx_get_sysraminfo(intptr_t *page_size, intptr_t *total_pages, return MDBX_SUCCESS; } -#ifndef xMDBX_ALLOY -unsigned sys_pagesize; -MDBX_MAYBE_UNUSED unsigned sys_pagesize_ln2, sys_allocation_granularity; -#endif /* xMDBX_ALLOY */ +MDBX_INTERNAL_VAR_INSTA unsigned sys_pagesize, sys_pagesize_ln2, + sys_allocation_granularity; void osal_ctor(void) { #if MDBX_HAVE_PWRITEV && defined(_SC_IOV_MAX) @@ -33537,7 +37073,7 @@ void osal_dtor(void) {} #if MDBX_VERSION_MAJOR != 0 || \ - MDBX_VERSION_MINOR != 12 + MDBX_VERSION_MINOR != 13 #error "API version mismatch! Had `git fetch --tags` done?" #endif @@ -33557,11 +37093,11 @@ __dll_export #endif const struct MDBX_version_info mdbx_version = { 0, - 12, - 9, - 16, - {"2024-03-06T22:58:31+03:00", "c5e6e3a4f75727b9e0039ad420ae167d3487d006", "fff3fbd866c50ee3c77b244a9b05f497e06a65e8", - "v0.12.9-16-gfff3fbd8"}, + 13, + 0, + 38, + {"2024-04-04T22:31:03+03:00", "a0fc2d938419aa82764beae00e1472f412d5a4d5", "f19753636d2364c43125f972b8d3f29dc9e244b4", + "v0.13.0-38-gf1975363"}, sourcery}; __dll_export @@ -33578,7 +37114,7 @@ __dll_export #endif const char *const mdbx_sourcery_anchor = sourcery; /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -33752,7 +37288,7 @@ static int funlock(mdbx_filehandle_t fd, size_t offset, size_t bytes) { #define DXB_BODY (env->me_psize * (size_t)NUM_METAS), DXB_MAXLEN #define DXB_WHOLE 0, DXB_MAXLEN -int mdbx_txn_lock(MDBX_env *env, bool dontwait) { +int osal_txn_lock(MDBX_env *env, bool dontwait) { if (dontwait) { if (!TryEnterCriticalSection(&env->me_windowsbug_lock)) return MDBX_BUSY; @@ -33764,16 +37300,13 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) { 0xC0000194 /* STATUS_POSSIBLE_DEADLOCK / EXCEPTION_POSSIBLE_DEADLOCK */) ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH) { - return ERROR_POSSIBLE_DEADLOCK; + return MDBX_EDEADLK; } } - if (env->me_flags & MDBX_EXCLUSIVE) { - /* Zap: Failing to release lock 'env->me_windowsbug_lock' - * in function 'mdbx_txn_lock' */ - MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(26115); - return MDBX_SUCCESS; - } + eASSERT(env, !env->me_txn0->mt_owner); + if (env->me_flags & MDBX_EXCLUSIVE) + goto done; const HANDLE fd4data = env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; @@ -33792,17 +37325,20 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) { } } if (rc == MDBX_SUCCESS) { + done: /* Zap: Failing to release lock 'env->me_windowsbug_lock' * in function 'mdbx_txn_lock' */ MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(26115); - return rc; + env->me_txn0->mt_owner = osal_thread_self(); + return MDBX_SUCCESS; } LeaveCriticalSection(&env->me_windowsbug_lock); return (!dontwait || rc != ERROR_LOCK_VIOLATION) ? rc : MDBX_BUSY; } -void mdbx_txn_unlock(MDBX_env *env) { +void osal_txn_unlock(MDBX_env *env) { + eASSERT(env, env->me_txn0->mt_owner == osal_thread_self()); if ((env->me_flags & MDBX_EXCLUSIVE) == 0) { const HANDLE fd4data = env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; @@ -33810,6 +37346,7 @@ void mdbx_txn_unlock(MDBX_env *env) { if (err != MDBX_SUCCESS) mdbx_panic("%s failed: err %u", __func__, err); } + env->me_txn0->mt_owner = 0; LeaveCriticalSection(&env->me_windowsbug_lock); } @@ -33899,7 +37436,7 @@ static int suspend_and_append(mdbx_handle_array_t **array, MDBX_INTERNAL_FUNC int osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { - eASSERT(env, (env->me_flags & MDBX_NOTLS) == 0); + eASSERT(env, (env->me_flags & MDBX_NOSTICKYTHREADS) == 0); const uintptr_t CurrentTid = GetCurrentThreadId(); int rc; if (env->me_lck_mmap.lck) { @@ -34016,7 +37553,7 @@ osal_resume_threads_after_remap(mdbx_handle_array_t *array) { * The osal_lck_downgrade() moves the locking-FSM from "exclusive write" * state to the "used" (i.e. shared) state. * - * The mdbx_lck_upgrade() moves the locking-FSM from "used" (i.e. shared) + * The osal_lck_upgrade() moves the locking-FSM from "used" (i.e. shared) * state to the "exclusive write" state. */ @@ -34189,7 +37726,7 @@ MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) { return MDBX_SUCCESS /* 5) now at S-? (used), done */; } -MDBX_INTERNAL_FUNC int mdbx_lck_upgrade(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, bool dont_wait) { /* Transite from used state (S-?) to exclusive-write (E-E) */ assert(env->me_lfd != INVALID_HANDLE_VALUE); @@ -34199,7 +37736,9 @@ MDBX_INTERNAL_FUNC int mdbx_lck_upgrade(MDBX_env *env) { /* 1) now on S-? (used), try S-E (locked) */ jitter4testing(false); - int rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_UPPER); + int rc = flock(env->me_lfd, + dont_wait ? LCK_EXCLUSIVE | LCK_DONTWAIT : LCK_EXCLUSIVE, + LCK_UPPER); if (rc != MDBX_SUCCESS) { /* 2) something went wrong, give up */; VERBOSE("%s, err %u", "S-?(used) >> S-E(locked)", rc); @@ -34214,7 +37753,9 @@ MDBX_INTERNAL_FUNC int mdbx_lck_upgrade(MDBX_env *env) { /* 4) now on ?-E (middle), try E-E (exclusive-write) */ jitter4testing(false); - rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER); + rc = flock(env->me_lfd, + dont_wait ? LCK_EXCLUSIVE | LCK_DONTWAIT : LCK_EXCLUSIVE, + LCK_LOWER); if (rc != MDBX_SUCCESS) { /* 5) something went wrong, give up */; VERBOSE("%s, err %u", "?-E(middle) >> E-E(exclusive-write)", rc); @@ -34251,7 +37792,9 @@ MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, } MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, - MDBX_env *inprocess_neighbor) { + MDBX_env *inprocess_neighbor, + const uint32_t current_pid) { + (void)current_pid; /* LY: should unmap before releasing the locks to avoid race condition and * STATUS_USER_MAPPED_FILE/ERROR_USER_MAPPED_FILE */ if (env->me_map) @@ -34260,7 +37803,7 @@ MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, const bool synced = env->me_lck_mmap.lck->mti_unsynced_pages.weak == 0; osal_munmap(&env->me_lck_mmap); if (synced && !inprocess_neighbor && env->me_lfd != INVALID_HANDLE_VALUE && - mdbx_lck_upgrade(env) == MDBX_SUCCESS) + osal_lck_upgrade(env, true) == MDBX_SUCCESS) /* this will fail if LCK is used/mmapped by other process(es) */ osal_ftruncate(env->me_lfd, 0); } @@ -34487,7 +38030,7 @@ static void mdbx_winnt_import(void) { #endif /* Windows LCK-implementation */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -34514,10 +38057,9 @@ static void mdbx_winnt_import(void) { #include -#ifndef xMDBX_ALLOY -uint32_t linux_kernel_version; -bool mdbx_RunningOnWSL1; -#endif /* xMDBX_ALLOY */ +MDBX_INTERNAL_VAR_INSTA uint32_t linux_kernel_version; +MDBX_INTERNAL_VAR_INSTA bool + mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; MDBX_EXCLUDE_FOR_GPROF __cold static uint8_t probe_for_WSL(const char *tag) { @@ -34607,7 +38149,7 @@ mdbx_global_destructor(void) { * - Блокировка таблицы читателей для регистрации, * т.е. функции osal_rdt_lock() и osal_rdt_unlock(). * - Блокировка БД для пишущих транзакций, - * т.е. функции mdbx_txn_lock() и mdbx_txn_unlock(). + * т.е. функции osal_txn_lock() и osal_txn_unlock(). * * Остальной функционал реализуется отдельно посредством файловых блокировок: * - Первоначальный захват БД в режиме exclusive/shared и последующий перевод @@ -34657,7 +38199,7 @@ mdbx_global_destructor(void) { static int op_setlk, op_setlkw, op_getlk; __cold static void choice_fcntl(void) { assert(!op_setlk && !op_setlkw && !op_getlk); - if ((runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 + if ((mdbx_static.flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 #if defined(__linux__) || defined(__gnu_linux__) && linux_kernel_version > 0x030f0000 /* OFD locks are available since 3.15, but engages here @@ -34781,7 +38323,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid) { /*---------------------------------------------------------------------------*/ #if MDBX_LOCKING > MDBX_LOCKING_SYSV -MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc) { +MDBX_INTERNAL_FUNC int osal_ipclock_stubinit(osal_ipclock_t *ipc) { #if MDBX_LOCKING == MDBX_LOCKING_POSIX1988 return sem_init(ipc, false, 1) ? errno : 0; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ @@ -35014,15 +38556,42 @@ MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) { return rc; } -__cold MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, - MDBX_env *inprocess_neighbor) { +MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, bool dont_wait) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); if (unlikely(osal_getpid() != env->me_pid)) return MDBX_PANIC; + const int cmd = dont_wait ? op_setlk : op_setlkw; + int rc = lck_op(env->me_lfd, cmd, F_WRLCK, 0, 1); + if (rc == MDBX_SUCCESS && (env->me_flags & MDBX_EXCLUSIVE) == 0) { + rc = (env->me_pid > 1) + ? lck_op(env->me_lazy_fd, cmd, F_WRLCK, 0, env->me_pid - 1) + : MDBX_SUCCESS; + if (rc == MDBX_SUCCESS) { + rc = lck_op(env->me_lazy_fd, cmd, F_WRLCK, env->me_pid + 1, + OFF_T_MAX - env->me_pid - 1); + if (rc != MDBX_SUCCESS && env->me_pid > 1 && + lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, 0, env->me_pid - 1)) + rc = MDBX_PANIC; + } + if (rc != MDBX_SUCCESS && lck_op(env->me_lfd, op_setlk, F_RDLCK, 0, 1)) + rc = MDBX_PANIC; + } + if (unlikely(rc != 0)) { + ERROR("%s, err %u", "lck", rc); + assert(MDBX_IS_ERROR(rc)); + } + return rc; +} + +__cold MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, + MDBX_env *inprocess_neighbor, + const uint32_t current_pid) { + eASSERT(env, osal_getpid() == current_pid); int rc = MDBX_SUCCESS; struct stat lck_info; - MDBX_lockinfo *lck = env->me_lck_mmap.lck; - if (env->me_lfd != INVALID_HANDLE_VALUE && !inprocess_neighbor && lck && + MDBX_lockinfo *lck = env->me_lck; + if (lck && lck == env->me_lck_mmap.lck && !inprocess_neighbor && /* try get exclusive access */ lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, OFF_T_MAX) == 0 && /* if LCK was not removed */ @@ -35031,7 +38600,8 @@ __cold MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX) == 0) { - VERBOSE("%p got exclusive, drown locks", (void *)env); + VERBOSE("%p got exclusive, drown ipc-locks", (void *)env); + eASSERT(env, current_pid == env->me_pid); #if MDBX_LOCKING == MDBX_LOCKING_SYSV if (env->me_sysv_ipc.semid != -1) rc = semctl(env->me_sysv_ipc.semid, 2, IPC_RMID) ? errno : 0; @@ -35045,13 +38615,20 @@ __cold MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, if (rc == 0) { const bool synced = lck->mti_unsynced_pages.weak == 0; osal_munmap(&env->me_lck_mmap); - if (synced) + if (synced && env->me_lfd != INVALID_HANDLE_VALUE) rc = ftruncate(env->me_lfd, 0) ? errno : 0; } jitter4testing(false); } + if (current_pid != env->me_pid) { + eASSERT(env, !inprocess_neighbor); + NOTICE("drown env %p after-fork pid %d -> %d", + __Wpedantic_format_voidptr(env), env->me_pid, current_pid); + inprocess_neighbor = nullptr; + } + /* 1) POSIX's fcntl() locks (i.e. when op_setlk == F_SETLK) should be restored * after file was closed. * @@ -35248,7 +38825,7 @@ __cold MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, #endif /* MDBX_LOCKING > 0 */ } -__cold static int mdbx_ipclock_failed(MDBX_env *env, osal_ipclock_t *ipc, +__cold static int osal_ipclock_failed(MDBX_env *env, osal_ipclock_t *ipc, const int err) { int rc = err; #if MDBX_LOCKING == MDBX_LOCKING_POSIX2008 || MDBX_LOCKING == MDBX_LOCKING_SYSV @@ -35309,11 +38886,6 @@ __cold static int mdbx_ipclock_failed(MDBX_env *env, osal_ipclock_t *ipc, #error "FIXME" #endif /* MDBX_LOCKING */ -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - if (rc == EDEADLK && atomic_load32(&env->me_ignore_EDEADLK, mo_Relaxed) > 0) - return rc; -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ - ERROR("mutex (un)lock failed, %s", mdbx_strerror(err)); if (rc != EDEADLK) env->me_flags |= MDBX_FATAL_ERROR; @@ -35339,7 +38911,7 @@ MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void) { } #endif /* __ANDROID_API__ || ANDROID) || BIONIC */ -static int mdbx_ipclock_lock(MDBX_env *env, osal_ipclock_t *ipc, +static int osal_ipclock_lock(MDBX_env *env, osal_ipclock_t *ipc, const bool dont_wait) { #if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 @@ -35375,63 +38947,87 @@ static int mdbx_ipclock_lock(MDBX_env *env, osal_ipclock_t *ipc, #endif /* MDBX_LOCKING */ if (unlikely(rc != MDBX_SUCCESS && rc != MDBX_BUSY)) - rc = mdbx_ipclock_failed(env, ipc, rc); + rc = osal_ipclock_failed(env, ipc, rc); return rc; } -static int mdbx_ipclock_unlock(MDBX_env *env, osal_ipclock_t *ipc) { +int osal_ipclock_unlock(MDBX_env *env, osal_ipclock_t *ipc) { + int err = MDBX_ENOSYS; #if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 - int rc = pthread_mutex_unlock(ipc); - (void)env; + err = pthread_mutex_unlock(ipc); #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 - int rc = sem_post(ipc) ? errno : MDBX_SUCCESS; - (void)env; + err = sem_post(ipc) ? errno : MDBX_SUCCESS; #elif MDBX_LOCKING == MDBX_LOCKING_SYSV if (unlikely(*ipc != (pid_t)env->me_pid)) - return EPERM; - *ipc = 0; - struct sembuf op = {.sem_num = (ipc != &env->me_lck->mti_wlock), - .sem_op = 1, - .sem_flg = SEM_UNDO}; - int rc = semop(env->me_sysv_ipc.semid, &op, 1) ? errno : MDBX_SUCCESS; + err = EPERM; + else { + *ipc = 0; + struct sembuf op = {.sem_num = (ipc != &env->me_lck->mti_wlock), + .sem_op = 1, + .sem_flg = SEM_UNDO}; + err = semop(env->me_sysv_ipc.semid, &op, 1) ? errno : MDBX_SUCCESS; + } #else #error "FIXME" #endif /* MDBX_LOCKING */ + int rc = err; + if (unlikely(rc != MDBX_SUCCESS)) { + const uint32_t current_pid = osal_getpid(); + if (current_pid == env->me_pid || LOG_ENABLED(MDBX_LOG_NOTICE)) + debug_log((current_pid == env->me_pid) + ? MDBX_LOG_FATAL + : (rc = MDBX_SUCCESS, MDBX_LOG_NOTICE), + "ipc-unlock()", __LINE__, "failed: env %p, lck-%s %p, err %d\n", + __Wpedantic_format_voidptr(env), + (env->me_lck == env->me_lck_mmap.lck) ? "mmap" : "stub", + __Wpedantic_format_voidptr(env->me_lck), err); + } return rc; } MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) { TRACE("%s", ">>"); jitter4testing(true); - int rc = mdbx_ipclock_lock(env, &env->me_lck->mti_rlock, false); + int rc = osal_ipclock_lock(env, &env->me_lck->mti_rlock, false); TRACE("<< rc %d", rc); return rc; } MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) { TRACE("%s", ">>"); - int rc = mdbx_ipclock_unlock(env, &env->me_lck->mti_rlock); - TRACE("<< rc %d", rc); - if (unlikely(rc != MDBX_SUCCESS)) - mdbx_panic("%s() failed: err %d\n", __func__, rc); + int err = osal_ipclock_unlock(env, &env->me_lck->mti_rlock); + TRACE("<< err %d", err); + if (unlikely(err != MDBX_SUCCESS)) + mdbx_panic("%s() failed: err %d\n", __func__, err); jitter4testing(true); } -int mdbx_txn_lock(MDBX_env *env, bool dont_wait) { +int osal_txn_lock(MDBX_env *env, bool dont_wait) { TRACE("%swait %s", dont_wait ? "dont-" : "", ">>"); jitter4testing(true); - int rc = mdbx_ipclock_lock(env, &env->me_lck->mti_wlock, dont_wait); - TRACE("<< rc %d", rc); - return MDBX_IS_ERROR(rc) ? rc : MDBX_SUCCESS; + const int err = osal_ipclock_lock(env, &env->me_lck->mti_wlock, dont_wait); + int rc = err; + if (likely(!MDBX_IS_ERROR(err))) { + eASSERT(env, !env->me_txn0->mt_owner || + err == /* если другой поток в этом-же процессе завершился + не освободив блокировку */ + MDBX_RESULT_TRUE); + env->me_txn0->mt_owner = osal_thread_self(); + rc = MDBX_SUCCESS; + } + TRACE("<< err %d, rc %d", err, rc); + return rc; } -void mdbx_txn_unlock(MDBX_env *env) { +void osal_txn_unlock(MDBX_env *env) { TRACE("%s", ">>"); - int rc = mdbx_ipclock_unlock(env, &env->me_lck->mti_wlock); - TRACE("<< rc %d", rc); - if (unlikely(rc != MDBX_SUCCESS)) - mdbx_panic("%s() failed: err %d\n", __func__, rc); + eASSERT(env, env->me_txn0->mt_owner == osal_thread_self()); + env->me_txn0->mt_owner = 0; + int err = osal_ipclock_unlock(env, &env->me_lck->mti_wlock); + TRACE("<< err %d", err); + if (unlikely(err != MDBX_SUCCESS)) + mdbx_panic("%s() failed: err %d\n", __func__, err); jitter4testing(true); } diff --git a/mdbxdist/mdbx.c++ b/mdbxdist/mdbx.c++ index cd38971..d8bada8 100644 --- a/mdbxdist/mdbx.c++ +++ b/mdbxdist/mdbx.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * . */ #define xMDBX_ALLOY 1 -#define MDBX_BUILD_SOURCERY 0c86daff919dc09ab5e99c8e5bfb1d51e1367babca9c630c97af92a2aa988ea9_v0_12_9_16_gfff3fbd8 +#define MDBX_BUILD_SOURCERY 91ff5b5423830ee44fca4b70dcb298f233338a17a3185c44df67ec16d3034af3_v0_13_0_38_gf1975363 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -25,11 +25,13 @@ #ifdef xMDBX_ALLOY /* Amalgamated build */ #define MDBX_INTERNAL_FUNC static -#define MDBX_INTERNAL_VAR static +#define MDBX_INTERNAL_VAR_PROTO static +#define MDBX_INTERNAL_VAR_INSTA static #else /* Non-amalgamated build */ #define MDBX_INTERNAL_FUNC -#define MDBX_INTERNAL_VAR extern +#define MDBX_INTERNAL_VAR_PROTO extern +#define MDBX_INTERNAL_VAR_INSTA #endif /* xMDBX_ALLOY */ /*----------------------------------------------------------------------------*/ @@ -94,6 +96,10 @@ disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ producing 'defined' has undefined behavior */ #endif +#if _MSC_VER < 1920 +/* avoid "error C2219: syntax error: type qualifier must be after '*'" */ +#define __restrict +#endif #if _MSC_VER > 1930 #pragma warning(disable : 6235) /* is always a constant */ #pragma warning(disable : 6237) /* is never evaluated and might \ @@ -139,7 +145,7 @@ #include "mdbx.h++" /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -187,6 +193,7 @@ #include #include +#include #include #include #include @@ -824,7 +831,7 @@ __extern_C key_t ftok(const char *, int); /*----------------------------------------------------------------------------*/ -#if defined(MDBX_USE_VALGRIND) +#if defined(ENABLE_MEMCHECK) #include #ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE /* LY: available since Valgrind 3.10 */ @@ -846,7 +853,7 @@ __extern_C key_t ftok(const char *, int); #define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a, s) (0) #define VALGRIND_CHECK_MEM_IS_DEFINED(a, s) (0) #define RUNNING_ON_VALGRIND (0) -#endif /* MDBX_USE_VALGRIND */ +#endif /* ENABLE_MEMCHECK */ #ifdef __SANITIZE_ADDRESS__ #include @@ -993,7 +1000,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1197,8 +1204,8 @@ typedef pthread_mutex_t osal_fastmutex_t; /*----------------------------------------------------------------------------*/ /* OS abstraction layer stuff */ -MDBX_INTERNAL_VAR unsigned sys_pagesize; -MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_pagesize_ln2, +MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize; +MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize_ln2, sys_allocation_granularity; /* Get the size of a memory page for the system. @@ -1462,8 +1469,9 @@ MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); #endif #if defined(__linux__) || defined(__gnu_linux__) -MDBX_INTERNAL_VAR uint32_t linux_kernel_version; -MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; +MDBX_INTERNAL_VAR_PROTO uint32_t linux_kernel_version; +MDBX_INTERNAL_VAR_PROTO bool + mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ #ifndef osal_strdup @@ -1677,7 +1685,8 @@ MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, /// restore POSIX-fcntl locks after the closing of file descriptors. /// \return Error code (MDBX_PANIC) or zero on success. MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, - MDBX_env *inprocess_neighbor); + MDBX_env *inprocess_neighbor, + const uint32_t current_pid); /// \brief Connects to shared interprocess locking objects and tries to acquire /// the maximum lock level (shared if exclusive is not available) @@ -1705,6 +1714,8 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); /// operational lock. /// \return Error code or zero on success MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, + bool dont_wait); /// \brief Locks LCK-file or/and table of readers for (de)registering. /// \return Error code or zero on success @@ -1713,16 +1724,12 @@ MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); /// \brief Unlocks LCK-file or/and table of readers after (de)registering. MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); -/// \brief Acquires lock for DB change (on writing transaction start) -/// Reading transactions will not be blocked. -/// Declared as LIBMDBX_API because it is used in mdbx_chk. +/// \brief Acquires write-transaction lock. /// \return Error code or zero on success -LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dont_wait); +MDBX_INTERNAL_FUNC int osal_txn_lock(MDBX_env *env, bool dont_wait); -/// \brief Releases lock once DB changes is made (after writing transaction -/// has finished). -/// Declared as LIBMDBX_API because it is used in mdbx_chk. -LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); +/// \brief Releases write-transaction lock.. +MDBX_INTERNAL_FUNC void osal_txn_unlock(MDBX_env *env); /// \brief Sets alive-flag of reader presence (indicative lock) for PID of /// the current process. The function does no more than needed for @@ -1751,7 +1758,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst); typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); -MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, +MDBX_INTERNAL_VAR_PROTO osal_srwlock_t_function osal_srwlock_Init, osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; @@ -1804,7 +1811,7 @@ typedef struct _FILE_REMOTE_PROTOCOL_INFO { typedef BOOL(WINAPI *MDBX_GetFileInformationByHandleEx)( _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -MDBX_INTERNAL_VAR MDBX_GetFileInformationByHandleEx +MDBX_INTERNAL_VAR_PROTO MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx; typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( @@ -1813,19 +1820,20 @@ typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( _Out_opt_ LPDWORD lpMaximumComponentLength, _Out_opt_ LPDWORD lpFileSystemFlags, _Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize); -MDBX_INTERNAL_VAR MDBX_GetVolumeInformationByHandleW +MDBX_INTERNAL_VAR_PROTO MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW; typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile, _Out_ LPWSTR lpszFilePath, _In_ DWORD cchFilePath, _In_ DWORD dwFlags); -MDBX_INTERNAL_VAR MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; +MDBX_INTERNAL_VAR_PROTO MDBX_GetFinalPathNameByHandleW + mdbx_GetFinalPathNameByHandleW; typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)( _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -MDBX_INTERNAL_VAR MDBX_SetFileInformationByHandle +MDBX_INTERNAL_VAR_PROTO MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle; typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( @@ -1834,10 +1842,10 @@ typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode, IN OUT PVOID InputBuffer, IN ULONG InputBufferLength, OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength); -MDBX_INTERNAL_VAR MDBX_NtFsControlFile mdbx_NtFsControlFile; +MDBX_INTERNAL_VAR_PROTO MDBX_NtFsControlFile mdbx_NtFsControlFile; typedef uint64_t(WINAPI *MDBX_GetTickCount64)(void); -MDBX_INTERNAL_VAR MDBX_GetTickCount64 mdbx_GetTickCount64; +MDBX_INTERNAL_VAR_PROTO MDBX_GetTickCount64 mdbx_GetTickCount64; #if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8 typedef struct _WIN32_MEMORY_RANGE_ENTRY { @@ -1849,13 +1857,13 @@ typedef struct _WIN32_MEMORY_RANGE_ENTRY { typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)( HANDLE hProcess, ULONG_PTR NumberOfEntries, PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags); -MDBX_INTERNAL_VAR MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; +MDBX_INTERNAL_VAR_PROTO MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT; typedef NTSTATUS(NTAPI *MDBX_NtExtendSection)(IN HANDLE SectionHandle, IN PLARGE_INTEGER NewSectionSize); -MDBX_INTERNAL_VAR MDBX_NtExtendSection mdbx_NtExtendSection; +MDBX_INTERNAL_VAR_PROTO MDBX_NtExtendSection mdbx_NtExtendSection; static __inline bool mdbx_RunningUnderWine(void) { return !mdbx_NtExtendSection; @@ -1865,14 +1873,15 @@ typedef LSTATUS(WINAPI *MDBX_RegGetValueA)(HKEY hkey, LPCSTR lpSubKey, LPCSTR lpValue, DWORD dwFlags, LPDWORD pdwType, PVOID pvData, LPDWORD pcbData); -MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; +MDBX_INTERNAL_VAR_PROTO MDBX_RegGetValueA mdbx_RegGetValueA; NTSYSAPI ULONG RtlRandomEx(PULONG Seed); typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle, PUCHAR OverlappedRangeStart, ULONG Length); -MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; +MDBX_INTERNAL_VAR_PROTO MDBX_SetFileIoOverlappedRange + mdbx_SetFileIoOverlappedRange; #endif /* Windows */ @@ -2102,7 +2111,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1 #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ -/** Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP +/** Controls dirty pages tracking, spilling and persisting in `MDBX_WRITEMAP` * mode. 0/OFF = Don't track dirty pages at all, don't spill ones, and use * msync() to persist data. This is by-default on Linux and other systems where * kernel provides properly LRU tracking and effective flushing on-demand. 1/ON @@ -2119,6 +2128,22 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_AVOID_MSYNC must be defined as 0 or 1 #endif /* MDBX_AVOID_MSYNC */ +/** Управляет механизмом поддержки разреженных наборов DBI-хендлов для снижения + * накладных расходов при запуске и обработке транзакций. */ +#ifndef MDBX_ENABLE_DBI_SPARSE +#define MDBX_ENABLE_DBI_SPARSE 1 +#elif !(MDBX_ENABLE_DBI_SPARSE == 0 || MDBX_ENABLE_DBI_SPARSE == 1) +#error MDBX_ENABLE_DBI_SPARSE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_DBI_SPARSE */ + +/** Управляет механизмом отложенного освобождения и поддержки пути быстрого + * открытия DBI-хендлов без захвата блокировок. */ +#ifndef MDBX_ENABLE_DBI_LOCKFREE +#define MDBX_ENABLE_DBI_LOCKFREE 1 +#elif !(MDBX_ENABLE_DBI_LOCKFREE == 0 || MDBX_ENABLE_DBI_LOCKFREE == 1) +#error MDBX_ENABLE_DBI_LOCKFREE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ + /** Controls sort order of internal page number lists. * This mostly experimental/advanced option with not for regular MDBX users. * \warning The database format depend on this option and libmdbx built with @@ -2166,8 +2191,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /** If defined then enables integration with Valgrind, * a memory analyzing tool. */ -#ifndef MDBX_USE_VALGRIND -#endif /* MDBX_USE_VALGRIND */ +#ifndef ENABLE_MEMCHECK +#endif /* ENABLE_MEMCHECK */ /** If defined then enables use C11 atomics, * otherwise detects ones availability automatically. */ @@ -2490,13 +2515,23 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_RUNTIME_FLAGS_INIT \ ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT -extern uint8_t runtime_flags; -extern uint8_t loglevel; -extern MDBX_debug_func *debug_logger; +union logger_union { + void *ptr; + MDBX_debug_func *fmt; + MDBX_debug_func_nofmt *nofmt; +}; + +MDBX_INTERNAL_VAR_PROTO struct mdbx_static { + uint8_t flags; + uint8_t loglevel; + union logger_union logger; + size_t logger_buffer_size; + char *logger_buffer; +} mdbx_static; MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { #if MDBX_DEBUG - if (MDBX_DBG_JITTER & runtime_flags) + if (MDBX_DBG_JITTER & mdbx_static.flags) osal_jitter(tiny); #else (void)tiny; @@ -2510,17 +2545,17 @@ MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, const char *fmt, va_list args); #if MDBX_DEBUG -#define LOG_ENABLED(msg) unlikely(msg <= loglevel) -#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) +#define LOG_ENABLED(msg) unlikely(msg <= mdbx_static.loglevel) +#define AUDIT_ENABLED() unlikely((mdbx_static.flags & MDBX_DBG_AUDIT)) #else /* MDBX_DEBUG */ -#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= mdbx_static.loglevel) #define AUDIT_ENABLED() (0) #endif /* MDBX_DEBUG */ #if MDBX_FORCE_ASSERTIONS #define ASSERT_ENABLED() (1) #elif MDBX_DEBUG -#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) +#define ASSERT_ENABLED() likely((mdbx_static.flags & MDBX_DBG_ASSERT)) #else #define ASSERT_ENABLED() (0) #endif /* assertions */ @@ -2955,7 +2990,8 @@ typedef struct MDBX_page { #define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) -/* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ +/* Drop legacy P_DIRTY flag for sub-pages for compatilibity, + * for assertions only. */ #define PAGETYPE_COMPAT(p) \ (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ @@ -3064,7 +3100,7 @@ typedef sem_t osal_ipclock_t; #endif /* MDBX_LOCKING */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) -MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_stubinit(osal_ipclock_t *ipc); MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); #endif /* MDBX_LOCKING */ @@ -3077,8 +3113,9 @@ MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); * read transactions started by the same thread need no further locking to * proceed. * - * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data. - * No reader table is used if the database is on a read-only filesystem. + * If MDBX_NOSTICKYTHREADS is set, the slot address is not saved in + * thread-specific data. No reader table is used if the database is on a + * read-only filesystem. * * Since the database uses multi-version concurrency control, readers don't * actually need any locking. This table is used to keep track of which @@ -3388,10 +3425,10 @@ typedef struct troika { #if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */ uint32_t unused_pad; #endif -#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) -#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) -#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) -#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3) +#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7u) +#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64u) +#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128u) +#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3u) txnid_t txnid[NUM_METAS]; } meta_troika_t; @@ -3421,6 +3458,8 @@ struct MDBX_txn { #error "Oops, some txn flags overlapped or wrong" #endif uint32_t mt_flags; + unsigned mt_numdbs; + size_t mt_owner; /* thread ID that owns this transaction */ MDBX_txn *mt_parent; /* parent of a nested txn */ /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ @@ -3438,31 +3477,30 @@ struct MDBX_txn { txnid_t mt_front; MDBX_env *mt_env; /* the DB environment */ - /* Array of records for each DB known in the environment. */ - MDBX_dbx *mt_dbxs; /* Array of MDBX_db records for each known DB */ MDBX_db *mt_dbs; - /* Array of sequence numbers for each DB handle */ - MDBX_atomic_uint32_t *mt_dbiseqs; - - /* Transaction DBI Flags */ -#define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ -#define DBI_STALE MDBX_DBI_STALE /* Named-DB record is older than txnID */ -#define DBI_FRESH MDBX_DBI_FRESH /* Named-DB handle opened in this txn */ -#define DBI_CREAT MDBX_DBI_CREAT /* Named-DB handle created in this txn */ -#define DBI_VALID 0x10 /* DB handle is valid, see also DB_VALID */ -#define DBI_USRVALID 0x20 /* As DB_VALID, but not set for FREE_DBI */ -#define DBI_AUDITED 0x40 /* Internal flag for accounting during audit */ - /* Array of flags for each DB */ - uint8_t *mt_dbistate; - /* Number of DB records in use, or 0 when the txn is finished. - * This number only ever increments until the txn finishes; we - * don't decrement it when individual DB handles are closed. */ - MDBX_dbi mt_numdbs; - size_t mt_owner; /* thread ID that owns this transaction */ + +#if MDBX_ENABLE_DBI_SPARSE + unsigned *__restrict mt_dbi_sparse; +#endif /* MDBX_ENABLE_DBI_SPARSE */ + + /* Non-shared DBI state flags inside transaction */ +#define DBI_DIRTY 0x01 /* DB was written in this txn */ +#define DBI_STALE 0x02 /* Named-DB record is older than txnID */ +#define DBI_FRESH 0x04 /* Named-DB handle opened in this txn */ +#define DBI_CREAT 0x08 /* Named-DB handle created in this txn */ +#define DBI_VALID 0x10 /* Handle is valid, see also DB_VALID */ +#define DBI_OLDEN 0x40 /* Handle was closed/reopened outside txn */ +#define DBI_LINDO 0x80 /* Lazy initialization done for DBI-slot */ + /* Array of non-shared txn's flags of DBI */ + uint8_t *__restrict mt_dbi_state; + + /* Array of sequence numbers for each DB handle. */ + uint32_t *__restrict mt_dbi_seqs; + MDBX_cursor **mt_cursors; + MDBX_canary mt_canary; void *mt_userctx; /* User-settable context */ - MDBX_cursor **mt_cursors; union { struct { @@ -3472,8 +3510,8 @@ struct MDBX_txn { struct { meta_troika_t troika; /* In write txns, array of cursors for each DB */ - MDBX_PNL relist; /* Reclaimed GC pages */ - txnid_t last_reclaimed; /* ID of last used record */ + MDBX_PNL __restrict relist; /* Reclaimed GC pages */ + txnid_t last_reclaimed; /* ID of last used record */ #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; #endif /* MDBX_ENABLE_REFUND */ @@ -3485,14 +3523,14 @@ struct MDBX_txn { * dirtylist into mt_parent after freeing hidden mt_parent pages. */ size_t dirtyroom; /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ - MDBX_dpl *dirtylist; + MDBX_dpl *__restrict dirtylist; /* The list of reclaimed txns from GC */ - MDBX_TXL lifo_reclaimed; + MDBX_TXL __restrict lifo_reclaimed; /* The list of pages that became unused during this transaction. */ - MDBX_PNL retired_pages; + MDBX_PNL __restrict retired_pages; /* The list of loose pages that became unused and may be reused * in this transaction, linked through `mp_next`. */ - MDBX_page *loose_pages; + MDBX_page *__restrict loose_pages; /* Number of loose pages (tw.loose_pages) */ size_t loose_count; union { @@ -3501,11 +3539,12 @@ struct MDBX_txn { /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ - MDBX_PNL list; + MDBX_PNL __restrict list; } spilled; size_t writemap_dirty_npages; size_t writemap_spilled_npages; }; + uint64_t gc_time_acc; } tw; }; }; @@ -3544,8 +3583,8 @@ struct MDBX_cursor { MDBX_db *mc_db; /* The database auxiliary record for this cursor */ MDBX_dbx *mc_dbx; - /* The mt_dbistate for this database */ - uint8_t *mc_dbistate; + /* The mt_dbi_state[] for this DBI */ + uint8_t *__restrict mc_dbi_state; uint8_t mc_snum; /* number of pushed pages */ uint8_t mc_top; /* index of top page, normally mc_snum-1 */ @@ -3598,6 +3637,11 @@ typedef struct MDBX_cursor_couple { MDBX_xcursor inner; } MDBX_cursor_couple; +struct mdbx_defer_free_item { + struct mdbx_defer_free_item *next; + uint64_t timestamp; +}; + /* The database environment. */ struct MDBX_env { /* ----------------------------------------------------- mostly static part */ @@ -3615,6 +3659,7 @@ struct MDBX_env { #define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; + unsigned me_psize; /* DB page size, initialized from me_os_psize */ osal_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.base #define me_lazy_fd me_dxb_mmap.fd @@ -3627,7 +3672,6 @@ struct MDBX_env { #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; - unsigned me_psize; /* DB page size, initialized from me_os_psize */ uint16_t me_leaf_nodemax; /* max size of a leaf-node */ uint16_t me_branch_nodemax; /* max size of a branch-node */ uint16_t me_subpage_limit; @@ -3645,13 +3689,15 @@ struct MDBX_env { MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ osal_thread_key_t me_txkey; /* thread-key for readers */ - pathchar_t *me_pathname; /* path to the DB files */ - void *me_pbuf; /* scratch area for DUPSORT put() */ - MDBX_txn *me_txn0; /* preallocated write transaction */ - - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ + struct { /* path to the DB files */ + pathchar_t *lck, *dxb, *specified; + void *buffer; + } me_pathname; + void *me_pbuf; /* scratch area for DUPSORT put() */ + MDBX_txn *me_txn0; /* preallocated write transaction */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *__restrict me_db_flags; /* array of flags from MDBX_db.md_flags */ + MDBX_atomic_uint32_t *me_dbi_seqs; /* array of dbi sequence numbers */ unsigned me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ unsigned me_maxgc_per_branch; @@ -3665,6 +3711,7 @@ struct MDBX_env { unsigned rp_augment_limit; unsigned dp_limit; unsigned dp_initial; + uint64_t gc_time_limit; uint8_t dp_loose_limit; uint8_t spill_max_denominator; uint8_t spill_min_denominator; @@ -3674,6 +3721,8 @@ struct MDBX_env { unsigned writethrough_threshold; #endif /* Windows */ bool prefault_write; + bool prefer_waf_insteadof_balance; /* Strive to minimize WAF instead of + balancing pages fullment */ union { unsigned all; /* tracks options with non-auto values but tuned by user */ @@ -3703,20 +3752,23 @@ struct MDBX_env { } me_sysv_ipc; #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ bool me_incore; + bool me_prefault_write; - MDBX_env *me_lcklist_next; +#if MDBX_ENABLE_DBI_LOCKFREE + struct mdbx_defer_free_item *me_defer_free; +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ /* --------------------------------------------------- mostly volatile part */ MDBX_txn *me_txn; /* current write transaction */ osal_fastmutex_t me_dbi_lock; - MDBX_dbi me_numdbs; /* number of DBs opened */ - bool me_prefault_write; + unsigned me_numdbs; /* number of DBs opened */ - MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ unsigned me_dp_reserve_len; + MDBX_page *__restrict me_dp_reserve; /* list of malloc'ed blocks for re-use */ + /* PNL of pages that became unused in a write txn */ - MDBX_PNL me_retired_pages; + MDBX_PNL __restrict me_retired_pages; osal_ioring_t me_ioring; #if defined(_WIN32) || defined(_WIN64) @@ -3734,13 +3786,12 @@ struct MDBX_env { #if MDBX_DEBUG MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ #endif -#ifdef MDBX_USE_VALGRIND +#ifdef ENABLE_MEMCHECK int me_valgrind_handle; #endif -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - MDBX_atomic_uint32_t me_ignore_EDEADLK; +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) pgno_t me_poison_edge; -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ #ifndef xMDBX_DEBUG_SPILLING #define xMDBX_DEBUG_SPILLING 0 @@ -3800,10 +3851,6 @@ osal_flush_incoherent_mmap(const void *addr, size_t nbytes, MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, int *dead); -MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, - MDBX_reader *end); -MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); - MDBX_INTERNAL_FUNC void global_ctor(void); MDBX_INTERNAL_FUNC void osal_ctor(void); MDBX_INTERNAL_FUNC void global_dtor(void); @@ -3918,7 +3965,8 @@ typedef struct MDBX_node { /* mdbx_dbi_open() flags */ #define DB_USABLE_FLAGS (DB_PERSISTENT_FLAGS | MDBX_CREATE | MDBX_DB_ACCEDE) -#define DB_VALID 0x8000 /* DB handle is valid, for me_dbflags */ +#define DB_VALID 0x8000u /* DB handle is valid, for me_db_flags */ +#define DB_POISON 0x7fffu /* update pending */ #define DB_INTERNAL_FLAGS DB_VALID #if DB_INTERNAL_FLAGS & DB_USABLE_FLAGS @@ -4007,11 +4055,11 @@ log2n_powerof2(size_t value_uintptr) { * environment and re-opening it with the new flags. */ #define ENV_CHANGEABLE_FLAGS \ (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | \ - MDBX_VALIDATION) + MDBX_NOMEMINIT | MDBX_DEPRECATED_COALESCE | MDBX_PAGEPERTURB | \ + MDBX_ACCEDE | MDBX_VALIDATION) #define ENV_CHANGELESS_FLAGS \ - (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ - MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) + (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOSTICKYTHREADS | \ + MDBX_NORDAHEAD | MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS) #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS @@ -4044,8 +4092,38 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { (size_t)(size), __LINE__); \ ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) + +/******************************************************************************/ + +/** \brief Page types for traverse the b-tree. + * \see mdbx_env_pgwalk() \see MDBX_pgvisitor_func */ +enum MDBX_page_type_t { + MDBX_page_broken, + MDBX_page_large, + MDBX_page_branch, + MDBX_page_leaf, + MDBX_page_dupfixed_leaf, + MDBX_subpage_leaf, + MDBX_subpage_dupfixed_leaf, + MDBX_subpage_broken, +}; +typedef enum MDBX_page_type_t MDBX_page_type_t; + +typedef struct MDBX_walk_sdb { + MDBX_val name; + struct MDBX_db *internal, *nested; +} MDBX_walk_sdb_t; + +/** \brief Callback function for traverse the b-tree. \see mdbx_env_pgwalk() */ +typedef int +MDBX_pgvisitor_func(const size_t pgno, const unsigned number, void *const ctx, + const int deep, const MDBX_walk_sdb_t *subdb, + const size_t page_size, const MDBX_page_type_t page_type, + const MDBX_error_t err, const size_t nentries, + const size_t payload_bytes, const size_t header_bytes, + const size_t unused_bytes); // -// Copyright (c) 2020-2023, Leonid Yuriev . +// Copyright (c) 2020-2024, Leonid Yuriev . // SPDX-License-Identifier: Apache-2.0 // // Non-inline part of the libmdbx C++ API @@ -4315,6 +4393,11 @@ namespace mdbx { "into an incompatible memory allocation scheme."); } +[[noreturn]] __cold void throw_incomparable_cursors() { + throw std::logic_error( + "mdbx:: incomparable and/or invalid cursors to compare positions."); +} + [[noreturn]] __cold void throw_bad_value_size() { throw bad_value_size(MDBX_BAD_VALSIZE); } @@ -4368,6 +4451,8 @@ DEFINE_EXCEPTION(thread_mismatch) DEFINE_EXCEPTION(transaction_full) DEFINE_EXCEPTION(transaction_overlapping) DEFINE_EXCEPTION(duplicated_lck_file) +DEFINE_EXCEPTION(dangling_map_id) + #undef DEFINE_EXCEPTION __cold const char *error::what() const noexcept { @@ -4454,6 +4539,7 @@ __cold void error::throw_exception() const { CASE_EXCEPTION(transaction_full, MDBX_TXN_FULL); CASE_EXCEPTION(transaction_overlapping, MDBX_TXN_OVERLAPPING); CASE_EXCEPTION(duplicated_lck_file, MDBX_DUPLICATED_CLK); + CASE_EXCEPTION(dangling_map_id, MDBX_DANGLING_DBI); #undef CASE_EXCEPTION default: if (is_mdbx_error()) @@ -4571,48 +4657,48 @@ bool slice::is_printable(bool disable_utf8) const noexcept { } #ifdef MDBX_U128_TYPE -MDBX_U128_TYPE slice::as_uint128() const { +MDBX_U128_TYPE slice::as_uint128_adapt() const { static_assert(sizeof(MDBX_U128_TYPE) == 16, "WTF?"); if (size() == 16) { MDBX_U128_TYPE r; memcpy(&r, data(), sizeof(r)); return r; } else - return as_uint64(); + return as_uint64_adapt(); } #endif /* MDBX_U128_TYPE */ -uint64_t slice::as_uint64() const { +uint64_t slice::as_uint64_adapt() const { static_assert(sizeof(uint64_t) == 8, "WTF?"); if (size() == 8) { uint64_t r; memcpy(&r, data(), sizeof(r)); return r; } else - return as_uint32(); + return as_uint32_adapt(); } -uint32_t slice::as_uint32() const { +uint32_t slice::as_uint32_adapt() const { static_assert(sizeof(uint32_t) == 4, "WTF?"); if (size() == 4) { uint32_t r; memcpy(&r, data(), sizeof(r)); return r; } else - return as_uint16(); + return as_uint16_adapt(); } -uint16_t slice::as_uint16() const { +uint16_t slice::as_uint16_adapt() const { static_assert(sizeof(uint16_t) == 2, "WTF?"); if (size() == 2) { uint16_t r; memcpy(&r, data(), sizeof(r)); return r; } else - return as_uint8(); + return as_uint8_adapt(); } -uint8_t slice::as_uint8() const { +uint8_t slice::as_uint8_adapt() const { static_assert(sizeof(uint8_t) == 1, "WTF?"); if (size() == 1) return *static_cast(data()); @@ -4623,48 +4709,48 @@ uint8_t slice::as_uint8() const { } #ifdef MDBX_I128_TYPE -MDBX_I128_TYPE slice::as_int128() const { +MDBX_I128_TYPE slice::as_int128_adapt() const { static_assert(sizeof(MDBX_I128_TYPE) == 16, "WTF?"); if (size() == 16) { MDBX_I128_TYPE r; memcpy(&r, data(), sizeof(r)); return r; } else - return as_int64(); + return as_int64_adapt(); } #endif /* MDBX_I128_TYPE */ -int64_t slice::as_int64() const { +int64_t slice::as_int64_adapt() const { static_assert(sizeof(int64_t) == 8, "WTF?"); if (size() == 8) { uint64_t r; memcpy(&r, data(), sizeof(r)); return r; } else - return as_int32(); + return as_int32_adapt(); } -int32_t slice::as_int32() const { +int32_t slice::as_int32_adapt() const { static_assert(sizeof(int32_t) == 4, "WTF?"); if (size() == 4) { int32_t r; memcpy(&r, data(), sizeof(r)); return r; } else - return as_int16(); + return as_int16_adapt(); } -int16_t slice::as_int16() const { +int16_t slice::as_int16_adapt() const { static_assert(sizeof(int16_t) == 2, "WTF?"); if (size() == 2) { int16_t r; memcpy(&r, data(), sizeof(r)); return r; } else - return as_int8(); + return as_int8_adapt(); } -int8_t slice::as_int8() const { +int8_t slice::as_int8_adapt() const { if (size() == 1) return *static_cast(data()); else if (size() == 0) @@ -5252,8 +5338,8 @@ env::operate_parameters::make_flags(bool accede, bool use_subdirectory) const { flags |= MDBX_NOSUBDIR; if (options.exclusive) flags |= MDBX_EXCLUSIVE; - if (options.orphan_read_transactions) - flags |= MDBX_NOTLS; + if (options.no_sticky_threads) + flags |= MDBX_NOSTICKYTHREADS; if (options.disable_readahead) flags |= MDBX_NORDAHEAD; if (options.disable_clear_memory) @@ -5263,7 +5349,7 @@ env::operate_parameters::make_flags(bool accede, bool use_subdirectory) const { if (options.nested_write_transactions) flags &= ~MDBX_WRITEMAP; if (reclaiming.coalesce) - flags |= MDBX_COALESCE; + flags |= MDBX_env_flags_t(MDBX_DEPRECATED_COALESCE); if (reclaiming.lifo) flags |= MDBX_LIFORECLAIM; switch (durability) { @@ -5308,12 +5394,13 @@ env::durability env::operate_parameters::durability_from_flags( env::reclaiming_options::reclaiming_options(MDBX_env_flags_t flags) noexcept : lifo((flags & MDBX_LIFORECLAIM) ? true : false), - coalesce((flags & MDBX_COALESCE) ? true : false) {} + coalesce((flags & MDBX_DEPRECATED_COALESCE) ? true : false) {} env::operate_options::operate_options(MDBX_env_flags_t flags) noexcept - : orphan_read_transactions( - ((flags & (MDBX_NOTLS | MDBX_EXCLUSIVE)) == MDBX_NOTLS) ? true - : false), + : no_sticky_threads(((flags & (MDBX_NOSTICKYTHREADS | MDBX_EXCLUSIVE)) == + MDBX_NOSTICKYTHREADS) + ? true + : false), nested_write_transactions((flags & (MDBX_WRITEMAP | MDBX_RDONLY)) ? false : true), exclusive((flags & MDBX_EXCLUSIVE) ? true : false), @@ -5327,7 +5414,7 @@ bool env::is_pristine() const { bool env::is_empty() const { return get_stat().ms_leaf_pages == 0; } -env &env::copy(filehandle fd, bool compactify, bool force_dynamic_size) { +__cold env &env::copy(filehandle fd, bool compactify, bool force_dynamic_size) { error::success_or_throw( ::mdbx_env_copy2fd(handle_, fd, (compactify ? MDBX_CP_COMPACT : MDBX_CP_DEFAULTS) | @@ -5336,8 +5423,8 @@ env &env::copy(filehandle fd, bool compactify, bool force_dynamic_size) { return *this; } -env &env::copy(const char *destination, bool compactify, - bool force_dynamic_size) { +__cold env &env::copy(const char *destination, bool compactify, + bool force_dynamic_size) { error::success_or_throw( ::mdbx_env_copy(handle_, destination, (compactify ? MDBX_CP_COMPACT : MDBX_CP_DEFAULTS) | @@ -5346,14 +5433,14 @@ env &env::copy(const char *destination, bool compactify, return *this; } -env &env::copy(const ::std::string &destination, bool compactify, - bool force_dynamic_size) { +__cold env &env::copy(const ::std::string &destination, bool compactify, + bool force_dynamic_size) { return copy(destination.c_str(), compactify, force_dynamic_size); } #if defined(_WIN32) || defined(_WIN64) -env &env::copy(const wchar_t *destination, bool compactify, - bool force_dynamic_size) { +__cold env &env::copy(const wchar_t *destination, bool compactify, + bool force_dynamic_size) { error::success_or_throw( ::mdbx_env_copyW(handle_, destination, (compactify ? MDBX_CP_COMPACT : MDBX_CP_DEFAULTS) | @@ -5369,13 +5456,13 @@ env &env::copy(const ::std::wstring &destination, bool compactify, #endif /* Windows */ #ifdef MDBX_STD_FILESYSTEM_PATH -env &env::copy(const MDBX_STD_FILESYSTEM_PATH &destination, bool compactify, - bool force_dynamic_size) { +__cold env &env::copy(const MDBX_STD_FILESYSTEM_PATH &destination, + bool compactify, bool force_dynamic_size) { return copy(destination.native(), compactify, force_dynamic_size); } #endif /* MDBX_STD_FILESYSTEM_PATH */ -path env::get_path() const { +__cold path env::get_path() const { #if defined(_WIN32) || defined(_WIN64) const wchar_t *c_wstr; error::success_or_throw(::mdbx_env_get_pathW(handle_, &c_wstr)); @@ -5389,29 +5476,30 @@ path env::get_path() const { #endif } -bool env::remove(const char *pathname, const remove_mode mode) { - return error::boolean_or_throw( +__cold bool env::remove(const char *pathname, const remove_mode mode) { + return !error::boolean_or_throw( ::mdbx_env_delete(pathname, MDBX_env_delete_mode_t(mode))); } -bool env::remove(const ::std::string &pathname, const remove_mode mode) { +__cold bool env::remove(const ::std::string &pathname, const remove_mode mode) { return remove(pathname.c_str(), mode); } #if defined(_WIN32) || defined(_WIN64) -bool env::remove(const wchar_t *pathname, const remove_mode mode) { - return error::boolean_or_throw( +__cold bool env::remove(const wchar_t *pathname, const remove_mode mode) { + return !error::boolean_or_throw( ::mdbx_env_deleteW(pathname, MDBX_env_delete_mode_t(mode))); } -bool env::remove(const ::std::wstring &pathname, const remove_mode mode) { +__cold bool env::remove(const ::std::wstring &pathname, + const remove_mode mode) { return remove(pathname.c_str(), mode); } #endif /* Windows */ #ifdef MDBX_STD_FILESYSTEM_PATH -bool env::remove(const MDBX_STD_FILESYSTEM_PATH &pathname, - const remove_mode mode) { +__cold bool env::remove(const MDBX_STD_FILESYSTEM_PATH &pathname, + const remove_mode mode) { return remove(pathname.native(), mode); } #endif /* MDBX_STD_FILESYSTEM_PATH */ @@ -5425,13 +5513,13 @@ static inline MDBX_env *create_env() { return ptr; } -env_managed::~env_managed() noexcept { +__cold env_managed::~env_managed() noexcept { if (MDBX_UNLIKELY(handle_)) MDBX_CXX20_UNLIKELY error::success_or_panic( ::mdbx_env_close(handle_), "mdbx::~env()", "mdbx_env_close"); } -void env_managed::close(bool dont_sync) { +__cold void env_managed::close(bool dont_sync) { const error rc = static_cast(::mdbx_env_close_ex(handle_, dont_sync)); switch (rc.code()) { @@ -5579,9 +5667,16 @@ void txn_managed::commit(commit_latency *latency) { MDBX_CXX20_UNLIKELY err.throw_exception(); } +void txn_managed::commit_embark_read() { + auto env = this->env(); + commit(); + error::success_or_throw( + ::mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &handle_)); +} + //------------------------------------------------------------------------------ -bool txn::drop_map(const char *name, bool throw_if_absent) { +__cold bool txn::drop_map(const char *name, bool throw_if_absent) { map_handle map; const int err = ::mdbx_dbi_open(handle_, name, MDBX_DB_ACCEDE, &map.dbi); switch (err) { @@ -5598,7 +5693,7 @@ bool txn::drop_map(const char *name, bool throw_if_absent) { } } -bool txn::clear_map(const char *name, bool throw_if_absent) { +__cold bool txn::clear_map(const char *name, bool throw_if_absent) { map_handle map; const int err = ::mdbx_dbi_open(handle_, name, MDBX_DB_ACCEDE, &map.dbi); switch (err) { @@ -5615,6 +5710,101 @@ bool txn::clear_map(const char *name, bool throw_if_absent) { } } +__cold bool txn::rename_map(const char *old_name, const char *new_name, + bool throw_if_absent) { + map_handle map; + const int err = ::mdbx_dbi_open(handle_, old_name, MDBX_DB_ACCEDE, &map.dbi); + switch (err) { + case MDBX_SUCCESS: + rename_map(map, new_name); + return true; + case MDBX_NOTFOUND: + case MDBX_BAD_DBI: + if (!throw_if_absent) + return false; + MDBX_CXX17_FALLTHROUGH /* fallthrough */; + default: + MDBX_CXX20_UNLIKELY error::throw_exception(err); + } +} + +#if defined(__cpp_lib_string_view) && __cpp_lib_string_view >= 201606L + +__cold bool txn::drop_map(const ::std::string_view &name, + bool throw_if_absent) { + map_handle map; + const int err = + ::mdbx_dbi_open2(handle_, mdbx::slice(name), MDBX_DB_ACCEDE, &map.dbi); + switch (err) { + case MDBX_SUCCESS: + drop_map(map); + return true; + case MDBX_NOTFOUND: + case MDBX_BAD_DBI: + if (!throw_if_absent) + return false; + MDBX_CXX17_FALLTHROUGH /* fallthrough */; + default: + MDBX_CXX20_UNLIKELY error::throw_exception(err); + } +} + +__cold bool txn::clear_map(const ::std::string_view &name, + bool throw_if_absent) { + map_handle map; + const int err = + ::mdbx_dbi_open2(handle_, mdbx::slice(name), MDBX_DB_ACCEDE, &map.dbi); + switch (err) { + case MDBX_SUCCESS: + clear_map(map); + return true; + case MDBX_NOTFOUND: + case MDBX_BAD_DBI: + if (!throw_if_absent) + return false; + MDBX_CXX17_FALLTHROUGH /* fallthrough */; + default: + MDBX_CXX20_UNLIKELY error::throw_exception(err); + } +} + +__cold bool txn::rename_map(const ::std::string_view &old_name, + const ::std::string_view &new_name, + bool throw_if_absent) { + map_handle map; + const int err = ::mdbx_dbi_open2(handle_, mdbx::slice(old_name), + MDBX_DB_ACCEDE, &map.dbi); + switch (err) { + case MDBX_SUCCESS: + rename_map(map, new_name); + return true; + case MDBX_NOTFOUND: + case MDBX_BAD_DBI: + if (!throw_if_absent) + return false; + MDBX_CXX17_FALLTHROUGH /* fallthrough */; + default: + MDBX_CXX20_UNLIKELY error::throw_exception(err); + } +} + +__cold bool txn::rename_map(const ::std::string &old_name, + const ::std::string &new_name, + bool throw_if_absent) { + return rename_map(::std::string_view(old_name), ::std::string_view(new_name), + throw_if_absent); +} + +#else + +__cold bool txn::rename_map(const ::std::string &old_name, + const ::std::string &new_name, + bool throw_if_absent) { + return rename_map(old_name.c_str(), new_name.c_str(), throw_if_absent); +} + +#endif /* __cpp_lib_string_view >= 201606L */ + //------------------------------------------------------------------------------ void cursor_managed::close() { @@ -5764,8 +5954,8 @@ __cold ::std::ostream &operator<<(::std::ostream &out, static const char comma[] = ", "; const char *delimiter = ""; out << "{"; - if (it.orphan_read_transactions) { - out << delimiter << "orphan_read_transactions"; + if (it.no_sticky_threads) { + out << delimiter << "no_sticky_threads"; delimiter = comma; } if (it.nested_write_transactions) { diff --git a/mdbxdist/mdbx.h b/mdbxdist/mdbx.h index 1512271..98f9a86 100644 --- a/mdbxdist/mdbx.h +++ b/mdbxdist/mdbx.h @@ -25,7 +25,7 @@ _The Future will (be) [Positive](https://www.ptsecurity.com). Всё будет \section copyright LICENSE & COPYRIGHT -\authors Copyright (c) 2015-2023, Leonid Yuriev +\authors Copyright (c) 2015-2024, Leonid Yuriev and other _libmdbx_ authors: please see [AUTHORS](./AUTHORS) file. \copyright Redistribution and use in source and binary forms, with or without @@ -343,13 +343,14 @@ typedef mode_t mdbx_mode_t; #ifdef __deprecated #define MDBX_DEPRECATED __deprecated #elif defined(DOXYGEN) || \ - (defined(__cplusplus) && __cplusplus >= 201603L && \ - __has_cpp_attribute(maybe_unused) && \ - __has_cpp_attribute(maybe_unused) >= 201603L) || \ + (defined(__cplusplus) && __cplusplus >= 201403L && \ + __has_cpp_attribute(deprecated) && \ + __has_cpp_attribute(deprecated) >= 201309L) || \ (!defined(__cplusplus) && defined(__STDC_VERSION__) && \ - __STDC_VERSION__ > 202005L) + __STDC_VERSION__ >= 202304L) #define MDBX_DEPRECATED [[deprecated]] -#elif defined(__GNUC__) || __has_attribute(__deprecated__) +#elif (defined(__GNUC__) && __GNUC__ > 5) || \ + (__has_attribute(__deprecated__) && !defined(__GNUC__)) #define MDBX_DEPRECATED __attribute__((__deprecated__)) #elif defined(_MSC_VER) #define MDBX_DEPRECATED __declspec(deprecated) @@ -634,9 +635,9 @@ typedef mode_t mdbx_mode_t; extern "C" { #endif -/* MDBX version 0.12.x */ +/* MDBX version 0.13.x */ #define MDBX_VERSION_MAJOR 0 -#define MDBX_VERSION_MINOR 12 +#define MDBX_VERSION_MINOR 13 #ifndef LIBMDBX_API #if defined(LIBMDBX_EXPORTS) @@ -816,7 +817,7 @@ typedef struct iovec MDBX_val; #endif /* ! SunOS */ enum MDBX_constants { - /** The hard limit for DBI handles */ + /** The hard limit for DBI handles. */ MDBX_MAX_DBI = UINT32_C(32765), /** The maximum size of a data item. */ @@ -1012,6 +1013,7 @@ typedef void MDBX_debug_func(MDBX_log_level_t loglevel, const char *function, /** \brief The "don't change `logger`" value for mdbx_setup_debug() */ #define MDBX_LOGGER_DONTCHANGE ((MDBX_debug_func *)(intptr_t)-1) +#define MDBX_LOGGER_NOFMT_DONTCHANGE ((MDBX_debug_func_nofmt *)(intptr_t)-1) /** \brief Setup global log-level, debug options and debug logger. * \returns The previously `debug_flags` in the 0-15 bits @@ -1020,6 +1022,17 @@ LIBMDBX_API int mdbx_setup_debug(MDBX_log_level_t log_level, MDBX_debug_flags_t debug_flags, MDBX_debug_func *logger); +typedef void MDBX_debug_func_nofmt(MDBX_log_level_t loglevel, + const char *function, int line, + const char *msg, + unsigned length) MDBX_CXX17_NOEXCEPT; + +LIBMDBX_API int mdbx_setup_debug_nofmt(MDBX_log_level_t log_level, + MDBX_debug_flags_t debug_flags, + MDBX_debug_func_nofmt *logger, + char *logger_buffer, + size_t logger_buffer_size); + /** \brief A callback function for most MDBX assert() failures, * called before printing the message and aborting. * \see mdbx_env_set_assert() @@ -1195,28 +1208,80 @@ enum MDBX_env_flags_t { */ MDBX_WRITEMAP = UINT32_C(0x80000), - /** Tie reader locktable slots to read-only transactions - * instead of to threads. + /** Отвязывает транзакции от потоков/threads насколько это возможно. * - * Don't use Thread-Local Storage, instead tie reader locktable slots to - * \ref MDBX_txn objects instead of to threads. So, \ref mdbx_txn_reset() - * keeps the slot reserved for the \ref MDBX_txn object. A thread may use - * parallel read-only transactions. And a read-only transaction may span - * threads if you synchronizes its use. + * Эта опция предназначена для приложений, которые мультиплексируют множество + * пользовательских легковесных потоков выполнения по отдельным потокам + * операционной системы, например как это происходит в средах выполнения + * GoLang и Rust. Таким приложениям также рекомендуется сериализовать + * транзакции записи в одном потоке операционной системы, поскольку блокировка + * записи MDBX использует базовые системные примитивы синхронизации и ничего + * не знает о пользовательских потоках и/или легковесных потоков среды + * выполнения. Как минимум, обязательно требуется обеспечить завершение каждой + * пишущей транзакции строго в том же потоке операционной системы где она была + * запущена. * - * Applications that multiplex many user threads over individual OS threads - * need this option. Such an application must also serialize the write - * transactions in an OS thread, since MDBX's write locking is unaware of - * the user threads. + * \note Начиная с версии v0.13 опция `MDBX_NOSTICKYTHREADS` полностью + * заменяет опцию \ref MDBX_NOTLS. * - * \note Regardless to `MDBX_NOTLS` flag a write transaction entirely should - * always be used in one thread from start to finish. MDBX checks this in a - * reasonable manner and return the \ref MDBX_THREAD_MISMATCH error in rules - * violation. + * При использовании `MDBX_NOSTICKYTHREADS` транзакции становятся не + * ассоциированными с создавшими их потоками выполнения. Поэтому в функциях + * API не выполняется проверка соответствия транзакции и текущего потока + * выполнения. Большинство функций работающих с транзакциями и курсорами + * становится возможным вызывать из любых потоков выполнения. Однако, также + * становится невозможно обнаружить ошибки одновременного использования + * транзакций и/или курсоров в разных потоках. * - * This flag affects only at environment opening but can't be changed after. + * Использование `MDBX_NOSTICKYTHREADS` также сужает возможности по изменению + * размера БД, так как теряется возможность отслеживать работающие с БД потоки + * выполнения и приостанавливать их на время снятия отображения БД в ОЗУ. В + * частности, по этой причине на Windows уменьшение файла БД не возможно до + * закрытия БД последним работающим с ней процессом или до последующего + * открытия БД в режиме чтения-записи. + * + * \warning Вне зависимости от \ref MDBX_NOSTICKYTHREADS и \ref MDBX_NOTLS не + * допускается одновременно использование объектов API из разных потоков + * выполнения! Обеспечение всех мер для исключения одновременного + * использования объектов API из разных потоков выполнения целиком ложится на + * вас! + * + * \warning Транзакции записи могут быть завершены только в том же потоке + * выполнения где они были запущены. Это ограничение следует из требований + * большинства операционных систем о том, что захваченный примитив + * синхронизации (мьютекс, семафор, критическая секция) должен освобождаться + * только захватившим его потоком выполнения. + * + * \warning Создание курсора в контексте транзакции, привязка курсора к + * транзакции, отвязка курсора от транзакции и закрытие привязанного к + * транзакции курсора, являются операциями использующими как сам курсор так и + * соответствующую транзакцию. Аналогично, завершение или прерывание + * транзакции является операцией использующей как саму транзакцию, так и все + * привязанные к ней курсоры. Во избежание повреждения внутренних структур + * данных, непредсказуемого поведения, разрушение БД и потери данных следует + * не допускать возможности одновременного использования каких-либо курсора + * или транзакций из разных потоков выполнения. + * + * Читающие транзакции при использовании `MDBX_NOSTICKYTHREADS` перестают + * использовать TLS (Thread Local Storage), а слоты блокировок MVCC-снимков в + * таблице читателей привязываются только к транзакциям. Завершение каких-либо + * потоков не приводит к снятию блокировок MVCC-снимков до явного завершения + * транзакций, либо до завершения соответствующего процесса в целом. + * + * Для пишущих транзакций не выполняется проверка соответствия текущего потока + * выполнения и потока создавшего транзакцию. Однако, фиксация или прерывание + * пишущих транзакций должны выполняться строго в потоке запустившим + * транзакцию, так как эти операции связаны с захватом и освобождением + * примитивов синхронизации (мьютексов, критических секций), для которых + * большинство операционных систем требует освобождение только потоком + * захватившим ресурс. + * + * Этот флаг вступает в силу при открытии среды и не может быть изменен после. */ - MDBX_NOTLS = UINT32_C(0x200000), + MDBX_NOSTICKYTHREADS = UINT32_C(0x200000), +#ifndef _MSC_VER /* avoid madness MSVC */ + /** \deprecated Please use \ref MDBX_NOSTICKYTHREADS instead. */ + MDBX_NOTLS MDBX_DEPRECATED = MDBX_NOSTICKYTHREADS, +#endif /* avoid madness MSVC */ /** Don't do readahead. * @@ -1262,8 +1327,9 @@ enum MDBX_env_flags_t { * This flag may be changed at any time using `mdbx_env_set_flags()`. */ MDBX_NOMEMINIT = UINT32_C(0x1000000), +#ifndef _MSC_VER /* avoid madness MSVC */ /** Aims to coalesce a Garbage Collection items. - * \note Always enabled since v0.12 + * \deprecated Always enabled since v0.12 and deprecated since v0.13. * * With `MDBX_COALESCE` flag MDBX will aims to coalesce items while recycling * a Garbage Collection. Technically, when possible short lists of pages @@ -1273,7 +1339,8 @@ enum MDBX_env_flags_t { * Unallocated space and reducing the database file. * * This flag may be changed at any time using mdbx_env_set_flags(). */ - MDBX_COALESCE = UINT32_C(0x2000000), + MDBX_COALESCE MDBX_DEPRECATED = UINT32_C(0x2000000), +#endif /* avoid madness MSVC */ /** LIFO policy for recycling a Garbage Collection items. * @@ -1778,7 +1845,7 @@ enum MDBX_cursor_op { * return both key and data, and the return code depends on whether a * upper-bound was found. * - * For non DUPSORT-ed collections this work the same to \ref MDBX_SET_RANGE, + * For non DUPSORT-ed collections this work like \ref MDBX_SET_RANGE, * but returns \ref MDBX_SUCCESS if the greater key was found or * \ref MDBX_NOTFOUND otherwise. * @@ -1786,7 +1853,28 @@ enum MDBX_cursor_op { * i.e. for a pairs/tuples of a key and an each data value of duplicates. * Returns \ref MDBX_SUCCESS if the greater pair was returned or * \ref MDBX_NOTFOUND otherwise. */ - MDBX_SET_UPPERBOUND + MDBX_SET_UPPERBOUND, + + /* Doubtless cursor positioning at a specified key. */ + MDBX_TO_KEY_LESSER_THAN, + MDBX_TO_KEY_LESSER_OR_EQUAL, + MDBX_TO_KEY_EQUAL, + MDBX_TO_KEY_GREATER_OR_EQUAL, + MDBX_TO_KEY_GREATER_THAN, + + /* Doubtless cursor positioning at a specified key-value pair + * for dupsort/multi-value hives. */ + MDBX_TO_EXACT_KEY_VALUE_LESSER_THAN, + MDBX_TO_EXACT_KEY_VALUE_LESSER_OR_EQUAL, + MDBX_TO_EXACT_KEY_VALUE_EQUAL, + MDBX_TO_EXACT_KEY_VALUE_GREATER_OR_EQUAL, + MDBX_TO_EXACT_KEY_VALUE_GREATER_THAN, + + MDBX_TO_PAIR_LESSER_THAN, + MDBX_TO_PAIR_LESSER_OR_EQUAL, + MDBX_TO_PAIR_EQUAL, + MDBX_TO_PAIR_GREATER_OR_EQUAL, + MDBX_TO_PAIR_GREATER_THAN }; #ifndef __cplusplus /** \ingroup c_cursors */ @@ -1921,7 +2009,7 @@ enum MDBX_error_t { MDBX_TOO_LARGE = -30417, /** A thread has attempted to use a not owned object, - * e.g. a transaction that started by another thread. */ + * e.g. a transaction that started by another thread */ MDBX_THREAD_MISMATCH = -30416, /** Overlapping read and write transactions for the current thread */ @@ -1936,8 +2024,12 @@ enum MDBX_error_t { /** Alternative/Duplicate LCK-file is exists and should be removed manually */ MDBX_DUPLICATED_CLK = -30413, + /** Some cursors and/or other resources should be closed before subDb or + * corresponding DBI-handle could be (re)used */ + MDBX_DANGLING_DBI = -30412, + /* The last of MDBX-added error codes */ - MDBX_LAST_ADDED_ERRCODE = MDBX_DUPLICATED_CLK, + MDBX_LAST_ADDED_ERRCODE = MDBX_DANGLING_DBI, #if defined(_WIN32) || defined(_WIN64) MDBX_ENODATA = ERROR_HANDLE_EOF, @@ -1950,7 +2042,8 @@ enum MDBX_error_t { MDBX_EPERM = ERROR_INVALID_FUNCTION, MDBX_EINTR = ERROR_CANCELLED, MDBX_ENOFILE = ERROR_FILE_NOT_FOUND, - MDBX_EREMOTE = ERROR_REMOTE_STORAGE_MEDIA_ERROR + MDBX_EREMOTE = ERROR_REMOTE_STORAGE_MEDIA_ERROR, + MDBX_EDEADLK = ERROR_POSSIBLE_DEADLOCK #else /* Windows */ #ifdef ENODATA MDBX_ENODATA = ENODATA, @@ -1966,7 +2059,8 @@ enum MDBX_error_t { MDBX_EPERM = EPERM, MDBX_EINTR = EINTR, MDBX_ENOFILE = ENOENT, - MDBX_EREMOTE = ENOTBLK + MDBX_EREMOTE = ENOTBLK, + MDBX_EDEADLK = EDEADLK #endif /* !Windows */ }; #ifndef __cplusplus @@ -2082,11 +2176,12 @@ enum MDBX_option_t { * track readers in the the environment. The default is about 100 for 4K * system page size. Starting a read-only transaction normally ties a lock * table slot to the current thread until the environment closes or the thread - * exits. If \ref MDBX_NOTLS is in use, \ref mdbx_txn_begin() instead ties the - * slot to the \ref MDBX_txn object until it or the \ref MDBX_env object is - * destroyed. This option may only set after \ref mdbx_env_create() and before - * \ref mdbx_env_open(), and has an effect only when the database is opened by - * the first process interacts with the database. + * exits. If \ref MDBX_NOSTICKYTHREADS is in use, \ref mdbx_txn_begin() + * instead ties the slot to the \ref MDBX_txn object until it or the \ref + * MDBX_env object is destroyed. This option may only set after \ref + * mdbx_env_create() and before \ref mdbx_env_open(), and has an effect only + * when the database is opened by the first process interacts with the + * database. * * \see mdbx_env_set_maxreaders() \see mdbx_env_get_maxreaders() */ MDBX_opt_max_readers, @@ -2106,6 +2201,7 @@ enum MDBX_option_t { /** \brief Controls the in-process limit to grow a list of reclaimed/recycled * page's numbers for finding a sequence of contiguous pages for large data * items. + * \see MDBX_opt_gc_time_limit * * \details A long values requires allocation of contiguous database pages. * To find such sequences, it may be necessary to accumulate very large lists, @@ -2266,6 +2362,33 @@ enum MDBX_option_t { * in the \ref MDBX_WRITEMAP mode by clearing ones through file handle before * touching. */ MDBX_opt_prefault_write_enable, + + /** \brief Controls the in-process spending time limit of searching + * consecutive pages inside GC. + * \see MDBX_opt_rp_augment_limit + * + * \details Задаёт ограничение времени в 1/65536 долях секунды, которое может + * быть потрачено в ходе пишущей транзакции на поиск последовательностей + * страниц внутри GC/freelist после достижения ограничения задаваемого опцией + * \ref MDBX_opt_rp_augment_limit. Контроль по времени не выполняется при + * поиске/выделении одиночных страниц и выделении страниц под нужды GC (при + * обновлении GC в ходе фиксации транзакции). + * + * Задаваемый лимит времени исчисляется по "настенным часам" и контролируется + * в рамках транзакции, наследуется для вложенных транзакций и с + * аккумулированием в родительской при их фиксации. Контроль по времени + * производится только при достижении ограничения задаваемого опцией \ref + * MDBX_opt_rp_augment_limit. Это позволяет гибко управлять поведением + * используя обе опции. + * + * По умолчанию ограничение устанавливается в 0, что приводит к + * незамедлительной остановке поиска в GC при достижении \ref + * MDBX_opt_rp_augment_limit во внутреннем состоянии транзакции и + * соответствует поведению до появления опции `MDBX_opt_gc_time_limit`. + * С другой стороны, при минимальном значении (включая 0) + * `MDBX_opt_rp_augment_limit` переработка GC будет ограничиваться + * преимущественно затраченным временем. */ + MDBX_opt_gc_time_limit }; #ifndef __cplusplus /** \ingroup c_settings */ @@ -2322,7 +2445,7 @@ LIBMDBX_API int mdbx_env_get_option(const MDBX_env *env, * * Flags set by mdbx_env_set_flags() are also used: * - \ref MDBX_ENV_DEFAULTS, \ref MDBX_NOSUBDIR, \ref MDBX_RDONLY, - * \ref MDBX_EXCLUSIVE, \ref MDBX_WRITEMAP, \ref MDBX_NOTLS, + * \ref MDBX_EXCLUSIVE, \ref MDBX_WRITEMAP, \ref MDBX_NOSTICKYTHREADS, * \ref MDBX_NORDAHEAD, \ref MDBX_NOMEMINIT, \ref MDBX_COALESCE, * \ref MDBX_LIFORECLAIM. See \ref env_flags section. * @@ -2389,7 +2512,7 @@ enum MDBX_env_delete_mode_t { /** \brief Just delete the environment's files and directory if any. * \note On POSIX systems, processes already working with the database will * continue to work without interference until it close the environment. - * \note On Windows, the behavior of `MDB_ENV_JUST_DELETE` is different + * \note On Windows, the behavior of `MDBX_ENV_JUST_DELETE` is different * because the system does not support deleting files that are currently * memory mapped. */ MDBX_ENV_JUST_DELETE = 0, @@ -2571,9 +2694,7 @@ struct MDBX_envinfo { uint64_t mi_latter_reader_txnid; /**< ID of the last reader transaction */ uint64_t mi_self_latter_reader_txnid; /**< ID of the last reader transaction of caller process */ - uint64_t mi_meta0_txnid, mi_meta0_sign; - uint64_t mi_meta1_txnid, mi_meta1_sign; - uint64_t mi_meta2_txnid, mi_meta2_sign; + uint64_t mi_meta_txnid[3], mi_meta_sign[3]; uint32_t mi_maxreaders; /**< Total reader slots in the environment */ uint32_t mi_numreaders; /**< Max reader slots used in the environment */ uint32_t mi_dxb_pagesize; /**< Database pagesize */ @@ -2590,7 +2711,7 @@ struct MDBX_envinfo { struct { struct { uint64_t x, y; - } current, meta0, meta1, meta2; + } current, meta[3]; } mi_bootid; /** Bytes not explicitly synchronized to disk */ @@ -2656,7 +2777,8 @@ typedef struct MDBX_envinfo MDBX_envinfo; * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin() * \param [out] info The address of an \ref MDBX_envinfo structure * where the information will be copied - * \param [in] bytes The size of \ref MDBX_envinfo. + * \param [in] bytes The actual size of \ref MDBX_envinfo, + * this value is used to provide ABI compatibility. * * \returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_info_ex(const MDBX_env *env, const MDBX_txn *txn, @@ -2891,6 +3013,86 @@ LIBMDBX_INLINE_API(int, mdbx_env_close, (MDBX_env * env)) { return mdbx_env_close_ex(env, false); } +#if defined(DOXYGEN) || !(defined(_WIN32) || defined(_WIN64)) +/** \brief Восстанавливает экземпляр среды в дочернем процессе после ветвления + * родительского процесса посредством `fork()` и родственных системных вызовов. + * \ingroup c_extra + * + * Без вызова \ref mdbx_env_resurrect_after_fork() использование открытого + * экземпляра среды в дочернем процессе не возможно, включая все выполняющиеся + * на момент ветвления транзакции. + * + * Выполняемые функцией действия можно рассматривать как повторное открытие БД + * в дочернем процессе, с сохранением заданных опций и адресов уже созданных + * экземпляров объектов связанных с API. + * + * \note Функция не доступна в ОС семейства Windows по причине отсутствия + * функционала ветвления процесса в API операционной системы. + * + * Ветвление не оказывает влияния на состояние MDBX-среды в родительском + * процессе. Все транзакции, которые были в родительском процессе на момент + * ветвления, после ветвления в родительском процессе продолжат выполняться без + * помех. Но в дочернем процессе все соответствующие транзакции безальтернативно + * перестают быть валидными, а попытка их использования приведет к возврату + * ошибки или отправке `SIGSEGV`. + * + * Использование экземпляра среды в дочернем процессе не возможно до вызова + * \ref mdbx_env_resurrect_after_fork(), так как в результате ветвления у + * процесса меняется PID, значение которого используется для организации + * совместно работы с БД, в том числе, для отслеживания процессов/потоков + * выполняющих читающие транзакции связанные с соответствующими снимками данных. + * Все активные на момент ветвления транзакции не могут продолжаться в дочернем + * процессе, так как не владеют какими-либо блокировками или каким-либо снимком + * данных и не удерживает его от переработки при сборке мусора. + * + * Функция \ref mdbx_env_resurrect_after_fork() восстанавливает переданный + * экземпляр среды в дочернем процессе после ветвления, а именно: обновляет + * используемые системные идентификаторы, повторно открывает дескрипторы файлов, + * производит захват необходимых блокировок связанных с LCK- и DXB-файлами БД, + * восстанавливает отображения в память страницы БД, таблицы читателей и + * служебных/вспомогательных данных в память. Однако унаследованные от + * родительского процесса транзакции не восстанавливаются, прием пишущие и + * читающие транзакции обрабатываются по-разному: + * + * - Пишущая транзакция, если таковая была на момент ветвления, + * прерывается в дочернем процессе с освобождение связанных с ней ресурсов, + * включая все вложенные транзакции. + * + * - Читающие же транзакции, если таковые были в родительском процессе, + * в дочернем процессе логически прерываются, но без освобождения ресурсов. + * Поэтому необходимо обеспечить вызов \ref mdbx_txn_abort() для каждой + * такой читающей транзакций в дочернем процессе, либо смириться с утечкой + * ресурсов до завершения дочернего процесса. + * + * Причина не-освобождения ресурсов читающих транзакций в том, что исторически + * MDBX не ведет какой-либо общий список экземпляров читающих, так как это не + * требуется для штатных режимов работы, но требует использования атомарных + * операций или дополнительных объектов синхронизации при создании/разрушении + * экземпляров \ref MDBX_txn. + * + * Вызов \ref mdbx_env_resurrect_after_fork() без ветвления, не в дочернем + * процессе, либо повторные вызовы не приводят к каким-либо действиям или + * изменениям. + * + * \param [in,out] env Экземпляр среды созданный функцией + * \ref mdbx_env_create(). + * + * \returns Ненулевое значение ошибки при сбое и 0 при успешном выполнении, + * некоторые возможные ошибки таковы: + * + * \retval MDBX_BUSY В родительском процессе БД была открыта + * в режиме \ref MDBX_EXCLUSIVE. + * + * \retval MDBX_EBADSIGN При повреждении сигнатуры экземпляра объекта, а также + * в случае одновременного вызова \ref + * mdbx_env_resurrect_after_fork() из разных потоков. + * + * \retval MDBX_PANIC Произошла критическая ошибка при восстановлении + * экземпляра среды, либо такая ошибка уже была + * до вызова функции. */ +LIBMDBX_API int mdbx_env_resurrect_after_fork(MDBX_env *env); +#endif /* Windows */ + /** \brief Warming up options * \ingroup c_settings * \anchor warmup_flags @@ -3239,7 +3441,7 @@ LIBMDBX_API int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *fd); * 2) Temporary close memory mapped is required to change * geometry, but there read transaction(s) is running * and no corresponding thread(s) could be suspended - * since the \ref MDBX_NOTLS mode is used. + * since the \ref MDBX_NOSTICKYTHREADS mode is used. * \retval MDBX_EACCESS The environment opened in read-only. * \retval MDBX_MAP_FULL Specified size smaller than the space already * consumed by the environment. @@ -3268,7 +3470,7 @@ MDBX_DEPRECATED LIBMDBX_INLINE_API(int, mdbx_env_set_mapsize, * value. * * \returns A \ref MDBX_RESULT_TRUE or \ref MDBX_RESULT_FALSE value, - * otherwise the error code: + * otherwise the error code. * \retval MDBX_RESULT_TRUE Readahead is reasonable. * \retval MDBX_RESULT_FALSE Readahead is NOT reasonable, * i.e. \ref MDBX_NORDAHEAD is useful to @@ -3310,6 +3512,12 @@ mdbx_limits_dbsize_max(intptr_t pagesize); MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t mdbx_limits_keysize_max(intptr_t pagesize, MDBX_db_flags_t flags); +/** \brief Returns minimal key size in bytes for given database flags. + * \ingroup c_statinfo + * \see db_flags */ +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t +mdbx_limits_keysize_min(MDBX_db_flags_t flags); + /** \brief Returns maximal data size in bytes for given page size * and database flags, or -1 if pagesize is invalid. * \ingroup c_statinfo @@ -3317,6 +3525,12 @@ mdbx_limits_keysize_max(intptr_t pagesize, MDBX_db_flags_t flags); MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t mdbx_limits_valsize_max(intptr_t pagesize, MDBX_db_flags_t flags); +/** \brief Returns minimal data size in bytes for given database flags. + * \ingroup c_statinfo + * \see db_flags */ +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t +mdbx_limits_valsize_min(MDBX_db_flags_t flags); + /** \brief Returns maximal size of key-value pair to fit in a single page with * the given size and database flags, or -1 if pagesize is invalid. * \ingroup c_statinfo @@ -3346,11 +3560,11 @@ mdbx_limits_txnsize_max(intptr_t pagesize); * track readers in the the environment. The default is about 100 for 4K system * page size. Starting a read-only transaction normally ties a lock table slot * to the current thread until the environment closes or the thread exits. If - * \ref MDBX_NOTLS is in use, \ref mdbx_txn_begin() instead ties the slot to the - * \ref MDBX_txn object until it or the \ref MDBX_env object is destroyed. - * This function may only be called after \ref mdbx_env_create() and before - * \ref mdbx_env_open(), and has an effect only when the database is opened by - * the first process interacts with the database. + * \ref MDBX_NOSTICKYTHREADS is in use, \ref mdbx_txn_begin() instead ties the + * slot to the \ref MDBX_txn object until it or the \ref MDBX_env object is + * destroyed. This function may only be called after \ref mdbx_env_create() and + * before \ref mdbx_env_open(), and has an effect only when the database is + * opened by the first process interacts with the database. * \see mdbx_env_get_maxreaders() * * \param [in] env An environment handle returned @@ -3544,8 +3758,8 @@ mdbx_env_get_userctx(const MDBX_env *env); * \see mdbx_txn_begin() * * \note A transaction and its cursors must only be used by a single thread, - * and a thread may only have a single transaction at a time. If \ref MDBX_NOTLS - * is in use, this does not apply to read-only transactions. + * and a thread may only have a single transaction at a time unless + * the \ref MDBX_NOSTICKYTHREADS is used. * * \note Cursors may not span transactions. * @@ -3606,8 +3820,8 @@ LIBMDBX_API int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, * \see mdbx_txn_begin_ex() * * \note A transaction and its cursors must only be used by a single thread, - * and a thread may only have a single transaction at a time. If \ref MDBX_NOTLS - * is in use, this does not apply to read-only transactions. + * and a thread may only have a single transaction at a time unless + * the \ref MDBX_NOSTICKYTHREADS is used. * * \note Cursors may not span transactions. * @@ -3766,7 +3980,7 @@ mdbx_txn_env(const MDBX_txn *txn); * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). * * \returns A transaction flags, valid if input is an valid transaction, - * otherwise -1. */ + * otherwise \ref MDBX_TXN_INVALID. */ MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int mdbx_txn_flags(const MDBX_txn *txn); /** \brief Return the transaction's ID. @@ -3982,10 +4196,11 @@ LIBMDBX_API int mdbx_txn_break(MDBX_txn *txn); * Abort the read-only transaction like \ref mdbx_txn_abort(), but keep the * transaction handle. Therefore \ref mdbx_txn_renew() may reuse the handle. * This saves allocation overhead if the process will start a new read-only - * transaction soon, and also locking overhead if \ref MDBX_NOTLS is in use. The - * reader table lock is released, but the table slot stays tied to its thread - * or \ref MDBX_txn. Use \ref mdbx_txn_abort() to discard a reset handle, and to - * free its lock table slot if \ref MDBX_NOTLS is in use. + * transaction soon, and also locking overhead if \ref MDBX_NOSTICKYTHREADS is + * in use. The reader table lock is released, but the table slot stays tied to + * its thread or \ref MDBX_txn. Use \ref mdbx_txn_abort() to discard a reset + * handle, and to free its lock table slot if \ref MDBX_NOSTICKYTHREADS is in + * use. * * Cursors opened within the transaction must not be used again after this * call, except with \ref mdbx_cursor_renew() and \ref mdbx_cursor_close(). @@ -4196,6 +4411,7 @@ typedef int(MDBX_cmp_func)(const MDBX_val *a, * by current thread. */ LIBMDBX_API int mdbx_dbi_open(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, MDBX_dbi *dbi); +/** \copydoc mdbx_dbi_open() */ LIBMDBX_API int mdbx_dbi_open2(MDBX_txn *txn, const MDBX_val *name, MDBX_db_flags_t flags, MDBX_dbi *dbi); @@ -4217,10 +4433,30 @@ LIBMDBX_API int mdbx_dbi_open2(MDBX_txn *txn, const MDBX_val *name, MDBX_DEPRECATED LIBMDBX_API int mdbx_dbi_open_ex(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp); +/** \copydoc mdbx_dbi_open_ex() */ MDBX_DEPRECATED LIBMDBX_API int mdbx_dbi_open_ex2(MDBX_txn *txn, const MDBX_val *name, MDBX_db_flags_t flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp); +/** \brief Переименовает таблицу по DBI-хендлу. + * \ingroup c_dbi + * + * Переименовывает пользовательскую именованную subDB связанную с передаваемым + * DBI-дескриптором. + * + * \param [in,out] txn Пишущая транзакция запущенная посредством + * \ref mdbx_txn_begin(). + * \param [in] dbi Дескриптор таблицы (именованной пользовательской subDB) + * открытый посредством \ref mdbx_dbi_open(). + * + * \param [in] name Новое имя для переименования. + * + * \returns Ненулевое значение ошибки при сбое и 0 при успешном выполнении. */ +LIBMDBX_API int mdbx_dbi_rename(MDBX_txn *txn, MDBX_dbi dbi, const char *name); +/** \copydoc mdbx_dbi_rename() */ +LIBMDBX_API int mdbx_dbi_rename2(MDBX_txn *txn, MDBX_dbi dbi, + const MDBX_val *name); + /** \defgroup value2key Value-to-Key functions * \brief Value-to-Key functions to * \ref avoid_custom_comparators "avoid using custom comparators" @@ -4734,6 +4970,28 @@ mdbx_cursor_get_userctx(const MDBX_cursor *cursor); LIBMDBX_API int mdbx_cursor_bind(const MDBX_txn *txn, MDBX_cursor *cursor, MDBX_dbi dbi); +/** \brief Unbind cursor from a transaction. + * \ingroup c_cursors + * + * Unbinded cursor is disassociated with any transactions but still holds + * the original DBI-handle internally. Thus it could be renewed with any running + * transaction or closed. + * + * \see mdbx_cursor_renew() + * \see mdbx_cursor_bind() + * \see mdbx_cursor_close() + * + * \note In contrast to LMDB, the MDBX required that any opened cursors can be + * reused and must be freed explicitly, regardless ones was opened in a + * read-only or write transaction. The REASON for this is eliminates ambiguity + * which helps to avoid errors such as: use-after-free, double-free, i.e. + * memory corruption and segfaults. + * + * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). + * + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_cursor_unbind(MDBX_cursor *cursor); + /** \brief Create a cursor handle for the specified transaction and DBI handle. * \ingroup c_cursors * @@ -4783,6 +5041,27 @@ LIBMDBX_API int mdbx_cursor_open(const MDBX_txn *txn, MDBX_dbi dbi, * or \ref mdbx_cursor_create(). */ LIBMDBX_API void mdbx_cursor_close(MDBX_cursor *cursor); +/** \brief Unbind or closes all cursors of a given transaction. + * \ingroup c_cursors + * + * Unbinds either closes all cursors associated (opened or renewed) with + * a given transaction in a bulk with minimal overhead. + * + * \see mdbx_cursor_unbind() + * \see mdbx_cursor_close() + * + * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). + * \param [in] unbind If non-zero, unbinds cursors and leaves ones reusable. + * Otherwise close and dispose cursors. + * + * \returns A negative error value on failure or the number of closed cursors + * on success, some possible errors are: + * \retval MDBX_THREAD_MISMATCH Given transaction is not owned + * by current thread. + * \retval MDBX_BAD_TXN Given transaction is invalid or has + * a child/nested transaction transaction. */ +LIBMDBX_API int mdbx_txn_release_all_cursors(const MDBX_txn *txn, bool unbind); + /** \brief Renew a cursor handle for use within the given transaction. * \ingroup c_cursors * @@ -4834,6 +5113,31 @@ LIBMDBX_API MDBX_dbi mdbx_cursor_dbi(const MDBX_cursor *cursor); * \returns A non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest); +/** \brief Сравнивает позицию курсоров. + * \ingroup c_cursors + * + * Функция предназначена для сравнения позиций двух + * инициализированных/установленных курсоров, связанных с одной транзакцией и + * одной таблицей (DBI-дескриптором). + * Если же курсоры связаны с разными транзакциями, либо с разными таблицами, + * либо один из них не инициализирован, то результат сравнения не определен + * (поведением может быть изменено в последующих версиях). + * + * \param [in] left Левый курсор для сравнения позиций. + * \param [in] right Правый курсор для сравнения позиций. + * \param [in] ignore_multival Булевой флаг, влияющий на результат только при + * сравнении курсоров для таблиц с мульти-значениями, т.е. с флагом + * \ref MDBX_DUPSORT. В случае `true`, позиции курсоров сравниваются + * только по ключам, без учета позиционирования среди мульти-значений. + * Иначе, в случае `false`, при совпадении позиций по ключам, + * сравниваются также позиции по мульти-значениям. + * + * \retval Значение со знаком в семантике оператора `<=>` (меньше нуля, ноль, + * либо больше нуля) как результат сравнения позиций курсоров. */ +LIBMDBX_API int mdbx_cursor_compare(const MDBX_cursor *left, + const MDBX_cursor *right, + bool ignore_multival); + /** \brief Retrieve by cursor. * \ingroup c_crud * @@ -4868,6 +5172,203 @@ LIBMDBX_API int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest); LIBMDBX_API int mdbx_cursor_get(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op); +/** \brief Тип предикативных функций обратного вызова используемых + * \ref mdbx_cursor_scan() и \ref mdbx_cursor_scan_from() для пробирования + * пар ключ-значения. + * \ingroup c_crud + * + * \param [in,out] context Указатель на контекст с необходимой для оценки + * информацией, который полностью подготавливается + * и контролируется вами. + * \param [in] key Ключ для оценки пользовательской функцией. + * \param [in] value Значение для оценки пользовательской функцией. + * \param [in,out] arg Дополнительный аргумент предикативной функции, + * который полностью подготавливается + * и контролируется вами. + * + * \returns Результат проверки соответствия переданной пары ключ-значения + * искомой цели. Иначе код ошибки, который прерывает сканирование и возвращается + * без изменения в качестве результата из функций \ref mdbx_cursor_scan() + * или \ref mdbx_cursor_scan_from(). + * + * \retval MDBX_RESULT_TRUE если переданная пара ключ-значение соответствует + * искомой и следует завершить сканирование. + * \retval MDBX_RESULT_FALSE если переданная пара ключ-значение НЕ соответствует + * искомой и следует продолжать сканирование. + * \retval ИНАЧЕ любое другое значение, отличное от \ref MDBX_RESULT_TRUE + * и \ref MDBX_RESULT_FALSE, считается индикатором ошибки + * и возвращается без изменений в качестве результата сканирования. + * + * \see mdbx_cursor_scan() + * \see mdbx_cursor_scan_from() */ +typedef int(MDBX_predicate_func)(void *context, MDBX_val *key, MDBX_val *value, + void *arg) MDBX_CXX17_NOEXCEPT; + +/** \brief Сканирует таблицу с использованием передаваемого предиката, + * с уменьшением сопутствующих накладных расходов. + * \ingroup c_crud + * + * Реализует функционал сходный с шаблоном `std::find_if<>()` с использованием + * курсора и пользовательской предикативной функции, экономя при этом + * на сопутствующих накладных расходах, в том числе, не выполняя часть проверок + * внутри цикла итерации записей и потенциально уменьшая количество + * DSO-трансграничных вызовов. + * + * Функция принимает курсор, который должен быть привязан к некоторой транзакции + * и DBI-дескриптору таблицы (именованной пользовательской subDB), выполняет + * первоначальное позиционирование курсора определяемое аргументом `start_op`. + * Далее, производится оценка каждой пары ключ-значения посредством + * предоставляемой вами предикативной функции `predicate` и затем, при + * необходимости, переход к следующему элементу посредством операции `turn_op`, + * до наступления одного из четырех событий: + * - достигается конец данных; + * - возникнет ошибка при позиционировании курсора; + * - оценочная функция вернет \ref MDBX_RESULT_TRUE, сигнализируя + * о необходимости остановить дальнейшее сканирование; + * - оценочная функция возвратит значение отличное от \ref MDBX_RESULT_FALSE + * и \ref MDBX_RESULT_TRUE сигнализируя об ошибке. + * + * \param [in,out] cursor Курсор для выполнения операции сканирования, + * связанный с активной транзакцией и DBI-дескриптором + * таблицы. Например, курсор созданный + * посредством \ref mdbx_cursor_open(). + * \param [in] predicate Предикативная функция для оценки итерируемых + * пар ключ-значения, + * более подробно смотрите \ref MDBX_predicate_func. + * \param [in,out] context Указатель на контекст с необходимой для оценки + * информацией, который полностью подготавливается + * и контролируется вами. + * \param [in] start_op Стартовая операция позиционирования курсора, + * более подробно смотрите \ref MDBX_cursor_op. + * Для сканирования без изменения исходной позиции + * курсора используйте \ref MDBX_GET_CURRENT. + * Допустимые значения \ref MDBX_FIRST, + * \ref MDBX_FIRST_DUP, \ref MDBX_LAST, + * \ref MDBX_LAST_DUP, \ref MDBX_GET_CURRENT, + * а также \ref MDBX_GET_MULTIPLE. + * \param [in] turn_op Операция позиционирования курсора для перехода + * к следующему элементу. Допустимые значения + * \ref MDBX_NEXT, \ref MDBX_NEXT_DUP, + * \ref MDBX_NEXT_NODUP, \ref MDBX_PREV, + * \ref MDBX_PREV_DUP, \ref MDBX_PREV_NODUP, а также + * \ref MDBX_NEXT_MULTIPLE и \ref MDBX_PREV_MULTIPLE. + * \param [in,out] arg Дополнительный аргумент предикативной функции, + * который полностью подготавливается + * и контролируется вами. + * + * \note При использовании \ref MDBX_GET_MULTIPLE, \ref MDBX_NEXT_MULTIPLE + * или \ref MDBX_PREV_MULTIPLE внимательно учитывайте пакетную специфику + * передачи значений через параметры предикативной функции. + * + * \see MDBX_predicate_func + * \see mdbx_cursor_scan_from + * + * \returns Результат операции сканирования, либо код ошибки. + * + * \retval MDBX_RESULT_TRUE если найдена пара ключ-значение, для которой + * предикативная функция вернула \ref MDBX_RESULT_TRUE. + * \retval MDBX_RESULT_FALSE если если подходящая пара ключ-значения НЕ найдена, + * в процессе поиска достигнут конец данных, либо нет данных для поиска. + * \retval ИНАЧЕ любое другое значение, отличное от \ref MDBX_RESULT_TRUE + * и \ref MDBX_RESULT_FALSE, является кодом ошибки при позиционировании + * курса, либо определяемым пользователем кодом остановки поиска + * или ошибочной ситуации. */ +LIBMDBX_API int mdbx_cursor_scan(MDBX_cursor *cursor, + MDBX_predicate_func *predicate, void *context, + MDBX_cursor_op start_op, + MDBX_cursor_op turn_op, void *arg); + +/** Сканирует таблицу с использованием передаваемого предиката, + * начиная с передаваемой пары ключ-значение, + * с уменьшением сопутствующих накладных расходов. + * \ingroup c_crud + * + * Функция принимает курсор, который должен быть привязан к некоторой транзакции + * и DBI-дескриптору таблицы (именованной пользовательской subDB), выполняет + * первоначальное позиционирование курсора определяемое аргументом `from_op`. + * а также аргументами `from_key` и `from_value`. + * Далее, производится оценка каждой пары ключ-значения посредством + * предоставляемой вами предикативной функции `predicate` и затем, при + * необходимости, переход к следующему элементу посредством операции `turn_op`, + * до наступления одного из четырех событий: + * - достигается конец данных; + * - возникнет ошибка при позиционировании курсора; + * - оценочная функция вернет \ref MDBX_RESULT_TRUE, сигнализируя + * о необходимости остановить дальнейшее сканирование; + * - оценочная функция возвратит значение отличное от \ref MDBX_RESULT_FALSE + * и \ref MDBX_RESULT_TRUE сигнализируя об ошибке. + * + * \param [in,out] cursor Курсор для выполнения операции сканирования, + * связанный с активной транзакцией и DBI-дескриптором + * таблицы. Например, курсор созданный + * посредством \ref mdbx_cursor_open(). + * \param [in] predicate Предикативная функция для оценки итерируемых + * пар ключ-значения, + * более подробно смотрите \ref MDBX_predicate_func. + * \param [in,out] context Указатель на контекст с необходимой для оценки + * информацией, который полностью подготавливается + * и контролируется вами. + * \param [in] from_op Операция позиционирования курсора к исходной + * позиции, более подробно смотрите + * \ref MDBX_cursor_op. + * Допустимые значения \ref MDBX_GET_BOTH, + * \ref MDBX_GET_BOTH_RANGE, \ref MDBX_SET_KEY, + * \ref MDBX_SET_LOWERBOUND, \ref MDBX_SET_UPPERBOUND, + * \ref MDBX_TO_KEY_LESSER_THAN, + * \ref MDBX_TO_KEY_LESSER_OR_EQUAL, + * \ref MDBX_TO_KEY_EQUAL, + * \ref MDBX_TO_KEY_GREATER_OR_EQUAL, + * \ref MDBX_TO_KEY_GREATER_THAN, + * \ref MDBX_TO_EXACT_KEY_VALUE_LESSER_THAN, + * \ref MDBX_TO_EXACT_KEY_VALUE_LESSER_OR_EQUAL, + * \ref MDBX_TO_EXACT_KEY_VALUE_EQUAL, + * \ref MDBX_TO_EXACT_KEY_VALUE_GREATER_OR_EQUAL, + * \ref MDBX_TO_EXACT_KEY_VALUE_GREATER_THAN, + * \ref MDBX_TO_PAIR_LESSER_THAN, + * \ref MDBX_TO_PAIR_LESSER_OR_EQUAL, + * \ref MDBX_TO_PAIR_EQUAL, + * \ref MDBX_TO_PAIR_GREATER_OR_EQUAL, + * \ref MDBX_TO_PAIR_GREATER_THAN, + * а также \ref MDBX_GET_MULTIPLE. + * \param [in,out] from_key Указатель на ключ используемый как для исходного + * позиционирования, так и для последующих итераций + * перехода. + * \param [in,out] from_value Указатель на значние используемое как для + * исходного позиционирования, так и для последующих + * итераций перехода. + * \param [in] turn_op Операция позиционирования курсора для перехода + * к следующему элементу. Допустимые значения + * \ref MDBX_NEXT, \ref MDBX_NEXT_DUP, + * \ref MDBX_NEXT_NODUP, \ref MDBX_PREV, + * \ref MDBX_PREV_DUP, \ref MDBX_PREV_NODUP, а также + * \ref MDBX_NEXT_MULTIPLE и \ref MDBX_PREV_MULTIPLE. + * \param [in,out] arg Дополнительный аргумент предикативной функции, + * который полностью подготавливается + * и контролируется вами. + * + * \note При использовании \ref MDBX_GET_MULTIPLE, \ref MDBX_NEXT_MULTIPLE + * или \ref MDBX_PREV_MULTIPLE внимательно учитывайте пакетную специфику + * передачи значений через параметры предикативной функции. + * + * \see MDBX_predicate_func + * \see mdbx_cursor_scan + * + * \returns Результат операции сканирования, либо код ошибки. + * + * \retval MDBX_RESULT_TRUE если найдена пара ключ-значение, для которой + * предикативная функция вернула \ref MDBX_RESULT_TRUE. + * \retval MDBX_RESULT_FALSE если если подходящая пара ключ-значения НЕ найдена, + * в процессе поиска достигнут конец данных, либо нет данных для поиска. + * \retval ИНАЧЕ любое другое значение, отличное от \ref MDBX_RESULT_TRUE + * и \ref MDBX_RESULT_FALSE, является кодом ошибки при позиционировании + * курса, либо определяемым пользователем кодом остановки поиска + * или ошибочной ситуации. */ +LIBMDBX_API int mdbx_cursor_scan_from(MDBX_cursor *cursor, + MDBX_predicate_func *predicate, + void *context, MDBX_cursor_op from_op, + MDBX_val *from_key, MDBX_val *from_value, + MDBX_cursor_op turn_op, void *arg); + /** \brief Retrieve multiple non-dupsort key/value pairs by cursor. * \ingroup c_crud * @@ -5054,7 +5555,7 @@ LIBMDBX_API int mdbx_cursor_count(const MDBX_cursor *cursor, size_t *pcount); * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). * * \returns A \ref MDBX_RESULT_TRUE or \ref MDBX_RESULT_FALSE value, - * otherwise the error code: + * otherwise the error code. * \retval MDBX_RESULT_TRUE No more data available or cursor not * positioned * \retval MDBX_RESULT_FALSE A data is available @@ -5069,13 +5570,27 @@ mdbx_cursor_eof(const MDBX_cursor *cursor); * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). * * \returns A MDBX_RESULT_TRUE or MDBX_RESULT_FALSE value, - * otherwise the error code: + * otherwise the error code. * \retval MDBX_RESULT_TRUE Cursor positioned to the first key-value pair * \retval MDBX_RESULT_FALSE Cursor NOT positioned to the first key-value * pair \retval Otherwise the error code */ MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int mdbx_cursor_on_first(const MDBX_cursor *cursor); +/** \brief Определяет стоит ли курсор на первом или единственном + * мульти-значении соответствующем ключу. + * \ingroup c_cursors + * \param [in] cursor Курсор созданный посредством \ref mdbx_cursor_open(). + * \returns Значание \ref MDBX_RESULT_TRUE, либо \ref MDBX_RESULT_FALSE, + * иначе код ошибки. + * \retval MDBX_RESULT_TRUE курсор установлен на первом или единственном + * мульти-значении соответствующем ключу. + * \retval MDBX_RESULT_FALSE курсор НЕ установлен на первом или единственном + * мульти-значении соответствующем ключу. + * \retval ИНАЧЕ код ошибки. */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int +mdbx_cursor_on_first_dup(const MDBX_cursor *cursor); + /** \brief Determines whether the cursor is pointed to the last key-value pair * or not. * \ingroup c_cursors @@ -5083,13 +5598,27 @@ mdbx_cursor_on_first(const MDBX_cursor *cursor); * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). * * \returns A \ref MDBX_RESULT_TRUE or \ref MDBX_RESULT_FALSE value, - * otherwise the error code: + * otherwise the error code. * \retval MDBX_RESULT_TRUE Cursor positioned to the last key-value pair * \retval MDBX_RESULT_FALSE Cursor NOT positioned to the last key-value pair * \retval Otherwise the error code */ MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int mdbx_cursor_on_last(const MDBX_cursor *cursor); +/** \brief Определяет стоит ли курсор на последнем или единственном + * мульти-значении соответствующем ключу. + * \ingroup c_cursors + * \param [in] cursor Курсор созданный посредством \ref mdbx_cursor_open(). + * \returns Значание \ref MDBX_RESULT_TRUE, либо \ref MDBX_RESULT_FALSE, + * иначе код ошибки. + * \retval MDBX_RESULT_TRUE курсор установлен на последнем или единственном + * мульти-значении соответствующем ключу. + * \retval MDBX_RESULT_FALSE курсор НЕ установлен на последнем или единственном + * мульти-значении соответствующем ключу. + * \retval ИНАЧЕ код ошибки. */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int +mdbx_cursor_on_last_dup(const MDBX_cursor *cursor); + /** \addtogroup c_rqest * \details \note The estimation result varies greatly depending on the filling * of specific pages and the overall balance of the b-tree: @@ -5226,7 +5755,7 @@ LIBMDBX_API int mdbx_estimate_range(const MDBX_txn *txn, MDBX_dbi dbi, * \param [in] ptr The address of data to check. * * \returns A MDBX_RESULT_TRUE or MDBX_RESULT_FALSE value, - * otherwise the error code: + * otherwise the error code. * \retval MDBX_RESULT_TRUE Given address is on the dirty page. * \retval MDBX_RESULT_FALSE Given address is NOT on the dirty page. * \retval Otherwise the error code. */ @@ -5521,48 +6050,21 @@ LIBMDBX_API int mdbx_env_set_hsr(MDBX_env *env, MDBX_hsr_func *hsr_callback); MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API MDBX_hsr_func * mdbx_env_get_hsr(const MDBX_env *env); -/** \defgroup btree_traversal B-tree Traversal - * This is internal API for mdbx_chk tool. You should avoid to use it, except - * some extremal special cases. +/** \defgroup chk Checking and Recovery + * Basically this is internal API for `mdbx_chk` tool, etc. + * You should avoid to use it, except some extremal special cases. * \ingroup c_extra * @{ */ -/** \brief Page types for traverse the b-tree. - * \see mdbx_env_pgwalk() \see MDBX_pgvisitor_func */ -enum MDBX_page_type_t { - MDBX_page_broken, - MDBX_page_meta, - MDBX_page_large, - MDBX_page_branch, - MDBX_page_leaf, - MDBX_page_dupfixed_leaf, - MDBX_subpage_leaf, - MDBX_subpage_dupfixed_leaf, - MDBX_subpage_broken, -}; -#ifndef __cplusplus -typedef enum MDBX_page_type_t MDBX_page_type_t; -#endif - -/** \brief Pseudo-name for MainDB */ -#define MDBX_PGWALK_MAIN ((void *)((ptrdiff_t)0)) -/** \brief Pseudo-name for GarbageCollectorDB */ -#define MDBX_PGWALK_GC ((void *)((ptrdiff_t)-1)) -/** \brief Pseudo-name for MetaPages */ -#define MDBX_PGWALK_META ((void *)((ptrdiff_t)-2)) - -/** \brief Callback function for traverse the b-tree. \see mdbx_env_pgwalk() */ -typedef int -MDBX_pgvisitor_func(const uint64_t pgno, const unsigned number, void *const ctx, - const int deep, const MDBX_val *dbi_name, - const size_t page_size, const MDBX_page_type_t type, - const MDBX_error_t err, const size_t nentries, - const size_t payload_bytes, const size_t header_bytes, - const size_t unused_bytes) MDBX_CXX17_NOEXCEPT; +/** \brief Acquires write-transaction lock. + * Provided for custom and/or complex locking scenarios. + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dont_wait); -/** \brief B-tree traversal function. */ -LIBMDBX_API int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, - void *ctx, bool dont_check_keys_ordering); +/** \brief Releases write-transaction lock. + * Provided for custom and/or complex locking scenarios. + * \returns A non-zero error value on failure and 0 on success. */ +LIBMDBX_API int mdbx_txn_unlock(MDBX_env *env); /** \brief Open an environment instance using specific meta-page * for checking and recovery. @@ -5594,7 +6096,314 @@ LIBMDBX_API int mdbx_env_open_for_recoveryW(MDBX_env *env, * leg(s). */ LIBMDBX_API int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target_meta); -/** end of btree_traversal @} */ +/** \brief Получает базовую информацию о БД не открывая её. + * \ingroup c_opening + * + * Назначение функции в получении базовой информации без открытия БД и + * отображения данных в память (что может быть достаточно затратным действием + * для ядра ОС). Полученная таким образом информация может быть полезной для + * подстройки опций работы с БД перед её открытием, а также в сценариях файловых + * менеджерах и прочих вспомогательных утилитах. + * + * \todo Добавить в API возможность установки обратного вызова для ревизии опций + * работы с БД в процессе её открытия (при удержании блокировок). + * + * \param [in] pathname Путь к директории или файлу БД. + * \param [out] info Указатель на структуру \ref MDBX_envinfo + * для получения информации. + * \param [in] bytes Актуальный размер структуры \ref MDBX_envinfo, это + * значение используется для обеспечения совместимости + * ABI. + * + * \note Заполняется только некоторые поля структуры \ref MDBX_envinfo, значения + * которых возможно получить без отображения файлов БД в память и без захвата + * блокировок: размер страницы БД, геометрия БД, размер распределенного места + * (номер последней распределенной страницы), номер последней транзакции и + * boot-id. + * + * \warning Полученная информация является снимком на время выполнения функции и + * может быть в любой момент изменена работающим с БД процессом. В том числе, + * нет препятствий к тому, чтобы другой процесс удалил БД и создал её заново с + * другим размером страницы и/или изменением любых других параметров. + * + * \returns Ненулевое значение ошибки при сбое и 0 при успешном выполнении. */ +LIBMDBX_API int mdbx_preopen_snapinfo(const char *pathname, MDBX_envinfo *info, + size_t bytes); +#if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) +/** \copydoc mdbx_preopen_snapinfo() + * \note Available only on Windows. + * \see mdbx_preopen_snapinfo() */ +LIBMDBX_API int mdbx_preopen_snapinfoW(const wchar_t *pathname, + MDBX_envinfo *info, size_t bytes); +#endif /* Windows */ + +/** \brief Флаги/опции для проверки целостности базы данных. + * \note Данный API еще не зафиксирован, в последующих версиях могут быть + * незначительные доработки и изменения. + * \see mdbx_env_chk() */ +enum MDBX_chk_flags_t { + /** Режим проверки по-умолчанию, в том числе в режиме только-чтения. */ + MDBX_CHK_DEFAULTS = 0, + + /** Проверка в режиме чтения-записи, с захватом блокировки и приостановки + * пишущих транзакций. */ + MDBX_CHK_READWRITE = 1, + + /** Пропустить обход дерева страниц. */ + MDBX_CHK_SKIP_BTREE_TRAVERSAL = 2, + + /** Пропустить просмотр записей ключ-значение. */ + MDBX_CHK_SKIP_KV_TRAVERSAL = 4, + + /** Игнорировать порядок ключей и записей. + * \note Требуется при проверке унаследованных БД созданных с использованием + * нестандартных (пользовательских) функций сравнения ключей или значений. */ + MDBX_CHK_IGNORE_ORDER = 8 +}; +#ifndef __cplusplus +/** \ingroup c_opening */ +typedef enum MDBX_chk_flags_t MDBX_chk_flags_t; +#else +DEFINE_ENUM_FLAG_OPERATORS(MDBX_chk_flags_t) +#endif + +/** \brief Уровни логирование/детализации информации, + * поставляемой через обратные вызовы при проверке целостности базы данных. + * \see mdbx_env_chk() */ +enum MDBX_chk_severity { + MDBX_chk_severity_prio_shift = 4, + MDBX_chk_severity_kind_mask = 0xF, + MDBX_chk_fatal = 0x00u, + MDBX_chk_error = 0x11u, + MDBX_chk_warning = 0x22u, + MDBX_chk_notice = 0x33u, + MDBX_chk_result = 0x44u, + MDBX_chk_resolution = 0x55u, + MDBX_chk_processing = 0x56u, + MDBX_chk_info = 0x67u, + MDBX_chk_verbose = 0x78u, + MDBX_chk_details = 0x89u, + MDBX_chk_extra = 0x9Au +}; + +/** \brief Стадии проверки, + * сообщаемые через обратные вызовы при проверке целостности базы данных. + * \see mdbx_env_chk() */ +enum MDBX_chk_stage { + MDBX_chk_none, + MDBX_chk_init, + MDBX_chk_lock, + MDBX_chk_meta, + MDBX_chk_traversal_tree, + MDBX_chk_traversal_freedb, + MDBX_chk_space, + MDBX_chk_traversal_maindb, + MDBX_chk_traversal_subdbs, + MDBX_chk_conclude, + MDBX_chk_unlock, + MDBX_chk_finalize +}; + +/** \brief Виртуальная строка отчета, формируемого при проверке целостности базы + * данных. \see mdbx_env_chk() */ +typedef struct MDBX_chk_line { + struct MDBX_chk_context *ctx; + uint8_t severity, scope_depth, empty; + char *begin, *end, *out; +} MDBX_chk_line_t; + +/** \brief Проблема обнаруженная при проверке целостности базы данных. + * \see mdbx_env_chk() */ +typedef struct MDBX_chk_issue { + struct MDBX_chk_issue *next; + size_t count; + const char *caption; +} MDBX_chk_issue_t; + +/** \brief Иерархический контекст при проверке целостности базы данных. + * \see mdbx_env_chk() */ +typedef struct MDBX_chk_scope { + MDBX_chk_issue_t *issues; + struct MDBX_chk_internal *internal; + const void *object; + enum MDBX_chk_stage stage; + enum MDBX_chk_severity verbosity; + size_t subtotal_issues; + union { + void *ptr; + size_t number; + } usr_z, usr_v, usr_o; +} MDBX_chk_scope_t; + +/** \brief Пользовательский тип для привязки дополнительных данных, + * связанных с некоторой таблицей ключ-значение, при проверке целостности базы + * данных. \see mdbx_env_chk() */ +typedef struct MDBX_chk_user_subdb_cookie MDBX_chk_user_subdb_cookie_t; + +/** \brief Гистограмма с некоторой статистической информацией, + * собираемой при проверке целостности БД. + * \see mdbx_env_chk() */ +struct MDBX_chk_histogram { + size_t amount, count, ones, pad; + struct { + size_t begin, end, amount, count; + } ranges[9]; +}; + +/** \brief Информация о некоторой таблицей ключ-значение, + * при проверке целостности базы данных. + * \see mdbx_env_chk() */ +typedef struct MDBX_chk_subdb { + MDBX_chk_user_subdb_cookie_t *cookie; + +/** \brief Pseudo-name for MainDB */ +#define MDBX_CHK_MAIN ((void *)((ptrdiff_t)0)) +/** \brief Pseudo-name for GarbageCollectorDB */ +#define MDBX_CHK_GC ((void *)((ptrdiff_t)-1)) +/** \brief Pseudo-name for MetaPages */ +#define MDBX_CHK_META ((void *)((ptrdiff_t)-2)) + + MDBX_val name; + MDBX_db_flags_t flags; + int id; + + size_t payload_bytes, lost_bytes; + struct { + size_t all, empty, other; + size_t branch, leaf; + size_t nested_branch, nested_leaf, nested_subleaf; + } pages; + struct { + /// Tree deep histogram + struct MDBX_chk_histogram deep; + /// Histogram of large/overflow pages length + struct MDBX_chk_histogram large_pages; + /// Histogram of nested trees height, span length for GC + struct MDBX_chk_histogram nested_tree; + /// Keys length histogram + struct MDBX_chk_histogram key_len; + /// Values length histogram + struct MDBX_chk_histogram val_len; + } histogram; +} MDBX_chk_subdb_t; + +/** \brief Контекст проверки целостности базы данных. + * \see mdbx_env_chk() */ +typedef struct MDBX_chk_context { + struct MDBX_chk_internal *internal; + MDBX_env *env; + MDBX_txn *txn; + MDBX_chk_scope_t *scope; + uint8_t scope_nesting; + struct { + size_t total_payload_bytes; + size_t subdb_total, subdb_processed; + size_t total_unused_bytes, unused_pages; + size_t processed_pages, reclaimable_pages, gc_pages, alloc_pages, + backed_pages; + size_t problems_meta, tree_problems, gc_tree_problems, kv_tree_problems, + problems_gc, problems_kv, total_problems; + uint64_t steady_txnid, recent_txnid; + /** Указатель на массив размером subdb_total с указателями на экземпляры + * структур MDBX_chk_subdb_t с информацией о всех таблицах ключ-значение, + * включая MainDB и GC/FreeDB. */ + const MDBX_chk_subdb_t *const *subdbs; + } result; +} MDBX_chk_context_t; + +/** \brief Набор функций обратного вызова используемых при проверке целостности + * базы данных. + * + * Функции обратного вызова предназначены для организации взаимодействия с кодом + * приложения. В том числе, для интеграции логики приложения проверяющей + * целостность стуктуры данных выше уровня ключ-значение, подготовки и + * структурированного вывода информации как о ходе, так и результатов проверки. + * + * Все функции обратного вызова опциональны, неиспользуемые указатели должны + * быть установлены в `nullptr`. + * + * \note Данный API еще не зафиксирован, в последующих версиях могут быть + * незначительные доработки и изменения. + * + * \see mdbx_env_chk() */ +typedef struct MDBX_chk_callbacks { + bool (*check_break)(MDBX_chk_context_t *ctx); + int (*scope_push)(MDBX_chk_context_t *ctx, MDBX_chk_scope_t *outer, + MDBX_chk_scope_t *inner, const char *fmt, va_list args); + int (*scope_conclude)(MDBX_chk_context_t *ctx, MDBX_chk_scope_t *outer, + MDBX_chk_scope_t *inner, int err); + void (*scope_pop)(MDBX_chk_context_t *ctx, MDBX_chk_scope_t *outer, + MDBX_chk_scope_t *inner); + void (*issue)(MDBX_chk_context_t *ctx, const char *object, + uint64_t entry_number, const char *issue, const char *extra_fmt, + va_list extra_args); + MDBX_chk_user_subdb_cookie_t *(*subdb_filter)(MDBX_chk_context_t *ctx, + const MDBX_val *name, + MDBX_db_flags_t flags); + int (*subdb_conclude)(MDBX_chk_context_t *ctx, const MDBX_chk_subdb_t *subdb, + MDBX_cursor *cursor, int err); + void (*subdb_dispose)(MDBX_chk_context_t *ctx, const MDBX_chk_subdb_t *subdb); + + int (*subdb_handle_kv)(MDBX_chk_context_t *ctx, const MDBX_chk_subdb_t *subdb, + size_t entry_number, const MDBX_val *key, + const MDBX_val *value); + + int (*stage_begin)(MDBX_chk_context_t *ctx, enum MDBX_chk_stage); + int (*stage_end)(MDBX_chk_context_t *ctx, enum MDBX_chk_stage, int err); + + MDBX_chk_line_t *(*print_begin)(MDBX_chk_context_t *ctx, + enum MDBX_chk_severity severity); + void (*print_flush)(MDBX_chk_line_t *); + void (*print_done)(MDBX_chk_line_t *); + void (*print_chars)(MDBX_chk_line_t *, const char *str, size_t len); + void (*print_format)(MDBX_chk_line_t *, const char *fmt, va_list args); + void (*print_size)(MDBX_chk_line_t *, const char *prefix, + const uint64_t value, const char *suffix); +} MDBX_chk_callbacks_t; + +/** \brief Проверяет целостность базы данных. + * + * Взаимодействие с кодом приложения реализуется через функции обратного вызова, + * предоставляемые приложением посредством параметра `cb`. В ходе такого + * взаимодействия приложение может контролировать ход проверки, в том числе, + * пропускать/фильтровать обработку отдельных элементов, а также реализовать + * дополнительную верификацию структуры и/или информации с учетом назначения и + * семантической значимости для приложения. Например, приложение может выполнить + * проверку собственных индексов и корректность записей в БД. Именно с этой + * целью функционал проверки целостности был доработан для интенсивного + * использования обратных вызовов и перенесен из утилиты `mdbx_chk` в основную + * библиотеку. + * + * Проверка выполняется в несколько стадий, начиная с инициализации и до + * завершения, более подробно см \ref MDBX_chk_stage. О начале и завершении + * каждой стадии код приложения уведомляется через соответствующие функции + * обратного вызова, более подробно см \ref MDBX_chk_callbacks_t. + * + * \param [in] env Указатель на экземпляр среды. + * \param [in] cb Набор функций обратного вызова. + * \param [in,out] ctx Контекст проверки целостности базы данных, + * где будут формироваться результаты проверки. + * \param [in] flags Флаги/опции проверки целостности базы данных. + * \param [in] verbosity Необходимый уровень детализации информации о ходе + * и результатах проверки. + * \param [in] timeout_seconds_16dot16 Ограничение длительности в 1/65536 долях + * секунды для выполнения проверки, + * либо 0 при отсутствии ограничения. + * \returns Нулевое значение в случае успеха, иначе код ошибки. */ +LIBMDBX_API int mdbx_env_chk(MDBX_env *env, const MDBX_chk_callbacks_t *cb, + MDBX_chk_context_t *ctx, + const enum MDBX_chk_flags_t flags, + enum MDBX_chk_severity verbosity, + unsigned timeout_seconds_16dot16); + +/** \brief Вспомогательная функция для подсчета проблем детектируемых + * приложением, в том числе, поступающим к приложению через логирование. + * \see mdbx_env_chk() + * \see MDBX_debug_func + * \returns Нулевое значение в случае успеха, иначе код ошибки. */ +LIBMDBX_API int mdbx_env_chk_encount_problem(MDBX_chk_context_t *ctx); + +/** end of chk @} */ /** end of c_api @} */ diff --git a/mdbxdist/mdbx.h++ b/mdbxdist/mdbx.h++ index b77d851..f8f6df4 100644 --- a/mdbxdist/mdbx.h++ +++ b/mdbxdist/mdbx.h++ @@ -1,7 +1,7 @@ /// \file mdbx.h++ /// \brief The libmdbx C++ API header file. /// -/// \author Copyright (c) 2020-2023, Leonid Yuriev . +/// \author Copyright (c) 2020-2024, Leonid Yuriev . /// \copyright SPDX-License-Identifier: Apache-2.0 /// /// Tested with: @@ -85,6 +85,10 @@ #include #endif +#if defined(__cpp_lib_span) && __cpp_lib_span >= 202002L +#include +#endif + #if __cplusplus >= 201103L #include #include @@ -162,6 +166,20 @@ #define MDBX_CXX20_CONSTEXPR inline #endif /* MDBX_CXX20_CONSTEXPR */ +#if CONSTEXPR_ENUM_FLAGS_OPERATIONS || defined(DOXYGEN) +#define MDBX_CXX01_CONSTEXPR_ENUM MDBX_CXX01_CONSTEXPR +#define MDBX_CXX11_CONSTEXPR_ENUM MDBX_CXX11_CONSTEXPR +#define MDBX_CXX14_CONSTEXPR_ENUM MDBX_CXX14_CONSTEXPR +#define MDBX_CXX17_CONSTEXPR_ENUM MDBX_CXX17_CONSTEXPR +#define MDBX_CXX20_CONSTEXPR_ENUM MDBX_CXX20_CONSTEXPR +#else +#define MDBX_CXX01_CONSTEXPR_ENUM inline +#define MDBX_CXX11_CONSTEXPR_ENUM inline +#define MDBX_CXX14_CONSTEXPR_ENUM inline +#define MDBX_CXX17_CONSTEXPR_ENUM inline +#define MDBX_CXX20_CONSTEXPR_ENUM inline +#endif /* CONSTEXPR_ENUM_FLAGS_OPERATIONS */ + /** Workaround for old compilers without support assertion inside `constexpr` * functions. */ #if defined(CONSTEXPR_ASSERT) @@ -336,13 +354,23 @@ static MDBX_CXX20_CONSTEXPR void *memcpy(void *dest, const void *src, static MDBX_CXX20_CONSTEXPR int memcmp(const void *a, const void *b, size_t bytes) noexcept; -/// \brief Legacy default allocator +/// \brief Legacy allocator /// but it is recommended to use \ref polymorphic_allocator. using legacy_allocator = ::std::string::allocator_type; +#if defined(DOXYGEN) || \ + (defined(__cpp_lib_memory_resource) && \ + __cpp_lib_memory_resource >= 201603L && _GLIBCXX_USE_CXX11_ABI) +/// \brief Default polymorphic allocator for modern code. +using polymorphic_allocator = ::std::pmr::string::allocator_type; +using default_allocator = polymorphic_allocator; +#else +using default_allocator = legacy_allocator; +#endif /* __cpp_lib_memory_resource >= 201603L */ + struct slice; struct default_capacity_policy; -template class buffer; class env; @@ -352,16 +380,6 @@ class txn_managed; class cursor; class cursor_managed; -#if defined(DOXYGEN) || \ - (defined(__cpp_lib_memory_resource) && \ - __cpp_lib_memory_resource >= 201603L && _GLIBCXX_USE_CXX11_ABI) -/// \brief Default polymorphic allocator for modern code. -using polymorphic_allocator = ::std::pmr::string::allocator_type; -using default_allocator = polymorphic_allocator; -#else -using default_allocator = legacy_allocator; -#endif /* __cpp_lib_memory_resource >= 201603L */ - /// \brief Default buffer. using default_buffer = buffer; @@ -488,6 +506,7 @@ public: static inline void throw_on_failure(int error_code); static inline bool boolean_or_throw(int error_code); static inline void success_or_throw(int error_code, const exception_thunk &); + static inline bool boolean_or_throw(int error_code, const exception_thunk &); static inline void panic_on_failure(int error_code, const char *context_where, const char *func_who) noexcept; static inline void success_or_panic(int error_code, const char *context_where, @@ -562,6 +581,7 @@ MDBX_DECLARE_EXCEPTION(thread_mismatch); MDBX_DECLARE_EXCEPTION(transaction_full); MDBX_DECLARE_EXCEPTION(transaction_overlapping); MDBX_DECLARE_EXCEPTION(duplicated_lck_file); +MDBX_DECLARE_EXCEPTION(dangling_map_id); #undef MDBX_DECLARE_EXCEPTION [[noreturn]] LIBMDBX_API void throw_too_small_target_buffer(); @@ -569,6 +589,7 @@ MDBX_DECLARE_EXCEPTION(duplicated_lck_file); [[noreturn]] LIBMDBX_API void throw_out_range(); [[noreturn]] LIBMDBX_API void throw_allocators_mismatch(); [[noreturn]] LIBMDBX_API void throw_bad_value_size(); +[[noreturn]] LIBMDBX_API void throw_incomparable_cursors(); static MDBX_CXX14_CONSTEXPR size_t check_length(size_t bytes); static MDBX_CXX14_CONSTEXPR size_t check_length(size_t headroom, size_t payload); @@ -616,24 +637,24 @@ concept SliceTranscoder = #endif /* MDBX_HAVE_CXX20_CONCEPTS */ -template inline buffer make_buffer(PRODUCER &producer, const ALLOCATOR &allocator = ALLOCATOR()); -template inline buffer make_buffer(const PRODUCER &producer, const ALLOCATOR &allocator = ALLOCATOR()); -template inline string make_string(PRODUCER &producer, const ALLOCATOR &allocator = ALLOCATOR()); -template inline string make_string(const PRODUCER &producer, const ALLOCATOR &allocator = ALLOCATOR()); @@ -682,6 +703,47 @@ struct LIBMDBX_API_TYPE slice : public ::MDBX_val { MDBX_CXX14_CONSTEXPR slice(MDBX_val &&src); MDBX_CXX14_CONSTEXPR slice(slice &&src) noexcept; +#if defined(DOXYGEN) || (defined(__cpp_lib_span) && __cpp_lib_span >= 202002L) + template + MDBX_CXX14_CONSTEXPR slice(const ::std::span &span) + : slice(span.begin(), span.end()) { + static_assert(::std::is_standard_layout::value && + !::std::is_pointer::value, + "Must be a standard layout type!"); + } + + template + MDBX_CXX14_CONSTEXPR ::std::span as_span() const { + static_assert(::std::is_standard_layout::value && + !::std::is_pointer::value, + "Must be a standard layout type!"); + if (MDBX_LIKELY(size() % sizeof(POD) == 0)) + MDBX_CXX20_LIKELY + return ::std::span(static_cast(data()), + size() / sizeof(POD)); + throw_bad_value_size(); + } + + template MDBX_CXX14_CONSTEXPR ::std::span as_span() { + static_assert(::std::is_standard_layout::value && + !::std::is_pointer::value, + "Must be a standard layout type!"); + if (MDBX_LIKELY(size() % sizeof(POD) == 0)) + MDBX_CXX20_LIKELY + return ::std::span(static_cast(data()), size() / sizeof(POD)); + throw_bad_value_size(); + } + + MDBX_CXX14_CONSTEXPR ::std::span bytes() const { + return as_span(); + } + MDBX_CXX14_CONSTEXPR ::std::span bytes() { return as_span(); } + MDBX_CXX14_CONSTEXPR ::std::span chars() const { + return as_span(); + } + MDBX_CXX14_CONSTEXPR ::std::span chars() { return as_span(); } +#endif /* __cpp_lib_span >= 202002L */ + #if defined(DOXYGEN) || \ (defined(__cpp_lib_string_view) && __cpp_lib_string_view >= 201606L) /// \brief Create a slice that refers to the same contents as "string_view" @@ -768,7 +830,7 @@ struct LIBMDBX_API_TYPE slice : public ::MDBX_val { #endif /* __cpp_lib_string_view >= 201606L */ template , - class ALLOCATOR = legacy_allocator> + class ALLOCATOR = default_allocator> MDBX_CXX20_CONSTEXPR ::std::basic_string as_string(const ALLOCATOR &allocator = ALLOCATOR()) const { static_assert(sizeof(CHAR) == 1, "Must be single byte characters"); @@ -783,27 +845,27 @@ struct LIBMDBX_API_TYPE slice : public ::MDBX_val { } /// \brief Returns a string with a hexadecimal dump of the slice content. - template + template inline string as_hex_string(bool uppercase = false, unsigned wrap_width = 0, const ALLOCATOR &allocator = ALLOCATOR()) const; /// \brief Returns a string with a /// [Base58](https://en.wikipedia.org/wiki/Base58) dump of the slice content. - template + template inline string as_base58_string(unsigned wrap_width = 0, const ALLOCATOR &allocator = ALLOCATOR()) const; /// \brief Returns a string with a /// [Base58](https://en.wikipedia.org/wiki/Base64) dump of the slice content. - template + template inline string as_base64_string(unsigned wrap_width = 0, const ALLOCATOR &allocator = ALLOCATOR()) const; /// \brief Returns a buffer with a hexadecimal dump of the slice content. - template inline buffer encode_hex(bool uppercase = false, unsigned wrap_width = 0, @@ -811,7 +873,7 @@ struct LIBMDBX_API_TYPE slice : public ::MDBX_val { /// \brief Returns a buffer with a /// [Base58](https://en.wikipedia.org/wiki/Base58) dump of the slice content. - template inline buffer encode_base58(unsigned wrap_width = 0, @@ -819,14 +881,14 @@ struct LIBMDBX_API_TYPE slice : public ::MDBX_val { /// \brief Returns a buffer with a /// [Base64](https://en.wikipedia.org/wiki/Base64) dump of the slice content. - template inline buffer encode_base64(unsigned wrap_width = 0, const ALLOCATOR &allocator = ALLOCATOR()) const; /// \brief Decodes hexadecimal dump from the slice content to returned buffer. - template inline buffer hex_decode(bool ignore_spaces = false, @@ -834,7 +896,7 @@ struct LIBMDBX_API_TYPE slice : public ::MDBX_val { /// \brief Decodes [Base58](https://en.wikipedia.org/wiki/Base58) dump /// from the slice content to returned buffer. - template inline buffer base58_decode(bool ignore_spaces = false, @@ -842,7 +904,7 @@ struct LIBMDBX_API_TYPE slice : public ::MDBX_val { /// \brief Decodes [Base64](https://en.wikipedia.org/wiki/Base64) dump /// from the slice content to returned buffer. - template inline buffer base64_decode(bool ignore_spaces = false, @@ -1062,20 +1124,40 @@ struct LIBMDBX_API_TYPE slice : public ::MDBX_val { } #ifdef MDBX_U128_TYPE - MDBX_U128_TYPE as_uint128() const; + MDBX_CXX14_CONSTEXPR MDBX_U128_TYPE as_uint128() const { + return as_pod(); + } +#endif /* MDBX_U128_TYPE */ + MDBX_CXX14_CONSTEXPR uint64_t as_uint64() const { return as_pod(); } + MDBX_CXX14_CONSTEXPR uint32_t as_uint32() const { return as_pod(); } + MDBX_CXX14_CONSTEXPR uint16_t as_uint16() const { return as_pod(); } + MDBX_CXX14_CONSTEXPR uint8_t as_uint8() const { return as_pod(); } + +#ifdef MDBX_I128_TYPE + MDBX_CXX14_CONSTEXPR MDBX_I128_TYPE as_int128() const { + return as_pod(); + } +#endif /* MDBX_I128_TYPE */ + MDBX_CXX14_CONSTEXPR int64_t as_int64() const { return as_pod(); } + MDBX_CXX14_CONSTEXPR int32_t as_int32() const { return as_pod(); } + MDBX_CXX14_CONSTEXPR int16_t as_int16() const { return as_pod(); } + MDBX_CXX14_CONSTEXPR int8_t as_int8() const { return as_pod(); } + +#ifdef MDBX_U128_TYPE + MDBX_U128_TYPE as_uint128_adapt() const; #endif /* MDBX_U128_TYPE */ - uint64_t as_uint64() const; - uint32_t as_uint32() const; - uint16_t as_uint16() const; - uint8_t as_uint8() const; + uint64_t as_uint64_adapt() const; + uint32_t as_uint32_adapt() const; + uint16_t as_uint16_adapt() const; + uint8_t as_uint8_adapt() const; #ifdef MDBX_I128_TYPE - MDBX_I128_TYPE as_int128() const; + MDBX_I128_TYPE as_int128_adapt() const; #endif /* MDBX_I128_TYPE */ - int64_t as_int64() const; - int32_t as_int32() const; - int16_t as_int16() const; - int8_t as_int8() const; + int64_t as_int64_adapt() const; + int32_t as_int32_adapt() const; + int16_t as_int16_adapt() const; + int8_t as_int8_adapt() const; protected: MDBX_CXX11_CONSTEXPR slice(size_t invalid_length) noexcept @@ -1259,13 +1341,13 @@ struct LIBMDBX_API to_hex { } /// \brief Returns a string with a hexadecimal dump of a passed slice. - template + template string as_string(const ALLOCATOR &allocator = ALLOCATOR()) const { return make_string(*this, allocator); } /// \brief Returns a buffer with a hexadecimal dump of a passed slice. - template buffer as_buffer(const ALLOCATOR &allocator = ALLOCATOR()) const { @@ -1310,14 +1392,14 @@ struct LIBMDBX_API to_base58 { /// \brief Returns a string with a /// [Base58](https://en.wikipedia.org/wiki/Base58) dump of a passed slice. - template + template string as_string(const ALLOCATOR &allocator = ALLOCATOR()) const { return make_string(*this, allocator); } /// \brief Returns a buffer with a /// [Base58](https://en.wikipedia.org/wiki/Base58) dump of a passed slice. - template buffer as_buffer(const ALLOCATOR &allocator = ALLOCATOR()) const { @@ -1364,14 +1446,14 @@ struct LIBMDBX_API to_base64 { /// \brief Returns a string with a /// [Base64](https://en.wikipedia.org/wiki/Base64) dump of a passed slice. - template + template string as_string(const ALLOCATOR &allocator = ALLOCATOR()) const { return make_string(*this, allocator); } /// \brief Returns a buffer with a /// [Base64](https://en.wikipedia.org/wiki/Base64) dump of a passed slice. - template buffer as_buffer(const ALLOCATOR &allocator = ALLOCATOR()) const { @@ -1428,13 +1510,13 @@ struct LIBMDBX_API from_hex { } /// \brief Decodes hexadecimal dump from a passed slice to returned string. - template + template string as_string(const ALLOCATOR &allocator = ALLOCATOR()) const { return make_string(*this, allocator); } /// \brief Decodes hexadecimal dump from a passed slice to returned buffer. - template buffer as_buffer(const ALLOCATOR &allocator = ALLOCATOR()) const { @@ -1474,14 +1556,14 @@ struct LIBMDBX_API from_base58 { /// \brief Decodes [Base58](https://en.wikipedia.org/wiki/Base58) dump from a /// passed slice to returned string. - template + template string as_string(const ALLOCATOR &allocator = ALLOCATOR()) const { return make_string(*this, allocator); } /// \brief Decodes [Base58](https://en.wikipedia.org/wiki/Base58) dump from a /// passed slice to returned buffer. - template buffer as_buffer(const ALLOCATOR &allocator = ALLOCATOR()) const { @@ -1523,14 +1605,14 @@ struct LIBMDBX_API from_base64 { /// \brief Decodes [Base64](https://en.wikipedia.org/wiki/Base64) dump from a /// passed slice to returned string. - template + template string as_string(const ALLOCATOR &allocator = ALLOCATOR()) const { return make_string(*this, allocator); } /// \brief Decodes [Base64](https://en.wikipedia.org/wiki/Base64) dump from a /// passed slice to returned buffer. - template buffer as_buffer(const ALLOCATOR &allocator = ALLOCATOR()) const { @@ -1825,7 +1907,6 @@ private: const size_t old_capacity = bin_.capacity(); const size_t new_capacity = bin::advise_capacity(old_capacity, wanna_capacity); - assert(new_capacity >= wanna_capacity); if (MDBX_LIKELY(new_capacity == old_capacity)) MDBX_CXX20_LIKELY { assert(bin_.is_inplace() == @@ -1991,7 +2072,13 @@ private: return *this; } - MDBX_CXX20_CONSTEXPR void clear() { reshape(0, 0, nullptr, 0); } + MDBX_CXX20_CONSTEXPR void *clear() { + return reshape(0, 0, nullptr, 0); + } + MDBX_CXX20_CONSTEXPR void *clear_and_reserve(size_t whole_capacity, + size_t headroom) { + return reshape(whole_capacity, headroom, nullptr, 0); + } MDBX_CXX20_CONSTEXPR void resize(size_t capacity, size_t headroom, slice &content) { content.iov_base = @@ -2334,6 +2421,33 @@ public: return slice_; } +#if defined(DOXYGEN) || (defined(__cpp_lib_span) && __cpp_lib_span >= 202002L) + template + MDBX_CXX14_CONSTEXPR buffer(const ::std::span &span) + : buffer(span.begin(), span.end()) { + static_assert(::std::is_standard_layout::value && + !::std::is_pointer::value, + "Must be a standard layout type!"); + } + + template + MDBX_CXX14_CONSTEXPR ::std::span as_span() const { + return slice_.template as_span(); + } + template MDBX_CXX14_CONSTEXPR ::std::span as_span() { + return slice_.template as_span(); + } + + MDBX_CXX14_CONSTEXPR ::std::span bytes() const { + return as_span(); + } + MDBX_CXX14_CONSTEXPR ::std::span bytes() { return as_span(); } + MDBX_CXX14_CONSTEXPR ::std::span chars() const { + return as_span(); + } + MDBX_CXX14_CONSTEXPR ::std::span chars() { return as_span(); } +#endif /* __cpp_lib_span >= 202002L */ + template static buffer wrap(const POD &pod, bool make_reference = false, const allocator_type &allocator = allocator_type()) { @@ -2344,6 +2458,48 @@ public: return slice_.as_pod(); } +#ifdef MDBX_U128_TYPE + MDBX_CXX14_CONSTEXPR MDBX_U128_TYPE as_uint128() const { + return slice().as_uint128(); + } +#endif /* MDBX_U128_TYPE */ + MDBX_CXX14_CONSTEXPR uint64_t as_uint64() const { + return slice().as_uint64(); + } + MDBX_CXX14_CONSTEXPR uint32_t as_uint32() const { + return slice().as_uint32(); + } + MDBX_CXX14_CONSTEXPR uint16_t as_uint16() const { + return slice().as_uint16(); + } + MDBX_CXX14_CONSTEXPR uint8_t as_uint8() const { return slice().as_uint8(); } + +#ifdef MDBX_I128_TYPE + MDBX_CXX14_CONSTEXPR MDBX_I128_TYPE as_int128() const { + return slice().as_int128(); + } +#endif /* MDBX_I128_TYPE */ + MDBX_CXX14_CONSTEXPR int64_t as_int64() const { return slice().as_int64(); } + MDBX_CXX14_CONSTEXPR int32_t as_int32() const { return slice().as_int32(); } + MDBX_CXX14_CONSTEXPR int16_t as_int16() const { return slice().as_int16(); } + MDBX_CXX14_CONSTEXPR int8_t as_int8() const { return slice().as_int8(); } + +#ifdef MDBX_U128_TYPE + MDBX_U128_TYPE as_uint128_adapt() const { return slice().as_uint128_adapt(); } +#endif /* MDBX_U128_TYPE */ + uint64_t as_uint64_adapt() const { return slice().as_uint64_adapt(); } + uint32_t as_uint32_adapt() const { return slice().as_uint32_adapt(); } + uint16_t as_uint16_adapt() const { return slice().as_uint16_adapt(); } + uint8_t as_uint8_adapt() const { return slice().as_uint8_adapt(); } + +#ifdef MDBX_I128_TYPE + MDBX_I128_TYPE as_int128_adapt() const { return slice().as_int128_adapt(); } +#endif /* MDBX_I128_TYPE */ + int64_t as_int64_adapt() const { return slice().as_int64_adapt(); } + int32_t as_int32_adapt() const { return slice().as_int32_adapt(); } + int16_t as_int16_adapt() const { return slice().as_int16_adapt(); } + int8_t as_int8_adapt() const { return slice().as_int8_adapt(); } + /// \brief Returns a new buffer with a hexadecimal dump of the slice content. static buffer hex(const ::mdbx::slice &source, bool uppercase = false, unsigned wrap_width = 0, @@ -2652,9 +2808,11 @@ public: } /// \brief Clears the contents and storage. - void clear() noexcept { - slice_.clear(); - silo_.clear(); + void clear() noexcept { slice_.assign(silo_.clear(), size_t(0)); } + + /// \brief Clears the contents and reserve storage. + void clear_and_reserve(size_t whole_capacity, size_t headroom = 0) noexcept { + slice_.assign(silo_.clear_and_reserve(whole_capacity, headroom), size_t(0)); } /// \brief Reduces memory usage by freeing unused storage space. @@ -2808,6 +2966,79 @@ public: return append_producer(from_base64(data, ignore_spaces)); } + buffer &append_u8(uint_fast8_t u8) { + if (MDBX_UNLIKELY(tailroom() < 1)) + MDBX_CXX20_UNLIKELY reserve_tailroom(1); + *slice_.byte_ptr() = u8; + slice_.iov_len += 1; + return *this; + } + + buffer &append_byte(uint_fast8_t byte) { return append_u8(byte); } + + buffer &append_u16(uint_fast16_t u16) { + if (MDBX_UNLIKELY(tailroom() < 2)) + MDBX_CXX20_UNLIKELY reserve_tailroom(2); + const auto ptr = slice_.byte_ptr(); + ptr[0] = uint8_t(u16); + ptr[1] = uint8_t(u16 >> 8); + slice_.iov_len += 2; + return *this; + } + + buffer &append_u24(uint_fast32_t u24) { + if (MDBX_UNLIKELY(tailroom() < 3)) + MDBX_CXX20_UNLIKELY reserve_tailroom(3); + const auto ptr = slice_.byte_ptr(); + ptr[0] = uint8_t(u24); + ptr[1] = uint8_t(u24 >> 8); + ptr[2] = uint8_t(u24 >> 16); + slice_.iov_len += 3; + return *this; + } + + buffer &append_u32(uint_fast32_t u32) { + if (MDBX_UNLIKELY(tailroom() < 4)) + MDBX_CXX20_UNLIKELY reserve_tailroom(4); + const auto ptr = slice_.byte_ptr(); + ptr[0] = uint8_t(u32); + ptr[1] = uint8_t(u32 >> 8); + ptr[2] = uint8_t(u32 >> 16); + ptr[3] = uint8_t(u32 >> 24); + slice_.iov_len += 4; + return *this; + } + + buffer &append_u48(uint_fast64_t u48) { + if (MDBX_UNLIKELY(tailroom() < 6)) + MDBX_CXX20_UNLIKELY reserve_tailroom(6); + const auto ptr = slice_.byte_ptr(); + ptr[0] = uint8_t(u48); + ptr[1] = uint8_t(u48 >> 8); + ptr[2] = uint8_t(u48 >> 16); + ptr[3] = uint8_t(u48 >> 24); + ptr[4] = uint8_t(u48 >> 32); + ptr[5] = uint8_t(u48 >> 40); + slice_.iov_len += 6; + return *this; + } + + buffer &append_u64(uint_fast64_t u64) { + if (MDBX_UNLIKELY(tailroom() < 8)) + MDBX_CXX20_UNLIKELY reserve_tailroom(8); + const auto ptr = slice_.byte_ptr(); + ptr[0] = uint8_t(u64); + ptr[1] = uint8_t(u64 >> 8); + ptr[2] = uint8_t(u64 >> 16); + ptr[3] = uint8_t(u64 >> 24); + ptr[4] = uint8_t(u64 >> 32); + ptr[5] = uint8_t(u64 >> 40); + ptr[6] = uint8_t(u64 >> 48); + ptr[7] = uint8_t(u64 >> 56); + slice_.iov_len += 8; + return *this; + } + //---------------------------------------------------------------------------- template @@ -2975,22 +3206,54 @@ struct value_result { /// \brief Combines pair of slices for key and value to represent result of /// certain operations. struct pair { + using stl_pair = std::pair; slice key, value; - pair(const slice &key, const slice &value) noexcept + MDBX_CXX11_CONSTEXPR pair(const slice &key, const slice &value) noexcept : key(key), value(value) {} + MDBX_CXX11_CONSTEXPR pair(const stl_pair &couple) noexcept + : key(couple.first), value(couple.second) {} + MDBX_CXX11_CONSTEXPR operator stl_pair() const noexcept { + return stl_pair(key, value); + } pair(const pair &) noexcept = default; pair &operator=(const pair &) noexcept = default; MDBX_CXX14_CONSTEXPR operator bool() const noexcept { assert(bool(key) == bool(value)); return key; } + MDBX_CXX14_CONSTEXPR static pair invalid() noexcept { + return pair(slice::invalid(), slice::invalid()); + } + + /// \brief Three-way fast non-lexicographically length-based comparison. + MDBX_NOTHROW_PURE_FUNCTION static MDBX_CXX14_CONSTEXPR intptr_t + compare_fast(const pair &a, const pair &b) noexcept; + + /// \brief Three-way lexicographically comparison. + MDBX_NOTHROW_PURE_FUNCTION static MDBX_CXX14_CONSTEXPR intptr_t + compare_lexicographically(const pair &a, const pair &b) noexcept; + friend MDBX_CXX14_CONSTEXPR bool operator==(const pair &a, + const pair &b) noexcept; + friend MDBX_CXX14_CONSTEXPR bool operator<(const pair &a, + const pair &b) noexcept; + friend MDBX_CXX14_CONSTEXPR bool operator>(const pair &a, + const pair &b) noexcept; + friend MDBX_CXX14_CONSTEXPR bool operator<=(const pair &a, + const pair &b) noexcept; + friend MDBX_CXX14_CONSTEXPR bool operator>=(const pair &a, + const pair &b) noexcept; + friend MDBX_CXX14_CONSTEXPR bool operator!=(const pair &a, + const pair &b) noexcept; }; /// \brief Combines pair of slices for key and value with boolean flag to /// represent result of certain operations. struct pair_result : public pair { bool done; - pair_result(const slice &key, const slice &value, bool done) noexcept + MDBX_CXX11_CONSTEXPR pair_result() noexcept + : pair(pair::invalid()), done(false) {} + MDBX_CXX11_CONSTEXPR pair_result(const slice &key, const slice &value, + bool done) noexcept : pair(key, value), done(done) {} pair_result(const pair_result &) noexcept = default; pair_result &operator=(const pair_result &) noexcept = default; @@ -3000,6 +3263,92 @@ struct pair_result : public pair { } }; +template +struct buffer_pair_spec { + using buffer_type = buffer; + using allocator_type = typename buffer_type::allocator_type; + using allocator_traits = typename buffer_type::allocator_traits; + using reservation_policy = CAPACITY_POLICY; + using stl_pair = ::std::pair; + buffer_type key, value; + + MDBX_CXX20_CONSTEXPR buffer_pair_spec() noexcept = default; + MDBX_CXX20_CONSTEXPR + buffer_pair_spec(const allocator_type &allocator) noexcept + : key(allocator), value(allocator) {} + + buffer_pair_spec(const buffer_type &key, const buffer_type &value, + const allocator_type &allocator = allocator_type()) + : key(key, allocator), value(value, allocator) {} + buffer_pair_spec(const buffer_type &key, const buffer_type &value, + bool make_reference, + const allocator_type &allocator = allocator_type()) + : key(key, make_reference, allocator), + value(value, make_reference, allocator) {} + + buffer_pair_spec(const stl_pair &pair, + const allocator_type &allocator = allocator_type()) + : buffer_pair_spec(pair.first, pair.second, allocator) {} + buffer_pair_spec(const stl_pair &pair, bool make_reference, + const allocator_type &allocator = allocator_type()) + : buffer_pair_spec(pair.first, pair.second, make_reference, allocator) {} + + buffer_pair_spec(const slice &key, const slice &value, + const allocator_type &allocator = allocator_type()) + : key(key, allocator), value(value, allocator) {} + buffer_pair_spec(const slice &key, const slice &value, bool make_reference, + const allocator_type &allocator = allocator_type()) + : key(key, make_reference, allocator), + value(value, make_reference, allocator) {} + + buffer_pair_spec(const pair &pair, + const allocator_type &allocator = allocator_type()) + : buffer_pair_spec(pair.key, pair.value, allocator) {} + buffer_pair_spec(const pair &pair, bool make_reference, + const allocator_type &allocator = allocator_type()) + : buffer_pair_spec(pair.key, pair.value, make_reference, allocator) {} + + buffer_pair_spec(const txn &txn, const slice &key, const slice &value, + const allocator_type &allocator = allocator_type()) + : key(txn, key, allocator), value(txn, value, allocator) {} + buffer_pair_spec(const txn &txn, const pair &pair, + const allocator_type &allocator = allocator_type()) + : buffer_pair_spec(txn, pair.key, pair.value, allocator) {} + + buffer_pair_spec(buffer_type &&key, buffer_type &&value) noexcept( + buffer_type::move_assign_alloc::is_nothrow()) + : key(::std::move(key)), value(::std::move(value)) {} + buffer_pair_spec(buffer_pair_spec &&pair) noexcept( + buffer_type::move_assign_alloc::is_nothrow()) + : buffer_pair_spec(::std::move(pair.key), ::std::move(pair.value)) {} + + /// \brief Checks whether data chunk stored inside the buffers both, otherwise + /// at least one of buffers just refers to data located outside. + MDBX_NOTHROW_PURE_FUNCTION MDBX_CXX20_CONSTEXPR bool + is_freestanding() const noexcept { + return key.is_freestanding() && value.is_freestanding(); + } + /// \brief Checks whether one of the buffers just refers to data located + /// outside the buffer, rather than stores it. + MDBX_NOTHROW_PURE_FUNCTION MDBX_CXX20_CONSTEXPR bool + is_reference() const noexcept { + return key.is_reference() || value.is_reference(); + } + /// \brief Makes buffers owning the data. + /// \details If buffer refers to an external data, then makes it the owner + /// of clone by allocating storage and copying the data. + void make_freestanding() { + key.make_freestanding(); + value.make_freestanding(); + } + + operator pair() const noexcept { return pair(key, value); } +}; + +template +using buffer_pair = buffer_pair_spec; + /// end of cxx_data @} //------------------------------------------------------------------------------ @@ -3025,6 +3374,26 @@ enum class key_mode { ///< \note Not yet implemented and PRs are welcome. }; +MDBX_CXX01_CONSTEXPR_ENUM bool is_usual(key_mode mode) noexcept { + return (MDBX_db_flags_t(mode) & (MDBX_REVERSEKEY | MDBX_INTEGERKEY)) == 0; +} + +MDBX_CXX01_CONSTEXPR_ENUM bool is_ordinal(key_mode mode) noexcept { + return (MDBX_db_flags_t(mode) & MDBX_INTEGERKEY) != 0; +} + +MDBX_CXX01_CONSTEXPR_ENUM bool is_samelength(key_mode mode) noexcept { + return (MDBX_db_flags_t(mode) & MDBX_INTEGERKEY) != 0; +} + +MDBX_CXX01_CONSTEXPR_ENUM bool is_reverse(key_mode mode) noexcept { + return (MDBX_db_flags_t(mode) & MDBX_REVERSEKEY) != 0; +} + +MDBX_CXX01_CONSTEXPR_ENUM bool is_msgpack(key_mode mode) noexcept { + return mode == key_mode::msgpack; +} + /// \brief Kind of the values and sorted multi-values with corresponding /// comparison. enum class value_mode { @@ -3077,23 +3446,49 @@ enum class value_mode { ///< end of the keys to the beginning. In terms of keys, ///< they are not unique, i.e. has duplicates which are ///< sorted by associated data values. - msgpack = -1 ///< A more than one data value could be associated with each - ///< key. Values are in [MessagePack](https://msgpack.org/) - ///< format with appropriate comparison. Internally each key is - ///< stored once, and the corresponding data values are sorted. - ///< In terms of keys, they are not unique, i.e. has duplicates - ///< which are sorted by associated data values. - ///< \note Not yet implemented and PRs are welcome. #else multi_reverse = uint32_t(MDBX_DUPSORT) | uint32_t(MDBX_REVERSEDUP), multi_samelength = uint32_t(MDBX_DUPSORT) | uint32_t(MDBX_DUPFIXED), multi_ordinal = uint32_t(MDBX_DUPSORT) | uint32_t(MDBX_DUPFIXED) | uint32_t(MDBX_INTEGERDUP), multi_reverse_samelength = uint32_t(MDBX_DUPSORT) | - uint32_t(MDBX_REVERSEDUP) | uint32_t(MDBX_DUPFIXED) + uint32_t(MDBX_REVERSEDUP) | + uint32_t(MDBX_DUPFIXED), #endif + msgpack = -1 ///< A more than one data value could be associated with each + ///< key. Values are in [MessagePack](https://msgpack.org/) + ///< format with appropriate comparison. Internally each key is + ///< stored once, and the corresponding data values are sorted. + ///< In terms of keys, they are not unique, i.e. has duplicates + ///< which are sorted by associated data values. + ///< \note Not yet implemented and PRs are welcome. }; +MDBX_CXX01_CONSTEXPR_ENUM bool is_usual(value_mode mode) noexcept { + return (MDBX_db_flags_t(mode) & (MDBX_DUPSORT | MDBX_INTEGERDUP | + MDBX_DUPFIXED | MDBX_REVERSEDUP)) == 0; +} + +MDBX_CXX01_CONSTEXPR_ENUM bool is_multi(value_mode mode) noexcept { + return (MDBX_db_flags_t(mode) & MDBX_DUPSORT) != 0; +} + +MDBX_CXX01_CONSTEXPR_ENUM bool is_ordinal(value_mode mode) noexcept { + return (MDBX_db_flags_t(mode) & MDBX_INTEGERDUP) != 0; +} + +MDBX_CXX01_CONSTEXPR_ENUM bool is_samelength(value_mode mode) noexcept { + return (MDBX_db_flags_t(mode) & MDBX_DUPFIXED) != 0; +} + +MDBX_CXX01_CONSTEXPR_ENUM bool is_reverse(value_mode mode) noexcept { + return (MDBX_db_flags_t(mode) & MDBX_REVERSEDUP) != 0; +} + +MDBX_CXX01_CONSTEXPR_ENUM bool is_msgpack(value_mode mode) noexcept { + return mode == value_mode::msgpack; +} + /// \brief A handle for an individual database (key-value spaces) in the /// environment. /// \see txn::open_map() \see txn::create_map() @@ -3119,18 +3514,8 @@ struct LIBMDBX_API_TYPE map_handle { map_handle::state state) noexcept; info(const info &) noexcept = default; info &operator=(const info &) noexcept = default; -#if CONSTEXPR_ENUM_FLAGS_OPERATIONS - MDBX_CXX11_CONSTEXPR -#else - inline -#endif - ::mdbx::key_mode key_mode() const noexcept; -#if CONSTEXPR_ENUM_FLAGS_OPERATIONS - MDBX_CXX11_CONSTEXPR -#else - inline -#endif - ::mdbx::value_mode value_mode() const noexcept; + MDBX_CXX11_CONSTEXPR_ENUM mdbx::key_mode key_mode() const noexcept; + MDBX_CXX11_CONSTEXPR_ENUM mdbx::value_mode value_mode() const noexcept; }; }; @@ -3294,8 +3679,10 @@ public: /// \brief Operate options. struct LIBMDBX_API_TYPE operate_options { - /// \copydoc MDBX_NOTLS - bool orphan_read_transactions{false}; + /// \copydoc MDBX_NOSTICKYTHREADS + bool no_sticky_threads{false}; + /// \brief Разрешает вложенные транзакции ценой отключения + /// \ref MDBX_WRITEMAP и увеличением накладных расходов. bool nested_write_transactions{false}; /// \copydoc MDBX_EXCLUSIVE bool exclusive{false}; @@ -3981,6 +4368,15 @@ public: /// \brief Opens cursor for specified key-value map handle. inline cursor_managed open_cursor(map_handle map) const; + /// \brief Unbind or close all cursors. + inline size_t release_all_cursors(bool unbind) const; + + /// \brief Close all cursors. + inline size_t close_all_cursors() const { return release_all_cursors(false); } + + /// \brief Unbind all cursors. + inline size_t unbind_all_cursors() const { return release_all_cursors(true); } + /// \brief Open existing key-value map. inline map_handle open_map( const char *name, @@ -3992,6 +4388,11 @@ public: const ::mdbx::key_mode key_mode = ::mdbx::key_mode::usual, const ::mdbx::value_mode value_mode = ::mdbx::value_mode::single) const; + /// \brief Open existing key-value map. + inline map_handle open_map_accede(const char *name) const; + /// \brief Open existing key-value map. + inline map_handle open_map_accede(const ::std::string &name) const; + /// \brief Create new or open existing key-value map. inline map_handle create_map(const char *name, @@ -4024,6 +4425,54 @@ public: inline bool clear_map(const ::std::string &name, bool throw_if_absent = false); + /// \brief Переименовывает таблицу ключ-значение. + inline void rename_map(map_handle map, const char *new_name); + /// \brief Переименовывает таблицу ключ-значение. + inline void rename_map(map_handle map, const ::std::string &new_name); + /// \brief Переименовывает таблицу ключ-значение. + /// \return `True` если таблица существует и была переименована, либо + /// `false` в случае отсутствия исходной таблицы. + bool rename_map(const char *old_name, const char *new_name, + bool throw_if_absent = false); + /// \brief Переименовывает таблицу ключ-значение. + /// \return `True` если таблица существует и была переименована, либо + /// `false` в случае отсутствия исходной таблицы. + bool rename_map(const ::std::string &old_name, const ::std::string &new_name, + bool throw_if_absent = false); + +#if defined(DOXYGEN) || \ + (defined(__cpp_lib_string_view) && __cpp_lib_string_view >= 201606L) + + /// \brief Open existing key-value map. + inline map_handle open_map( + const ::std::string_view &name, + const ::mdbx::key_mode key_mode = ::mdbx::key_mode::usual, + const ::mdbx::value_mode value_mode = ::mdbx::value_mode::single) const; + /// \brief Open existing key-value map. + inline map_handle open_map_accede(const ::std::string_view &name) const; + /// \brief Create new or open existing key-value map. + inline map_handle + create_map(const ::std::string_view &name, + const ::mdbx::key_mode key_mode = ::mdbx::key_mode::usual, + const ::mdbx::value_mode value_mode = ::mdbx::value_mode::single); + /// \brief Drop key-value map. + /// \return `True` if the key-value map existed and was deleted, either + /// `false` if the key-value map did not exist and there is nothing to delete. + bool drop_map(const ::std::string_view &name, bool throw_if_absent = false); + /// \return `True` if the key-value map existed and was cleared, either + /// `false` if the key-value map did not exist and there is nothing to clear. + bool clear_map(const ::std::string_view &name, bool throw_if_absent = false); + /// \brief Переименовывает таблицу ключ-значение. + inline void rename_map(map_handle map, const ::std::string_view &new_name); + /// \brief Переименовывает таблицу ключ-значение. + /// \return `True` если таблица существует и была переименована, либо + /// `false` в случае отсутствия исходной таблицы. + bool rename_map(const ::std::string_view &old_name, + const ::std::string_view &new_name, + bool throw_if_absent = false); + +#endif /* __cpp_lib_string_view >= 201606L */ + using map_stat = ::MDBX_stat; /// \brief Returns statistics for a sub-database. inline map_stat get_map_stat(map_handle map) const; @@ -4226,6 +4675,10 @@ public: /// \brief Commit all the operations of a transaction into the database. void commit(); + /// \brief Commit all the operations of a transaction into the database + /// and then start read transaction. + void commit_embark_read(); + using commit_latency = MDBX_commit_latency; /// \brief Commit all the operations of a transaction into the database @@ -4273,6 +4726,34 @@ public: friend MDBX_CXX11_CONSTEXPR bool operator!=(const cursor &a, const cursor &b) noexcept; + friend inline int compare_position_nothrow(const cursor &left, + const cursor &right, + bool ignore_nested) noexcept; + friend inline int compare_position(const cursor &left, const cursor &right, + bool ignore_nested); + + bool is_before_than(const cursor &other, bool ignore_nested = false) const { + return compare_position(*this, other, ignore_nested) < 0; + } + + bool is_same_or_before_than(const cursor &other, + bool ignore_nested = false) const { + return compare_position(*this, other, ignore_nested) <= 0; + } + + bool is_same_position(const cursor &other, bool ignore_nested = false) const { + return compare_position(*this, other, ignore_nested) == 0; + } + + bool is_after_than(const cursor &other, bool ignore_nested = false) const { + return compare_position(*this, other, ignore_nested) > 0; + } + + bool is_same_or_after_than(const cursor &other, + bool ignore_nested = false) const { + return compare_position(*this, other, ignore_nested) >= 0; + } + /// \brief Returns the application context associated with the cursor. inline void *get_context() const noexcept; @@ -4296,9 +4777,33 @@ public: multi_find_pair = MDBX_GET_BOTH, multi_exactkey_lowerboundvalue = MDBX_GET_BOTH_RANGE, - find_key = MDBX_SET, + seek_key = MDBX_SET, key_exact = MDBX_SET_KEY, - key_lowerbound = MDBX_SET_RANGE + key_lowerbound = MDBX_SET_RANGE, + + /* Doubtless cursor positioning at a specified key. */ + key_lesser_than = MDBX_TO_KEY_LESSER_THAN, + key_lesser_or_equal = MDBX_TO_KEY_LESSER_OR_EQUAL, + key_equal = MDBX_TO_KEY_EQUAL, + key_greater_or_equal = MDBX_TO_KEY_GREATER_OR_EQUAL, + key_greater_than = MDBX_TO_KEY_GREATER_THAN, + + /* Doubtless cursor positioning at a specified key-value pair + * for dupsort/multi-value hives. */ + multi_exactkey_value_lesser_than = MDBX_TO_EXACT_KEY_VALUE_LESSER_THAN, + multi_exactkey_value_lesser_or_equal = + MDBX_TO_EXACT_KEY_VALUE_LESSER_OR_EQUAL, + multi_exactkey_value_equal = MDBX_TO_EXACT_KEY_VALUE_EQUAL, + multi_exactkey_value_greater_or_equal = + MDBX_TO_EXACT_KEY_VALUE_GREATER_OR_EQUAL, + multi_exactkey_value_greater = MDBX_TO_EXACT_KEY_VALUE_GREATER_THAN, + + pair_lesser_than = MDBX_TO_PAIR_LESSER_THAN, + pair_lesser_or_equal = MDBX_TO_PAIR_LESSER_OR_EQUAL, + pair_equal = MDBX_TO_PAIR_EQUAL, + pair_exact = pair_equal, + pair_greater_or_equal = MDBX_TO_PAIR_GREATER_OR_EQUAL, + pair_greater_than = MDBX_TO_PAIR_GREATER_THAN, }; struct move_result : public pair_result { @@ -4332,49 +4837,243 @@ public: }; protected: + /* fake const, i.e. for some move/get operations */ inline bool move(move_operation operation, MDBX_val *key, MDBX_val *value, - bool throw_notfound) const - /* fake const, i.e. for some operations */; + bool throw_notfound) const; + inline ptrdiff_t estimate(move_operation operation, MDBX_val *key, MDBX_val *value) const; public: - inline move_result move(move_operation operation, bool throw_notfound); - inline move_result to_first(bool throw_notfound = true); - inline move_result to_previous(bool throw_notfound = true); - inline move_result to_previous_last_multi(bool throw_notfound = true); - inline move_result to_current_first_multi(bool throw_notfound = true); - inline move_result to_current_prev_multi(bool throw_notfound = true); - inline move_result current(bool throw_notfound = true) const; - inline move_result to_current_next_multi(bool throw_notfound = true); - inline move_result to_current_last_multi(bool throw_notfound = true); - inline move_result to_next_first_multi(bool throw_notfound = true); - inline move_result to_next(bool throw_notfound = true); - inline move_result to_last(bool throw_notfound = true); - - inline move_result move(move_operation operation, const slice &key, - bool throw_notfound); + template + bool scan(CALLABLE_PREDICATE predicate, move_operation start = first, + move_operation turn = next) { + struct wrapper : public exception_thunk { + static int probe(void *context, MDBX_val *key, MDBX_val *value, + void *arg) noexcept { + auto thunk = static_cast(context); + assert(thunk->is_clean()); + auto &predicate = *static_cast(arg); + try { + return predicate(pair(*key, *value)) ? MDBX_RESULT_TRUE + : MDBX_RESULT_FALSE; + } catch (... /* capture any exception to rethrow it over C code */) { + thunk->capture(); + return MDBX_RESULT_TRUE; + } + } + } thunk; + return error::boolean_or_throw( + ::mdbx_cursor_scan(handle_, wrapper::probe, &thunk, + MDBX_cursor_op(start), MDBX_cursor_op(turn), + &predicate), + thunk); + } + + template + bool fullscan(CALLABLE_PREDICATE predicate, bool backward = false) { + return scan(std::move(predicate), backward ? last : first, + backward ? previous : next); + } + + template + bool scan_from(CALLABLE_PREDICATE predicate, slice &from, + move_operation start = key_greater_or_equal, + move_operation turn = next) { + struct wrapper : public exception_thunk { + static int probe(void *context, MDBX_val *key, MDBX_val *value, + void *arg) noexcept { + auto thunk = static_cast(context); + assert(thunk->is_clean()); + auto &predicate = *static_cast(arg); + try { + return predicate(pair(*key, *value)) ? MDBX_RESULT_TRUE + : MDBX_RESULT_FALSE; + } catch (... /* capture any exception to rethrow it over C code */) { + thunk->capture(); + return MDBX_RESULT_TRUE; + } + } + } thunk; + return error::boolean_or_throw( + ::mdbx_cursor_scan_from(handle_, wrapper::probe, &thunk, + MDBX_cursor_op(start), &from, nullptr, + MDBX_cursor_op(turn), &predicate), + thunk); + } + + template + bool scan_from(CALLABLE_PREDICATE predicate, pair &from, + move_operation start = pair_greater_or_equal, + move_operation turn = next) { + struct wrapper : public exception_thunk { + static int probe(void *context, MDBX_val *key, MDBX_val *value, + void *arg) noexcept { + auto thunk = static_cast(context); + assert(thunk->is_clean()); + auto &predicate = *static_cast(arg); + try { + return predicate(pair(*key, *value)) ? MDBX_RESULT_TRUE + : MDBX_RESULT_FALSE; + } catch (... /* capture any exception to rethrow it over C code */) { + thunk->capture(); + return MDBX_RESULT_TRUE; + } + } + } thunk; + return error::boolean_or_throw( + ::mdbx_cursor_scan_from(handle_, wrapper::probe, &thunk, + MDBX_cursor_op(start), &from.key, &from.value, + MDBX_cursor_op(turn), &predicate), + thunk); + } + + move_result move(move_operation operation, bool throw_notfound) { + return move_result(*this, operation, throw_notfound); + } + move_result move(move_operation operation, const slice &key, + bool throw_notfound) { + return move_result(*this, operation, key, slice::invalid(), throw_notfound); + } + move_result move(move_operation operation, const slice &key, + const slice &value, bool throw_notfound) { + return move_result(*this, operation, key, value, throw_notfound); + } + bool move(move_operation operation, slice &key, slice &value, + bool throw_notfound) { + return move(operation, &key, &value, throw_notfound); + } + + move_result to_first(bool throw_notfound = true) { + return move(first, throw_notfound); + } + move_result to_previous(bool throw_notfound = true) { + return move(previous, throw_notfound); + } + move_result to_previous_last_multi(bool throw_notfound = true) { + return move(multi_prevkey_lastvalue, throw_notfound); + } + move_result to_current_first_multi(bool throw_notfound = true) { + return move(multi_currentkey_firstvalue, throw_notfound); + } + move_result to_current_prev_multi(bool throw_notfound = true) { + return move(multi_currentkey_prevvalue, throw_notfound); + } + move_result current(bool throw_notfound = true) const { + return move_result(*this, throw_notfound); + } + move_result to_current_next_multi(bool throw_notfound = true) { + return move(multi_currentkey_nextvalue, throw_notfound); + } + move_result to_current_last_multi(bool throw_notfound = true) { + return move(multi_currentkey_lastvalue, throw_notfound); + } + move_result to_next_first_multi(bool throw_notfound = true) { + return move(multi_nextkey_firstvalue, throw_notfound); + } + move_result to_next(bool throw_notfound = true) { + return move(next, throw_notfound); + } + move_result to_last(bool throw_notfound = true) { + return move(last, throw_notfound); + } + + move_result to_key_lesser_than(const slice &key, bool throw_notfound = true) { + return move(key_lesser_than, key, throw_notfound); + } + move_result to_key_lesser_or_equal(const slice &key, + bool throw_notfound = true) { + return move(key_lesser_or_equal, key, throw_notfound); + } + move_result to_key_equal(const slice &key, bool throw_notfound = true) { + return move(key_equal, key, throw_notfound); + } + move_result to_key_exact(const slice &key, bool throw_notfound = true) { + return move(key_exact, key, throw_notfound); + } + move_result to_key_greater_or_equal(const slice &key, + bool throw_notfound = true) { + return move(key_greater_or_equal, key, throw_notfound); + } + move_result to_key_greater_than(const slice &key, + bool throw_notfound = true) { + return move(key_greater_than, key, throw_notfound); + } + + move_result to_exact_key_value_lesser_than(const slice &key, + const slice &value, + bool throw_notfound = true) { + return move(multi_exactkey_value_lesser_than, key, value, throw_notfound); + } + move_result to_exact_key_value_lesser_or_equal(const slice &key, + const slice &value, + bool throw_notfound = true) { + return move(multi_exactkey_value_lesser_or_equal, key, value, + throw_notfound); + } + move_result to_exact_key_value_equal(const slice &key, const slice &value, + bool throw_notfound = true) { + return move(multi_exactkey_value_equal, key, value, throw_notfound); + } + move_result to_exact_key_value_greater_or_equal(const slice &key, + const slice &value, + bool throw_notfound = true) { + return move(multi_exactkey_value_greater_or_equal, key, value, + throw_notfound); + } + move_result to_exact_key_value_greater_than(const slice &key, + const slice &value, + bool throw_notfound = true) { + return move(multi_exactkey_value_greater, key, value, throw_notfound); + } + + move_result to_pair_lesser_than(const slice &key, const slice &value, + bool throw_notfound = true) { + return move(pair_lesser_than, key, value, throw_notfound); + } + move_result to_pair_lesser_or_equal(const slice &key, const slice &value, + bool throw_notfound = true) { + return move(pair_lesser_or_equal, key, value, throw_notfound); + } + move_result to_pair_equal(const slice &key, const slice &value, + bool throw_notfound = true) { + return move(pair_equal, key, value, throw_notfound); + } + move_result to_pair_exact(const slice &key, const slice &value, + bool throw_notfound = true) { + return move(pair_exact, key, value, throw_notfound); + } + move_result to_pair_greater_or_equal(const slice &key, const slice &value, + bool throw_notfound = true) { + return move(pair_greater_or_equal, key, value, throw_notfound); + } + move_result to_pair_greater_than(const slice &key, const slice &value, + bool throw_notfound = true) { + return move(pair_greater_than, key, value, throw_notfound); + } + + inline bool seek(const slice &key); inline move_result find(const slice &key, bool throw_notfound = true); - inline move_result lower_bound(const slice &key, bool throw_notfound = true); + inline move_result lower_bound(const slice &key, bool throw_notfound = false); + inline move_result upper_bound(const slice &key, bool throw_notfound = false); + + /// \brief Return count of duplicates for current key. + inline size_t count_multivalue() const; - inline move_result move(move_operation operation, const slice &key, - const slice &value, bool throw_notfound); inline move_result find_multivalue(const slice &key, const slice &value, bool throw_notfound = true); inline move_result lower_bound_multivalue(const slice &key, const slice &value, bool throw_notfound = false); - - inline bool seek(const slice &key); - inline bool move(move_operation operation, slice &key, slice &value, - bool throw_notfound); - - /// \brief Return count of duplicates for current key. - inline size_t count_multivalue() const; + inline move_result upper_bound_multivalue(const slice &key, + const slice &value, + bool throw_notfound = false); inline bool eof() const; inline bool on_first() const; inline bool on_last() const; + inline bool on_first_multival() const; + inline bool on_last_multival() const; inline estimate_result estimate(const slice &key, const slice &value) const; inline estimate_result estimate(const slice &key) const; inline estimate_result estimate(move_operation operation) const; @@ -4390,6 +5089,9 @@ public: /// map handle. inline void bind(const ::mdbx::txn &txn, ::mdbx::map_handle map_handle); + /// \brief Unbind cursor from a transaction. + inline void unbind(); + /// \brief Returns the cursor's transaction. inline ::mdbx::txn txn() const; inline map_handle map() const; @@ -4553,8 +5255,8 @@ static MDBX_CXX20_CONSTEXPR int memcmp(const void *a, const void *b, __cpp_lib_is_constant_evaluated >= 201811L if (::std::is_constant_evaluated()) { for (size_t i = 0; i < bytes; ++i) { - const int diff = - static_cast(a)[i] - static_cast(b)[i]; + const int diff = int(static_cast(a)[i]) - + int(static_cast(b)[i]); if (diff) return diff; } @@ -4653,7 +5355,8 @@ inline void error::success_or_throw() const { inline void error::success_or_throw(const exception_thunk &thunk) const { assert(thunk.is_clean() || code() != MDBX_SUCCESS); if (MDBX_UNLIKELY(!is_success())) { - MDBX_CXX20_UNLIKELY if (!thunk.is_clean()) thunk.rethrow_captured(); + MDBX_CXX20_UNLIKELY if (MDBX_UNLIKELY(!thunk.is_clean())) + thunk.rethrow_captured(); else throw_exception(); } } @@ -4714,6 +5417,13 @@ inline void error::success_or_panic(int error_code, const char *context_where, rc.success_or_panic(context_where, func_who); } +inline bool error::boolean_or_throw(int error_code, + const exception_thunk &thunk) { + if (MDBX_UNLIKELY(!thunk.is_clean())) + MDBX_CXX20_UNLIKELY thunk.rethrow_captured(); + return boolean_or_throw(error_code); +} + //------------------------------------------------------------------------------ MDBX_CXX11_CONSTEXPR slice::slice() noexcept : ::MDBX_val({nullptr, 0}) {} @@ -5088,6 +5798,56 @@ slice::is_base64(bool ignore_spaces) const noexcept { //------------------------------------------------------------------------------ +MDBX_CXX14_CONSTEXPR intptr_t pair::compare_fast(const pair &a, + const pair &b) noexcept { + const auto diff = slice::compare_fast(a.key, b.key); + return diff ? diff : slice::compare_fast(a.value, b.value); +} + +MDBX_CXX14_CONSTEXPR intptr_t +pair::compare_lexicographically(const pair &a, const pair &b) noexcept { + const auto diff = slice::compare_lexicographically(a.key, b.key); + return diff ? diff : slice::compare_lexicographically(a.value, b.value); +} + +MDBX_NOTHROW_PURE_FUNCTION MDBX_CXX14_CONSTEXPR bool +operator==(const pair &a, const pair &b) noexcept { + return a.key.length() == b.key.length() && + a.value.length() == b.value.length() && + memcmp(a.key.data(), b.key.data(), a.key.length()) == 0 && + memcmp(a.value.data(), b.value.data(), a.value.length()) == 0; +} + +MDBX_NOTHROW_PURE_FUNCTION MDBX_CXX14_CONSTEXPR bool +operator<(const pair &a, const pair &b) noexcept { + return pair::compare_lexicographically(a, b) < 0; +} + +MDBX_NOTHROW_PURE_FUNCTION MDBX_CXX14_CONSTEXPR bool +operator>(const pair &a, const pair &b) noexcept { + return pair::compare_lexicographically(a, b) > 0; +} + +MDBX_NOTHROW_PURE_FUNCTION MDBX_CXX14_CONSTEXPR bool +operator<=(const pair &a, const pair &b) noexcept { + return pair::compare_lexicographically(a, b) <= 0; +} + +MDBX_NOTHROW_PURE_FUNCTION MDBX_CXX14_CONSTEXPR bool +operator>=(const pair &a, const pair &b) noexcept { + return pair::compare_lexicographically(a, b) >= 0; +} + +MDBX_NOTHROW_PURE_FUNCTION MDBX_CXX14_CONSTEXPR bool +operator!=(const pair &a, const pair &b) noexcept { + return a.key.length() != b.key.length() || + a.value.length() != b.value.length() || + memcmp(a.key.data(), b.key.data(), a.key.length()) != 0 || + memcmp(a.value.data(), b.value.data(), a.value.length()) != 0; +} + +//------------------------------------------------------------------------------ + template inline buffer::buffer( const txn &txn, const struct slice &src, const allocator_type &allocator) @@ -5099,17 +5859,13 @@ MDBX_CXX11_CONSTEXPR map_handle::info::info(map_handle::flags flags, map_handle::state state) noexcept : flags(flags), state(state) {} -#if CONSTEXPR_ENUM_FLAGS_OPERATIONS -MDBX_CXX11_CONSTEXPR -#endif -::mdbx::key_mode map_handle::info::key_mode() const noexcept { +MDBX_CXX11_CONSTEXPR_ENUM mdbx::key_mode +map_handle::info::key_mode() const noexcept { return ::mdbx::key_mode(flags & (MDBX_REVERSEKEY | MDBX_INTEGERKEY)); } -#if CONSTEXPR_ENUM_FLAGS_OPERATIONS -MDBX_CXX11_CONSTEXPR -#endif -::mdbx::value_mode map_handle::info::value_mode() const noexcept { +MDBX_CXX11_CONSTEXPR_ENUM mdbx::value_mode +map_handle::info::value_mode() const noexcept { return ::mdbx::value_mode(flags & (MDBX_DUPSORT | MDBX_REVERSEDUP | MDBX_DUPFIXED | MDBX_INTEGERDUP)); } @@ -5640,6 +6396,13 @@ inline cursor_managed txn::open_cursor(map_handle map) const { return cursor_managed(ptr); } +inline size_t txn::release_all_cursors(bool unbind) const { + int err = ::mdbx_txn_release_all_cursors(handle_, unbind); + if (MDBX_UNLIKELY(err < 0)) + MDBX_CXX20_UNLIKELY error::throw_exception(err); + return size_t(err); +} + inline ::mdbx::map_handle txn::open_map(const char *name, const ::mdbx::key_mode key_mode, const ::mdbx::value_mode value_mode) const { @@ -5651,10 +6414,12 @@ txn::open_map(const char *name, const ::mdbx::key_mode key_mode, return map; } -inline ::mdbx::map_handle -txn::open_map(const ::std::string &name, const ::mdbx::key_mode key_mode, - const ::mdbx::value_mode value_mode) const { - return open_map(name.c_str(), key_mode, value_mode); +inline ::mdbx::map_handle txn::open_map_accede(const char *name) const { + ::mdbx::map_handle map; + error::success_or_throw( + ::mdbx_dbi_open(handle_, name, MDBX_DB_ACCEDE, &map.dbi)); + assert(map.dbi != 0); + return map; } inline ::mdbx::map_handle txn::create_map(const char *name, @@ -5669,28 +6434,121 @@ inline ::mdbx::map_handle txn::create_map(const char *name, return map; } -inline ::mdbx::map_handle txn::create_map(const ::std::string &name, +inline void txn::drop_map(map_handle map) { + error::success_or_throw(::mdbx_drop(handle_, map.dbi, true)); +} + +inline void txn::clear_map(map_handle map) { + error::success_or_throw(::mdbx_drop(handle_, map.dbi, false)); +} + +inline void txn::rename_map(map_handle map, const char *new_name) { + error::success_or_throw(::mdbx_dbi_rename(handle_, map, new_name)); +} + +#if defined(DOXYGEN) || \ + (defined(__cpp_lib_string_view) && __cpp_lib_string_view >= 201606L) + +inline ::mdbx::map_handle +txn::open_map(const ::std::string_view &name, const ::mdbx::key_mode key_mode, + const ::mdbx::value_mode value_mode) const { + ::mdbx::map_handle map; + error::success_or_throw(::mdbx_dbi_open2( + handle_, ::mdbx::slice(name), + MDBX_db_flags_t(key_mode) | MDBX_db_flags_t(value_mode), &map.dbi)); + assert(map.dbi != 0); + return map; +} + +inline ::mdbx::map_handle +txn::open_map_accede(const ::std::string_view &name) const { + ::mdbx::map_handle map; + error::success_or_throw( + ::mdbx_dbi_open2(handle_, ::mdbx::slice(name), MDBX_DB_ACCEDE, &map.dbi)); + assert(map.dbi != 0); + return map; +} + +inline ::mdbx::map_handle txn::create_map(const ::std::string_view &name, const ::mdbx::key_mode key_mode, const ::mdbx::value_mode value_mode) { - return create_map(name.c_str(), key_mode, value_mode); + ::mdbx::map_handle map; + error::success_or_throw(::mdbx_dbi_open2( + handle_, ::mdbx::slice(name), + MDBX_CREATE | MDBX_db_flags_t(key_mode) | MDBX_db_flags_t(value_mode), + &map.dbi)); + assert(map.dbi != 0); + return map; } -inline void txn::drop_map(map_handle map) { - error::success_or_throw(::mdbx_drop(handle_, map.dbi, true)); +inline void txn::rename_map(map_handle map, + const ::std::string_view &new_name) { + error::success_or_throw( + ::mdbx_dbi_rename2(handle_, map, ::mdbx::slice(new_name))); +} + +inline ::mdbx::map_handle +txn::open_map(const ::std::string &name, const ::mdbx::key_mode key_mode, + const ::mdbx::value_mode value_mode) const { + return open_map(::std::string_view(name), key_mode, value_mode); +} + +inline ::mdbx::map_handle +txn::open_map_accede(const ::std::string &name) const { + return open_map_accede(::std::string_view(name)); +} + +inline ::mdbx::map_handle txn::create_map(const ::std::string &name, + const ::mdbx::key_mode key_mode, + const ::mdbx::value_mode value_mode) { + return create_map(::std::string_view(name), key_mode, value_mode); } inline bool txn::drop_map(const ::std::string &name, bool throw_if_absent) { - return drop_map(name.c_str(), throw_if_absent); + return drop_map(::std::string_view(name), throw_if_absent); } -inline void txn::clear_map(map_handle map) { - error::success_or_throw(::mdbx_drop(handle_, map.dbi, false)); +inline bool txn::clear_map(const ::std::string &name, bool throw_if_absent) { + return clear_map(::std::string_view(name), throw_if_absent); +} + +inline void txn::rename_map(map_handle map, const ::std::string &new_name) { + return rename_map(map, ::std::string_view(new_name)); +} + +#else + +inline ::mdbx::map_handle +txn::open_map(const ::std::string &name, const ::mdbx::key_mode key_mode, + const ::mdbx::value_mode value_mode) const { + return open_map(name.c_str(), key_mode, value_mode); +} + +inline ::mdbx::map_handle +txn::open_map_accede(const ::std::string &name) const { + return open_map_accede(name.c_str()); +} + +inline ::mdbx::map_handle txn::create_map(const ::std::string &name, + const ::mdbx::key_mode key_mode, + const ::mdbx::value_mode value_mode) { + return create_map(name.c_str(), key_mode, value_mode); +} + +inline bool txn::drop_map(const ::std::string &name, bool throw_if_absent) { + return drop_map(name.c_str(), throw_if_absent); } inline bool txn::clear_map(const ::std::string &name, bool throw_if_absent) { return clear_map(name.c_str(), throw_if_absent); } +inline void txn::rename_map(map_handle map, const ::std::string &new_name) { + return rename_map(map, new_name.c_str()); +} + +#endif /* __cpp_lib_string_view >= 201606L */ + inline txn::map_stat txn::get_map_stat(map_handle map) const { txn::map_stat r; error::success_or_throw(::mdbx_dbi_stat(handle_, map.dbi, &r, sizeof(r))); @@ -6119,9 +6977,24 @@ MDBX_CXX11_CONSTEXPR bool operator!=(const cursor &a, return a.handle_ != b.handle_; } +inline int compare_position_nothrow(const cursor &left, const cursor &right, + bool ignore_nested = false) noexcept { + return mdbx_cursor_compare(left.handle_, right.handle_, ignore_nested); +} + +inline int compare_position(const cursor &left, const cursor &right, + bool ignore_nested = false) { + const auto diff = compare_position_nothrow(left, right, ignore_nested); + assert(compare_position_nothrow(right, left, ignore_nested) == -diff); + if (MDBX_LIKELY(int16_t(diff) == diff)) + MDBX_CXX20_LIKELY return int(diff); + else + throw_incomparable_cursors(); +} + inline cursor::move_result::move_result(const cursor &cursor, bool throw_notfound) - : pair_result(slice(), slice(), false) { + : pair_result() { done = cursor.move(get_current, &this->key, &this->value, throw_notfound); } @@ -6140,6 +7013,8 @@ inline bool cursor::move(move_operation operation, MDBX_val *key, switch (err) { case MDBX_SUCCESS: MDBX_CXX20_LIKELY return true; + case MDBX_RESULT_TRUE: + return false; case MDBX_NOTFOUND: if (!throw_notfound) return false; @@ -6171,60 +7046,6 @@ inline ptrdiff_t estimate(const cursor &from, const cursor &to) { return result; } -inline cursor::move_result cursor::move(move_operation operation, - bool throw_notfound) { - return move_result(*this, operation, throw_notfound); -} - -inline cursor::move_result cursor::to_first(bool throw_notfound) { - return move(first, throw_notfound); -} - -inline cursor::move_result cursor::to_previous(bool throw_notfound) { - return move(previous, throw_notfound); -} - -inline cursor::move_result cursor::to_previous_last_multi(bool throw_notfound) { - return move(multi_prevkey_lastvalue, throw_notfound); -} - -inline cursor::move_result cursor::to_current_first_multi(bool throw_notfound) { - return move(multi_currentkey_firstvalue, throw_notfound); -} - -inline cursor::move_result cursor::to_current_prev_multi(bool throw_notfound) { - return move(multi_currentkey_prevvalue, throw_notfound); -} - -inline cursor::move_result cursor::current(bool throw_notfound) const { - return move_result(*this, throw_notfound); -} - -inline cursor::move_result cursor::to_current_next_multi(bool throw_notfound) { - return move(multi_currentkey_nextvalue, throw_notfound); -} - -inline cursor::move_result cursor::to_current_last_multi(bool throw_notfound) { - return move(multi_currentkey_lastvalue, throw_notfound); -} - -inline cursor::move_result cursor::to_next_first_multi(bool throw_notfound) { - return move(multi_nextkey_firstvalue, throw_notfound); -} - -inline cursor::move_result cursor::to_next(bool throw_notfound) { - return move(next, throw_notfound); -} - -inline cursor::move_result cursor::to_last(bool throw_notfound) { - return move(last, throw_notfound); -} - -inline cursor::move_result cursor::move(move_operation operation, - const slice &key, bool throw_notfound) { - return move_result(*this, operation, key, throw_notfound); -} - inline cursor::move_result cursor::find(const slice &key, bool throw_notfound) { return move(key_exact, key, throw_notfound); } @@ -6234,12 +7055,6 @@ inline cursor::move_result cursor::lower_bound(const slice &key, return move(key_lowerbound, key, throw_notfound); } -inline cursor::move_result cursor::move(move_operation operation, - const slice &key, const slice &value, - bool throw_notfound) { - return move_result(*this, operation, key, value, throw_notfound); -} - inline cursor::move_result cursor::find_multivalue(const slice &key, const slice &value, bool throw_notfound) { @@ -6253,12 +7068,7 @@ inline cursor::move_result cursor::lower_bound_multivalue(const slice &key, } inline bool cursor::seek(const slice &key) { - return move(find_key, const_cast(&key), nullptr, false); -} - -inline bool cursor::move(move_operation operation, slice &key, slice &value, - bool throw_notfound) { - return move(operation, &key, &value, throw_notfound); + return move(seek_key, const_cast(&key), nullptr, false); } inline size_t cursor::count_multivalue() const { @@ -6279,6 +7089,14 @@ inline bool cursor::on_last() const { return error::boolean_or_throw(::mdbx_cursor_on_last(*this)); } +inline bool cursor::on_first_multival() const { + return error::boolean_or_throw(::mdbx_cursor_on_first_dup(*this)); +} + +inline bool cursor::on_last_multival() const { + return error::boolean_or_throw(::mdbx_cursor_on_last_dup(*this)); +} + inline cursor::estimate_result cursor::estimate(const slice &key, const slice &value) const { return estimate_result(*this, multi_exactkey_lowerboundvalue, key, value); @@ -6302,6 +7120,10 @@ inline void cursor::bind(const ::mdbx::txn &txn, error::success_or_throw(::mdbx_cursor_bind(txn, handle_, map_handle.dbi)); } +inline void cursor::unbind() { + error::success_or_throw(::mdbx_cursor_unbind(handle_)); +} + inline txn cursor::txn() const { MDBX_txn *txn = ::mdbx_cursor_txn(handle_); error::throw_on_nullptr(txn, MDBX_EINVAL); diff --git a/mdbxdist/mdbx_chk.c b/mdbxdist/mdbx_chk.c index 4c48027..a13d56f 100644 --- a/mdbxdist/mdbx_chk.c +++ b/mdbxdist/mdbx_chk.c @@ -1,7 +1,7 @@ /* mdbx_chk.c - memory-mapped database check tool */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -22,7 +22,7 @@ #define xMDBX_TOOLS /* Avoid using internal eASSERT() */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY 0c86daff919dc09ab5e99c8e5bfb1d51e1367babca9c630c97af92a2aa988ea9_v0_12_9_16_gfff3fbd8 +#define MDBX_BUILD_SOURCERY 91ff5b5423830ee44fca4b70dcb298f233338a17a3185c44df67ec16d3034af3_v0_13_0_38_gf1975363 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -47,11 +47,13 @@ #ifdef xMDBX_ALLOY /* Amalgamated build */ #define MDBX_INTERNAL_FUNC static -#define MDBX_INTERNAL_VAR static +#define MDBX_INTERNAL_VAR_PROTO static +#define MDBX_INTERNAL_VAR_INSTA static #else /* Non-amalgamated build */ #define MDBX_INTERNAL_FUNC -#define MDBX_INTERNAL_VAR extern +#define MDBX_INTERNAL_VAR_PROTO extern +#define MDBX_INTERNAL_VAR_INSTA #endif /* xMDBX_ALLOY */ /*----------------------------------------------------------------------------*/ @@ -116,6 +118,10 @@ disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ producing 'defined' has undefined behavior */ #endif +#if _MSC_VER < 1920 +/* avoid "error C2219: syntax error: type qualifier must be after '*'" */ +#define __restrict +#endif #if _MSC_VER > 1930 #pragma warning(disable : 6235) /* is always a constant */ #pragma warning(disable : 6237) /* is never evaluated and might \ @@ -161,7 +167,7 @@ #include "mdbx.h" /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -209,6 +215,7 @@ #include #include +#include #include #include #include @@ -846,7 +853,7 @@ __extern_C key_t ftok(const char *, int); /*----------------------------------------------------------------------------*/ -#if defined(MDBX_USE_VALGRIND) +#if defined(ENABLE_MEMCHECK) #include #ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE /* LY: available since Valgrind 3.10 */ @@ -868,7 +875,7 @@ __extern_C key_t ftok(const char *, int); #define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a, s) (0) #define VALGRIND_CHECK_MEM_IS_DEFINED(a, s) (0) #define RUNNING_ON_VALGRIND (0) -#endif /* MDBX_USE_VALGRIND */ +#endif /* ENABLE_MEMCHECK */ #ifdef __SANITIZE_ADDRESS__ #include @@ -1015,7 +1022,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1219,8 +1226,8 @@ typedef pthread_mutex_t osal_fastmutex_t; /*----------------------------------------------------------------------------*/ /* OS abstraction layer stuff */ -MDBX_INTERNAL_VAR unsigned sys_pagesize; -MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_pagesize_ln2, +MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize; +MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize_ln2, sys_allocation_granularity; /* Get the size of a memory page for the system. @@ -1484,8 +1491,9 @@ MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); #endif #if defined(__linux__) || defined(__gnu_linux__) -MDBX_INTERNAL_VAR uint32_t linux_kernel_version; -MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; +MDBX_INTERNAL_VAR_PROTO uint32_t linux_kernel_version; +MDBX_INTERNAL_VAR_PROTO bool + mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ #ifndef osal_strdup @@ -1699,7 +1707,8 @@ MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, /// restore POSIX-fcntl locks after the closing of file descriptors. /// \return Error code (MDBX_PANIC) or zero on success. MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, - MDBX_env *inprocess_neighbor); + MDBX_env *inprocess_neighbor, + const uint32_t current_pid); /// \brief Connects to shared interprocess locking objects and tries to acquire /// the maximum lock level (shared if exclusive is not available) @@ -1727,6 +1736,8 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); /// operational lock. /// \return Error code or zero on success MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, + bool dont_wait); /// \brief Locks LCK-file or/and table of readers for (de)registering. /// \return Error code or zero on success @@ -1735,16 +1746,12 @@ MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); /// \brief Unlocks LCK-file or/and table of readers after (de)registering. MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); -/// \brief Acquires lock for DB change (on writing transaction start) -/// Reading transactions will not be blocked. -/// Declared as LIBMDBX_API because it is used in mdbx_chk. +/// \brief Acquires write-transaction lock. /// \return Error code or zero on success -LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dont_wait); +MDBX_INTERNAL_FUNC int osal_txn_lock(MDBX_env *env, bool dont_wait); -/// \brief Releases lock once DB changes is made (after writing transaction -/// has finished). -/// Declared as LIBMDBX_API because it is used in mdbx_chk. -LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); +/// \brief Releases write-transaction lock.. +MDBX_INTERNAL_FUNC void osal_txn_unlock(MDBX_env *env); /// \brief Sets alive-flag of reader presence (indicative lock) for PID of /// the current process. The function does no more than needed for @@ -1773,7 +1780,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst); typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); -MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, +MDBX_INTERNAL_VAR_PROTO osal_srwlock_t_function osal_srwlock_Init, osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; @@ -1826,7 +1833,7 @@ typedef struct _FILE_REMOTE_PROTOCOL_INFO { typedef BOOL(WINAPI *MDBX_GetFileInformationByHandleEx)( _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -MDBX_INTERNAL_VAR MDBX_GetFileInformationByHandleEx +MDBX_INTERNAL_VAR_PROTO MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx; typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( @@ -1835,19 +1842,20 @@ typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( _Out_opt_ LPDWORD lpMaximumComponentLength, _Out_opt_ LPDWORD lpFileSystemFlags, _Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize); -MDBX_INTERNAL_VAR MDBX_GetVolumeInformationByHandleW +MDBX_INTERNAL_VAR_PROTO MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW; typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile, _Out_ LPWSTR lpszFilePath, _In_ DWORD cchFilePath, _In_ DWORD dwFlags); -MDBX_INTERNAL_VAR MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; +MDBX_INTERNAL_VAR_PROTO MDBX_GetFinalPathNameByHandleW + mdbx_GetFinalPathNameByHandleW; typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)( _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -MDBX_INTERNAL_VAR MDBX_SetFileInformationByHandle +MDBX_INTERNAL_VAR_PROTO MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle; typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( @@ -1856,10 +1864,10 @@ typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode, IN OUT PVOID InputBuffer, IN ULONG InputBufferLength, OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength); -MDBX_INTERNAL_VAR MDBX_NtFsControlFile mdbx_NtFsControlFile; +MDBX_INTERNAL_VAR_PROTO MDBX_NtFsControlFile mdbx_NtFsControlFile; typedef uint64_t(WINAPI *MDBX_GetTickCount64)(void); -MDBX_INTERNAL_VAR MDBX_GetTickCount64 mdbx_GetTickCount64; +MDBX_INTERNAL_VAR_PROTO MDBX_GetTickCount64 mdbx_GetTickCount64; #if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8 typedef struct _WIN32_MEMORY_RANGE_ENTRY { @@ -1871,13 +1879,13 @@ typedef struct _WIN32_MEMORY_RANGE_ENTRY { typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)( HANDLE hProcess, ULONG_PTR NumberOfEntries, PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags); -MDBX_INTERNAL_VAR MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; +MDBX_INTERNAL_VAR_PROTO MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT; typedef NTSTATUS(NTAPI *MDBX_NtExtendSection)(IN HANDLE SectionHandle, IN PLARGE_INTEGER NewSectionSize); -MDBX_INTERNAL_VAR MDBX_NtExtendSection mdbx_NtExtendSection; +MDBX_INTERNAL_VAR_PROTO MDBX_NtExtendSection mdbx_NtExtendSection; static __inline bool mdbx_RunningUnderWine(void) { return !mdbx_NtExtendSection; @@ -1887,14 +1895,15 @@ typedef LSTATUS(WINAPI *MDBX_RegGetValueA)(HKEY hkey, LPCSTR lpSubKey, LPCSTR lpValue, DWORD dwFlags, LPDWORD pdwType, PVOID pvData, LPDWORD pcbData); -MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; +MDBX_INTERNAL_VAR_PROTO MDBX_RegGetValueA mdbx_RegGetValueA; NTSYSAPI ULONG RtlRandomEx(PULONG Seed); typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle, PUCHAR OverlappedRangeStart, ULONG Length); -MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; +MDBX_INTERNAL_VAR_PROTO MDBX_SetFileIoOverlappedRange + mdbx_SetFileIoOverlappedRange; #endif /* Windows */ @@ -2124,7 +2133,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1 #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ -/** Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP +/** Controls dirty pages tracking, spilling and persisting in `MDBX_WRITEMAP` * mode. 0/OFF = Don't track dirty pages at all, don't spill ones, and use * msync() to persist data. This is by-default on Linux and other systems where * kernel provides properly LRU tracking and effective flushing on-demand. 1/ON @@ -2141,6 +2150,22 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_AVOID_MSYNC must be defined as 0 or 1 #endif /* MDBX_AVOID_MSYNC */ +/** Управляет механизмом поддержки разреженных наборов DBI-хендлов для снижения + * накладных расходов при запуске и обработке транзакций. */ +#ifndef MDBX_ENABLE_DBI_SPARSE +#define MDBX_ENABLE_DBI_SPARSE 1 +#elif !(MDBX_ENABLE_DBI_SPARSE == 0 || MDBX_ENABLE_DBI_SPARSE == 1) +#error MDBX_ENABLE_DBI_SPARSE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_DBI_SPARSE */ + +/** Управляет механизмом отложенного освобождения и поддержки пути быстрого + * открытия DBI-хендлов без захвата блокировок. */ +#ifndef MDBX_ENABLE_DBI_LOCKFREE +#define MDBX_ENABLE_DBI_LOCKFREE 1 +#elif !(MDBX_ENABLE_DBI_LOCKFREE == 0 || MDBX_ENABLE_DBI_LOCKFREE == 1) +#error MDBX_ENABLE_DBI_LOCKFREE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ + /** Controls sort order of internal page number lists. * This mostly experimental/advanced option with not for regular MDBX users. * \warning The database format depend on this option and libmdbx built with @@ -2188,8 +2213,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /** If defined then enables integration with Valgrind, * a memory analyzing tool. */ -#ifndef MDBX_USE_VALGRIND -#endif /* MDBX_USE_VALGRIND */ +#ifndef ENABLE_MEMCHECK +#endif /* ENABLE_MEMCHECK */ /** If defined then enables use C11 atomics, * otherwise detects ones availability automatically. */ @@ -2512,13 +2537,23 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_RUNTIME_FLAGS_INIT \ ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT -extern uint8_t runtime_flags; -extern uint8_t loglevel; -extern MDBX_debug_func *debug_logger; +union logger_union { + void *ptr; + MDBX_debug_func *fmt; + MDBX_debug_func_nofmt *nofmt; +}; + +MDBX_INTERNAL_VAR_PROTO struct mdbx_static { + uint8_t flags; + uint8_t loglevel; + union logger_union logger; + size_t logger_buffer_size; + char *logger_buffer; +} mdbx_static; MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { #if MDBX_DEBUG - if (MDBX_DBG_JITTER & runtime_flags) + if (MDBX_DBG_JITTER & mdbx_static.flags) osal_jitter(tiny); #else (void)tiny; @@ -2532,17 +2567,17 @@ MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, const char *fmt, va_list args); #if MDBX_DEBUG -#define LOG_ENABLED(msg) unlikely(msg <= loglevel) -#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) +#define LOG_ENABLED(msg) unlikely(msg <= mdbx_static.loglevel) +#define AUDIT_ENABLED() unlikely((mdbx_static.flags & MDBX_DBG_AUDIT)) #else /* MDBX_DEBUG */ -#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= mdbx_static.loglevel) #define AUDIT_ENABLED() (0) #endif /* MDBX_DEBUG */ #if MDBX_FORCE_ASSERTIONS #define ASSERT_ENABLED() (1) #elif MDBX_DEBUG -#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) +#define ASSERT_ENABLED() likely((mdbx_static.flags & MDBX_DBG_ASSERT)) #else #define ASSERT_ENABLED() (0) #endif /* assertions */ @@ -2977,7 +3012,8 @@ typedef struct MDBX_page { #define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) -/* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ +/* Drop legacy P_DIRTY flag for sub-pages for compatilibity, + * for assertions only. */ #define PAGETYPE_COMPAT(p) \ (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ @@ -3086,7 +3122,7 @@ typedef sem_t osal_ipclock_t; #endif /* MDBX_LOCKING */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) -MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_stubinit(osal_ipclock_t *ipc); MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); #endif /* MDBX_LOCKING */ @@ -3099,8 +3135,9 @@ MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); * read transactions started by the same thread need no further locking to * proceed. * - * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data. - * No reader table is used if the database is on a read-only filesystem. + * If MDBX_NOSTICKYTHREADS is set, the slot address is not saved in + * thread-specific data. No reader table is used if the database is on a + * read-only filesystem. * * Since the database uses multi-version concurrency control, readers don't * actually need any locking. This table is used to keep track of which @@ -3410,10 +3447,10 @@ typedef struct troika { #if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */ uint32_t unused_pad; #endif -#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) -#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) -#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) -#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3) +#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7u) +#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64u) +#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128u) +#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3u) txnid_t txnid[NUM_METAS]; } meta_troika_t; @@ -3443,6 +3480,8 @@ struct MDBX_txn { #error "Oops, some txn flags overlapped or wrong" #endif uint32_t mt_flags; + unsigned mt_numdbs; + size_t mt_owner; /* thread ID that owns this transaction */ MDBX_txn *mt_parent; /* parent of a nested txn */ /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ @@ -3460,31 +3499,30 @@ struct MDBX_txn { txnid_t mt_front; MDBX_env *mt_env; /* the DB environment */ - /* Array of records for each DB known in the environment. */ - MDBX_dbx *mt_dbxs; /* Array of MDBX_db records for each known DB */ MDBX_db *mt_dbs; - /* Array of sequence numbers for each DB handle */ - MDBX_atomic_uint32_t *mt_dbiseqs; - - /* Transaction DBI Flags */ -#define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ -#define DBI_STALE MDBX_DBI_STALE /* Named-DB record is older than txnID */ -#define DBI_FRESH MDBX_DBI_FRESH /* Named-DB handle opened in this txn */ -#define DBI_CREAT MDBX_DBI_CREAT /* Named-DB handle created in this txn */ -#define DBI_VALID 0x10 /* DB handle is valid, see also DB_VALID */ -#define DBI_USRVALID 0x20 /* As DB_VALID, but not set for FREE_DBI */ -#define DBI_AUDITED 0x40 /* Internal flag for accounting during audit */ - /* Array of flags for each DB */ - uint8_t *mt_dbistate; - /* Number of DB records in use, or 0 when the txn is finished. - * This number only ever increments until the txn finishes; we - * don't decrement it when individual DB handles are closed. */ - MDBX_dbi mt_numdbs; - size_t mt_owner; /* thread ID that owns this transaction */ + +#if MDBX_ENABLE_DBI_SPARSE + unsigned *__restrict mt_dbi_sparse; +#endif /* MDBX_ENABLE_DBI_SPARSE */ + + /* Non-shared DBI state flags inside transaction */ +#define DBI_DIRTY 0x01 /* DB was written in this txn */ +#define DBI_STALE 0x02 /* Named-DB record is older than txnID */ +#define DBI_FRESH 0x04 /* Named-DB handle opened in this txn */ +#define DBI_CREAT 0x08 /* Named-DB handle created in this txn */ +#define DBI_VALID 0x10 /* Handle is valid, see also DB_VALID */ +#define DBI_OLDEN 0x40 /* Handle was closed/reopened outside txn */ +#define DBI_LINDO 0x80 /* Lazy initialization done for DBI-slot */ + /* Array of non-shared txn's flags of DBI */ + uint8_t *__restrict mt_dbi_state; + + /* Array of sequence numbers for each DB handle. */ + uint32_t *__restrict mt_dbi_seqs; + MDBX_cursor **mt_cursors; + MDBX_canary mt_canary; void *mt_userctx; /* User-settable context */ - MDBX_cursor **mt_cursors; union { struct { @@ -3494,8 +3532,8 @@ struct MDBX_txn { struct { meta_troika_t troika; /* In write txns, array of cursors for each DB */ - MDBX_PNL relist; /* Reclaimed GC pages */ - txnid_t last_reclaimed; /* ID of last used record */ + MDBX_PNL __restrict relist; /* Reclaimed GC pages */ + txnid_t last_reclaimed; /* ID of last used record */ #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; #endif /* MDBX_ENABLE_REFUND */ @@ -3507,14 +3545,14 @@ struct MDBX_txn { * dirtylist into mt_parent after freeing hidden mt_parent pages. */ size_t dirtyroom; /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ - MDBX_dpl *dirtylist; + MDBX_dpl *__restrict dirtylist; /* The list of reclaimed txns from GC */ - MDBX_TXL lifo_reclaimed; + MDBX_TXL __restrict lifo_reclaimed; /* The list of pages that became unused during this transaction. */ - MDBX_PNL retired_pages; + MDBX_PNL __restrict retired_pages; /* The list of loose pages that became unused and may be reused * in this transaction, linked through `mp_next`. */ - MDBX_page *loose_pages; + MDBX_page *__restrict loose_pages; /* Number of loose pages (tw.loose_pages) */ size_t loose_count; union { @@ -3523,11 +3561,12 @@ struct MDBX_txn { /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ - MDBX_PNL list; + MDBX_PNL __restrict list; } spilled; size_t writemap_dirty_npages; size_t writemap_spilled_npages; }; + uint64_t gc_time_acc; } tw; }; }; @@ -3566,8 +3605,8 @@ struct MDBX_cursor { MDBX_db *mc_db; /* The database auxiliary record for this cursor */ MDBX_dbx *mc_dbx; - /* The mt_dbistate for this database */ - uint8_t *mc_dbistate; + /* The mt_dbi_state[] for this DBI */ + uint8_t *__restrict mc_dbi_state; uint8_t mc_snum; /* number of pushed pages */ uint8_t mc_top; /* index of top page, normally mc_snum-1 */ @@ -3620,6 +3659,11 @@ typedef struct MDBX_cursor_couple { MDBX_xcursor inner; } MDBX_cursor_couple; +struct mdbx_defer_free_item { + struct mdbx_defer_free_item *next; + uint64_t timestamp; +}; + /* The database environment. */ struct MDBX_env { /* ----------------------------------------------------- mostly static part */ @@ -3637,6 +3681,7 @@ struct MDBX_env { #define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; + unsigned me_psize; /* DB page size, initialized from me_os_psize */ osal_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.base #define me_lazy_fd me_dxb_mmap.fd @@ -3649,7 +3694,6 @@ struct MDBX_env { #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; - unsigned me_psize; /* DB page size, initialized from me_os_psize */ uint16_t me_leaf_nodemax; /* max size of a leaf-node */ uint16_t me_branch_nodemax; /* max size of a branch-node */ uint16_t me_subpage_limit; @@ -3667,13 +3711,15 @@ struct MDBX_env { MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ osal_thread_key_t me_txkey; /* thread-key for readers */ - pathchar_t *me_pathname; /* path to the DB files */ - void *me_pbuf; /* scratch area for DUPSORT put() */ - MDBX_txn *me_txn0; /* preallocated write transaction */ - - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ + struct { /* path to the DB files */ + pathchar_t *lck, *dxb, *specified; + void *buffer; + } me_pathname; + void *me_pbuf; /* scratch area for DUPSORT put() */ + MDBX_txn *me_txn0; /* preallocated write transaction */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *__restrict me_db_flags; /* array of flags from MDBX_db.md_flags */ + MDBX_atomic_uint32_t *me_dbi_seqs; /* array of dbi sequence numbers */ unsigned me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ unsigned me_maxgc_per_branch; @@ -3687,6 +3733,7 @@ struct MDBX_env { unsigned rp_augment_limit; unsigned dp_limit; unsigned dp_initial; + uint64_t gc_time_limit; uint8_t dp_loose_limit; uint8_t spill_max_denominator; uint8_t spill_min_denominator; @@ -3696,6 +3743,8 @@ struct MDBX_env { unsigned writethrough_threshold; #endif /* Windows */ bool prefault_write; + bool prefer_waf_insteadof_balance; /* Strive to minimize WAF instead of + balancing pages fullment */ union { unsigned all; /* tracks options with non-auto values but tuned by user */ @@ -3725,20 +3774,23 @@ struct MDBX_env { } me_sysv_ipc; #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ bool me_incore; + bool me_prefault_write; - MDBX_env *me_lcklist_next; +#if MDBX_ENABLE_DBI_LOCKFREE + struct mdbx_defer_free_item *me_defer_free; +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ /* --------------------------------------------------- mostly volatile part */ MDBX_txn *me_txn; /* current write transaction */ osal_fastmutex_t me_dbi_lock; - MDBX_dbi me_numdbs; /* number of DBs opened */ - bool me_prefault_write; + unsigned me_numdbs; /* number of DBs opened */ - MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ unsigned me_dp_reserve_len; + MDBX_page *__restrict me_dp_reserve; /* list of malloc'ed blocks for re-use */ + /* PNL of pages that became unused in a write txn */ - MDBX_PNL me_retired_pages; + MDBX_PNL __restrict me_retired_pages; osal_ioring_t me_ioring; #if defined(_WIN32) || defined(_WIN64) @@ -3756,13 +3808,12 @@ struct MDBX_env { #if MDBX_DEBUG MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ #endif -#ifdef MDBX_USE_VALGRIND +#ifdef ENABLE_MEMCHECK int me_valgrind_handle; #endif -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - MDBX_atomic_uint32_t me_ignore_EDEADLK; +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) pgno_t me_poison_edge; -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ #ifndef xMDBX_DEBUG_SPILLING #define xMDBX_DEBUG_SPILLING 0 @@ -3822,10 +3873,6 @@ osal_flush_incoherent_mmap(const void *addr, size_t nbytes, MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, int *dead); -MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, - MDBX_reader *end); -MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); - MDBX_INTERNAL_FUNC void global_ctor(void); MDBX_INTERNAL_FUNC void osal_ctor(void); MDBX_INTERNAL_FUNC void global_dtor(void); @@ -3940,7 +3987,8 @@ typedef struct MDBX_node { /* mdbx_dbi_open() flags */ #define DB_USABLE_FLAGS (DB_PERSISTENT_FLAGS | MDBX_CREATE | MDBX_DB_ACCEDE) -#define DB_VALID 0x8000 /* DB handle is valid, for me_dbflags */ +#define DB_VALID 0x8000u /* DB handle is valid, for me_db_flags */ +#define DB_POISON 0x7fffu /* update pending */ #define DB_INTERNAL_FLAGS DB_VALID #if DB_INTERNAL_FLAGS & DB_USABLE_FLAGS @@ -4029,11 +4077,11 @@ log2n_powerof2(size_t value_uintptr) { * environment and re-opening it with the new flags. */ #define ENV_CHANGEABLE_FLAGS \ (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | \ - MDBX_VALIDATION) + MDBX_NOMEMINIT | MDBX_DEPRECATED_COALESCE | MDBX_PAGEPERTURB | \ + MDBX_ACCEDE | MDBX_VALIDATION) #define ENV_CHANGELESS_FLAGS \ - (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ - MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) + (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOSTICKYTHREADS | \ + MDBX_NORDAHEAD | MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS) #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS @@ -4067,20 +4115,37 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) -#include +/******************************************************************************/ + +/** \brief Page types for traverse the b-tree. + * \see mdbx_env_pgwalk() \see MDBX_pgvisitor_func */ +enum MDBX_page_type_t { + MDBX_page_broken, + MDBX_page_large, + MDBX_page_branch, + MDBX_page_leaf, + MDBX_page_dupfixed_leaf, + MDBX_subpage_leaf, + MDBX_subpage_dupfixed_leaf, + MDBX_subpage_broken, +}; +typedef enum MDBX_page_type_t MDBX_page_type_t; -typedef struct flagbit { - int bit; - const char *name; -} flagbit; +typedef struct MDBX_walk_sdb { + MDBX_val name; + struct MDBX_db *internal, *nested; +} MDBX_walk_sdb_t; + +/** \brief Callback function for traverse the b-tree. \see mdbx_env_pgwalk() */ +typedef int +MDBX_pgvisitor_func(const size_t pgno, const unsigned number, void *const ctx, + const int deep, const MDBX_walk_sdb_t *subdb, + const size_t page_size, const MDBX_page_type_t page_type, + const MDBX_error_t err, const size_t nentries, + const size_t payload_bytes, const size_t header_bytes, + const size_t unused_bytes); -const flagbit dbflags[] = {{MDBX_DUPSORT, "dupsort"}, - {MDBX_INTEGERKEY, "integerkey"}, - {MDBX_REVERSEKEY, "reversekey"}, - {MDBX_DUPFIXED, "dupfixed"}, - {MDBX_REVERSEDUP, "reversedup"}, - {MDBX_INTEGERDUP, "integerdup"}, - {0, nullptr}}; +#include #if defined(_WIN32) || defined(_WIN64) /* @@ -4202,181 +4267,171 @@ static void signal_handler(int sig) { #define EXIT_FAILURE_CHECK_MAJOR (EXIT_FAILURE + 1) #define EXIT_FAILURE_CHECK_MINOR EXIT_FAILURE -typedef struct { - MDBX_val name; - struct { - uint64_t branch, large_count, large_volume, leaf; - uint64_t subleaf_dupsort, leaf_dupfixed, subleaf_dupfixed; - uint64_t total, empty, other; - } pages; - uint64_t payload_bytes; - uint64_t lost_bytes; -} walk_dbi_t; - -struct { - short *pagemap; - uint64_t total_payload_bytes; - uint64_t pgcount; - walk_dbi_t - dbi[MDBX_MAX_DBI + CORE_DBS + /* account pseudo-entry for meta */ 1]; -} walk; - -#define dbi_free walk.dbi[FREE_DBI] -#define dbi_main walk.dbi[MAIN_DBI] -#define dbi_meta walk.dbi[CORE_DBS] - -int envflags = MDBX_RDONLY | MDBX_EXCLUSIVE | MDBX_VALIDATION; +enum MDBX_env_flags_t env_flags = + MDBX_RDONLY | MDBX_EXCLUSIVE | MDBX_VALIDATION; MDBX_env *env; MDBX_txn *txn; -MDBX_envinfo envinfo; -size_t userdb_count, skipped_subdb; -uint64_t total_unused_bytes, reclaimable_pages, gc_pages, alloc_pages, - unused_pages, backed_pages; -unsigned verbose; -bool ignore_wrong_order, quiet, dont_traversal; +unsigned verbose = 0; +bool quiet; MDBX_val only_subdb; int stuck_meta = -1; +MDBX_chk_context_t chk; +bool turn_meta = false; +bool force_turn_meta = false; +enum MDBX_chk_flags_t chk_flags = MDBX_CHK_DEFAULTS; +enum MDBX_chk_stage chk_stage = MDBX_chk_none; + +static MDBX_chk_line_t line_struct; +static size_t anchor_lineno; +static size_t line_count; +static FILE *line_output; + +#define LINE_SEVERITY_NONE 255 +static bool lf(void) { + if (!line_struct.empty) { + line_count += 1; + line_struct.empty = true; + line_struct.severity = LINE_SEVERITY_NONE; + line_struct.scope_depth = 0; + if (line_output) { + fputc('\n', line_output); + return true; + } + } + return false; +} -struct problem { - struct problem *pr_next; - size_t count; - const char *caption; -}; - -struct problem *problems_list; -unsigned total_problems, data_tree_problems, gc_tree_problems; - -static void MDBX_PRINTF_ARGS(1, 2) print(const char *msg, ...) { - if (!quiet) { - va_list args; +static void flush(void) { fflush(nullptr); } - fflush(stderr); - va_start(args, msg); - vfprintf(stdout, msg, args); - va_end(args); - } +static void lf_flush(void) { + if (lf()) + flush(); } -static MDBX_val printable_buf; -static void free_printable_buf(void) { osal_free(printable_buf.iov_base); } - -static const char *sdb_name(const MDBX_val *val) { - if (val == MDBX_PGWALK_MAIN) - return "@MAIN"; - if (val == MDBX_PGWALK_GC) - return "@GC"; - if (val == MDBX_PGWALK_META) - return "@META"; - - const unsigned char *const data = val->iov_base; - const size_t len = val->iov_len; - if (data == MDBX_PGWALK_MAIN) - return "@MAIN"; - if (data == MDBX_PGWALK_GC) - return "@GC"; - if (data == MDBX_PGWALK_META) - return "@META"; - - if (!len) - return ""; - if (!data) - return ""; - if (len > 65536) { - static char buf[64]; - /* NOTE: There is MSYS2 MinGW bug if you here got - * the "unknown conversion type character ‘z’ in format [-Werror=format=]" - * https://stackoverflow.com/questions/74504432/whats-the-proper-way-to-tell-mingw-based-gcc-to-use-ansi-stdio-output-on-windo - */ - snprintf(buf, sizeof(buf), "", len); - return buf; - } +static bool silently(enum MDBX_chk_severity severity) { + int cutoff = + chk.scope ? chk.scope->verbosity >> MDBX_chk_severity_prio_shift + : verbose + (MDBX_chk_result >> MDBX_chk_severity_prio_shift); + int prio = (severity >> MDBX_chk_severity_prio_shift); + if (chk.scope && chk.scope->stage == MDBX_chk_traversal_subdbs && verbose < 2) + prio += 1; + return quiet || cutoff < ((prio > 0) ? prio : 0); +} - bool printable = true; - bool quoting = false; - size_t xchars = 0; - for (size_t i = 0; i < val->iov_len && printable; ++i) { - quoting |= data[i] != '_' && isalnum(data[i]) == 0; - printable = isprint(data[i]) != 0 || - (data[i] < ' ' && ++xchars < 4 && len > xchars * 4); - } +static FILE *prefix(enum MDBX_chk_severity severity) { + if (silently(severity)) + return nullptr; - size_t need = len + 1; - if (quoting || !printable) - need += len + /* quotes */ 2 + 2 * /* max xchars */ 4; - if (need > printable_buf.iov_len) { - void *ptr = osal_realloc(printable_buf.iov_base, need); - if (!ptr) - return ""; - if (!printable_buf.iov_base) - atexit(free_printable_buf); - printable_buf.iov_base = ptr; - printable_buf.iov_len = need; - } + static const char *const prefixes[16] = { + "!!!fatal: ", // 0 fatal + " ! ", // 1 error + " ~ ", // 2 warning + " ", // 3 notice + "", // 4 result + " = ", // 5 resolution + " - ", // 6 processing + " ", // 7 info + " ", // 8 verbose + " ", // 9 details + " // ", // A lib-verbose + " //// ", // B lib-debug + " ////// ", // C lib-trace + " ////// ", // D lib-extra + " ////// ", // E +1 + " ////// " // F +2 + }; - char *out = printable_buf.iov_base; - if (!quoting) { - memcpy(out, data, len); - out += len; - } else if (printable) { - *out++ = '\''; - for (size_t i = 0; i < len; ++i) { - if (data[i] < ' ') { - assert((char *)printable_buf.iov_base + printable_buf.iov_len > - out + 4); - static const char hex[] = "0123456789abcdef"; - out[0] = '\\'; - out[1] = 'x'; - out[2] = hex[data[i] >> 4]; - out[3] = hex[data[i] & 15]; - out += 4; - } else if (strchr("\"'`\\", data[i])) { - assert((char *)printable_buf.iov_base + printable_buf.iov_len > - out + 2); - out[0] = '\\'; - out[1] = data[i]; - out += 2; - } else { - assert((char *)printable_buf.iov_base + printable_buf.iov_len > - out + 1); - *out++ = data[i]; - } + const bool nl = + line_struct.scope_depth != chk.scope_nesting || + (line_struct.severity != severity && + (line_struct.severity != MDBX_chk_processing || + severity < MDBX_chk_result || severity > MDBX_chk_resolution)); + if (nl) + lf(); + if (severity < MDBX_chk_warning) + flush(); + FILE *out = (severity > MDBX_chk_error) ? stdout : stderr; + if (nl || line_struct.empty) { + line_struct.severity = severity; + line_struct.scope_depth = chk.scope_nesting; + unsigned kind = line_struct.severity & MDBX_chk_severity_kind_mask; + if (line_struct.scope_depth || *prefixes[kind]) { + line_struct.empty = false; + for (size_t i = 0; i < line_struct.scope_depth; ++i) + fputs(" ", out); + fputs(prefixes[kind], out); } - *out++ = '\''; } - assert((char *)printable_buf.iov_base + printable_buf.iov_len > out); - *out = 0; - return printable_buf.iov_base; + return line_output = out; } -static void va_log(MDBX_log_level_t level, const char *function, int line, - const char *msg, va_list args) { - static const char *const prefixes[] = { - "!!!fatal: ", " ! " /* error */, " ~ " /* warning */, - " " /* notice */, " // " /* verbose */, " //// " /* debug */, - " ////// " /* trace */ - }; - - FILE *out = stdout; - if (level <= MDBX_LOG_ERROR) { - total_problems++; - out = stderr; +static void suffix(size_t cookie, const char *str) { + if (cookie == line_count && !line_struct.empty) { + fprintf(line_output, " %s", str); + line_struct.empty = false; + lf(); } +} - if (!quiet && verbose + 1 >= (unsigned)level && - (unsigned)level < ARRAY_LENGTH(prefixes)) { - fflush(nullptr); - fputs(prefixes[level], out); +static size_t MDBX_PRINTF_ARGS(2, 3) + print(enum MDBX_chk_severity severity, const char *msg, ...) { + FILE *out = prefix(severity); + if (out) { + va_list args; + va_start(args, msg); vfprintf(out, msg, args); + va_end(args); + line_struct.empty = false; + return line_count; + } + return 0; +} - const bool have_lf = msg[strlen(msg) - 1] == '\n'; - if (level == MDBX_LOG_FATAL && function && line) - fprintf(out, have_lf ? " %s(), %u\n" : " (%s:%u)\n", - function + (strncmp(function, "mdbx_", 5) ? 5 : 0), line); - else if (!have_lf) - fputc('\n', out); - fflush(nullptr); +static FILE *MDBX_PRINTF_ARGS(2, 3) + print_ln(enum MDBX_chk_severity severity, const char *msg, ...) { + FILE *out = prefix(severity); + if (out) { + va_list args; + va_start(args, msg); + vfprintf(out, msg, args); + va_end(args); + line_struct.empty = false; + lf(); } + return out; +} +static void logger(MDBX_log_level_t level, const char *function, int line, + const char *fmt, va_list args) { + if (level <= MDBX_LOG_ERROR) + mdbx_env_chk_encount_problem(&chk); + + const unsigned kind = (level > MDBX_LOG_NOTICE) + ? level - MDBX_LOG_NOTICE + + (MDBX_chk_extra & MDBX_chk_severity_kind_mask) + : level; + const unsigned prio = kind << MDBX_chk_severity_prio_shift; + enum MDBX_chk_severity severity = prio + kind; + FILE *out = prefix(severity); + if (out) { + vfprintf(out, fmt, args); + const bool have_lf = fmt[strlen(fmt) - 1] == '\n'; + if (level == MDBX_LOG_FATAL && function && line) { + if (have_lf) + for (size_t i = 0; i < line_struct.scope_depth; ++i) + fputs(" ", out); + fprintf(out, have_lf ? " %s(), %u" : " (%s:%u)", + function + (strncmp(function, "mdbx_", 5) ? 0 : 5), line); + lf(); + } else if (have_lf) { + line_struct.empty = true; + line_struct.severity = LINE_SEVERITY_NONE; + line_count += 1; + } else + lf(); + } + if (level < MDBX_LOG_VERBOSE) + flush(); if (level == MDBX_LOG_FATAL) { #if !MDBX_DEBUG && !MDBX_FORCE_ASSERTIONS exit(EXIT_FAILURE_MDBX); @@ -4385,759 +4440,136 @@ static void va_log(MDBX_log_level_t level, const char *function, int line, } } -static void MDBX_PRINTF_ARGS(1, 2) error(const char *msg, ...) { +static void MDBX_PRINTF_ARGS(1, 2) error_fmt(const char *msg, ...) { va_list args; va_start(args, msg); - va_log(MDBX_LOG_ERROR, nullptr, 0, msg, args); + logger(MDBX_LOG_ERROR, nullptr, 0, msg, args); va_end(args); } -static void logger(MDBX_log_level_t level, const char *function, int line, - const char *msg, va_list args) { - (void)line; - (void)function; - if (level < MDBX_LOG_EXTRA) - va_log(level, function, line, msg, args); +static int error_fn(const char *fn, int err) { + if (err) + error_fmt("%s() failed, error %d, %s", fn, err, mdbx_strerror(err)); + return err; } -static int check_user_break(void) { - switch (user_break) { - case 0: - return MDBX_SUCCESS; - case 1: - print(" - interrupted by signal\n"); - fflush(nullptr); +static bool check_break(MDBX_chk_context_t *ctx) { + (void)ctx; + if (!user_break) + return false; + if (user_break == 1) { + print(MDBX_chk_resolution, "interrupted by signal"); + lf_flush(); user_break = 2; } - return MDBX_EINTR; -} - -static void pagemap_cleanup(void) { - osal_free(walk.pagemap); - walk.pagemap = nullptr; -} - -static bool eq(const MDBX_val a, const MDBX_val b) { - return a.iov_len == b.iov_len && - (a.iov_base == b.iov_base || a.iov_len == 0 || - !memcmp(a.iov_base, b.iov_base, a.iov_len)); -} - -static walk_dbi_t *pagemap_lookup_dbi(const MDBX_val *dbi_name, bool silent) { - static walk_dbi_t *last; - - if (dbi_name == MDBX_PGWALK_MAIN) - return &dbi_main; - if (dbi_name == MDBX_PGWALK_GC) - return &dbi_free; - if (dbi_name == MDBX_PGWALK_META) - return &dbi_meta; - - if (last && eq(last->name, *dbi_name)) - return last; - - walk_dbi_t *dbi = walk.dbi + CORE_DBS + /* account pseudo-entry for meta */ 1; - for (; dbi < ARRAY_END(walk.dbi) && dbi->name.iov_base; ++dbi) { - if (eq(dbi->name, *dbi_name)) - return last = dbi; - } - - if (verbose > 0 && !silent) { - print(" - found %s area\n", sdb_name(dbi_name)); - fflush(nullptr); - } - - if (dbi == ARRAY_END(walk.dbi)) - return nullptr; - - dbi->name = *dbi_name; - return last = dbi; + return true; } -static void MDBX_PRINTF_ARGS(4, 5) - problem_add(const char *object, uint64_t entry_number, const char *msg, - const char *extra, ...) { - total_problems++; - - if (!quiet) { - int need_fflush = 0; - struct problem *p; - - for (p = problems_list; p; p = p->pr_next) - if (p->caption == msg) - break; - - if (!p) { - p = osal_calloc(1, sizeof(*p)); - if (unlikely(!p)) - return; - p->caption = msg; - p->pr_next = problems_list; - problems_list = p; - need_fflush = 1; - } - - p->count++; - if (verbose > 1) { - print(" %s #%" PRIu64 ": %s", object, entry_number, msg); - if (extra) { - va_list args; - printf(" ("); - va_start(args, extra); - vfprintf(stdout, extra, args); - va_end(args); - printf(")"); - } - printf("\n"); - if (need_fflush) - fflush(nullptr); +static int scope_push(MDBX_chk_context_t *ctx, MDBX_chk_scope_t *scope, + MDBX_chk_scope_t *inner, const char *fmt, va_list args) { + (void)scope; + if (fmt && *fmt) { + FILE *out = prefix(MDBX_chk_processing); + if (out) { + vfprintf(out, fmt, args); + inner->usr_o.number = line_count; + line_struct.ctx = ctx; + flush(); } } + return MDBX_SUCCESS; } -static struct problem *problems_push(void) { - struct problem *p = problems_list; - problems_list = nullptr; - return p; +static void scope_pop(MDBX_chk_context_t *ctx, MDBX_chk_scope_t *scope, + MDBX_chk_scope_t *inner) { + (void)ctx; + (void)scope; + suffix(inner->usr_o.number, inner->subtotal_issues ? "error(s)" : "done"); + flush(); } -static size_t problems_pop(struct problem *list) { - size_t count = 0; - - if (problems_list) { - int i; - - print(" - problems: "); - for (i = 0; problems_list; ++i) { - struct problem *p = problems_list->pr_next; - count += problems_list->count; - print("%s%s (%" PRIuPTR ")", i ? ", " : "", problems_list->caption, - problems_list->count); - osal_free(problems_list); - problems_list = p; - } - print("\n"); - fflush(nullptr); - } - - problems_list = list; - return count; +static MDBX_chk_user_subdb_cookie_t *subdb_filter(MDBX_chk_context_t *ctx, + const MDBX_val *name, + MDBX_db_flags_t flags) { + (void)ctx; + (void)flags; + return (!only_subdb.iov_base || + (only_subdb.iov_len == name->iov_len && + memcmp(only_subdb.iov_base, name->iov_base, name->iov_len) == 0)) + ? (void *)(intptr_t)-1 + : nullptr; } -static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, - void *const ctx, const int deep, const MDBX_val *dbi_name, - const size_t page_size, const MDBX_page_type_t pagetype, - const MDBX_error_t err, const size_t nentries, - const size_t payload_bytes, const size_t header_bytes, - const size_t unused_bytes) { +static int stage_begin(MDBX_chk_context_t *ctx, enum MDBX_chk_stage stage) { (void)ctx; - const bool is_gc_tree = dbi_name == MDBX_PGWALK_GC; - if (deep > 42) { - problem_add("deep", deep, "too large", nullptr); - data_tree_problems += !is_gc_tree; - gc_tree_problems += is_gc_tree; - return MDBX_CORRUPTED /* avoid infinite loop/recursion */; - } - - walk_dbi_t *dbi = pagemap_lookup_dbi(dbi_name, false); - if (!dbi) { - data_tree_problems += !is_gc_tree; - gc_tree_problems += is_gc_tree; - return MDBX_ENOMEM; - } - - const size_t page_bytes = payload_bytes + header_bytes + unused_bytes; - walk.pgcount += pgnumber; - - const char *pagetype_caption; - bool branch = false; - switch (pagetype) { - default: - problem_add("page", pgno, "unknown page-type", "type %u, deep %i", - (unsigned)pagetype, deep); - pagetype_caption = "unknown"; - dbi->pages.other += pgnumber; - data_tree_problems += !is_gc_tree; - gc_tree_problems += is_gc_tree; - break; - case MDBX_page_broken: - pagetype_caption = "broken"; - dbi->pages.other += pgnumber; - data_tree_problems += !is_gc_tree; - gc_tree_problems += is_gc_tree; - break; - case MDBX_subpage_broken: - pagetype_caption = "broken-subpage"; - data_tree_problems += !is_gc_tree; - gc_tree_problems += is_gc_tree; - break; - case MDBX_page_meta: - pagetype_caption = "meta"; - dbi->pages.other += pgnumber; - break; - case MDBX_page_large: - pagetype_caption = "large"; - dbi->pages.large_volume += pgnumber; - dbi->pages.large_count += 1; - break; - case MDBX_page_branch: - pagetype_caption = "branch"; - dbi->pages.branch += pgnumber; - branch = true; - break; - case MDBX_page_leaf: - pagetype_caption = "leaf"; - dbi->pages.leaf += pgnumber; - break; - case MDBX_page_dupfixed_leaf: - pagetype_caption = "leaf-dupfixed"; - dbi->pages.leaf_dupfixed += pgnumber; - break; - case MDBX_subpage_leaf: - pagetype_caption = "subleaf-dupsort"; - dbi->pages.subleaf_dupsort += 1; - break; - case MDBX_subpage_dupfixed_leaf: - pagetype_caption = "subleaf-dupfixed"; - dbi->pages.subleaf_dupfixed += 1; - break; - } - - if (pgnumber) { - if (verbose > 3 && (!only_subdb.iov_base || eq(only_subdb, dbi->name))) { - if (pgnumber == 1) - print(" %s-page %" PRIu64, pagetype_caption, pgno); - else - print(" %s-span %" PRIu64 "[%u]", pagetype_caption, pgno, pgnumber); - print(" of %s: header %" PRIiPTR ", %s %" PRIiPTR ", payload %" PRIiPTR - ", unused %" PRIiPTR ", deep %i\n", - sdb_name(&dbi->name), header_bytes, - (pagetype == MDBX_page_branch) ? "keys" : "entries", nentries, - payload_bytes, unused_bytes, deep); - } - - bool already_used = false; - for (unsigned n = 0; n < pgnumber; ++n) { - uint64_t spanpgno = pgno + n; - if (spanpgno >= alloc_pages) { - problem_add("page", spanpgno, "wrong page-no", - "%s-page: %" PRIu64 " > %" PRIu64 ", deep %i", - pagetype_caption, spanpgno, alloc_pages, deep); - data_tree_problems += !is_gc_tree; - gc_tree_problems += is_gc_tree; - } else if (walk.pagemap[spanpgno]) { - walk_dbi_t *coll_dbi = &walk.dbi[walk.pagemap[spanpgno] - 1]; - problem_add("page", spanpgno, - (branch && coll_dbi == dbi) ? "loop" : "already used", - "%s-page: by %s, deep %i", pagetype_caption, - sdb_name(&coll_dbi->name), deep); - already_used = true; - data_tree_problems += !is_gc_tree; - gc_tree_problems += is_gc_tree; - } else { - walk.pagemap[spanpgno] = (short)(dbi - walk.dbi + 1); - dbi->pages.total += 1; - } - } - - if (already_used) - return branch ? MDBX_RESULT_TRUE /* avoid infinite loop/recursion */ - : MDBX_SUCCESS; - } - - if (MDBX_IS_ERROR(err)) { - problem_add("page", pgno, "invalid/corrupted", "%s-page", pagetype_caption); - data_tree_problems += !is_gc_tree; - gc_tree_problems += is_gc_tree; - } else { - if (unused_bytes > page_size) { - problem_add("page", pgno, "illegal unused-bytes", - "%s-page: %u < %" PRIuPTR " < %u", pagetype_caption, 0, - unused_bytes, envinfo.mi_dxb_pagesize); - data_tree_problems += !is_gc_tree; - gc_tree_problems += is_gc_tree; - } - - if (header_bytes < (int)sizeof(long) || - (size_t)header_bytes >= envinfo.mi_dxb_pagesize - sizeof(long)) { - problem_add("page", pgno, "illegal header-length", - "%s-page: %" PRIuPTR " < %" PRIuPTR " < %" PRIuPTR, - pagetype_caption, sizeof(long), header_bytes, - envinfo.mi_dxb_pagesize - sizeof(long)); - data_tree_problems += !is_gc_tree; - gc_tree_problems += is_gc_tree; - } - if (payload_bytes < 1) { - if (nentries > 1) { - problem_add("page", pgno, "zero size-of-entry", - "%s-page: payload %" PRIuPTR " bytes, %" PRIuPTR " entries", - pagetype_caption, payload_bytes, nentries); - /* if ((size_t)header_bytes + unused_bytes < page_size) { - // LY: hush a misuse error - page_bytes = page_size; - } */ - data_tree_problems += !is_gc_tree; - gc_tree_problems += is_gc_tree; - } else { - problem_add("page", pgno, "empty", - "%s-page: payload %" PRIuPTR " bytes, %" PRIuPTR - " entries, deep %i", - pagetype_caption, payload_bytes, nentries, deep); - dbi->pages.empty += 1; - data_tree_problems += !is_gc_tree; - gc_tree_problems += is_gc_tree; - } - } - - if (pgnumber) { - if (page_bytes != page_size) { - problem_add("page", pgno, "misused", - "%s-page: %" PRIuPTR " != %" PRIuPTR " (%" PRIuPTR - "h + %" PRIuPTR "p + %" PRIuPTR "u), deep %i", - pagetype_caption, page_size, page_bytes, header_bytes, - payload_bytes, unused_bytes, deep); - if (page_size > page_bytes) - dbi->lost_bytes += page_size - page_bytes; - data_tree_problems += !is_gc_tree; - gc_tree_problems += is_gc_tree; - } else { - dbi->payload_bytes += (uint64_t)payload_bytes + header_bytes; - walk.total_payload_bytes += (uint64_t)payload_bytes + header_bytes; - } - } - } - - return check_user_break(); + chk_stage = stage; + anchor_lineno = line_count; + flush(); + return MDBX_SUCCESS; } -typedef int(visitor)(const uint64_t record_number, const MDBX_val *key, - const MDBX_val *data); -static int process_db(MDBX_dbi dbi_handle, const MDBX_val *dbi_name, - visitor *handler); - -static int handle_userdb(const uint64_t record_number, const MDBX_val *key, - const MDBX_val *data) { - (void)record_number; - (void)key; - (void)data; - return check_user_break(); +static int conclude(MDBX_chk_context_t *ctx); +static int stage_end(MDBX_chk_context_t *ctx, enum MDBX_chk_stage stage, + int err) { + if (stage == MDBX_chk_conclude && !err) + err = conclude(ctx); + suffix(anchor_lineno, err ? "error(s)" : "done"); + flush(); + chk_stage = MDBX_chk_none; + return err; } -static int handle_freedb(const uint64_t record_number, const MDBX_val *key, - const MDBX_val *data) { - char *bad = ""; - pgno_t *iptr = data->iov_base; - - if (key->iov_len != sizeof(txnid_t)) - problem_add("entry", record_number, "wrong txn-id size", - "key-size %" PRIiPTR, key->iov_len); - else { - txnid_t txnid; - memcpy(&txnid, key->iov_base, sizeof(txnid)); - if (txnid < 1 || txnid > envinfo.mi_recent_txnid) - problem_add("entry", record_number, "wrong txn-id", "%" PRIaTXN, txnid); - else { - if (data->iov_len < sizeof(pgno_t) || data->iov_len % sizeof(pgno_t)) - problem_add("entry", txnid, "wrong idl size", "%" PRIuPTR, - data->iov_len); - size_t number = (data->iov_len >= sizeof(pgno_t)) ? *iptr++ : 0; - if (number < 1 || number > MDBX_PGL_LIMIT) - problem_add("entry", txnid, "wrong idl length", "%" PRIuPTR, number); - else if ((number + 1) * sizeof(pgno_t) > data->iov_len) { - problem_add("entry", txnid, "trimmed idl", - "%" PRIuSIZE " > %" PRIuSIZE " (corruption)", - (number + 1) * sizeof(pgno_t), data->iov_len); - number = data->iov_len / sizeof(pgno_t) - 1; - } else if (data->iov_len - (number + 1) * sizeof(pgno_t) >= - /* LY: allow gap up to one page. it is ok - * and better than shink-and-retry inside update_gc() */ - envinfo.mi_dxb_pagesize) - problem_add("entry", txnid, "extra idl space", - "%" PRIuSIZE " < %" PRIuSIZE " (minor, not a trouble)", - (number + 1) * sizeof(pgno_t), data->iov_len); - - gc_pages += number; - if (envinfo.mi_latter_reader_txnid > txnid) - reclaimable_pages += number; - - pgno_t prev = MDBX_PNL_ASCENDING ? NUM_METAS - 1 : txn->mt_next_pgno; - pgno_t span = 1; - for (size_t i = 0; i < number; ++i) { - if (check_user_break()) - return MDBX_EINTR; - const pgno_t pgno = iptr[i]; - if (pgno < NUM_METAS) - problem_add("entry", txnid, "wrong idl entry", - "pgno %" PRIaPGNO " < meta-pages %u", pgno, NUM_METAS); - else if (pgno >= backed_pages) - problem_add("entry", txnid, "wrong idl entry", - "pgno %" PRIaPGNO " > backed-pages %" PRIu64, pgno, - backed_pages); - else if (pgno >= alloc_pages) - problem_add("entry", txnid, "wrong idl entry", - "pgno %" PRIaPGNO " > alloc-pages %" PRIu64, pgno, - alloc_pages - 1); - else { - if (MDBX_PNL_DISORDERED(prev, pgno)) { - bad = " [bad sequence]"; - problem_add("entry", txnid, "bad sequence", - "%" PRIaPGNO " %c [%zu].%" PRIaPGNO, prev, - (prev == pgno) ? '=' : (MDBX_PNL_ASCENDING ? '>' : '<'), - i, pgno); - } - if (walk.pagemap) { - int idx = walk.pagemap[pgno]; - if (idx == 0) - walk.pagemap[pgno] = -1; - else if (idx > 0) - problem_add("page", pgno, "already used", "by %s", - sdb_name(&walk.dbi[idx - 1].name)); - else - problem_add("page", pgno, "already listed in GC", nullptr); - } - } - prev = pgno; - while (i + span < number && - iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span) - : pgno_sub(pgno, span))) - ++span; - } - if (verbose > 3 && !only_subdb.iov_base) { - print(" transaction %" PRIaTXN ", %" PRIuPTR - " pages, maxspan %" PRIaPGNO "%s\n", - txnid, number, span, bad); - if (verbose > 4) { - for (size_t i = 0; i < number; i += span) { - const pgno_t pgno = iptr[i]; - for (span = 1; - i + span < number && - iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span) - : pgno_sub(pgno, span)); - ++span) - ; - if (span > 1) { - print(" %9" PRIaPGNO "[%" PRIaPGNO "]\n", pgno, span); - } else - print(" %9" PRIaPGNO "\n", pgno); - } - } - } - } +static MDBX_chk_line_t *print_begin(MDBX_chk_context_t *ctx, + enum MDBX_chk_severity severity) { + (void)ctx; + if (silently(severity)) + return nullptr; + if (line_struct.ctx) { + if (line_struct.severity == MDBX_chk_processing && + severity >= MDBX_chk_result && severity <= MDBX_chk_resolution && + line_output) + fputc(' ', line_output); + else + lf(); + line_struct.ctx = nullptr; } - - return check_user_break(); + line_struct.severity = severity; + return &line_struct; } -static int equal_or_greater(const MDBX_val *a, const MDBX_val *b) { - return eq(*a, *b) ? 0 : 1; +static void print_flush(MDBX_chk_line_t *line) { + (void)line; + flush(); } -static int handle_maindb(const uint64_t record_number, const MDBX_val *key, - const MDBX_val *data) { - if (data->iov_len == sizeof(MDBX_db)) { - int rc = process_db(~0u, key, handle_userdb); - if (rc != MDBX_INCOMPATIBLE) { - userdb_count++; - return rc; - } - } - return handle_userdb(record_number, key, data); +static void print_done(MDBX_chk_line_t *line) { + lf(); + line->ctx = nullptr; } -static const char *db_flags2keymode(unsigned flags) { - flags &= (MDBX_REVERSEKEY | MDBX_INTEGERKEY); - switch (flags) { - case 0: - return "usual"; - case MDBX_REVERSEKEY: - return "reserve"; - case MDBX_INTEGERKEY: - return "ordinal"; - case MDBX_REVERSEKEY | MDBX_INTEGERKEY: - return "msgpack"; - default: - assert(false); - __unreachable(); - } +static void print_chars(MDBX_chk_line_t *line, const char *str, size_t len) { + if (line->empty) + prefix(line->severity); + fwrite(str, 1, len, line_output); } -static const char *db_flags2valuemode(unsigned flags) { - flags &= (MDBX_DUPSORT | MDBX_REVERSEDUP | MDBX_DUPFIXED | MDBX_INTEGERDUP); - switch (flags) { - case 0: - return "single"; - case MDBX_DUPSORT: - return "multi"; - case MDBX_REVERSEDUP: - case MDBX_DUPSORT | MDBX_REVERSEDUP: - return "multi-reverse"; - case MDBX_DUPFIXED: - case MDBX_DUPSORT | MDBX_DUPFIXED: - return "multi-samelength"; - case MDBX_DUPFIXED | MDBX_REVERSEDUP: - case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP: - return "multi-reverse-samelength"; - case MDBX_INTEGERDUP: - case MDBX_DUPSORT | MDBX_INTEGERDUP: - case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP: - case MDBX_DUPFIXED | MDBX_INTEGERDUP: - return "multi-ordinal"; - case MDBX_INTEGERDUP | MDBX_REVERSEDUP: - case MDBX_DUPSORT | MDBX_INTEGERDUP | MDBX_REVERSEDUP: - return "multi-msgpack"; - case MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP: - case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP: - return "reserved"; - default: - assert(false); - __unreachable(); - } +static void print_format(MDBX_chk_line_t *line, const char *fmt, va_list args) { + if (line->empty) + prefix(line->severity); + vfprintf(line_output, fmt, args); } -static int process_db(MDBX_dbi dbi_handle, const MDBX_val *dbi_name, - visitor *handler) { - MDBX_cursor *mc; - MDBX_stat ms; - MDBX_val key, data; - MDBX_val prev_key, prev_data; - unsigned flags; - int rc, i; - struct problem *saved_list; - uint64_t problems_count; - const bool second_pass = dbi_handle == MAIN_DBI; - - uint64_t record_count = 0, dups = 0; - uint64_t key_bytes = 0, data_bytes = 0; - - if ((MDBX_TXN_FINISHED | MDBX_TXN_ERROR) & mdbx_txn_flags(txn)) { - print(" ! abort processing %s due to a previous error\n", - sdb_name(dbi_name)); - return MDBX_BAD_TXN; - } - - if (dbi_handle == ~0u) { - rc = mdbx_dbi_open_ex2( - txn, dbi_name, MDBX_DB_ACCEDE, &dbi_handle, - (dbi_name && ignore_wrong_order) ? equal_or_greater : nullptr, - (dbi_name && ignore_wrong_order) ? equal_or_greater : nullptr); - if (rc) { - if (!dbi_name || - rc != - MDBX_INCOMPATIBLE) /* LY: mainDB's record is not a user's DB. */ { - error("mdbx_dbi_open(%s) failed, error %d %s\n", sdb_name(dbi_name), rc, - mdbx_strerror(rc)); - } - return rc; - } - } - - if (dbi_handle >= CORE_DBS && dbi_name && only_subdb.iov_base && - !eq(only_subdb, *dbi_name)) { - if (verbose) { - print("Skip processing %s...\n", sdb_name(dbi_name)); - fflush(nullptr); - } - skipped_subdb++; - return MDBX_SUCCESS; - } - - if (!second_pass && verbose) - print("Processing %s...\n", sdb_name(dbi_name)); - fflush(nullptr); - - rc = mdbx_dbi_flags(txn, dbi_handle, &flags); - if (rc) { - error("mdbx_dbi_flags() failed, error %d %s\n", rc, mdbx_strerror(rc)); - return rc; - } - - rc = mdbx_dbi_stat(txn, dbi_handle, &ms, sizeof(ms)); - if (rc) { - error("mdbx_dbi_stat() failed, error %d %s\n", rc, mdbx_strerror(rc)); - return rc; - } - - if (!second_pass && verbose) { - print(" - key-value kind: %s-key => %s-value", db_flags2keymode(flags), - db_flags2valuemode(flags)); - if (verbose > 1) { - print(", flags:"); - if (!flags) - print(" none"); - else { - for (i = 0; dbflags[i].bit; i++) - if (flags & dbflags[i].bit) - print(" %s", dbflags[i].name); - } - if (verbose > 2) - print(" (0x%02X), dbi-id %d", flags, dbi_handle); - } - print("\n"); - if (ms.ms_mod_txnid) - print(" - last modification txn#%" PRIu64 "\n", ms.ms_mod_txnid); - if (verbose > 1) { - print(" - page size %u, entries %" PRIu64 "\n", ms.ms_psize, - ms.ms_entries); - print(" - b-tree depth %u, pages: branch %" PRIu64 ", leaf %" PRIu64 - ", overflow %" PRIu64 "\n", - ms.ms_depth, ms.ms_branch_pages, ms.ms_leaf_pages, - ms.ms_overflow_pages); - } - } - - walk_dbi_t *dbi = (dbi_handle < CORE_DBS) - ? &walk.dbi[dbi_handle] - : pagemap_lookup_dbi(dbi_name, true); - if (!dbi) { - error("too many DBIs or out of memory\n"); - return MDBX_ENOMEM; - } - if (!dont_traversal) { - const uint64_t subtotal_pages = - ms.ms_branch_pages + ms.ms_leaf_pages + ms.ms_overflow_pages; - if (subtotal_pages != dbi->pages.total) - error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", - "subtotal", subtotal_pages, dbi->pages.total); - if (ms.ms_branch_pages != dbi->pages.branch) - error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", "branch", - ms.ms_branch_pages, dbi->pages.branch); - const uint64_t allleaf_pages = dbi->pages.leaf + dbi->pages.leaf_dupfixed; - if (ms.ms_leaf_pages != allleaf_pages) - error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", - "all-leaf", ms.ms_leaf_pages, allleaf_pages); - if (ms.ms_overflow_pages != dbi->pages.large_volume) - error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", - "large/overlow", ms.ms_overflow_pages, dbi->pages.large_volume); - } - rc = mdbx_cursor_open(txn, dbi_handle, &mc); - if (rc) { - error("mdbx_cursor_open() failed, error %d %s\n", rc, mdbx_strerror(rc)); - return rc; - } - - if (ignore_wrong_order) { /* for debugging with enabled assertions */ - mc->mc_checking |= CC_SKIPORD; - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_checking |= CC_SKIPORD; - } - - const size_t maxkeysize = mdbx_env_get_maxkeysize_ex(env, flags); - saved_list = problems_push(); - prev_key.iov_base = nullptr; - prev_key.iov_len = 0; - prev_data.iov_base = nullptr; - prev_data.iov_len = 0; - rc = mdbx_cursor_get(mc, &key, &data, MDBX_FIRST); - while (rc == MDBX_SUCCESS) { - rc = check_user_break(); - if (rc) - goto bailout; - - if (!second_pass) { - bool bad_key = false; - if (key.iov_len > maxkeysize) { - problem_add("entry", record_count, "key length exceeds max-key-size", - "%" PRIuPTR " > %" PRIuPTR, key.iov_len, maxkeysize); - bad_key = true; - } else if ((flags & MDBX_INTEGERKEY) && key.iov_len != sizeof(uint64_t) && - key.iov_len != sizeof(uint32_t)) { - problem_add("entry", record_count, "wrong key length", - "%" PRIuPTR " != 4or8", key.iov_len); - bad_key = true; - } - - bool bad_data = false; - if ((flags & MDBX_INTEGERDUP) && data.iov_len != sizeof(uint64_t) && - data.iov_len != sizeof(uint32_t)) { - problem_add("entry", record_count, "wrong data length", - "%" PRIuPTR " != 4or8", data.iov_len); - bad_data = true; - } - - if (prev_key.iov_base) { - if (prev_data.iov_base && !bad_data && (flags & MDBX_DUPFIXED) && - prev_data.iov_len != data.iov_len) { - problem_add("entry", record_count, "different data length", - "%" PRIuPTR " != %" PRIuPTR, prev_data.iov_len, - data.iov_len); - bad_data = true; - } - - if (!bad_key) { - int cmp = mdbx_cmp(txn, dbi_handle, &key, &prev_key); - if (cmp == 0) { - ++dups; - if ((flags & MDBX_DUPSORT) == 0) { - problem_add("entry", record_count, "duplicated entries", nullptr); - if (prev_data.iov_base && data.iov_len == prev_data.iov_len && - memcmp(data.iov_base, prev_data.iov_base, data.iov_len) == - 0) { - problem_add("entry", record_count, "complete duplicate", - nullptr); - } - } else if (!bad_data && prev_data.iov_base) { - cmp = mdbx_dcmp(txn, dbi_handle, &data, &prev_data); - if (cmp == 0) { - problem_add("entry", record_count, "complete duplicate", - nullptr); - } else if (cmp < 0 && !ignore_wrong_order) { - problem_add("entry", record_count, - "wrong order of multi-values", nullptr); - } - } - } else if (cmp < 0 && !ignore_wrong_order) { - problem_add("entry", record_count, "wrong order of entries", - nullptr); - } - } - } - - if (!bad_key) { - if (verbose && (flags & MDBX_INTEGERKEY) && !prev_key.iov_base) - print(" - fixed key-size %" PRIuPTR "\n", key.iov_len); - prev_key = key; - } - if (!bad_data) { - if (verbose && (flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) && - !prev_data.iov_base) - print(" - fixed data-size %" PRIuPTR "\n", data.iov_len); - prev_data = data; - } - } - - if (handler) { - rc = handler(record_count, &key, &data); - if (MDBX_IS_ERROR(rc)) - goto bailout; - } - - record_count++; - key_bytes += key.iov_len; - data_bytes += data.iov_len; - - rc = mdbx_cursor_get(mc, &key, &data, MDBX_NEXT); - } - if (rc != MDBX_NOTFOUND) - error("mdbx_cursor_get() failed, error %d %s\n", rc, mdbx_strerror(rc)); - else - rc = 0; - - if (record_count != ms.ms_entries) - problem_add("entry", record_count, "different number of entries", - "%" PRIu64 " != %" PRIu64, record_count, ms.ms_entries); -bailout: - problems_count = problems_pop(saved_list); - if (!second_pass && verbose) { - print(" - summary: %" PRIu64 " records, %" PRIu64 " dups, %" PRIu64 - " key's bytes, %" PRIu64 " data's " - "bytes, %" PRIu64 " problems\n", - record_count, dups, key_bytes, data_bytes, problems_count); - fflush(nullptr); - } - - mdbx_cursor_close(mc); - return (rc || problems_count) ? MDBX_RESULT_TRUE : MDBX_SUCCESS; -} +static const MDBX_chk_callbacks_t cb = {.check_break = check_break, + .scope_push = scope_push, + .scope_pop = scope_pop, + .subdb_filter = subdb_filter, + .stage_begin = stage_begin, + .stage_end = stage_end, + .print_begin = print_begin, + .print_flush = print_flush, + .print_done = print_done, + .print_chars = print_chars, + .print_format = print_format}; static void usage(char *prog) { fprintf( @@ -5145,7 +4577,7 @@ static void usage(char *prog) { "usage: %s " "[-V] [-v] [-q] [-c] [-0|1|2] [-w] [-d] [-i] [-s subdb] [-u|U] dbpath\n" " -V\t\tprint version and exit\n" - " -v\t\tmore verbose, could be used multiple times\n" + " -v\t\tmore verbose, could be repeated upto 9 times for extra details\n" " -q\t\tbe quiet\n" " -c\t\tforce cooperative mode (don't try exclusive)\n" " -w\t\twrite-mode checking\n" @@ -5161,144 +4593,68 @@ static void usage(char *prog) { exit(EXIT_INTERRUPTED); } -static bool meta_ot(txnid_t txn_a, uint64_t sign_a, txnid_t txn_b, - uint64_t sign_b, const bool wanna_steady) { - if (txn_a == txn_b) - return SIGN_IS_STEADY(sign_b); - - if (wanna_steady && SIGN_IS_STEADY(sign_a) != SIGN_IS_STEADY(sign_b)) - return SIGN_IS_STEADY(sign_b); - - return txn_a < txn_b; -} - -static bool meta_eq(txnid_t txn_a, uint64_t sign_a, txnid_t txn_b, - uint64_t sign_b) { - if (!txn_a || txn_a != txn_b) - return false; - - if (SIGN_IS_STEADY(sign_a) != SIGN_IS_STEADY(sign_b)) - return false; - - return true; -} - -static int meta_recent(const bool wanna_steady) { - if (meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, - envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, wanna_steady)) - return meta_ot(envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, - envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, wanna_steady) - ? 1 - : 2; - else - return meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, - envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, wanna_steady) - ? 2 - : 0; -} - -static int meta_tail(int head) { - switch (head) { - case 0: - return meta_ot(envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, - envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, true) - ? 1 - : 2; - case 1: - return meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, - envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, true) - ? 0 - : 2; - case 2: - return meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, - envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, true) - ? 0 - : 1; - default: - assert(false); - return -1; - } -} - -static int meta_head(void) { return meta_recent(false); } - -void verbose_meta(int num, txnid_t txnid, uint64_t sign, uint64_t bootid_x, - uint64_t bootid_y) { - const bool have_bootid = (bootid_x | bootid_y) != 0; - const bool bootid_match = bootid_x == envinfo.mi_bootid.current.x && - bootid_y == envinfo.mi_bootid.current.y; - - print(" - meta-%d: ", num); - switch (sign) { - case MDBX_DATASIGN_NONE: - print("no-sync/legacy"); - break; - case MDBX_DATASIGN_WEAK: - print("weak-%s", bootid_match ? (have_bootid ? "intact (same boot-id)" - : "unknown (no boot-id") - : "dead"); - break; - default: - print("steady"); - break; +static int conclude(MDBX_chk_context_t *ctx) { + int err = MDBX_SUCCESS; + if (ctx->result.total_problems == 1 && ctx->result.problems_meta == 1 && + (chk_flags & + (MDBX_CHK_SKIP_BTREE_TRAVERSAL | MDBX_CHK_SKIP_KV_TRAVERSAL)) == 0 && + (env_flags & MDBX_RDONLY) == 0 && !only_subdb.iov_base && + stuck_meta < 0 && ctx->result.steady_txnid < ctx->result.recent_txnid) { + const size_t step_lineno = + print(MDBX_chk_resolution, + "Perform sync-to-disk for make steady checkpoint" + " at txn-id #%" PRIi64 "...", + ctx->result.recent_txnid); + flush(); + err = error_fn("mdbx_env_pgwalk", mdbx_env_sync_ex(ctx->env, true, false)); + if (err == MDBX_SUCCESS) { + ctx->result.problems_meta -= 1; + ctx->result.total_problems -= 1; + suffix(step_lineno, "done"); + } } - print(" txn#%" PRIu64, txnid); - - const int head = meta_head(); - if (num == head) - print(", head"); - else if (num == meta_tail(head)) - print(", tail"); - else - print(", stay"); - if (stuck_meta >= 0) { - if (num == stuck_meta) - print(", forced for checking"); - } else if (txnid > envinfo.mi_recent_txnid && - (envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) == MDBX_EXCLUSIVE) - print(", rolled-back %" PRIu64 " (%" PRIu64 " >>> %" PRIu64 ")", - txnid - envinfo.mi_recent_txnid, txnid, envinfo.mi_recent_txnid); - print("\n"); -} - -static uint64_t get_meta_txnid(const unsigned meta_id) { - switch (meta_id) { - default: - assert(false); - error("unexpected meta_id %u\n", meta_id); - return 0; - case 0: - return envinfo.mi_meta0_txnid; - case 1: - return envinfo.mi_meta1_txnid; - case 2: - return envinfo.mi_meta2_txnid; + if (turn_meta && stuck_meta >= 0 && + (chk_flags & + (MDBX_CHK_SKIP_BTREE_TRAVERSAL | MDBX_CHK_SKIP_KV_TRAVERSAL)) == 0 && + !only_subdb.iov_base && + (env_flags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) == MDBX_EXCLUSIVE) { + const bool successful_check = + (err | ctx->result.total_problems | ctx->result.problems_meta) == 0; + if (successful_check || force_turn_meta) { + const size_t step_lineno = print( + MDBX_chk_resolution, + "Performing turn to the specified meta-page (%d) due to %s!", + stuck_meta, + successful_check ? "successful check" : "the -T option was given"); + flush(); + err = mdbx_env_turn_for_recovery(ctx->env, stuck_meta); + if (err != MDBX_SUCCESS) + error_fn("mdbx_env_turn_for_recovery", err); + else + suffix(step_lineno, "done"); + } else { + print(MDBX_chk_resolution, + "Skipping turn to the specified meta-page (%d) due to " + "unsuccessful check!", + stuck_meta); + lf_flush(); + } } -} -static void print_size(const char *prefix, const uint64_t value, - const char *suffix) { - const char sf[] = - "KMGTPEZY"; /* LY: Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta! */ - double k = 1024.0; - size_t i; - for (i = 0; sf[i + 1] && value / k > 1000.0; ++i) - k *= 1024; - print("%s%" PRIu64 " (%.2f %cb)%s", prefix, value, value / k, sf[i], suffix); + return err; } int main(int argc, char *argv[]) { int rc; char *prog = argv[0]; char *envname; - unsigned problems_maindb = 0, problems_freedb = 0, problems_meta = 0; - bool write_locked = false; - bool turn_meta = false; - bool force_turn_meta = false; bool warmup = false; MDBX_warmup_flags_t warmup_flags = MDBX_warmup_default; + if (argc < 2) + usage(prog); + double elapsed; #if defined(_WIN32) || defined(_WIN64) uint64_t timestamp_start, timestamp_finish; @@ -5306,20 +4662,11 @@ int main(int argc, char *argv[]) { #else struct timespec timestamp_start, timestamp_finish; if (clock_gettime(CLOCK_MONOTONIC, ×tamp_start)) { - rc = errno; - error("clock_gettime() failed, error %d %s\n", rc, mdbx_strerror(rc)); + error_fn("clock_gettime", errno); return EXIT_FAILURE_SYS; } #endif - dbi_meta.name.iov_base = MDBX_PGWALK_META; - dbi_free.name.iov_base = MDBX_PGWALK_GC; - dbi_main.name.iov_base = MDBX_PGWALK_MAIN; - atexit(pagemap_cleanup); - - if (argc < 2) - usage(prog); - for (int i; (i = getopt(argc, argv, "uU" "0" @@ -5352,7 +4699,16 @@ int main(int argc, char *argv[]) { mdbx_build.options); return EXIT_SUCCESS; case 'v': - verbose++; + if (verbose >= 9 && 0) + usage(prog); + else { + verbose += 1; + if (verbose == 0 && !MDBX_DEBUG) + printf("Verbosity level %u exposures only to" + " a debug/extra-logging-enabled builds (with NDEBUG undefined" + " or MDBX_DEBUG > 0)\n", + verbose); + } break; case '0': stuck_meta = 0; @@ -5369,8 +4725,6 @@ int main(int argc, char *argv[]) { case 'T': turn_meta = force_turn_meta = true; quiet = false; - if (verbose < 2) - verbose = 2; break; case 'q': quiet = true; @@ -5378,27 +4732,30 @@ int main(int argc, char *argv[]) { case 'n': break; case 'w': - envflags &= ~MDBX_RDONLY; + env_flags &= ~MDBX_RDONLY; + chk_flags |= MDBX_CHK_READWRITE; #if MDBX_MMAP_INCOHERENT_FILE_WRITE /* Temporary `workaround` for OpenBSD kernel's flaw. * See https://libmdbx.dqdkfa.ru/dead-github/issues/67 */ - envflags |= MDBX_WRITEMAP; + env_flags |= MDBX_WRITEMAP; #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ break; case 'c': - envflags = (envflags & ~MDBX_EXCLUSIVE) | MDBX_ACCEDE; + env_flags = (env_flags & ~MDBX_EXCLUSIVE) | MDBX_ACCEDE; break; case 'd': - dont_traversal = true; + chk_flags |= MDBX_CHK_SKIP_BTREE_TRAVERSAL; break; case 's': if (only_subdb.iov_base && strcmp(only_subdb.iov_base, optarg)) usage(prog); - only_subdb.iov_base = optarg; - only_subdb.iov_len = strlen(optarg); + else { + only_subdb.iov_base = optarg; + only_subdb.iov_len = strlen(optarg); + } break; case 'i': - ignore_wrong_order = true; + chk_flags |= MDBX_CHK_IGNORE_ORDER; break; case 'u': warmup = true; @@ -5417,26 +4774,29 @@ int main(int argc, char *argv[]) { usage(prog); rc = MDBX_SUCCESS; - if (stuck_meta >= 0 && (envflags & MDBX_EXCLUSIVE) == 0) { - error("exclusive mode is required to using specific meta-page(%d) for " - "checking.\n", - stuck_meta); + if (stuck_meta >= 0 && (env_flags & MDBX_EXCLUSIVE) == 0) { + error_fmt("exclusive mode is required to using specific meta-page(%d) for " + "checking.", + stuck_meta); rc = EXIT_INTERRUPTED; } if (turn_meta) { if (stuck_meta < 0) { - error("meta-page must be specified (by -0, -1 or -2 options) to turn to " - "it.\n"); + error_fmt( + "meta-page must be specified (by -0, -1 or -2 options) to turn to " + "it."); rc = EXIT_INTERRUPTED; } - if (envflags & MDBX_RDONLY) { - error("write-mode must be enabled to turn to the specified meta-page.\n"); + if (env_flags & MDBX_RDONLY) { + error_fmt( + "write-mode must be enabled to turn to the specified meta-page."); rc = EXIT_INTERRUPTED; } - if (only_subdb.iov_base || dont_traversal) { - error( + if (only_subdb.iov_base || (chk_flags & (MDBX_CHK_SKIP_BTREE_TRAVERSAL | + MDBX_CHK_SKIP_KV_TRAVERSAL))) { + error_fmt( "whole database checking with b-tree traversal are required to turn " - "to the specified meta-page.\n"); + "to the specified meta-page."); rc = EXIT_INTERRUPTED; } } @@ -5457,13 +4817,19 @@ int main(int argc, char *argv[]) { #endif /* !WINDOWS */ envname = argv[optind]; - print("mdbx_chk %s (%s, T-%s)\nRunning for %s in 'read-%s' mode...\n", + print(MDBX_chk_result, + "mdbx_chk %s (%s, T-%s)\nRunning for %s in 'read-%s' mode with " + "verbosity level %u (%s)...", mdbx_version.git.describe, mdbx_version.git.datetime, mdbx_version.git.tree, envname, - (envflags & MDBX_RDONLY) ? "only" : "write"); - fflush(nullptr); - mdbx_setup_debug((verbose < MDBX_LOG_TRACE - 1) - ? (MDBX_log_level_t)(verbose + 1) + (env_flags & MDBX_RDONLY) ? "only" : "write", verbose, + (verbose > 8) + ? (MDBX_DEBUG ? "extra details for debugging" + : "same as 8 for non-debug builds with MDBX_DEBUG=0") + : "of 0..9"); + lf_flush(); + mdbx_setup_debug((verbose + MDBX_LOG_WARN < MDBX_LOG_TRACE) + ? (MDBX_log_level_t)(verbose + MDBX_LOG_WARN) : MDBX_LOG_TRACE, MDBX_DBG_DUMP | MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | MDBX_DBG_LEGACY_OVERLAP | MDBX_DBG_DONT_UPGRADE, @@ -5471,22 +4837,22 @@ int main(int argc, char *argv[]) { rc = mdbx_env_create(&env); if (rc) { - error("mdbx_env_create() failed, error %d %s\n", rc, mdbx_strerror(rc)); + error_fn("mdbx_env_create", rc); return rc < 0 ? EXIT_FAILURE_MDBX : EXIT_FAILURE_SYS; } - rc = mdbx_env_set_maxdbs(env, MDBX_MAX_DBI); + rc = mdbx_env_set_maxdbs(env, CORE_DBS); if (rc) { - error("mdbx_env_set_maxdbs() failed, error %d %s\n", rc, mdbx_strerror(rc)); + error_fn("mdbx_env_set_maxdbs", rc); goto bailout; } if (stuck_meta >= 0) { rc = mdbx_env_open_for_recovery(env, envname, stuck_meta, - (envflags & MDBX_RDONLY) ? false : true); + (env_flags & MDBX_RDONLY) ? false : true); } else { - rc = mdbx_env_open(env, envname, envflags, 0); - if ((envflags & MDBX_EXCLUSIVE) && + rc = mdbx_env_open(env, envname, env_flags, 0); + if ((env_flags & MDBX_EXCLUSIVE) && (rc == MDBX_BUSY || #if defined(_WIN32) || defined(_WIN64) rc == ERROR_LOCK_VIOLATION || rc == ERROR_SHARING_VIOLATION @@ -5494,489 +4860,52 @@ int main(int argc, char *argv[]) { rc == EBUSY || rc == EAGAIN #endif )) { - envflags &= ~MDBX_EXCLUSIVE; - rc = mdbx_env_open(env, envname, envflags | MDBX_ACCEDE, 0); + env_flags &= ~MDBX_EXCLUSIVE; + rc = mdbx_env_open(env, envname, env_flags | MDBX_ACCEDE, 0); } } if (rc) { - error("mdbx_env_open() failed, error %d %s\n", rc, mdbx_strerror(rc)); - if (rc == MDBX_WANNA_RECOVERY && (envflags & MDBX_RDONLY)) - print("Please run %s in the read-write mode (with '-w' option).\n", prog); + error_fn("mdbx_env_open", rc); + if (rc == MDBX_WANNA_RECOVERY && (env_flags & MDBX_RDONLY)) + print_ln(MDBX_chk_result, + "Please run %s in the read-write mode (with '-w' option).", + prog); goto bailout; } - if (verbose) - print(" - %s mode\n", - (envflags & MDBX_EXCLUSIVE) ? "monopolistic" : "cooperative"); - - if ((envflags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) == 0) { - if (verbose) { - print(" - taking write lock..."); - fflush(nullptr); - } - rc = mdbx_txn_lock(env, false); - if (rc != MDBX_SUCCESS) { - error("mdbx_txn_lock() failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - if (verbose) - print(" done\n"); - write_locked = true; - } + print_ln(MDBX_chk_verbose, "%s mode", + (env_flags & MDBX_EXCLUSIVE) ? "monopolistic" : "cooperative"); if (warmup) { - if (verbose) { - print(" - warming up..."); - fflush(nullptr); - } + anchor_lineno = print(MDBX_chk_verbose, "warming up..."); + flush(); rc = mdbx_env_warmup(env, nullptr, warmup_flags, 3600 * 65536); if (MDBX_IS_ERROR(rc)) { - error("mdbx_env_warmup(flags %u) failed, error %d %s\n", warmup_flags, rc, - mdbx_strerror(rc)); + error_fn("mdbx_env_warmup", rc); goto bailout; } - if (verbose) - print(" %s\n", rc ? "timeout" : "done"); - } - - rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &txn); - if (rc) { - error("mdbx_txn_begin() failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - - rc = mdbx_env_info_ex(env, txn, &envinfo, sizeof(envinfo)); - if (rc) { - error("mdbx_env_info_ex() failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - if (verbose) { - print(" - current boot-id "); - if (envinfo.mi_bootid.current.x | envinfo.mi_bootid.current.y) - print("%016" PRIx64 "-%016" PRIx64 "\n", envinfo.mi_bootid.current.x, - envinfo.mi_bootid.current.y); - else - print("unavailable\n"); + suffix(anchor_lineno, rc ? "timeout" : "done"); } - mdbx_filehandle_t dxb_fd; - rc = mdbx_env_get_fd(env, &dxb_fd); + rc = mdbx_env_chk(env, &cb, &chk, chk_flags, + MDBX_chk_result + (verbose << MDBX_chk_severity_prio_shift), + 0); if (rc) { - error("mdbx_env_get_fd() failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - - uint64_t dxb_filesize = 0; -#if defined(_WIN32) || defined(_WIN64) - { - BY_HANDLE_FILE_INFORMATION info; - if (!GetFileInformationByHandle(dxb_fd, &info)) - rc = GetLastError(); - else - dxb_filesize = info.nFileSizeLow | (uint64_t)info.nFileSizeHigh << 32; - } -#else - { - struct stat st; - STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(uint64_t), - "libmdbx requires 64-bit file I/O on 64-bit systems"); - if (fstat(dxb_fd, &st)) - rc = errno; - else - dxb_filesize = st.st_size; - } -#endif - if (rc) { - error("osal_filesize() failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - - errno = 0; - const uint64_t dxbfile_pages = dxb_filesize / envinfo.mi_dxb_pagesize; - alloc_pages = txn->mt_next_pgno; - backed_pages = envinfo.mi_geo.current / envinfo.mi_dxb_pagesize; - if (backed_pages > dxbfile_pages) { - print(" ! backed-pages %" PRIu64 " > file-pages %" PRIu64 "\n", - backed_pages, dxbfile_pages); - ++problems_meta; - } - if (dxbfile_pages < NUM_METAS) - print(" ! file-pages %" PRIu64 " < %u\n", dxbfile_pages, NUM_METAS); - if (backed_pages < NUM_METAS) - print(" ! backed-pages %" PRIu64 " < %u\n", backed_pages, NUM_METAS); - if (backed_pages < NUM_METAS || dxbfile_pages < NUM_METAS) - goto bailout; - if (backed_pages > MAX_PAGENO + 1) { - print(" ! backed-pages %" PRIu64 " > max-pages %" PRIaPGNO "\n", - backed_pages, MAX_PAGENO + 1); - ++problems_meta; - backed_pages = MAX_PAGENO + 1; - } - - if ((envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) { - if (backed_pages > dxbfile_pages) { - print(" ! backed-pages %" PRIu64 " > file-pages %" PRIu64 "\n", - backed_pages, dxbfile_pages); - ++problems_meta; - backed_pages = dxbfile_pages; - } - if (alloc_pages > backed_pages) { - print(" ! alloc-pages %" PRIu64 " > backed-pages %" PRIu64 "\n", - alloc_pages, backed_pages); - ++problems_meta; - alloc_pages = backed_pages; - } - } else { - /* LY: DB may be shrunk by writer down to the allocated pages. */ - if (alloc_pages > backed_pages) { - print(" ! alloc-pages %" PRIu64 " > backed-pages %" PRIu64 "\n", - alloc_pages, backed_pages); - ++problems_meta; - alloc_pages = backed_pages; - } - if (alloc_pages > dxbfile_pages) { - print(" ! alloc-pages %" PRIu64 " > file-pages %" PRIu64 "\n", - alloc_pages, dxbfile_pages); - ++problems_meta; - alloc_pages = dxbfile_pages; - } - if (backed_pages > dxbfile_pages) - backed_pages = dxbfile_pages; - } - - if (verbose) { - print(" - pagesize %u (%u system), max keysize %d..%d" - ", max readers %u\n", - envinfo.mi_dxb_pagesize, envinfo.mi_sys_pagesize, - mdbx_env_get_maxkeysize_ex(env, MDBX_DUPSORT), - mdbx_env_get_maxkeysize_ex(env, 0), envinfo.mi_maxreaders); - print_size(" - mapsize ", envinfo.mi_mapsize, "\n"); - if (envinfo.mi_geo.lower == envinfo.mi_geo.upper) - print_size(" - fixed datafile: ", envinfo.mi_geo.current, ""); - else { - print_size(" - dynamic datafile: ", envinfo.mi_geo.lower, ""); - print_size(" .. ", envinfo.mi_geo.upper, ", "); - print_size("+", envinfo.mi_geo.grow, ", "); - print_size("-", envinfo.mi_geo.shrink, "\n"); - print_size(" - current datafile: ", envinfo.mi_geo.current, ""); - } - printf(", %" PRIu64 " pages\n", - envinfo.mi_geo.current / envinfo.mi_dxb_pagesize); -#if defined(_WIN32) || defined(_WIN64) - if (envinfo.mi_geo.shrink && envinfo.mi_geo.current != envinfo.mi_geo.upper) - print( - " WARNING: Due Windows system limitations a " - "file couldn't\n be truncated while the database " - "is opened. So, the size\n database file " - "of may by large than the database itself,\n " - "until it will be closed or reopened in read-write mode.\n"); -#endif - verbose_meta(0, envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, - envinfo.mi_bootid.meta0.x, envinfo.mi_bootid.meta0.y); - verbose_meta(1, envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, - envinfo.mi_bootid.meta1.x, envinfo.mi_bootid.meta1.y); - verbose_meta(2, envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, - envinfo.mi_bootid.meta2.x, envinfo.mi_bootid.meta2.y); - } - - if (stuck_meta >= 0) { - if (verbose) { - print(" - skip checking meta-pages since the %u" - " is selected for verification\n", - stuck_meta); - print(" - transactions: recent %" PRIu64 - ", selected for verification %" PRIu64 ", lag %" PRIi64 "\n", - envinfo.mi_recent_txnid, get_meta_txnid(stuck_meta), - envinfo.mi_recent_txnid - get_meta_txnid(stuck_meta)); - } - } else { - if (verbose > 1) - print(" - performs check for meta-pages clashes\n"); - if (meta_eq(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, - envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign)) { - print(" ! meta-%d and meta-%d are clashed\n", 0, 1); - ++problems_meta; - } - if (meta_eq(envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, - envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign)) { - print(" ! meta-%d and meta-%d are clashed\n", 1, 2); - ++problems_meta; - } - if (meta_eq(envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, - envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign)) { - print(" ! meta-%d and meta-%d are clashed\n", 2, 0); - ++problems_meta; - } - - const unsigned steady_meta_id = meta_recent(true); - const uint64_t steady_meta_txnid = get_meta_txnid(steady_meta_id); - const unsigned weak_meta_id = meta_recent(false); - const uint64_t weak_meta_txnid = get_meta_txnid(weak_meta_id); - if (envflags & MDBX_EXCLUSIVE) { - if (verbose > 1) - print(" - performs full check recent-txn-id with meta-pages\n"); - if (steady_meta_txnid != envinfo.mi_recent_txnid) { - print(" ! steady meta-%d txn-id mismatch recent-txn-id (%" PRIi64 - " != %" PRIi64 ")\n", - steady_meta_id, steady_meta_txnid, envinfo.mi_recent_txnid); - ++problems_meta; - } - } else if (write_locked) { - if (verbose > 1) - print(" - performs lite check recent-txn-id with meta-pages (not a " - "monopolistic mode)\n"); - if (weak_meta_txnid != envinfo.mi_recent_txnid) { - print(" ! weak meta-%d txn-id mismatch recent-txn-id (%" PRIi64 - " != %" PRIi64 ")\n", - weak_meta_id, weak_meta_txnid, envinfo.mi_recent_txnid); - ++problems_meta; - } - } else if (verbose) { - print(" - skip check recent-txn-id with meta-pages (monopolistic or " - "read-write mode only)\n"); - } - total_problems += problems_meta; - - if (verbose) - print(" - transactions: recent %" PRIu64 ", latter reader %" PRIu64 - ", lag %" PRIi64 "\n", - envinfo.mi_recent_txnid, envinfo.mi_latter_reader_txnid, - envinfo.mi_recent_txnid - envinfo.mi_latter_reader_txnid); - } - - if (!dont_traversal) { - struct problem *saved_list; - size_t traversal_problems; - uint64_t empty_pages, lost_bytes; - - print("Traversal b-tree by txn#%" PRIaTXN "...\n", txn->mt_txnid); - fflush(nullptr); - walk.pagemap = osal_calloc((size_t)backed_pages, sizeof(*walk.pagemap)); - if (!walk.pagemap) { - rc = errno ? errno : MDBX_ENOMEM; - error("calloc() failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - - saved_list = problems_push(); - rc = mdbx_env_pgwalk(txn, pgvisitor, nullptr, - true /* always skip key ordering checking to avoid - MDBX_CORRUPTED when using custom comparators */); - traversal_problems = problems_pop(saved_list); - - if (rc) { - if (rc != MDBX_EINTR || !check_user_break()) - error("mdbx_env_pgwalk() failed, error %d %s\n", rc, mdbx_strerror(rc)); - goto bailout; - } - - for (uint64_t n = 0; n < alloc_pages; ++n) - if (!walk.pagemap[n]) - unused_pages += 1; - - empty_pages = lost_bytes = 0; - for (walk_dbi_t *dbi = &dbi_main; - dbi < ARRAY_END(walk.dbi) && dbi->name.iov_base; ++dbi) { - empty_pages += dbi->pages.empty; - lost_bytes += dbi->lost_bytes; - } - - if (verbose) { - uint64_t total_page_bytes = walk.pgcount * envinfo.mi_dxb_pagesize; - print(" - pages: walked %" PRIu64 ", left/unused %" PRIu64 "\n", - walk.pgcount, unused_pages); - if (verbose > 1) { - for (walk_dbi_t *dbi = walk.dbi; - dbi < ARRAY_END(walk.dbi) && dbi->name.iov_base; ++dbi) { - print(" %s: subtotal %" PRIu64, sdb_name(&dbi->name), - dbi->pages.total); - if (dbi->pages.other && dbi->pages.other != dbi->pages.total) - print(", other %" PRIu64, dbi->pages.other); - if (dbi->pages.branch) - print(", branch %" PRIu64, dbi->pages.branch); - if (dbi->pages.large_count) - print(", large %" PRIu64, dbi->pages.large_count); - uint64_t all_leaf = dbi->pages.leaf + dbi->pages.leaf_dupfixed; - if (all_leaf) { - print(", leaf %" PRIu64, all_leaf); - if (verbose > 2 && - (dbi->pages.subleaf_dupsort | dbi->pages.leaf_dupfixed | - dbi->pages.subleaf_dupfixed)) - print(" (usual %" PRIu64 ", sub-dupsort %" PRIu64 - ", dupfixed %" PRIu64 ", sub-dupfixed %" PRIu64 ")", - dbi->pages.leaf, dbi->pages.subleaf_dupsort, - dbi->pages.leaf_dupfixed, dbi->pages.subleaf_dupfixed); - } - print("\n"); - } - } - - if (verbose > 1) - print(" - usage: total %" PRIu64 " bytes, payload %" PRIu64 - " (%.1f%%), unused " - "%" PRIu64 " (%.1f%%)\n", - total_page_bytes, walk.total_payload_bytes, - walk.total_payload_bytes * 100.0 / total_page_bytes, - total_page_bytes - walk.total_payload_bytes, - (total_page_bytes - walk.total_payload_bytes) * 100.0 / - total_page_bytes); - if (verbose > 2) { - for (walk_dbi_t *dbi = walk.dbi; - dbi < ARRAY_END(walk.dbi) && dbi->name.iov_base; ++dbi) - if (dbi->pages.total) { - uint64_t dbi_bytes = dbi->pages.total * envinfo.mi_dxb_pagesize; - print(" %s: subtotal %" PRIu64 " bytes (%.1f%%)," - " payload %" PRIu64 " (%.1f%%), unused %" PRIu64 " (%.1f%%)", - sdb_name(&dbi->name), dbi_bytes, - dbi_bytes * 100.0 / total_page_bytes, dbi->payload_bytes, - dbi->payload_bytes * 100.0 / dbi_bytes, - dbi_bytes - dbi->payload_bytes, - (dbi_bytes - dbi->payload_bytes) * 100.0 / dbi_bytes); - if (dbi->pages.empty) - print(", %" PRIu64 " empty pages", dbi->pages.empty); - if (dbi->lost_bytes) - print(", %" PRIu64 " bytes lost", dbi->lost_bytes); - print("\n"); - } else - print(" %s: empty\n", sdb_name(&dbi->name)); - } - print(" - summary: average fill %.1f%%", - walk.total_payload_bytes * 100.0 / total_page_bytes); - if (empty_pages) - print(", %" PRIu64 " empty pages", empty_pages); - if (lost_bytes) - print(", %" PRIu64 " bytes lost", lost_bytes); - print(", %" PRIuPTR " problems\n", traversal_problems); - } - } else if (verbose) { - print("Skipping b-tree walk...\n"); - fflush(nullptr); - } - - if (gc_tree_problems) { - print("Skip processing %s since %s is corrupted (%u problems)\n", "@GC", - "b-tree", gc_tree_problems); - problems_freedb = gc_tree_problems; - } else - problems_freedb = process_db(FREE_DBI, MDBX_PGWALK_GC, handle_freedb); - - if (verbose) { - uint64_t value = envinfo.mi_mapsize / envinfo.mi_dxb_pagesize; - double percent = value / 100.0; - print(" - space: %" PRIu64 " total pages", value); - print(", backed %" PRIu64 " (%.1f%%)", backed_pages, - backed_pages / percent); - print(", allocated %" PRIu64 " (%.1f%%)", alloc_pages, - alloc_pages / percent); - - if (verbose > 1) { - value = envinfo.mi_mapsize / envinfo.mi_dxb_pagesize - alloc_pages; - print(", remained %" PRIu64 " (%.1f%%)", value, value / percent); - - value = dont_traversal ? alloc_pages - gc_pages : walk.pgcount; - print(", used %" PRIu64 " (%.1f%%)", value, value / percent); - - print(", gc %" PRIu64 " (%.1f%%)", gc_pages, gc_pages / percent); - - value = gc_pages - reclaimable_pages; - print(", detained %" PRIu64 " (%.1f%%)", value, value / percent); - - print(", reclaimable %" PRIu64 " (%.1f%%)", reclaimable_pages, - reclaimable_pages / percent); - } - - value = envinfo.mi_mapsize / envinfo.mi_dxb_pagesize - alloc_pages + - reclaimable_pages; - print(", available %" PRIu64 " (%.1f%%)\n", value, value / percent); - } - - if ((problems_maindb = data_tree_problems) == 0 && problems_freedb == 0) { - if (!dont_traversal && - (envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) { - if (walk.pgcount != alloc_pages - gc_pages) { - error("used pages mismatch (%" PRIu64 "(walked) != %" PRIu64 - "(allocated - GC))\n", - walk.pgcount, alloc_pages - gc_pages); - } - if (unused_pages != gc_pages) { - error("GC pages mismatch (%" PRIu64 "(expected) != %" PRIu64 "(GC))\n", - unused_pages, gc_pages); - } - } else if (verbose) { - print(" - skip check used and GC pages (btree-traversal with " - "monopolistic or read-write mode only)\n"); - } - - problems_maindb = process_db(~0u, /* MAIN_DBI */ nullptr, nullptr); - if (problems_maindb == 0) { - print("Scanning %s for %s...\n", "@MAIN", "sub-database(s)"); - if (!process_db(MAIN_DBI, nullptr, handle_maindb)) { - if (!userdb_count && verbose) - print(" - does not contain multiple databases\n"); - } - } else { - print("Skip processing %s since %s is corrupted (%u problems)\n", - "sub-database(s)", "@MAIN", problems_maindb); - } - } else { - print("Skip processing %s since %s is corrupted (%u problems)\n", "@MAIN", - "b-tree", data_tree_problems); - } - - if (rc == 0 && total_problems == 1 && problems_meta == 1 && !dont_traversal && - (envflags & MDBX_RDONLY) == 0 && !only_subdb.iov_base && stuck_meta < 0 && - get_meta_txnid(meta_recent(true)) < envinfo.mi_recent_txnid) { - print("Perform sync-to-disk for make steady checkpoint at txn-id #%" PRIi64 - "\n", - envinfo.mi_recent_txnid); - fflush(nullptr); - if (write_locked) { - mdbx_txn_unlock(env); - write_locked = false; - } - rc = mdbx_env_sync_ex(env, true, false); - if (rc != MDBX_SUCCESS) - error("mdbx_env_pgwalk() failed, error %d %s\n", rc, mdbx_strerror(rc)); - else { - total_problems -= 1; - problems_meta -= 1; - } - } - - if (turn_meta && stuck_meta >= 0 && !dont_traversal && !only_subdb.iov_base && - (envflags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) == MDBX_EXCLUSIVE) { - const bool successful_check = (rc | total_problems | problems_meta) == 0; - if (successful_check || force_turn_meta) { - fflush(nullptr); - print(" = Performing turn to the specified meta-page (%d) due to %s!\n", - stuck_meta, - successful_check ? "successful check" : "the -T option was given"); - fflush(nullptr); - rc = mdbx_env_turn_for_recovery(env, stuck_meta); - if (rc != MDBX_SUCCESS) - error("mdbx_env_turn_for_recovery() failed, error %d %s\n", rc, - mdbx_strerror(rc)); - } else { - print(" = Skipping turn to the specified meta-page (%d) due to " - "unsuccessful check!\n", - stuck_meta); - } + if (chk.result.total_problems == 0) + error_fn("mdbx_env_chk", rc); + else if (rc != MDBX_EINTR && rc != MDBX_RESULT_TRUE && !user_break) + rc = 0; } bailout: - if (txn) - mdbx_txn_abort(txn); - if (write_locked) { - mdbx_txn_unlock(env); - write_locked = false; - } if (env) { - const bool dont_sync = rc != 0 || total_problems; + const bool dont_sync = rc != 0 || chk.result.total_problems || + (chk_flags & MDBX_CHK_READWRITE) == 0; mdbx_env_close_ex(env, dont_sync); } - fflush(nullptr); + flush(); if (rc) { - if (rc < 0) + if (rc > 0) return user_break ? EXIT_INTERRUPTED : EXIT_FAILURE_SYS; return EXIT_FAILURE_MDBX; } @@ -5986,21 +4915,24 @@ int main(int argc, char *argv[]) { elapsed = (timestamp_finish - timestamp_start) * 1e-3; #else if (clock_gettime(CLOCK_MONOTONIC, ×tamp_finish)) { - rc = errno; - error("clock_gettime() failed, error %d %s\n", rc, mdbx_strerror(rc)); + error_fn("clock_gettime", errno); return EXIT_FAILURE_SYS; } elapsed = timestamp_finish.tv_sec - timestamp_start.tv_sec + (timestamp_finish.tv_nsec - timestamp_start.tv_nsec) * 1e-9; #endif /* !WINDOWS */ - if (total_problems) { - print("Total %u error%s detected, elapsed %.3f seconds.\n", total_problems, - (total_problems > 1) ? "s are" : " is", elapsed); - if (problems_meta || problems_maindb || problems_freedb) + if (chk.result.total_problems) { + print_ln(MDBX_chk_result, + "Total %" PRIuSIZE " error%s detected, elapsed %.3f seconds.", + chk.result.total_problems, + (chk.result.total_problems > 1) ? "s are" : " is", elapsed); + if (chk.result.problems_meta || chk.result.problems_kv || + chk.result.problems_gc) return EXIT_FAILURE_CHECK_MAJOR; return EXIT_FAILURE_CHECK_MINOR; } - print("No error is detected, elapsed %.3f seconds\n", elapsed); + print_ln(MDBX_chk_result, "No error is detected, elapsed %.3f seconds.", + elapsed); return EXIT_SUCCESS; } diff --git a/mdbxdist/mdbx_copy.c b/mdbxdist/mdbx_copy.c index 8af7231..3b28417 100644 --- a/mdbxdist/mdbx_copy.c +++ b/mdbxdist/mdbx_copy.c @@ -1,7 +1,7 @@ /* mdbx_copy.c - memory-mapped database backup tool */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -22,7 +22,7 @@ #define xMDBX_TOOLS /* Avoid using internal eASSERT() */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY 0c86daff919dc09ab5e99c8e5bfb1d51e1367babca9c630c97af92a2aa988ea9_v0_12_9_16_gfff3fbd8 +#define MDBX_BUILD_SOURCERY 91ff5b5423830ee44fca4b70dcb298f233338a17a3185c44df67ec16d3034af3_v0_13_0_38_gf1975363 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -47,11 +47,13 @@ #ifdef xMDBX_ALLOY /* Amalgamated build */ #define MDBX_INTERNAL_FUNC static -#define MDBX_INTERNAL_VAR static +#define MDBX_INTERNAL_VAR_PROTO static +#define MDBX_INTERNAL_VAR_INSTA static #else /* Non-amalgamated build */ #define MDBX_INTERNAL_FUNC -#define MDBX_INTERNAL_VAR extern +#define MDBX_INTERNAL_VAR_PROTO extern +#define MDBX_INTERNAL_VAR_INSTA #endif /* xMDBX_ALLOY */ /*----------------------------------------------------------------------------*/ @@ -116,6 +118,10 @@ disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ producing 'defined' has undefined behavior */ #endif +#if _MSC_VER < 1920 +/* avoid "error C2219: syntax error: type qualifier must be after '*'" */ +#define __restrict +#endif #if _MSC_VER > 1930 #pragma warning(disable : 6235) /* is always a constant */ #pragma warning(disable : 6237) /* is never evaluated and might \ @@ -161,7 +167,7 @@ #include "mdbx.h" /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -209,6 +215,7 @@ #include #include +#include #include #include #include @@ -846,7 +853,7 @@ __extern_C key_t ftok(const char *, int); /*----------------------------------------------------------------------------*/ -#if defined(MDBX_USE_VALGRIND) +#if defined(ENABLE_MEMCHECK) #include #ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE /* LY: available since Valgrind 3.10 */ @@ -868,7 +875,7 @@ __extern_C key_t ftok(const char *, int); #define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a, s) (0) #define VALGRIND_CHECK_MEM_IS_DEFINED(a, s) (0) #define RUNNING_ON_VALGRIND (0) -#endif /* MDBX_USE_VALGRIND */ +#endif /* ENABLE_MEMCHECK */ #ifdef __SANITIZE_ADDRESS__ #include @@ -1015,7 +1022,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1219,8 +1226,8 @@ typedef pthread_mutex_t osal_fastmutex_t; /*----------------------------------------------------------------------------*/ /* OS abstraction layer stuff */ -MDBX_INTERNAL_VAR unsigned sys_pagesize; -MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_pagesize_ln2, +MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize; +MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize_ln2, sys_allocation_granularity; /* Get the size of a memory page for the system. @@ -1484,8 +1491,9 @@ MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); #endif #if defined(__linux__) || defined(__gnu_linux__) -MDBX_INTERNAL_VAR uint32_t linux_kernel_version; -MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; +MDBX_INTERNAL_VAR_PROTO uint32_t linux_kernel_version; +MDBX_INTERNAL_VAR_PROTO bool + mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ #ifndef osal_strdup @@ -1699,7 +1707,8 @@ MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, /// restore POSIX-fcntl locks after the closing of file descriptors. /// \return Error code (MDBX_PANIC) or zero on success. MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, - MDBX_env *inprocess_neighbor); + MDBX_env *inprocess_neighbor, + const uint32_t current_pid); /// \brief Connects to shared interprocess locking objects and tries to acquire /// the maximum lock level (shared if exclusive is not available) @@ -1727,6 +1736,8 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); /// operational lock. /// \return Error code or zero on success MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, + bool dont_wait); /// \brief Locks LCK-file or/and table of readers for (de)registering. /// \return Error code or zero on success @@ -1735,16 +1746,12 @@ MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); /// \brief Unlocks LCK-file or/and table of readers after (de)registering. MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); -/// \brief Acquires lock for DB change (on writing transaction start) -/// Reading transactions will not be blocked. -/// Declared as LIBMDBX_API because it is used in mdbx_chk. +/// \brief Acquires write-transaction lock. /// \return Error code or zero on success -LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dont_wait); +MDBX_INTERNAL_FUNC int osal_txn_lock(MDBX_env *env, bool dont_wait); -/// \brief Releases lock once DB changes is made (after writing transaction -/// has finished). -/// Declared as LIBMDBX_API because it is used in mdbx_chk. -LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); +/// \brief Releases write-transaction lock.. +MDBX_INTERNAL_FUNC void osal_txn_unlock(MDBX_env *env); /// \brief Sets alive-flag of reader presence (indicative lock) for PID of /// the current process. The function does no more than needed for @@ -1773,7 +1780,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst); typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); -MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, +MDBX_INTERNAL_VAR_PROTO osal_srwlock_t_function osal_srwlock_Init, osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; @@ -1826,7 +1833,7 @@ typedef struct _FILE_REMOTE_PROTOCOL_INFO { typedef BOOL(WINAPI *MDBX_GetFileInformationByHandleEx)( _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -MDBX_INTERNAL_VAR MDBX_GetFileInformationByHandleEx +MDBX_INTERNAL_VAR_PROTO MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx; typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( @@ -1835,19 +1842,20 @@ typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( _Out_opt_ LPDWORD lpMaximumComponentLength, _Out_opt_ LPDWORD lpFileSystemFlags, _Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize); -MDBX_INTERNAL_VAR MDBX_GetVolumeInformationByHandleW +MDBX_INTERNAL_VAR_PROTO MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW; typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile, _Out_ LPWSTR lpszFilePath, _In_ DWORD cchFilePath, _In_ DWORD dwFlags); -MDBX_INTERNAL_VAR MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; +MDBX_INTERNAL_VAR_PROTO MDBX_GetFinalPathNameByHandleW + mdbx_GetFinalPathNameByHandleW; typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)( _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -MDBX_INTERNAL_VAR MDBX_SetFileInformationByHandle +MDBX_INTERNAL_VAR_PROTO MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle; typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( @@ -1856,10 +1864,10 @@ typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode, IN OUT PVOID InputBuffer, IN ULONG InputBufferLength, OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength); -MDBX_INTERNAL_VAR MDBX_NtFsControlFile mdbx_NtFsControlFile; +MDBX_INTERNAL_VAR_PROTO MDBX_NtFsControlFile mdbx_NtFsControlFile; typedef uint64_t(WINAPI *MDBX_GetTickCount64)(void); -MDBX_INTERNAL_VAR MDBX_GetTickCount64 mdbx_GetTickCount64; +MDBX_INTERNAL_VAR_PROTO MDBX_GetTickCount64 mdbx_GetTickCount64; #if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8 typedef struct _WIN32_MEMORY_RANGE_ENTRY { @@ -1871,13 +1879,13 @@ typedef struct _WIN32_MEMORY_RANGE_ENTRY { typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)( HANDLE hProcess, ULONG_PTR NumberOfEntries, PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags); -MDBX_INTERNAL_VAR MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; +MDBX_INTERNAL_VAR_PROTO MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT; typedef NTSTATUS(NTAPI *MDBX_NtExtendSection)(IN HANDLE SectionHandle, IN PLARGE_INTEGER NewSectionSize); -MDBX_INTERNAL_VAR MDBX_NtExtendSection mdbx_NtExtendSection; +MDBX_INTERNAL_VAR_PROTO MDBX_NtExtendSection mdbx_NtExtendSection; static __inline bool mdbx_RunningUnderWine(void) { return !mdbx_NtExtendSection; @@ -1887,14 +1895,15 @@ typedef LSTATUS(WINAPI *MDBX_RegGetValueA)(HKEY hkey, LPCSTR lpSubKey, LPCSTR lpValue, DWORD dwFlags, LPDWORD pdwType, PVOID pvData, LPDWORD pcbData); -MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; +MDBX_INTERNAL_VAR_PROTO MDBX_RegGetValueA mdbx_RegGetValueA; NTSYSAPI ULONG RtlRandomEx(PULONG Seed); typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle, PUCHAR OverlappedRangeStart, ULONG Length); -MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; +MDBX_INTERNAL_VAR_PROTO MDBX_SetFileIoOverlappedRange + mdbx_SetFileIoOverlappedRange; #endif /* Windows */ @@ -2124,7 +2133,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1 #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ -/** Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP +/** Controls dirty pages tracking, spilling and persisting in `MDBX_WRITEMAP` * mode. 0/OFF = Don't track dirty pages at all, don't spill ones, and use * msync() to persist data. This is by-default on Linux and other systems where * kernel provides properly LRU tracking and effective flushing on-demand. 1/ON @@ -2141,6 +2150,22 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_AVOID_MSYNC must be defined as 0 or 1 #endif /* MDBX_AVOID_MSYNC */ +/** Управляет механизмом поддержки разреженных наборов DBI-хендлов для снижения + * накладных расходов при запуске и обработке транзакций. */ +#ifndef MDBX_ENABLE_DBI_SPARSE +#define MDBX_ENABLE_DBI_SPARSE 1 +#elif !(MDBX_ENABLE_DBI_SPARSE == 0 || MDBX_ENABLE_DBI_SPARSE == 1) +#error MDBX_ENABLE_DBI_SPARSE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_DBI_SPARSE */ + +/** Управляет механизмом отложенного освобождения и поддержки пути быстрого + * открытия DBI-хендлов без захвата блокировок. */ +#ifndef MDBX_ENABLE_DBI_LOCKFREE +#define MDBX_ENABLE_DBI_LOCKFREE 1 +#elif !(MDBX_ENABLE_DBI_LOCKFREE == 0 || MDBX_ENABLE_DBI_LOCKFREE == 1) +#error MDBX_ENABLE_DBI_LOCKFREE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ + /** Controls sort order of internal page number lists. * This mostly experimental/advanced option with not for regular MDBX users. * \warning The database format depend on this option and libmdbx built with @@ -2188,8 +2213,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /** If defined then enables integration with Valgrind, * a memory analyzing tool. */ -#ifndef MDBX_USE_VALGRIND -#endif /* MDBX_USE_VALGRIND */ +#ifndef ENABLE_MEMCHECK +#endif /* ENABLE_MEMCHECK */ /** If defined then enables use C11 atomics, * otherwise detects ones availability automatically. */ @@ -2512,13 +2537,23 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_RUNTIME_FLAGS_INIT \ ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT -extern uint8_t runtime_flags; -extern uint8_t loglevel; -extern MDBX_debug_func *debug_logger; +union logger_union { + void *ptr; + MDBX_debug_func *fmt; + MDBX_debug_func_nofmt *nofmt; +}; + +MDBX_INTERNAL_VAR_PROTO struct mdbx_static { + uint8_t flags; + uint8_t loglevel; + union logger_union logger; + size_t logger_buffer_size; + char *logger_buffer; +} mdbx_static; MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { #if MDBX_DEBUG - if (MDBX_DBG_JITTER & runtime_flags) + if (MDBX_DBG_JITTER & mdbx_static.flags) osal_jitter(tiny); #else (void)tiny; @@ -2532,17 +2567,17 @@ MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, const char *fmt, va_list args); #if MDBX_DEBUG -#define LOG_ENABLED(msg) unlikely(msg <= loglevel) -#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) +#define LOG_ENABLED(msg) unlikely(msg <= mdbx_static.loglevel) +#define AUDIT_ENABLED() unlikely((mdbx_static.flags & MDBX_DBG_AUDIT)) #else /* MDBX_DEBUG */ -#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= mdbx_static.loglevel) #define AUDIT_ENABLED() (0) #endif /* MDBX_DEBUG */ #if MDBX_FORCE_ASSERTIONS #define ASSERT_ENABLED() (1) #elif MDBX_DEBUG -#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) +#define ASSERT_ENABLED() likely((mdbx_static.flags & MDBX_DBG_ASSERT)) #else #define ASSERT_ENABLED() (0) #endif /* assertions */ @@ -2977,7 +3012,8 @@ typedef struct MDBX_page { #define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) -/* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ +/* Drop legacy P_DIRTY flag for sub-pages for compatilibity, + * for assertions only. */ #define PAGETYPE_COMPAT(p) \ (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ @@ -3086,7 +3122,7 @@ typedef sem_t osal_ipclock_t; #endif /* MDBX_LOCKING */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) -MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_stubinit(osal_ipclock_t *ipc); MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); #endif /* MDBX_LOCKING */ @@ -3099,8 +3135,9 @@ MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); * read transactions started by the same thread need no further locking to * proceed. * - * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data. - * No reader table is used if the database is on a read-only filesystem. + * If MDBX_NOSTICKYTHREADS is set, the slot address is not saved in + * thread-specific data. No reader table is used if the database is on a + * read-only filesystem. * * Since the database uses multi-version concurrency control, readers don't * actually need any locking. This table is used to keep track of which @@ -3410,10 +3447,10 @@ typedef struct troika { #if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */ uint32_t unused_pad; #endif -#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) -#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) -#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) -#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3) +#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7u) +#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64u) +#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128u) +#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3u) txnid_t txnid[NUM_METAS]; } meta_troika_t; @@ -3443,6 +3480,8 @@ struct MDBX_txn { #error "Oops, some txn flags overlapped or wrong" #endif uint32_t mt_flags; + unsigned mt_numdbs; + size_t mt_owner; /* thread ID that owns this transaction */ MDBX_txn *mt_parent; /* parent of a nested txn */ /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ @@ -3460,31 +3499,30 @@ struct MDBX_txn { txnid_t mt_front; MDBX_env *mt_env; /* the DB environment */ - /* Array of records for each DB known in the environment. */ - MDBX_dbx *mt_dbxs; /* Array of MDBX_db records for each known DB */ MDBX_db *mt_dbs; - /* Array of sequence numbers for each DB handle */ - MDBX_atomic_uint32_t *mt_dbiseqs; - - /* Transaction DBI Flags */ -#define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ -#define DBI_STALE MDBX_DBI_STALE /* Named-DB record is older than txnID */ -#define DBI_FRESH MDBX_DBI_FRESH /* Named-DB handle opened in this txn */ -#define DBI_CREAT MDBX_DBI_CREAT /* Named-DB handle created in this txn */ -#define DBI_VALID 0x10 /* DB handle is valid, see also DB_VALID */ -#define DBI_USRVALID 0x20 /* As DB_VALID, but not set for FREE_DBI */ -#define DBI_AUDITED 0x40 /* Internal flag for accounting during audit */ - /* Array of flags for each DB */ - uint8_t *mt_dbistate; - /* Number of DB records in use, or 0 when the txn is finished. - * This number only ever increments until the txn finishes; we - * don't decrement it when individual DB handles are closed. */ - MDBX_dbi mt_numdbs; - size_t mt_owner; /* thread ID that owns this transaction */ + +#if MDBX_ENABLE_DBI_SPARSE + unsigned *__restrict mt_dbi_sparse; +#endif /* MDBX_ENABLE_DBI_SPARSE */ + + /* Non-shared DBI state flags inside transaction */ +#define DBI_DIRTY 0x01 /* DB was written in this txn */ +#define DBI_STALE 0x02 /* Named-DB record is older than txnID */ +#define DBI_FRESH 0x04 /* Named-DB handle opened in this txn */ +#define DBI_CREAT 0x08 /* Named-DB handle created in this txn */ +#define DBI_VALID 0x10 /* Handle is valid, see also DB_VALID */ +#define DBI_OLDEN 0x40 /* Handle was closed/reopened outside txn */ +#define DBI_LINDO 0x80 /* Lazy initialization done for DBI-slot */ + /* Array of non-shared txn's flags of DBI */ + uint8_t *__restrict mt_dbi_state; + + /* Array of sequence numbers for each DB handle. */ + uint32_t *__restrict mt_dbi_seqs; + MDBX_cursor **mt_cursors; + MDBX_canary mt_canary; void *mt_userctx; /* User-settable context */ - MDBX_cursor **mt_cursors; union { struct { @@ -3494,8 +3532,8 @@ struct MDBX_txn { struct { meta_troika_t troika; /* In write txns, array of cursors for each DB */ - MDBX_PNL relist; /* Reclaimed GC pages */ - txnid_t last_reclaimed; /* ID of last used record */ + MDBX_PNL __restrict relist; /* Reclaimed GC pages */ + txnid_t last_reclaimed; /* ID of last used record */ #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; #endif /* MDBX_ENABLE_REFUND */ @@ -3507,14 +3545,14 @@ struct MDBX_txn { * dirtylist into mt_parent after freeing hidden mt_parent pages. */ size_t dirtyroom; /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ - MDBX_dpl *dirtylist; + MDBX_dpl *__restrict dirtylist; /* The list of reclaimed txns from GC */ - MDBX_TXL lifo_reclaimed; + MDBX_TXL __restrict lifo_reclaimed; /* The list of pages that became unused during this transaction. */ - MDBX_PNL retired_pages; + MDBX_PNL __restrict retired_pages; /* The list of loose pages that became unused and may be reused * in this transaction, linked through `mp_next`. */ - MDBX_page *loose_pages; + MDBX_page *__restrict loose_pages; /* Number of loose pages (tw.loose_pages) */ size_t loose_count; union { @@ -3523,11 +3561,12 @@ struct MDBX_txn { /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ - MDBX_PNL list; + MDBX_PNL __restrict list; } spilled; size_t writemap_dirty_npages; size_t writemap_spilled_npages; }; + uint64_t gc_time_acc; } tw; }; }; @@ -3566,8 +3605,8 @@ struct MDBX_cursor { MDBX_db *mc_db; /* The database auxiliary record for this cursor */ MDBX_dbx *mc_dbx; - /* The mt_dbistate for this database */ - uint8_t *mc_dbistate; + /* The mt_dbi_state[] for this DBI */ + uint8_t *__restrict mc_dbi_state; uint8_t mc_snum; /* number of pushed pages */ uint8_t mc_top; /* index of top page, normally mc_snum-1 */ @@ -3620,6 +3659,11 @@ typedef struct MDBX_cursor_couple { MDBX_xcursor inner; } MDBX_cursor_couple; +struct mdbx_defer_free_item { + struct mdbx_defer_free_item *next; + uint64_t timestamp; +}; + /* The database environment. */ struct MDBX_env { /* ----------------------------------------------------- mostly static part */ @@ -3637,6 +3681,7 @@ struct MDBX_env { #define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; + unsigned me_psize; /* DB page size, initialized from me_os_psize */ osal_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.base #define me_lazy_fd me_dxb_mmap.fd @@ -3649,7 +3694,6 @@ struct MDBX_env { #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; - unsigned me_psize; /* DB page size, initialized from me_os_psize */ uint16_t me_leaf_nodemax; /* max size of a leaf-node */ uint16_t me_branch_nodemax; /* max size of a branch-node */ uint16_t me_subpage_limit; @@ -3667,13 +3711,15 @@ struct MDBX_env { MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ osal_thread_key_t me_txkey; /* thread-key for readers */ - pathchar_t *me_pathname; /* path to the DB files */ - void *me_pbuf; /* scratch area for DUPSORT put() */ - MDBX_txn *me_txn0; /* preallocated write transaction */ - - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ + struct { /* path to the DB files */ + pathchar_t *lck, *dxb, *specified; + void *buffer; + } me_pathname; + void *me_pbuf; /* scratch area for DUPSORT put() */ + MDBX_txn *me_txn0; /* preallocated write transaction */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *__restrict me_db_flags; /* array of flags from MDBX_db.md_flags */ + MDBX_atomic_uint32_t *me_dbi_seqs; /* array of dbi sequence numbers */ unsigned me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ unsigned me_maxgc_per_branch; @@ -3687,6 +3733,7 @@ struct MDBX_env { unsigned rp_augment_limit; unsigned dp_limit; unsigned dp_initial; + uint64_t gc_time_limit; uint8_t dp_loose_limit; uint8_t spill_max_denominator; uint8_t spill_min_denominator; @@ -3696,6 +3743,8 @@ struct MDBX_env { unsigned writethrough_threshold; #endif /* Windows */ bool prefault_write; + bool prefer_waf_insteadof_balance; /* Strive to minimize WAF instead of + balancing pages fullment */ union { unsigned all; /* tracks options with non-auto values but tuned by user */ @@ -3725,20 +3774,23 @@ struct MDBX_env { } me_sysv_ipc; #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ bool me_incore; + bool me_prefault_write; - MDBX_env *me_lcklist_next; +#if MDBX_ENABLE_DBI_LOCKFREE + struct mdbx_defer_free_item *me_defer_free; +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ /* --------------------------------------------------- mostly volatile part */ MDBX_txn *me_txn; /* current write transaction */ osal_fastmutex_t me_dbi_lock; - MDBX_dbi me_numdbs; /* number of DBs opened */ - bool me_prefault_write; + unsigned me_numdbs; /* number of DBs opened */ - MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ unsigned me_dp_reserve_len; + MDBX_page *__restrict me_dp_reserve; /* list of malloc'ed blocks for re-use */ + /* PNL of pages that became unused in a write txn */ - MDBX_PNL me_retired_pages; + MDBX_PNL __restrict me_retired_pages; osal_ioring_t me_ioring; #if defined(_WIN32) || defined(_WIN64) @@ -3756,13 +3808,12 @@ struct MDBX_env { #if MDBX_DEBUG MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ #endif -#ifdef MDBX_USE_VALGRIND +#ifdef ENABLE_MEMCHECK int me_valgrind_handle; #endif -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - MDBX_atomic_uint32_t me_ignore_EDEADLK; +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) pgno_t me_poison_edge; -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ #ifndef xMDBX_DEBUG_SPILLING #define xMDBX_DEBUG_SPILLING 0 @@ -3822,10 +3873,6 @@ osal_flush_incoherent_mmap(const void *addr, size_t nbytes, MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, int *dead); -MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, - MDBX_reader *end); -MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); - MDBX_INTERNAL_FUNC void global_ctor(void); MDBX_INTERNAL_FUNC void osal_ctor(void); MDBX_INTERNAL_FUNC void global_dtor(void); @@ -3940,7 +3987,8 @@ typedef struct MDBX_node { /* mdbx_dbi_open() flags */ #define DB_USABLE_FLAGS (DB_PERSISTENT_FLAGS | MDBX_CREATE | MDBX_DB_ACCEDE) -#define DB_VALID 0x8000 /* DB handle is valid, for me_dbflags */ +#define DB_VALID 0x8000u /* DB handle is valid, for me_db_flags */ +#define DB_POISON 0x7fffu /* update pending */ #define DB_INTERNAL_FLAGS DB_VALID #if DB_INTERNAL_FLAGS & DB_USABLE_FLAGS @@ -4029,11 +4077,11 @@ log2n_powerof2(size_t value_uintptr) { * environment and re-opening it with the new flags. */ #define ENV_CHANGEABLE_FLAGS \ (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | \ - MDBX_VALIDATION) + MDBX_NOMEMINIT | MDBX_DEPRECATED_COALESCE | MDBX_PAGEPERTURB | \ + MDBX_ACCEDE | MDBX_VALIDATION) #define ENV_CHANGELESS_FLAGS \ - (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ - MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) + (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOSTICKYTHREADS | \ + MDBX_NORDAHEAD | MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS) #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS @@ -4067,6 +4115,36 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) +/******************************************************************************/ + +/** \brief Page types for traverse the b-tree. + * \see mdbx_env_pgwalk() \see MDBX_pgvisitor_func */ +enum MDBX_page_type_t { + MDBX_page_broken, + MDBX_page_large, + MDBX_page_branch, + MDBX_page_leaf, + MDBX_page_dupfixed_leaf, + MDBX_subpage_leaf, + MDBX_subpage_dupfixed_leaf, + MDBX_subpage_broken, +}; +typedef enum MDBX_page_type_t MDBX_page_type_t; + +typedef struct MDBX_walk_sdb { + MDBX_val name; + struct MDBX_db *internal, *nested; +} MDBX_walk_sdb_t; + +/** \brief Callback function for traverse the b-tree. \see mdbx_env_pgwalk() */ +typedef int +MDBX_pgvisitor_func(const size_t pgno, const unsigned number, void *const ctx, + const int deep, const MDBX_walk_sdb_t *subdb, + const size_t page_size, const MDBX_page_type_t page_type, + const MDBX_error_t err, const size_t nentries, + const size_t payload_bytes, const size_t header_bytes, + const size_t unused_bytes); + #if defined(_WIN32) || defined(_WIN64) /* * POSIX getopt for Windows diff --git a/mdbxdist/mdbx_drop.c b/mdbxdist/mdbx_drop.c index 96cf201..ba5730a 100644 --- a/mdbxdist/mdbx_drop.c +++ b/mdbxdist/mdbx_drop.c @@ -1,7 +1,7 @@ /* mdbx_drop.c - memory-mapped database delete tool */ /* - * Copyright 2021-2023 Leonid Yuriev + * Copyright 2021-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * * Copyright 2016-2021 Howard Chu, Symas Corp. @@ -24,7 +24,7 @@ #define xMDBX_TOOLS /* Avoid using internal eASSERT() */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -36,7 +36,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY 0c86daff919dc09ab5e99c8e5bfb1d51e1367babca9c630c97af92a2aa988ea9_v0_12_9_16_gfff3fbd8 +#define MDBX_BUILD_SOURCERY 91ff5b5423830ee44fca4b70dcb298f233338a17a3185c44df67ec16d3034af3_v0_13_0_38_gf1975363 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -49,11 +49,13 @@ #ifdef xMDBX_ALLOY /* Amalgamated build */ #define MDBX_INTERNAL_FUNC static -#define MDBX_INTERNAL_VAR static +#define MDBX_INTERNAL_VAR_PROTO static +#define MDBX_INTERNAL_VAR_INSTA static #else /* Non-amalgamated build */ #define MDBX_INTERNAL_FUNC -#define MDBX_INTERNAL_VAR extern +#define MDBX_INTERNAL_VAR_PROTO extern +#define MDBX_INTERNAL_VAR_INSTA #endif /* xMDBX_ALLOY */ /*----------------------------------------------------------------------------*/ @@ -118,6 +120,10 @@ disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ producing 'defined' has undefined behavior */ #endif +#if _MSC_VER < 1920 +/* avoid "error C2219: syntax error: type qualifier must be after '*'" */ +#define __restrict +#endif #if _MSC_VER > 1930 #pragma warning(disable : 6235) /* is always a constant */ #pragma warning(disable : 6237) /* is never evaluated and might \ @@ -163,7 +169,7 @@ #include "mdbx.h" /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -211,6 +217,7 @@ #include #include +#include #include #include #include @@ -848,7 +855,7 @@ __extern_C key_t ftok(const char *, int); /*----------------------------------------------------------------------------*/ -#if defined(MDBX_USE_VALGRIND) +#if defined(ENABLE_MEMCHECK) #include #ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE /* LY: available since Valgrind 3.10 */ @@ -870,7 +877,7 @@ __extern_C key_t ftok(const char *, int); #define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a, s) (0) #define VALGRIND_CHECK_MEM_IS_DEFINED(a, s) (0) #define RUNNING_ON_VALGRIND (0) -#endif /* MDBX_USE_VALGRIND */ +#endif /* ENABLE_MEMCHECK */ #ifdef __SANITIZE_ADDRESS__ #include @@ -1017,7 +1024,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1221,8 +1228,8 @@ typedef pthread_mutex_t osal_fastmutex_t; /*----------------------------------------------------------------------------*/ /* OS abstraction layer stuff */ -MDBX_INTERNAL_VAR unsigned sys_pagesize; -MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_pagesize_ln2, +MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize; +MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize_ln2, sys_allocation_granularity; /* Get the size of a memory page for the system. @@ -1486,8 +1493,9 @@ MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); #endif #if defined(__linux__) || defined(__gnu_linux__) -MDBX_INTERNAL_VAR uint32_t linux_kernel_version; -MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; +MDBX_INTERNAL_VAR_PROTO uint32_t linux_kernel_version; +MDBX_INTERNAL_VAR_PROTO bool + mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ #ifndef osal_strdup @@ -1701,7 +1709,8 @@ MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, /// restore POSIX-fcntl locks after the closing of file descriptors. /// \return Error code (MDBX_PANIC) or zero on success. MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, - MDBX_env *inprocess_neighbor); + MDBX_env *inprocess_neighbor, + const uint32_t current_pid); /// \brief Connects to shared interprocess locking objects and tries to acquire /// the maximum lock level (shared if exclusive is not available) @@ -1729,6 +1738,8 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); /// operational lock. /// \return Error code or zero on success MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, + bool dont_wait); /// \brief Locks LCK-file or/and table of readers for (de)registering. /// \return Error code or zero on success @@ -1737,16 +1748,12 @@ MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); /// \brief Unlocks LCK-file or/and table of readers after (de)registering. MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); -/// \brief Acquires lock for DB change (on writing transaction start) -/// Reading transactions will not be blocked. -/// Declared as LIBMDBX_API because it is used in mdbx_chk. +/// \brief Acquires write-transaction lock. /// \return Error code or zero on success -LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dont_wait); +MDBX_INTERNAL_FUNC int osal_txn_lock(MDBX_env *env, bool dont_wait); -/// \brief Releases lock once DB changes is made (after writing transaction -/// has finished). -/// Declared as LIBMDBX_API because it is used in mdbx_chk. -LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); +/// \brief Releases write-transaction lock.. +MDBX_INTERNAL_FUNC void osal_txn_unlock(MDBX_env *env); /// \brief Sets alive-flag of reader presence (indicative lock) for PID of /// the current process. The function does no more than needed for @@ -1775,7 +1782,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst); typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); -MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, +MDBX_INTERNAL_VAR_PROTO osal_srwlock_t_function osal_srwlock_Init, osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; @@ -1828,7 +1835,7 @@ typedef struct _FILE_REMOTE_PROTOCOL_INFO { typedef BOOL(WINAPI *MDBX_GetFileInformationByHandleEx)( _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -MDBX_INTERNAL_VAR MDBX_GetFileInformationByHandleEx +MDBX_INTERNAL_VAR_PROTO MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx; typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( @@ -1837,19 +1844,20 @@ typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( _Out_opt_ LPDWORD lpMaximumComponentLength, _Out_opt_ LPDWORD lpFileSystemFlags, _Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize); -MDBX_INTERNAL_VAR MDBX_GetVolumeInformationByHandleW +MDBX_INTERNAL_VAR_PROTO MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW; typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile, _Out_ LPWSTR lpszFilePath, _In_ DWORD cchFilePath, _In_ DWORD dwFlags); -MDBX_INTERNAL_VAR MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; +MDBX_INTERNAL_VAR_PROTO MDBX_GetFinalPathNameByHandleW + mdbx_GetFinalPathNameByHandleW; typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)( _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -MDBX_INTERNAL_VAR MDBX_SetFileInformationByHandle +MDBX_INTERNAL_VAR_PROTO MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle; typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( @@ -1858,10 +1866,10 @@ typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode, IN OUT PVOID InputBuffer, IN ULONG InputBufferLength, OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength); -MDBX_INTERNAL_VAR MDBX_NtFsControlFile mdbx_NtFsControlFile; +MDBX_INTERNAL_VAR_PROTO MDBX_NtFsControlFile mdbx_NtFsControlFile; typedef uint64_t(WINAPI *MDBX_GetTickCount64)(void); -MDBX_INTERNAL_VAR MDBX_GetTickCount64 mdbx_GetTickCount64; +MDBX_INTERNAL_VAR_PROTO MDBX_GetTickCount64 mdbx_GetTickCount64; #if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8 typedef struct _WIN32_MEMORY_RANGE_ENTRY { @@ -1873,13 +1881,13 @@ typedef struct _WIN32_MEMORY_RANGE_ENTRY { typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)( HANDLE hProcess, ULONG_PTR NumberOfEntries, PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags); -MDBX_INTERNAL_VAR MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; +MDBX_INTERNAL_VAR_PROTO MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT; typedef NTSTATUS(NTAPI *MDBX_NtExtendSection)(IN HANDLE SectionHandle, IN PLARGE_INTEGER NewSectionSize); -MDBX_INTERNAL_VAR MDBX_NtExtendSection mdbx_NtExtendSection; +MDBX_INTERNAL_VAR_PROTO MDBX_NtExtendSection mdbx_NtExtendSection; static __inline bool mdbx_RunningUnderWine(void) { return !mdbx_NtExtendSection; @@ -1889,14 +1897,15 @@ typedef LSTATUS(WINAPI *MDBX_RegGetValueA)(HKEY hkey, LPCSTR lpSubKey, LPCSTR lpValue, DWORD dwFlags, LPDWORD pdwType, PVOID pvData, LPDWORD pcbData); -MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; +MDBX_INTERNAL_VAR_PROTO MDBX_RegGetValueA mdbx_RegGetValueA; NTSYSAPI ULONG RtlRandomEx(PULONG Seed); typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle, PUCHAR OverlappedRangeStart, ULONG Length); -MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; +MDBX_INTERNAL_VAR_PROTO MDBX_SetFileIoOverlappedRange + mdbx_SetFileIoOverlappedRange; #endif /* Windows */ @@ -2126,7 +2135,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1 #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ -/** Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP +/** Controls dirty pages tracking, spilling and persisting in `MDBX_WRITEMAP` * mode. 0/OFF = Don't track dirty pages at all, don't spill ones, and use * msync() to persist data. This is by-default on Linux and other systems where * kernel provides properly LRU tracking and effective flushing on-demand. 1/ON @@ -2143,6 +2152,22 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_AVOID_MSYNC must be defined as 0 or 1 #endif /* MDBX_AVOID_MSYNC */ +/** Управляет механизмом поддержки разреженных наборов DBI-хендлов для снижения + * накладных расходов при запуске и обработке транзакций. */ +#ifndef MDBX_ENABLE_DBI_SPARSE +#define MDBX_ENABLE_DBI_SPARSE 1 +#elif !(MDBX_ENABLE_DBI_SPARSE == 0 || MDBX_ENABLE_DBI_SPARSE == 1) +#error MDBX_ENABLE_DBI_SPARSE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_DBI_SPARSE */ + +/** Управляет механизмом отложенного освобождения и поддержки пути быстрого + * открытия DBI-хендлов без захвата блокировок. */ +#ifndef MDBX_ENABLE_DBI_LOCKFREE +#define MDBX_ENABLE_DBI_LOCKFREE 1 +#elif !(MDBX_ENABLE_DBI_LOCKFREE == 0 || MDBX_ENABLE_DBI_LOCKFREE == 1) +#error MDBX_ENABLE_DBI_LOCKFREE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ + /** Controls sort order of internal page number lists. * This mostly experimental/advanced option with not for regular MDBX users. * \warning The database format depend on this option and libmdbx built with @@ -2190,8 +2215,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /** If defined then enables integration with Valgrind, * a memory analyzing tool. */ -#ifndef MDBX_USE_VALGRIND -#endif /* MDBX_USE_VALGRIND */ +#ifndef ENABLE_MEMCHECK +#endif /* ENABLE_MEMCHECK */ /** If defined then enables use C11 atomics, * otherwise detects ones availability automatically. */ @@ -2514,13 +2539,23 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_RUNTIME_FLAGS_INIT \ ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT -extern uint8_t runtime_flags; -extern uint8_t loglevel; -extern MDBX_debug_func *debug_logger; +union logger_union { + void *ptr; + MDBX_debug_func *fmt; + MDBX_debug_func_nofmt *nofmt; +}; + +MDBX_INTERNAL_VAR_PROTO struct mdbx_static { + uint8_t flags; + uint8_t loglevel; + union logger_union logger; + size_t logger_buffer_size; + char *logger_buffer; +} mdbx_static; MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { #if MDBX_DEBUG - if (MDBX_DBG_JITTER & runtime_flags) + if (MDBX_DBG_JITTER & mdbx_static.flags) osal_jitter(tiny); #else (void)tiny; @@ -2534,17 +2569,17 @@ MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, const char *fmt, va_list args); #if MDBX_DEBUG -#define LOG_ENABLED(msg) unlikely(msg <= loglevel) -#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) +#define LOG_ENABLED(msg) unlikely(msg <= mdbx_static.loglevel) +#define AUDIT_ENABLED() unlikely((mdbx_static.flags & MDBX_DBG_AUDIT)) #else /* MDBX_DEBUG */ -#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= mdbx_static.loglevel) #define AUDIT_ENABLED() (0) #endif /* MDBX_DEBUG */ #if MDBX_FORCE_ASSERTIONS #define ASSERT_ENABLED() (1) #elif MDBX_DEBUG -#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) +#define ASSERT_ENABLED() likely((mdbx_static.flags & MDBX_DBG_ASSERT)) #else #define ASSERT_ENABLED() (0) #endif /* assertions */ @@ -2979,7 +3014,8 @@ typedef struct MDBX_page { #define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) -/* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ +/* Drop legacy P_DIRTY flag for sub-pages for compatilibity, + * for assertions only. */ #define PAGETYPE_COMPAT(p) \ (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ @@ -3088,7 +3124,7 @@ typedef sem_t osal_ipclock_t; #endif /* MDBX_LOCKING */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) -MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_stubinit(osal_ipclock_t *ipc); MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); #endif /* MDBX_LOCKING */ @@ -3101,8 +3137,9 @@ MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); * read transactions started by the same thread need no further locking to * proceed. * - * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data. - * No reader table is used if the database is on a read-only filesystem. + * If MDBX_NOSTICKYTHREADS is set, the slot address is not saved in + * thread-specific data. No reader table is used if the database is on a + * read-only filesystem. * * Since the database uses multi-version concurrency control, readers don't * actually need any locking. This table is used to keep track of which @@ -3412,10 +3449,10 @@ typedef struct troika { #if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */ uint32_t unused_pad; #endif -#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) -#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) -#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) -#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3) +#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7u) +#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64u) +#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128u) +#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3u) txnid_t txnid[NUM_METAS]; } meta_troika_t; @@ -3445,6 +3482,8 @@ struct MDBX_txn { #error "Oops, some txn flags overlapped or wrong" #endif uint32_t mt_flags; + unsigned mt_numdbs; + size_t mt_owner; /* thread ID that owns this transaction */ MDBX_txn *mt_parent; /* parent of a nested txn */ /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ @@ -3462,31 +3501,30 @@ struct MDBX_txn { txnid_t mt_front; MDBX_env *mt_env; /* the DB environment */ - /* Array of records for each DB known in the environment. */ - MDBX_dbx *mt_dbxs; /* Array of MDBX_db records for each known DB */ MDBX_db *mt_dbs; - /* Array of sequence numbers for each DB handle */ - MDBX_atomic_uint32_t *mt_dbiseqs; - - /* Transaction DBI Flags */ -#define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ -#define DBI_STALE MDBX_DBI_STALE /* Named-DB record is older than txnID */ -#define DBI_FRESH MDBX_DBI_FRESH /* Named-DB handle opened in this txn */ -#define DBI_CREAT MDBX_DBI_CREAT /* Named-DB handle created in this txn */ -#define DBI_VALID 0x10 /* DB handle is valid, see also DB_VALID */ -#define DBI_USRVALID 0x20 /* As DB_VALID, but not set for FREE_DBI */ -#define DBI_AUDITED 0x40 /* Internal flag for accounting during audit */ - /* Array of flags for each DB */ - uint8_t *mt_dbistate; - /* Number of DB records in use, or 0 when the txn is finished. - * This number only ever increments until the txn finishes; we - * don't decrement it when individual DB handles are closed. */ - MDBX_dbi mt_numdbs; - size_t mt_owner; /* thread ID that owns this transaction */ + +#if MDBX_ENABLE_DBI_SPARSE + unsigned *__restrict mt_dbi_sparse; +#endif /* MDBX_ENABLE_DBI_SPARSE */ + + /* Non-shared DBI state flags inside transaction */ +#define DBI_DIRTY 0x01 /* DB was written in this txn */ +#define DBI_STALE 0x02 /* Named-DB record is older than txnID */ +#define DBI_FRESH 0x04 /* Named-DB handle opened in this txn */ +#define DBI_CREAT 0x08 /* Named-DB handle created in this txn */ +#define DBI_VALID 0x10 /* Handle is valid, see also DB_VALID */ +#define DBI_OLDEN 0x40 /* Handle was closed/reopened outside txn */ +#define DBI_LINDO 0x80 /* Lazy initialization done for DBI-slot */ + /* Array of non-shared txn's flags of DBI */ + uint8_t *__restrict mt_dbi_state; + + /* Array of sequence numbers for each DB handle. */ + uint32_t *__restrict mt_dbi_seqs; + MDBX_cursor **mt_cursors; + MDBX_canary mt_canary; void *mt_userctx; /* User-settable context */ - MDBX_cursor **mt_cursors; union { struct { @@ -3496,8 +3534,8 @@ struct MDBX_txn { struct { meta_troika_t troika; /* In write txns, array of cursors for each DB */ - MDBX_PNL relist; /* Reclaimed GC pages */ - txnid_t last_reclaimed; /* ID of last used record */ + MDBX_PNL __restrict relist; /* Reclaimed GC pages */ + txnid_t last_reclaimed; /* ID of last used record */ #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; #endif /* MDBX_ENABLE_REFUND */ @@ -3509,14 +3547,14 @@ struct MDBX_txn { * dirtylist into mt_parent after freeing hidden mt_parent pages. */ size_t dirtyroom; /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ - MDBX_dpl *dirtylist; + MDBX_dpl *__restrict dirtylist; /* The list of reclaimed txns from GC */ - MDBX_TXL lifo_reclaimed; + MDBX_TXL __restrict lifo_reclaimed; /* The list of pages that became unused during this transaction. */ - MDBX_PNL retired_pages; + MDBX_PNL __restrict retired_pages; /* The list of loose pages that became unused and may be reused * in this transaction, linked through `mp_next`. */ - MDBX_page *loose_pages; + MDBX_page *__restrict loose_pages; /* Number of loose pages (tw.loose_pages) */ size_t loose_count; union { @@ -3525,11 +3563,12 @@ struct MDBX_txn { /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ - MDBX_PNL list; + MDBX_PNL __restrict list; } spilled; size_t writemap_dirty_npages; size_t writemap_spilled_npages; }; + uint64_t gc_time_acc; } tw; }; }; @@ -3568,8 +3607,8 @@ struct MDBX_cursor { MDBX_db *mc_db; /* The database auxiliary record for this cursor */ MDBX_dbx *mc_dbx; - /* The mt_dbistate for this database */ - uint8_t *mc_dbistate; + /* The mt_dbi_state[] for this DBI */ + uint8_t *__restrict mc_dbi_state; uint8_t mc_snum; /* number of pushed pages */ uint8_t mc_top; /* index of top page, normally mc_snum-1 */ @@ -3622,6 +3661,11 @@ typedef struct MDBX_cursor_couple { MDBX_xcursor inner; } MDBX_cursor_couple; +struct mdbx_defer_free_item { + struct mdbx_defer_free_item *next; + uint64_t timestamp; +}; + /* The database environment. */ struct MDBX_env { /* ----------------------------------------------------- mostly static part */ @@ -3639,6 +3683,7 @@ struct MDBX_env { #define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; + unsigned me_psize; /* DB page size, initialized from me_os_psize */ osal_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.base #define me_lazy_fd me_dxb_mmap.fd @@ -3651,7 +3696,6 @@ struct MDBX_env { #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; - unsigned me_psize; /* DB page size, initialized from me_os_psize */ uint16_t me_leaf_nodemax; /* max size of a leaf-node */ uint16_t me_branch_nodemax; /* max size of a branch-node */ uint16_t me_subpage_limit; @@ -3669,13 +3713,15 @@ struct MDBX_env { MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ osal_thread_key_t me_txkey; /* thread-key for readers */ - pathchar_t *me_pathname; /* path to the DB files */ - void *me_pbuf; /* scratch area for DUPSORT put() */ - MDBX_txn *me_txn0; /* preallocated write transaction */ - - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ + struct { /* path to the DB files */ + pathchar_t *lck, *dxb, *specified; + void *buffer; + } me_pathname; + void *me_pbuf; /* scratch area for DUPSORT put() */ + MDBX_txn *me_txn0; /* preallocated write transaction */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *__restrict me_db_flags; /* array of flags from MDBX_db.md_flags */ + MDBX_atomic_uint32_t *me_dbi_seqs; /* array of dbi sequence numbers */ unsigned me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ unsigned me_maxgc_per_branch; @@ -3689,6 +3735,7 @@ struct MDBX_env { unsigned rp_augment_limit; unsigned dp_limit; unsigned dp_initial; + uint64_t gc_time_limit; uint8_t dp_loose_limit; uint8_t spill_max_denominator; uint8_t spill_min_denominator; @@ -3698,6 +3745,8 @@ struct MDBX_env { unsigned writethrough_threshold; #endif /* Windows */ bool prefault_write; + bool prefer_waf_insteadof_balance; /* Strive to minimize WAF instead of + balancing pages fullment */ union { unsigned all; /* tracks options with non-auto values but tuned by user */ @@ -3727,20 +3776,23 @@ struct MDBX_env { } me_sysv_ipc; #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ bool me_incore; + bool me_prefault_write; - MDBX_env *me_lcklist_next; +#if MDBX_ENABLE_DBI_LOCKFREE + struct mdbx_defer_free_item *me_defer_free; +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ /* --------------------------------------------------- mostly volatile part */ MDBX_txn *me_txn; /* current write transaction */ osal_fastmutex_t me_dbi_lock; - MDBX_dbi me_numdbs; /* number of DBs opened */ - bool me_prefault_write; + unsigned me_numdbs; /* number of DBs opened */ - MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ unsigned me_dp_reserve_len; + MDBX_page *__restrict me_dp_reserve; /* list of malloc'ed blocks for re-use */ + /* PNL of pages that became unused in a write txn */ - MDBX_PNL me_retired_pages; + MDBX_PNL __restrict me_retired_pages; osal_ioring_t me_ioring; #if defined(_WIN32) || defined(_WIN64) @@ -3758,13 +3810,12 @@ struct MDBX_env { #if MDBX_DEBUG MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ #endif -#ifdef MDBX_USE_VALGRIND +#ifdef ENABLE_MEMCHECK int me_valgrind_handle; #endif -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - MDBX_atomic_uint32_t me_ignore_EDEADLK; +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) pgno_t me_poison_edge; -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ #ifndef xMDBX_DEBUG_SPILLING #define xMDBX_DEBUG_SPILLING 0 @@ -3824,10 +3875,6 @@ osal_flush_incoherent_mmap(const void *addr, size_t nbytes, MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, int *dead); -MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, - MDBX_reader *end); -MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); - MDBX_INTERNAL_FUNC void global_ctor(void); MDBX_INTERNAL_FUNC void osal_ctor(void); MDBX_INTERNAL_FUNC void global_dtor(void); @@ -3942,7 +3989,8 @@ typedef struct MDBX_node { /* mdbx_dbi_open() flags */ #define DB_USABLE_FLAGS (DB_PERSISTENT_FLAGS | MDBX_CREATE | MDBX_DB_ACCEDE) -#define DB_VALID 0x8000 /* DB handle is valid, for me_dbflags */ +#define DB_VALID 0x8000u /* DB handle is valid, for me_db_flags */ +#define DB_POISON 0x7fffu /* update pending */ #define DB_INTERNAL_FLAGS DB_VALID #if DB_INTERNAL_FLAGS & DB_USABLE_FLAGS @@ -4031,11 +4079,11 @@ log2n_powerof2(size_t value_uintptr) { * environment and re-opening it with the new flags. */ #define ENV_CHANGEABLE_FLAGS \ (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | \ - MDBX_VALIDATION) + MDBX_NOMEMINIT | MDBX_DEPRECATED_COALESCE | MDBX_PAGEPERTURB | \ + MDBX_ACCEDE | MDBX_VALIDATION) #define ENV_CHANGELESS_FLAGS \ - (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ - MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) + (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOSTICKYTHREADS | \ + MDBX_NORDAHEAD | MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS) #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS @@ -4069,6 +4117,36 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) +/******************************************************************************/ + +/** \brief Page types for traverse the b-tree. + * \see mdbx_env_pgwalk() \see MDBX_pgvisitor_func */ +enum MDBX_page_type_t { + MDBX_page_broken, + MDBX_page_large, + MDBX_page_branch, + MDBX_page_leaf, + MDBX_page_dupfixed_leaf, + MDBX_subpage_leaf, + MDBX_subpage_dupfixed_leaf, + MDBX_subpage_broken, +}; +typedef enum MDBX_page_type_t MDBX_page_type_t; + +typedef struct MDBX_walk_sdb { + MDBX_val name; + struct MDBX_db *internal, *nested; +} MDBX_walk_sdb_t; + +/** \brief Callback function for traverse the b-tree. \see mdbx_env_pgwalk() */ +typedef int +MDBX_pgvisitor_func(const size_t pgno, const unsigned number, void *const ctx, + const int deep, const MDBX_walk_sdb_t *subdb, + const size_t page_size, const MDBX_page_type_t page_type, + const MDBX_error_t err, const size_t nentries, + const size_t payload_bytes, const size_t header_bytes, + const size_t unused_bytes); + #include #if defined(_WIN32) || defined(_WIN64) diff --git a/mdbxdist/mdbx_dump.c b/mdbxdist/mdbx_dump.c index 588f588..9bf7e36 100644 --- a/mdbxdist/mdbx_dump.c +++ b/mdbxdist/mdbx_dump.c @@ -1,7 +1,7 @@ /* mdbx_dump.c - memory-mapped database dump tool */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -22,7 +22,7 @@ #define xMDBX_TOOLS /* Avoid using internal eASSERT() */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY 0c86daff919dc09ab5e99c8e5bfb1d51e1367babca9c630c97af92a2aa988ea9_v0_12_9_16_gfff3fbd8 +#define MDBX_BUILD_SOURCERY 91ff5b5423830ee44fca4b70dcb298f233338a17a3185c44df67ec16d3034af3_v0_13_0_38_gf1975363 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -47,11 +47,13 @@ #ifdef xMDBX_ALLOY /* Amalgamated build */ #define MDBX_INTERNAL_FUNC static -#define MDBX_INTERNAL_VAR static +#define MDBX_INTERNAL_VAR_PROTO static +#define MDBX_INTERNAL_VAR_INSTA static #else /* Non-amalgamated build */ #define MDBX_INTERNAL_FUNC -#define MDBX_INTERNAL_VAR extern +#define MDBX_INTERNAL_VAR_PROTO extern +#define MDBX_INTERNAL_VAR_INSTA #endif /* xMDBX_ALLOY */ /*----------------------------------------------------------------------------*/ @@ -116,6 +118,10 @@ disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ producing 'defined' has undefined behavior */ #endif +#if _MSC_VER < 1920 +/* avoid "error C2219: syntax error: type qualifier must be after '*'" */ +#define __restrict +#endif #if _MSC_VER > 1930 #pragma warning(disable : 6235) /* is always a constant */ #pragma warning(disable : 6237) /* is never evaluated and might \ @@ -161,7 +167,7 @@ #include "mdbx.h" /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -209,6 +215,7 @@ #include #include +#include #include #include #include @@ -846,7 +853,7 @@ __extern_C key_t ftok(const char *, int); /*----------------------------------------------------------------------------*/ -#if defined(MDBX_USE_VALGRIND) +#if defined(ENABLE_MEMCHECK) #include #ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE /* LY: available since Valgrind 3.10 */ @@ -868,7 +875,7 @@ __extern_C key_t ftok(const char *, int); #define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a, s) (0) #define VALGRIND_CHECK_MEM_IS_DEFINED(a, s) (0) #define RUNNING_ON_VALGRIND (0) -#endif /* MDBX_USE_VALGRIND */ +#endif /* ENABLE_MEMCHECK */ #ifdef __SANITIZE_ADDRESS__ #include @@ -1015,7 +1022,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1219,8 +1226,8 @@ typedef pthread_mutex_t osal_fastmutex_t; /*----------------------------------------------------------------------------*/ /* OS abstraction layer stuff */ -MDBX_INTERNAL_VAR unsigned sys_pagesize; -MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_pagesize_ln2, +MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize; +MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize_ln2, sys_allocation_granularity; /* Get the size of a memory page for the system. @@ -1484,8 +1491,9 @@ MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); #endif #if defined(__linux__) || defined(__gnu_linux__) -MDBX_INTERNAL_VAR uint32_t linux_kernel_version; -MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; +MDBX_INTERNAL_VAR_PROTO uint32_t linux_kernel_version; +MDBX_INTERNAL_VAR_PROTO bool + mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ #ifndef osal_strdup @@ -1699,7 +1707,8 @@ MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, /// restore POSIX-fcntl locks after the closing of file descriptors. /// \return Error code (MDBX_PANIC) or zero on success. MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, - MDBX_env *inprocess_neighbor); + MDBX_env *inprocess_neighbor, + const uint32_t current_pid); /// \brief Connects to shared interprocess locking objects and tries to acquire /// the maximum lock level (shared if exclusive is not available) @@ -1727,6 +1736,8 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); /// operational lock. /// \return Error code or zero on success MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, + bool dont_wait); /// \brief Locks LCK-file or/and table of readers for (de)registering. /// \return Error code or zero on success @@ -1735,16 +1746,12 @@ MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); /// \brief Unlocks LCK-file or/and table of readers after (de)registering. MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); -/// \brief Acquires lock for DB change (on writing transaction start) -/// Reading transactions will not be blocked. -/// Declared as LIBMDBX_API because it is used in mdbx_chk. +/// \brief Acquires write-transaction lock. /// \return Error code or zero on success -LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dont_wait); +MDBX_INTERNAL_FUNC int osal_txn_lock(MDBX_env *env, bool dont_wait); -/// \brief Releases lock once DB changes is made (after writing transaction -/// has finished). -/// Declared as LIBMDBX_API because it is used in mdbx_chk. -LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); +/// \brief Releases write-transaction lock.. +MDBX_INTERNAL_FUNC void osal_txn_unlock(MDBX_env *env); /// \brief Sets alive-flag of reader presence (indicative lock) for PID of /// the current process. The function does no more than needed for @@ -1773,7 +1780,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst); typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); -MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, +MDBX_INTERNAL_VAR_PROTO osal_srwlock_t_function osal_srwlock_Init, osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; @@ -1826,7 +1833,7 @@ typedef struct _FILE_REMOTE_PROTOCOL_INFO { typedef BOOL(WINAPI *MDBX_GetFileInformationByHandleEx)( _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -MDBX_INTERNAL_VAR MDBX_GetFileInformationByHandleEx +MDBX_INTERNAL_VAR_PROTO MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx; typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( @@ -1835,19 +1842,20 @@ typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( _Out_opt_ LPDWORD lpMaximumComponentLength, _Out_opt_ LPDWORD lpFileSystemFlags, _Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize); -MDBX_INTERNAL_VAR MDBX_GetVolumeInformationByHandleW +MDBX_INTERNAL_VAR_PROTO MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW; typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile, _Out_ LPWSTR lpszFilePath, _In_ DWORD cchFilePath, _In_ DWORD dwFlags); -MDBX_INTERNAL_VAR MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; +MDBX_INTERNAL_VAR_PROTO MDBX_GetFinalPathNameByHandleW + mdbx_GetFinalPathNameByHandleW; typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)( _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -MDBX_INTERNAL_VAR MDBX_SetFileInformationByHandle +MDBX_INTERNAL_VAR_PROTO MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle; typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( @@ -1856,10 +1864,10 @@ typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode, IN OUT PVOID InputBuffer, IN ULONG InputBufferLength, OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength); -MDBX_INTERNAL_VAR MDBX_NtFsControlFile mdbx_NtFsControlFile; +MDBX_INTERNAL_VAR_PROTO MDBX_NtFsControlFile mdbx_NtFsControlFile; typedef uint64_t(WINAPI *MDBX_GetTickCount64)(void); -MDBX_INTERNAL_VAR MDBX_GetTickCount64 mdbx_GetTickCount64; +MDBX_INTERNAL_VAR_PROTO MDBX_GetTickCount64 mdbx_GetTickCount64; #if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8 typedef struct _WIN32_MEMORY_RANGE_ENTRY { @@ -1871,13 +1879,13 @@ typedef struct _WIN32_MEMORY_RANGE_ENTRY { typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)( HANDLE hProcess, ULONG_PTR NumberOfEntries, PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags); -MDBX_INTERNAL_VAR MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; +MDBX_INTERNAL_VAR_PROTO MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT; typedef NTSTATUS(NTAPI *MDBX_NtExtendSection)(IN HANDLE SectionHandle, IN PLARGE_INTEGER NewSectionSize); -MDBX_INTERNAL_VAR MDBX_NtExtendSection mdbx_NtExtendSection; +MDBX_INTERNAL_VAR_PROTO MDBX_NtExtendSection mdbx_NtExtendSection; static __inline bool mdbx_RunningUnderWine(void) { return !mdbx_NtExtendSection; @@ -1887,14 +1895,15 @@ typedef LSTATUS(WINAPI *MDBX_RegGetValueA)(HKEY hkey, LPCSTR lpSubKey, LPCSTR lpValue, DWORD dwFlags, LPDWORD pdwType, PVOID pvData, LPDWORD pcbData); -MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; +MDBX_INTERNAL_VAR_PROTO MDBX_RegGetValueA mdbx_RegGetValueA; NTSYSAPI ULONG RtlRandomEx(PULONG Seed); typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle, PUCHAR OverlappedRangeStart, ULONG Length); -MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; +MDBX_INTERNAL_VAR_PROTO MDBX_SetFileIoOverlappedRange + mdbx_SetFileIoOverlappedRange; #endif /* Windows */ @@ -2124,7 +2133,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1 #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ -/** Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP +/** Controls dirty pages tracking, spilling and persisting in `MDBX_WRITEMAP` * mode. 0/OFF = Don't track dirty pages at all, don't spill ones, and use * msync() to persist data. This is by-default on Linux and other systems where * kernel provides properly LRU tracking and effective flushing on-demand. 1/ON @@ -2141,6 +2150,22 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_AVOID_MSYNC must be defined as 0 or 1 #endif /* MDBX_AVOID_MSYNC */ +/** Управляет механизмом поддержки разреженных наборов DBI-хендлов для снижения + * накладных расходов при запуске и обработке транзакций. */ +#ifndef MDBX_ENABLE_DBI_SPARSE +#define MDBX_ENABLE_DBI_SPARSE 1 +#elif !(MDBX_ENABLE_DBI_SPARSE == 0 || MDBX_ENABLE_DBI_SPARSE == 1) +#error MDBX_ENABLE_DBI_SPARSE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_DBI_SPARSE */ + +/** Управляет механизмом отложенного освобождения и поддержки пути быстрого + * открытия DBI-хендлов без захвата блокировок. */ +#ifndef MDBX_ENABLE_DBI_LOCKFREE +#define MDBX_ENABLE_DBI_LOCKFREE 1 +#elif !(MDBX_ENABLE_DBI_LOCKFREE == 0 || MDBX_ENABLE_DBI_LOCKFREE == 1) +#error MDBX_ENABLE_DBI_LOCKFREE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ + /** Controls sort order of internal page number lists. * This mostly experimental/advanced option with not for regular MDBX users. * \warning The database format depend on this option and libmdbx built with @@ -2188,8 +2213,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /** If defined then enables integration with Valgrind, * a memory analyzing tool. */ -#ifndef MDBX_USE_VALGRIND -#endif /* MDBX_USE_VALGRIND */ +#ifndef ENABLE_MEMCHECK +#endif /* ENABLE_MEMCHECK */ /** If defined then enables use C11 atomics, * otherwise detects ones availability automatically. */ @@ -2512,13 +2537,23 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_RUNTIME_FLAGS_INIT \ ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT -extern uint8_t runtime_flags; -extern uint8_t loglevel; -extern MDBX_debug_func *debug_logger; +union logger_union { + void *ptr; + MDBX_debug_func *fmt; + MDBX_debug_func_nofmt *nofmt; +}; + +MDBX_INTERNAL_VAR_PROTO struct mdbx_static { + uint8_t flags; + uint8_t loglevel; + union logger_union logger; + size_t logger_buffer_size; + char *logger_buffer; +} mdbx_static; MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { #if MDBX_DEBUG - if (MDBX_DBG_JITTER & runtime_flags) + if (MDBX_DBG_JITTER & mdbx_static.flags) osal_jitter(tiny); #else (void)tiny; @@ -2532,17 +2567,17 @@ MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, const char *fmt, va_list args); #if MDBX_DEBUG -#define LOG_ENABLED(msg) unlikely(msg <= loglevel) -#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) +#define LOG_ENABLED(msg) unlikely(msg <= mdbx_static.loglevel) +#define AUDIT_ENABLED() unlikely((mdbx_static.flags & MDBX_DBG_AUDIT)) #else /* MDBX_DEBUG */ -#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= mdbx_static.loglevel) #define AUDIT_ENABLED() (0) #endif /* MDBX_DEBUG */ #if MDBX_FORCE_ASSERTIONS #define ASSERT_ENABLED() (1) #elif MDBX_DEBUG -#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) +#define ASSERT_ENABLED() likely((mdbx_static.flags & MDBX_DBG_ASSERT)) #else #define ASSERT_ENABLED() (0) #endif /* assertions */ @@ -2977,7 +3012,8 @@ typedef struct MDBX_page { #define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) -/* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ +/* Drop legacy P_DIRTY flag for sub-pages for compatilibity, + * for assertions only. */ #define PAGETYPE_COMPAT(p) \ (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ @@ -3086,7 +3122,7 @@ typedef sem_t osal_ipclock_t; #endif /* MDBX_LOCKING */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) -MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_stubinit(osal_ipclock_t *ipc); MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); #endif /* MDBX_LOCKING */ @@ -3099,8 +3135,9 @@ MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); * read transactions started by the same thread need no further locking to * proceed. * - * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data. - * No reader table is used if the database is on a read-only filesystem. + * If MDBX_NOSTICKYTHREADS is set, the slot address is not saved in + * thread-specific data. No reader table is used if the database is on a + * read-only filesystem. * * Since the database uses multi-version concurrency control, readers don't * actually need any locking. This table is used to keep track of which @@ -3410,10 +3447,10 @@ typedef struct troika { #if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */ uint32_t unused_pad; #endif -#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) -#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) -#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) -#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3) +#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7u) +#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64u) +#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128u) +#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3u) txnid_t txnid[NUM_METAS]; } meta_troika_t; @@ -3443,6 +3480,8 @@ struct MDBX_txn { #error "Oops, some txn flags overlapped or wrong" #endif uint32_t mt_flags; + unsigned mt_numdbs; + size_t mt_owner; /* thread ID that owns this transaction */ MDBX_txn *mt_parent; /* parent of a nested txn */ /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ @@ -3460,31 +3499,30 @@ struct MDBX_txn { txnid_t mt_front; MDBX_env *mt_env; /* the DB environment */ - /* Array of records for each DB known in the environment. */ - MDBX_dbx *mt_dbxs; /* Array of MDBX_db records for each known DB */ MDBX_db *mt_dbs; - /* Array of sequence numbers for each DB handle */ - MDBX_atomic_uint32_t *mt_dbiseqs; - - /* Transaction DBI Flags */ -#define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ -#define DBI_STALE MDBX_DBI_STALE /* Named-DB record is older than txnID */ -#define DBI_FRESH MDBX_DBI_FRESH /* Named-DB handle opened in this txn */ -#define DBI_CREAT MDBX_DBI_CREAT /* Named-DB handle created in this txn */ -#define DBI_VALID 0x10 /* DB handle is valid, see also DB_VALID */ -#define DBI_USRVALID 0x20 /* As DB_VALID, but not set for FREE_DBI */ -#define DBI_AUDITED 0x40 /* Internal flag for accounting during audit */ - /* Array of flags for each DB */ - uint8_t *mt_dbistate; - /* Number of DB records in use, or 0 when the txn is finished. - * This number only ever increments until the txn finishes; we - * don't decrement it when individual DB handles are closed. */ - MDBX_dbi mt_numdbs; - size_t mt_owner; /* thread ID that owns this transaction */ + +#if MDBX_ENABLE_DBI_SPARSE + unsigned *__restrict mt_dbi_sparse; +#endif /* MDBX_ENABLE_DBI_SPARSE */ + + /* Non-shared DBI state flags inside transaction */ +#define DBI_DIRTY 0x01 /* DB was written in this txn */ +#define DBI_STALE 0x02 /* Named-DB record is older than txnID */ +#define DBI_FRESH 0x04 /* Named-DB handle opened in this txn */ +#define DBI_CREAT 0x08 /* Named-DB handle created in this txn */ +#define DBI_VALID 0x10 /* Handle is valid, see also DB_VALID */ +#define DBI_OLDEN 0x40 /* Handle was closed/reopened outside txn */ +#define DBI_LINDO 0x80 /* Lazy initialization done for DBI-slot */ + /* Array of non-shared txn's flags of DBI */ + uint8_t *__restrict mt_dbi_state; + + /* Array of sequence numbers for each DB handle. */ + uint32_t *__restrict mt_dbi_seqs; + MDBX_cursor **mt_cursors; + MDBX_canary mt_canary; void *mt_userctx; /* User-settable context */ - MDBX_cursor **mt_cursors; union { struct { @@ -3494,8 +3532,8 @@ struct MDBX_txn { struct { meta_troika_t troika; /* In write txns, array of cursors for each DB */ - MDBX_PNL relist; /* Reclaimed GC pages */ - txnid_t last_reclaimed; /* ID of last used record */ + MDBX_PNL __restrict relist; /* Reclaimed GC pages */ + txnid_t last_reclaimed; /* ID of last used record */ #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; #endif /* MDBX_ENABLE_REFUND */ @@ -3507,14 +3545,14 @@ struct MDBX_txn { * dirtylist into mt_parent after freeing hidden mt_parent pages. */ size_t dirtyroom; /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ - MDBX_dpl *dirtylist; + MDBX_dpl *__restrict dirtylist; /* The list of reclaimed txns from GC */ - MDBX_TXL lifo_reclaimed; + MDBX_TXL __restrict lifo_reclaimed; /* The list of pages that became unused during this transaction. */ - MDBX_PNL retired_pages; + MDBX_PNL __restrict retired_pages; /* The list of loose pages that became unused and may be reused * in this transaction, linked through `mp_next`. */ - MDBX_page *loose_pages; + MDBX_page *__restrict loose_pages; /* Number of loose pages (tw.loose_pages) */ size_t loose_count; union { @@ -3523,11 +3561,12 @@ struct MDBX_txn { /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ - MDBX_PNL list; + MDBX_PNL __restrict list; } spilled; size_t writemap_dirty_npages; size_t writemap_spilled_npages; }; + uint64_t gc_time_acc; } tw; }; }; @@ -3566,8 +3605,8 @@ struct MDBX_cursor { MDBX_db *mc_db; /* The database auxiliary record for this cursor */ MDBX_dbx *mc_dbx; - /* The mt_dbistate for this database */ - uint8_t *mc_dbistate; + /* The mt_dbi_state[] for this DBI */ + uint8_t *__restrict mc_dbi_state; uint8_t mc_snum; /* number of pushed pages */ uint8_t mc_top; /* index of top page, normally mc_snum-1 */ @@ -3620,6 +3659,11 @@ typedef struct MDBX_cursor_couple { MDBX_xcursor inner; } MDBX_cursor_couple; +struct mdbx_defer_free_item { + struct mdbx_defer_free_item *next; + uint64_t timestamp; +}; + /* The database environment. */ struct MDBX_env { /* ----------------------------------------------------- mostly static part */ @@ -3637,6 +3681,7 @@ struct MDBX_env { #define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; + unsigned me_psize; /* DB page size, initialized from me_os_psize */ osal_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.base #define me_lazy_fd me_dxb_mmap.fd @@ -3649,7 +3694,6 @@ struct MDBX_env { #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; - unsigned me_psize; /* DB page size, initialized from me_os_psize */ uint16_t me_leaf_nodemax; /* max size of a leaf-node */ uint16_t me_branch_nodemax; /* max size of a branch-node */ uint16_t me_subpage_limit; @@ -3667,13 +3711,15 @@ struct MDBX_env { MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ osal_thread_key_t me_txkey; /* thread-key for readers */ - pathchar_t *me_pathname; /* path to the DB files */ - void *me_pbuf; /* scratch area for DUPSORT put() */ - MDBX_txn *me_txn0; /* preallocated write transaction */ - - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ + struct { /* path to the DB files */ + pathchar_t *lck, *dxb, *specified; + void *buffer; + } me_pathname; + void *me_pbuf; /* scratch area for DUPSORT put() */ + MDBX_txn *me_txn0; /* preallocated write transaction */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *__restrict me_db_flags; /* array of flags from MDBX_db.md_flags */ + MDBX_atomic_uint32_t *me_dbi_seqs; /* array of dbi sequence numbers */ unsigned me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ unsigned me_maxgc_per_branch; @@ -3687,6 +3733,7 @@ struct MDBX_env { unsigned rp_augment_limit; unsigned dp_limit; unsigned dp_initial; + uint64_t gc_time_limit; uint8_t dp_loose_limit; uint8_t spill_max_denominator; uint8_t spill_min_denominator; @@ -3696,6 +3743,8 @@ struct MDBX_env { unsigned writethrough_threshold; #endif /* Windows */ bool prefault_write; + bool prefer_waf_insteadof_balance; /* Strive to minimize WAF instead of + balancing pages fullment */ union { unsigned all; /* tracks options with non-auto values but tuned by user */ @@ -3725,20 +3774,23 @@ struct MDBX_env { } me_sysv_ipc; #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ bool me_incore; + bool me_prefault_write; - MDBX_env *me_lcklist_next; +#if MDBX_ENABLE_DBI_LOCKFREE + struct mdbx_defer_free_item *me_defer_free; +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ /* --------------------------------------------------- mostly volatile part */ MDBX_txn *me_txn; /* current write transaction */ osal_fastmutex_t me_dbi_lock; - MDBX_dbi me_numdbs; /* number of DBs opened */ - bool me_prefault_write; + unsigned me_numdbs; /* number of DBs opened */ - MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ unsigned me_dp_reserve_len; + MDBX_page *__restrict me_dp_reserve; /* list of malloc'ed blocks for re-use */ + /* PNL of pages that became unused in a write txn */ - MDBX_PNL me_retired_pages; + MDBX_PNL __restrict me_retired_pages; osal_ioring_t me_ioring; #if defined(_WIN32) || defined(_WIN64) @@ -3756,13 +3808,12 @@ struct MDBX_env { #if MDBX_DEBUG MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ #endif -#ifdef MDBX_USE_VALGRIND +#ifdef ENABLE_MEMCHECK int me_valgrind_handle; #endif -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - MDBX_atomic_uint32_t me_ignore_EDEADLK; +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) pgno_t me_poison_edge; -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ #ifndef xMDBX_DEBUG_SPILLING #define xMDBX_DEBUG_SPILLING 0 @@ -3822,10 +3873,6 @@ osal_flush_incoherent_mmap(const void *addr, size_t nbytes, MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, int *dead); -MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, - MDBX_reader *end); -MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); - MDBX_INTERNAL_FUNC void global_ctor(void); MDBX_INTERNAL_FUNC void osal_ctor(void); MDBX_INTERNAL_FUNC void global_dtor(void); @@ -3940,7 +3987,8 @@ typedef struct MDBX_node { /* mdbx_dbi_open() flags */ #define DB_USABLE_FLAGS (DB_PERSISTENT_FLAGS | MDBX_CREATE | MDBX_DB_ACCEDE) -#define DB_VALID 0x8000 /* DB handle is valid, for me_dbflags */ +#define DB_VALID 0x8000u /* DB handle is valid, for me_db_flags */ +#define DB_POISON 0x7fffu /* update pending */ #define DB_INTERNAL_FLAGS DB_VALID #if DB_INTERNAL_FLAGS & DB_USABLE_FLAGS @@ -4029,11 +4077,11 @@ log2n_powerof2(size_t value_uintptr) { * environment and re-opening it with the new flags. */ #define ENV_CHANGEABLE_FLAGS \ (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | \ - MDBX_VALIDATION) + MDBX_NOMEMINIT | MDBX_DEPRECATED_COALESCE | MDBX_PAGEPERTURB | \ + MDBX_ACCEDE | MDBX_VALIDATION) #define ENV_CHANGELESS_FLAGS \ - (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ - MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) + (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOSTICKYTHREADS | \ + MDBX_NORDAHEAD | MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS) #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS @@ -4067,6 +4115,36 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) +/******************************************************************************/ + +/** \brief Page types for traverse the b-tree. + * \see mdbx_env_pgwalk() \see MDBX_pgvisitor_func */ +enum MDBX_page_type_t { + MDBX_page_broken, + MDBX_page_large, + MDBX_page_branch, + MDBX_page_leaf, + MDBX_page_dupfixed_leaf, + MDBX_subpage_leaf, + MDBX_subpage_dupfixed_leaf, + MDBX_subpage_broken, +}; +typedef enum MDBX_page_type_t MDBX_page_type_t; + +typedef struct MDBX_walk_sdb { + MDBX_val name; + struct MDBX_db *internal, *nested; +} MDBX_walk_sdb_t; + +/** \brief Callback function for traverse the b-tree. \see mdbx_env_pgwalk() */ +typedef int +MDBX_pgvisitor_func(const size_t pgno, const unsigned number, void *const ctx, + const int deep, const MDBX_walk_sdb_t *subdb, + const size_t page_size, const MDBX_page_type_t page_type, + const MDBX_error_t err, const size_t nentries, + const size_t payload_bytes, const size_t header_bytes, + const size_t unused_bytes); + #include #define PRINT 1 diff --git a/mdbxdist/mdbx_load.c b/mdbxdist/mdbx_load.c index b7eb75d..666b27d 100644 --- a/mdbxdist/mdbx_load.c +++ b/mdbxdist/mdbx_load.c @@ -1,7 +1,7 @@ /* mdbx_load.c - memory-mapped database load tool */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -22,7 +22,7 @@ #define xMDBX_TOOLS /* Avoid using internal eASSERT() */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY 0c86daff919dc09ab5e99c8e5bfb1d51e1367babca9c630c97af92a2aa988ea9_v0_12_9_16_gfff3fbd8 +#define MDBX_BUILD_SOURCERY 91ff5b5423830ee44fca4b70dcb298f233338a17a3185c44df67ec16d3034af3_v0_13_0_38_gf1975363 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -47,11 +47,13 @@ #ifdef xMDBX_ALLOY /* Amalgamated build */ #define MDBX_INTERNAL_FUNC static -#define MDBX_INTERNAL_VAR static +#define MDBX_INTERNAL_VAR_PROTO static +#define MDBX_INTERNAL_VAR_INSTA static #else /* Non-amalgamated build */ #define MDBX_INTERNAL_FUNC -#define MDBX_INTERNAL_VAR extern +#define MDBX_INTERNAL_VAR_PROTO extern +#define MDBX_INTERNAL_VAR_INSTA #endif /* xMDBX_ALLOY */ /*----------------------------------------------------------------------------*/ @@ -116,6 +118,10 @@ disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ producing 'defined' has undefined behavior */ #endif +#if _MSC_VER < 1920 +/* avoid "error C2219: syntax error: type qualifier must be after '*'" */ +#define __restrict +#endif #if _MSC_VER > 1930 #pragma warning(disable : 6235) /* is always a constant */ #pragma warning(disable : 6237) /* is never evaluated and might \ @@ -161,7 +167,7 @@ #include "mdbx.h" /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -209,6 +215,7 @@ #include #include +#include #include #include #include @@ -846,7 +853,7 @@ __extern_C key_t ftok(const char *, int); /*----------------------------------------------------------------------------*/ -#if defined(MDBX_USE_VALGRIND) +#if defined(ENABLE_MEMCHECK) #include #ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE /* LY: available since Valgrind 3.10 */ @@ -868,7 +875,7 @@ __extern_C key_t ftok(const char *, int); #define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a, s) (0) #define VALGRIND_CHECK_MEM_IS_DEFINED(a, s) (0) #define RUNNING_ON_VALGRIND (0) -#endif /* MDBX_USE_VALGRIND */ +#endif /* ENABLE_MEMCHECK */ #ifdef __SANITIZE_ADDRESS__ #include @@ -1015,7 +1022,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1219,8 +1226,8 @@ typedef pthread_mutex_t osal_fastmutex_t; /*----------------------------------------------------------------------------*/ /* OS abstraction layer stuff */ -MDBX_INTERNAL_VAR unsigned sys_pagesize; -MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_pagesize_ln2, +MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize; +MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize_ln2, sys_allocation_granularity; /* Get the size of a memory page for the system. @@ -1484,8 +1491,9 @@ MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); #endif #if defined(__linux__) || defined(__gnu_linux__) -MDBX_INTERNAL_VAR uint32_t linux_kernel_version; -MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; +MDBX_INTERNAL_VAR_PROTO uint32_t linux_kernel_version; +MDBX_INTERNAL_VAR_PROTO bool + mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ #ifndef osal_strdup @@ -1699,7 +1707,8 @@ MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, /// restore POSIX-fcntl locks after the closing of file descriptors. /// \return Error code (MDBX_PANIC) or zero on success. MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, - MDBX_env *inprocess_neighbor); + MDBX_env *inprocess_neighbor, + const uint32_t current_pid); /// \brief Connects to shared interprocess locking objects and tries to acquire /// the maximum lock level (shared if exclusive is not available) @@ -1727,6 +1736,8 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); /// operational lock. /// \return Error code or zero on success MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, + bool dont_wait); /// \brief Locks LCK-file or/and table of readers for (de)registering. /// \return Error code or zero on success @@ -1735,16 +1746,12 @@ MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); /// \brief Unlocks LCK-file or/and table of readers after (de)registering. MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); -/// \brief Acquires lock for DB change (on writing transaction start) -/// Reading transactions will not be blocked. -/// Declared as LIBMDBX_API because it is used in mdbx_chk. +/// \brief Acquires write-transaction lock. /// \return Error code or zero on success -LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dont_wait); +MDBX_INTERNAL_FUNC int osal_txn_lock(MDBX_env *env, bool dont_wait); -/// \brief Releases lock once DB changes is made (after writing transaction -/// has finished). -/// Declared as LIBMDBX_API because it is used in mdbx_chk. -LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); +/// \brief Releases write-transaction lock.. +MDBX_INTERNAL_FUNC void osal_txn_unlock(MDBX_env *env); /// \brief Sets alive-flag of reader presence (indicative lock) for PID of /// the current process. The function does no more than needed for @@ -1773,7 +1780,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst); typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); -MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, +MDBX_INTERNAL_VAR_PROTO osal_srwlock_t_function osal_srwlock_Init, osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; @@ -1826,7 +1833,7 @@ typedef struct _FILE_REMOTE_PROTOCOL_INFO { typedef BOOL(WINAPI *MDBX_GetFileInformationByHandleEx)( _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -MDBX_INTERNAL_VAR MDBX_GetFileInformationByHandleEx +MDBX_INTERNAL_VAR_PROTO MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx; typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( @@ -1835,19 +1842,20 @@ typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( _Out_opt_ LPDWORD lpMaximumComponentLength, _Out_opt_ LPDWORD lpFileSystemFlags, _Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize); -MDBX_INTERNAL_VAR MDBX_GetVolumeInformationByHandleW +MDBX_INTERNAL_VAR_PROTO MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW; typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile, _Out_ LPWSTR lpszFilePath, _In_ DWORD cchFilePath, _In_ DWORD dwFlags); -MDBX_INTERNAL_VAR MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; +MDBX_INTERNAL_VAR_PROTO MDBX_GetFinalPathNameByHandleW + mdbx_GetFinalPathNameByHandleW; typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)( _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -MDBX_INTERNAL_VAR MDBX_SetFileInformationByHandle +MDBX_INTERNAL_VAR_PROTO MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle; typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( @@ -1856,10 +1864,10 @@ typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode, IN OUT PVOID InputBuffer, IN ULONG InputBufferLength, OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength); -MDBX_INTERNAL_VAR MDBX_NtFsControlFile mdbx_NtFsControlFile; +MDBX_INTERNAL_VAR_PROTO MDBX_NtFsControlFile mdbx_NtFsControlFile; typedef uint64_t(WINAPI *MDBX_GetTickCount64)(void); -MDBX_INTERNAL_VAR MDBX_GetTickCount64 mdbx_GetTickCount64; +MDBX_INTERNAL_VAR_PROTO MDBX_GetTickCount64 mdbx_GetTickCount64; #if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8 typedef struct _WIN32_MEMORY_RANGE_ENTRY { @@ -1871,13 +1879,13 @@ typedef struct _WIN32_MEMORY_RANGE_ENTRY { typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)( HANDLE hProcess, ULONG_PTR NumberOfEntries, PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags); -MDBX_INTERNAL_VAR MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; +MDBX_INTERNAL_VAR_PROTO MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT; typedef NTSTATUS(NTAPI *MDBX_NtExtendSection)(IN HANDLE SectionHandle, IN PLARGE_INTEGER NewSectionSize); -MDBX_INTERNAL_VAR MDBX_NtExtendSection mdbx_NtExtendSection; +MDBX_INTERNAL_VAR_PROTO MDBX_NtExtendSection mdbx_NtExtendSection; static __inline bool mdbx_RunningUnderWine(void) { return !mdbx_NtExtendSection; @@ -1887,14 +1895,15 @@ typedef LSTATUS(WINAPI *MDBX_RegGetValueA)(HKEY hkey, LPCSTR lpSubKey, LPCSTR lpValue, DWORD dwFlags, LPDWORD pdwType, PVOID pvData, LPDWORD pcbData); -MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; +MDBX_INTERNAL_VAR_PROTO MDBX_RegGetValueA mdbx_RegGetValueA; NTSYSAPI ULONG RtlRandomEx(PULONG Seed); typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle, PUCHAR OverlappedRangeStart, ULONG Length); -MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; +MDBX_INTERNAL_VAR_PROTO MDBX_SetFileIoOverlappedRange + mdbx_SetFileIoOverlappedRange; #endif /* Windows */ @@ -2124,7 +2133,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1 #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ -/** Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP +/** Controls dirty pages tracking, spilling and persisting in `MDBX_WRITEMAP` * mode. 0/OFF = Don't track dirty pages at all, don't spill ones, and use * msync() to persist data. This is by-default on Linux and other systems where * kernel provides properly LRU tracking and effective flushing on-demand. 1/ON @@ -2141,6 +2150,22 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_AVOID_MSYNC must be defined as 0 or 1 #endif /* MDBX_AVOID_MSYNC */ +/** Управляет механизмом поддержки разреженных наборов DBI-хендлов для снижения + * накладных расходов при запуске и обработке транзакций. */ +#ifndef MDBX_ENABLE_DBI_SPARSE +#define MDBX_ENABLE_DBI_SPARSE 1 +#elif !(MDBX_ENABLE_DBI_SPARSE == 0 || MDBX_ENABLE_DBI_SPARSE == 1) +#error MDBX_ENABLE_DBI_SPARSE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_DBI_SPARSE */ + +/** Управляет механизмом отложенного освобождения и поддержки пути быстрого + * открытия DBI-хендлов без захвата блокировок. */ +#ifndef MDBX_ENABLE_DBI_LOCKFREE +#define MDBX_ENABLE_DBI_LOCKFREE 1 +#elif !(MDBX_ENABLE_DBI_LOCKFREE == 0 || MDBX_ENABLE_DBI_LOCKFREE == 1) +#error MDBX_ENABLE_DBI_LOCKFREE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ + /** Controls sort order of internal page number lists. * This mostly experimental/advanced option with not for regular MDBX users. * \warning The database format depend on this option and libmdbx built with @@ -2188,8 +2213,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /** If defined then enables integration with Valgrind, * a memory analyzing tool. */ -#ifndef MDBX_USE_VALGRIND -#endif /* MDBX_USE_VALGRIND */ +#ifndef ENABLE_MEMCHECK +#endif /* ENABLE_MEMCHECK */ /** If defined then enables use C11 atomics, * otherwise detects ones availability automatically. */ @@ -2512,13 +2537,23 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_RUNTIME_FLAGS_INIT \ ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT -extern uint8_t runtime_flags; -extern uint8_t loglevel; -extern MDBX_debug_func *debug_logger; +union logger_union { + void *ptr; + MDBX_debug_func *fmt; + MDBX_debug_func_nofmt *nofmt; +}; + +MDBX_INTERNAL_VAR_PROTO struct mdbx_static { + uint8_t flags; + uint8_t loglevel; + union logger_union logger; + size_t logger_buffer_size; + char *logger_buffer; +} mdbx_static; MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { #if MDBX_DEBUG - if (MDBX_DBG_JITTER & runtime_flags) + if (MDBX_DBG_JITTER & mdbx_static.flags) osal_jitter(tiny); #else (void)tiny; @@ -2532,17 +2567,17 @@ MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, const char *fmt, va_list args); #if MDBX_DEBUG -#define LOG_ENABLED(msg) unlikely(msg <= loglevel) -#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) +#define LOG_ENABLED(msg) unlikely(msg <= mdbx_static.loglevel) +#define AUDIT_ENABLED() unlikely((mdbx_static.flags & MDBX_DBG_AUDIT)) #else /* MDBX_DEBUG */ -#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= mdbx_static.loglevel) #define AUDIT_ENABLED() (0) #endif /* MDBX_DEBUG */ #if MDBX_FORCE_ASSERTIONS #define ASSERT_ENABLED() (1) #elif MDBX_DEBUG -#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) +#define ASSERT_ENABLED() likely((mdbx_static.flags & MDBX_DBG_ASSERT)) #else #define ASSERT_ENABLED() (0) #endif /* assertions */ @@ -2977,7 +3012,8 @@ typedef struct MDBX_page { #define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) -/* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ +/* Drop legacy P_DIRTY flag for sub-pages for compatilibity, + * for assertions only. */ #define PAGETYPE_COMPAT(p) \ (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ @@ -3086,7 +3122,7 @@ typedef sem_t osal_ipclock_t; #endif /* MDBX_LOCKING */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) -MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_stubinit(osal_ipclock_t *ipc); MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); #endif /* MDBX_LOCKING */ @@ -3099,8 +3135,9 @@ MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); * read transactions started by the same thread need no further locking to * proceed. * - * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data. - * No reader table is used if the database is on a read-only filesystem. + * If MDBX_NOSTICKYTHREADS is set, the slot address is not saved in + * thread-specific data. No reader table is used if the database is on a + * read-only filesystem. * * Since the database uses multi-version concurrency control, readers don't * actually need any locking. This table is used to keep track of which @@ -3410,10 +3447,10 @@ typedef struct troika { #if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */ uint32_t unused_pad; #endif -#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) -#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) -#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) -#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3) +#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7u) +#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64u) +#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128u) +#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3u) txnid_t txnid[NUM_METAS]; } meta_troika_t; @@ -3443,6 +3480,8 @@ struct MDBX_txn { #error "Oops, some txn flags overlapped or wrong" #endif uint32_t mt_flags; + unsigned mt_numdbs; + size_t mt_owner; /* thread ID that owns this transaction */ MDBX_txn *mt_parent; /* parent of a nested txn */ /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ @@ -3460,31 +3499,30 @@ struct MDBX_txn { txnid_t mt_front; MDBX_env *mt_env; /* the DB environment */ - /* Array of records for each DB known in the environment. */ - MDBX_dbx *mt_dbxs; /* Array of MDBX_db records for each known DB */ MDBX_db *mt_dbs; - /* Array of sequence numbers for each DB handle */ - MDBX_atomic_uint32_t *mt_dbiseqs; - - /* Transaction DBI Flags */ -#define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ -#define DBI_STALE MDBX_DBI_STALE /* Named-DB record is older than txnID */ -#define DBI_FRESH MDBX_DBI_FRESH /* Named-DB handle opened in this txn */ -#define DBI_CREAT MDBX_DBI_CREAT /* Named-DB handle created in this txn */ -#define DBI_VALID 0x10 /* DB handle is valid, see also DB_VALID */ -#define DBI_USRVALID 0x20 /* As DB_VALID, but not set for FREE_DBI */ -#define DBI_AUDITED 0x40 /* Internal flag for accounting during audit */ - /* Array of flags for each DB */ - uint8_t *mt_dbistate; - /* Number of DB records in use, or 0 when the txn is finished. - * This number only ever increments until the txn finishes; we - * don't decrement it when individual DB handles are closed. */ - MDBX_dbi mt_numdbs; - size_t mt_owner; /* thread ID that owns this transaction */ + +#if MDBX_ENABLE_DBI_SPARSE + unsigned *__restrict mt_dbi_sparse; +#endif /* MDBX_ENABLE_DBI_SPARSE */ + + /* Non-shared DBI state flags inside transaction */ +#define DBI_DIRTY 0x01 /* DB was written in this txn */ +#define DBI_STALE 0x02 /* Named-DB record is older than txnID */ +#define DBI_FRESH 0x04 /* Named-DB handle opened in this txn */ +#define DBI_CREAT 0x08 /* Named-DB handle created in this txn */ +#define DBI_VALID 0x10 /* Handle is valid, see also DB_VALID */ +#define DBI_OLDEN 0x40 /* Handle was closed/reopened outside txn */ +#define DBI_LINDO 0x80 /* Lazy initialization done for DBI-slot */ + /* Array of non-shared txn's flags of DBI */ + uint8_t *__restrict mt_dbi_state; + + /* Array of sequence numbers for each DB handle. */ + uint32_t *__restrict mt_dbi_seqs; + MDBX_cursor **mt_cursors; + MDBX_canary mt_canary; void *mt_userctx; /* User-settable context */ - MDBX_cursor **mt_cursors; union { struct { @@ -3494,8 +3532,8 @@ struct MDBX_txn { struct { meta_troika_t troika; /* In write txns, array of cursors for each DB */ - MDBX_PNL relist; /* Reclaimed GC pages */ - txnid_t last_reclaimed; /* ID of last used record */ + MDBX_PNL __restrict relist; /* Reclaimed GC pages */ + txnid_t last_reclaimed; /* ID of last used record */ #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; #endif /* MDBX_ENABLE_REFUND */ @@ -3507,14 +3545,14 @@ struct MDBX_txn { * dirtylist into mt_parent after freeing hidden mt_parent pages. */ size_t dirtyroom; /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ - MDBX_dpl *dirtylist; + MDBX_dpl *__restrict dirtylist; /* The list of reclaimed txns from GC */ - MDBX_TXL lifo_reclaimed; + MDBX_TXL __restrict lifo_reclaimed; /* The list of pages that became unused during this transaction. */ - MDBX_PNL retired_pages; + MDBX_PNL __restrict retired_pages; /* The list of loose pages that became unused and may be reused * in this transaction, linked through `mp_next`. */ - MDBX_page *loose_pages; + MDBX_page *__restrict loose_pages; /* Number of loose pages (tw.loose_pages) */ size_t loose_count; union { @@ -3523,11 +3561,12 @@ struct MDBX_txn { /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ - MDBX_PNL list; + MDBX_PNL __restrict list; } spilled; size_t writemap_dirty_npages; size_t writemap_spilled_npages; }; + uint64_t gc_time_acc; } tw; }; }; @@ -3566,8 +3605,8 @@ struct MDBX_cursor { MDBX_db *mc_db; /* The database auxiliary record for this cursor */ MDBX_dbx *mc_dbx; - /* The mt_dbistate for this database */ - uint8_t *mc_dbistate; + /* The mt_dbi_state[] for this DBI */ + uint8_t *__restrict mc_dbi_state; uint8_t mc_snum; /* number of pushed pages */ uint8_t mc_top; /* index of top page, normally mc_snum-1 */ @@ -3620,6 +3659,11 @@ typedef struct MDBX_cursor_couple { MDBX_xcursor inner; } MDBX_cursor_couple; +struct mdbx_defer_free_item { + struct mdbx_defer_free_item *next; + uint64_t timestamp; +}; + /* The database environment. */ struct MDBX_env { /* ----------------------------------------------------- mostly static part */ @@ -3637,6 +3681,7 @@ struct MDBX_env { #define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; + unsigned me_psize; /* DB page size, initialized from me_os_psize */ osal_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.base #define me_lazy_fd me_dxb_mmap.fd @@ -3649,7 +3694,6 @@ struct MDBX_env { #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; - unsigned me_psize; /* DB page size, initialized from me_os_psize */ uint16_t me_leaf_nodemax; /* max size of a leaf-node */ uint16_t me_branch_nodemax; /* max size of a branch-node */ uint16_t me_subpage_limit; @@ -3667,13 +3711,15 @@ struct MDBX_env { MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ osal_thread_key_t me_txkey; /* thread-key for readers */ - pathchar_t *me_pathname; /* path to the DB files */ - void *me_pbuf; /* scratch area for DUPSORT put() */ - MDBX_txn *me_txn0; /* preallocated write transaction */ - - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ + struct { /* path to the DB files */ + pathchar_t *lck, *dxb, *specified; + void *buffer; + } me_pathname; + void *me_pbuf; /* scratch area for DUPSORT put() */ + MDBX_txn *me_txn0; /* preallocated write transaction */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *__restrict me_db_flags; /* array of flags from MDBX_db.md_flags */ + MDBX_atomic_uint32_t *me_dbi_seqs; /* array of dbi sequence numbers */ unsigned me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ unsigned me_maxgc_per_branch; @@ -3687,6 +3733,7 @@ struct MDBX_env { unsigned rp_augment_limit; unsigned dp_limit; unsigned dp_initial; + uint64_t gc_time_limit; uint8_t dp_loose_limit; uint8_t spill_max_denominator; uint8_t spill_min_denominator; @@ -3696,6 +3743,8 @@ struct MDBX_env { unsigned writethrough_threshold; #endif /* Windows */ bool prefault_write; + bool prefer_waf_insteadof_balance; /* Strive to minimize WAF instead of + balancing pages fullment */ union { unsigned all; /* tracks options with non-auto values but tuned by user */ @@ -3725,20 +3774,23 @@ struct MDBX_env { } me_sysv_ipc; #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ bool me_incore; + bool me_prefault_write; - MDBX_env *me_lcklist_next; +#if MDBX_ENABLE_DBI_LOCKFREE + struct mdbx_defer_free_item *me_defer_free; +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ /* --------------------------------------------------- mostly volatile part */ MDBX_txn *me_txn; /* current write transaction */ osal_fastmutex_t me_dbi_lock; - MDBX_dbi me_numdbs; /* number of DBs opened */ - bool me_prefault_write; + unsigned me_numdbs; /* number of DBs opened */ - MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ unsigned me_dp_reserve_len; + MDBX_page *__restrict me_dp_reserve; /* list of malloc'ed blocks for re-use */ + /* PNL of pages that became unused in a write txn */ - MDBX_PNL me_retired_pages; + MDBX_PNL __restrict me_retired_pages; osal_ioring_t me_ioring; #if defined(_WIN32) || defined(_WIN64) @@ -3756,13 +3808,12 @@ struct MDBX_env { #if MDBX_DEBUG MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ #endif -#ifdef MDBX_USE_VALGRIND +#ifdef ENABLE_MEMCHECK int me_valgrind_handle; #endif -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - MDBX_atomic_uint32_t me_ignore_EDEADLK; +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) pgno_t me_poison_edge; -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ #ifndef xMDBX_DEBUG_SPILLING #define xMDBX_DEBUG_SPILLING 0 @@ -3822,10 +3873,6 @@ osal_flush_incoherent_mmap(const void *addr, size_t nbytes, MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, int *dead); -MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, - MDBX_reader *end); -MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); - MDBX_INTERNAL_FUNC void global_ctor(void); MDBX_INTERNAL_FUNC void osal_ctor(void); MDBX_INTERNAL_FUNC void global_dtor(void); @@ -3940,7 +3987,8 @@ typedef struct MDBX_node { /* mdbx_dbi_open() flags */ #define DB_USABLE_FLAGS (DB_PERSISTENT_FLAGS | MDBX_CREATE | MDBX_DB_ACCEDE) -#define DB_VALID 0x8000 /* DB handle is valid, for me_dbflags */ +#define DB_VALID 0x8000u /* DB handle is valid, for me_db_flags */ +#define DB_POISON 0x7fffu /* update pending */ #define DB_INTERNAL_FLAGS DB_VALID #if DB_INTERNAL_FLAGS & DB_USABLE_FLAGS @@ -4029,11 +4077,11 @@ log2n_powerof2(size_t value_uintptr) { * environment and re-opening it with the new flags. */ #define ENV_CHANGEABLE_FLAGS \ (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | \ - MDBX_VALIDATION) + MDBX_NOMEMINIT | MDBX_DEPRECATED_COALESCE | MDBX_PAGEPERTURB | \ + MDBX_ACCEDE | MDBX_VALIDATION) #define ENV_CHANGELESS_FLAGS \ - (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ - MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) + (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOSTICKYTHREADS | \ + MDBX_NORDAHEAD | MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS) #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS @@ -4067,6 +4115,36 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) +/******************************************************************************/ + +/** \brief Page types for traverse the b-tree. + * \see mdbx_env_pgwalk() \see MDBX_pgvisitor_func */ +enum MDBX_page_type_t { + MDBX_page_broken, + MDBX_page_large, + MDBX_page_branch, + MDBX_page_leaf, + MDBX_page_dupfixed_leaf, + MDBX_subpage_leaf, + MDBX_subpage_dupfixed_leaf, + MDBX_subpage_broken, +}; +typedef enum MDBX_page_type_t MDBX_page_type_t; + +typedef struct MDBX_walk_sdb { + MDBX_val name; + struct MDBX_db *internal, *nested; +} MDBX_walk_sdb_t; + +/** \brief Callback function for traverse the b-tree. \see mdbx_env_pgwalk() */ +typedef int +MDBX_pgvisitor_func(const size_t pgno, const unsigned number, void *const ctx, + const int deep, const MDBX_walk_sdb_t *subdb, + const size_t page_size, const MDBX_page_type_t page_type, + const MDBX_error_t err, const size_t nentries, + const size_t payload_bytes, const size_t header_bytes, + const size_t unused_bytes); + #include #if defined(_WIN32) || defined(_WIN64) @@ -4635,7 +4713,7 @@ static int equal_or_greater(const MDBX_val *a, const MDBX_val *b) { } int main(int argc, char *argv[]) { - int i, rc; + int i, err; MDBX_env *env = nullptr; MDBX_txn *txn = nullptr; MDBX_cursor *mc = nullptr; @@ -4738,40 +4816,45 @@ int main(int argc, char *argv[]) { dbuf.iov_len = 4096; dbuf.iov_base = osal_malloc(dbuf.iov_len); if (!dbuf.iov_base) { - rc = MDBX_ENOMEM; - error("value-buffer", rc); - goto env_close; + err = MDBX_ENOMEM; + error("value-buffer", err); + goto bailout; } /* read first header for mapsize= */ if (!(mode & NOHDR)) { - rc = readhdr(); - if (unlikely(rc != MDBX_SUCCESS)) { - if (rc == EOF) - rc = MDBX_ENODATA; - error("readheader", rc); - goto env_close; + err = readhdr(); + if (unlikely(err != MDBX_SUCCESS)) { + if (err == EOF) + err = MDBX_ENODATA; + error("readheader", err); + goto bailout; } } - rc = mdbx_env_create(&env); - if (unlikely(rc != MDBX_SUCCESS)) { - error("mdbx_env_create", rc); - return EXIT_FAILURE; + err = mdbx_env_create(&env); + if (unlikely(err != MDBX_SUCCESS)) { + error("mdbx_env_create", err); + goto bailout; + } + + err = mdbx_env_set_maxdbs(env, 2); + if (unlikely(err != MDBX_SUCCESS)) { + error("mdbx_env_set_maxdbs", err); + goto bailout; } - mdbx_env_set_maxdbs(env, 2); if (envinfo.mi_maxreaders) { - rc = mdbx_env_set_maxreaders(env, envinfo.mi_maxreaders); - if (unlikely(rc != MDBX_SUCCESS)) { - error("mdbx_env_set_maxreaders", rc); - goto env_close; + err = mdbx_env_set_maxreaders(env, envinfo.mi_maxreaders); + if (unlikely(err != MDBX_SUCCESS)) { + error("mdbx_env_set_maxreaders", err); + goto bailout; } } if (envinfo.mi_geo.current | envinfo.mi_mapsize) { if (envinfo.mi_geo.current) { - rc = mdbx_env_set_geometry( + err = mdbx_env_set_geometry( env, (intptr_t)envinfo.mi_geo.lower, (intptr_t)envinfo.mi_geo.current, (intptr_t)envinfo.mi_geo.upper, (intptr_t)envinfo.mi_geo.shrink, (intptr_t)envinfo.mi_geo.grow, @@ -4784,23 +4867,23 @@ int main(int argc, char *argv[]) { "Database size is too large for current system (mapsize=%" PRIu64 " is great than system-limit %zu)\n", envinfo.mi_mapsize, (size_t)MAX_MAPSIZE); - goto env_close; + goto bailout; } - rc = mdbx_env_set_geometry( + err = mdbx_env_set_geometry( env, (intptr_t)envinfo.mi_mapsize, (intptr_t)envinfo.mi_mapsize, (intptr_t)envinfo.mi_mapsize, 0, 0, envinfo.mi_dxb_pagesize ? (intptr_t)envinfo.mi_dxb_pagesize : -1); } - if (unlikely(rc != MDBX_SUCCESS)) { - error("mdbx_env_set_geometry", rc); - goto env_close; + if (unlikely(err != MDBX_SUCCESS)) { + error("mdbx_env_set_geometry", err); + goto bailout; } } - rc = mdbx_env_open(env, envname, envflags, 0664); - if (unlikely(rc != MDBX_SUCCESS)) { - error("mdbx_env_open", rc); - goto env_close; + err = mdbx_env_open(env, envname, envflags, 0664); + if (unlikely(err != MDBX_SUCCESS)) { + error("mdbx_env_open", err); + goto bailout; } kbuf.iov_len = mdbx_env_get_maxvalsize_ex(env, 0) + (size_t)1; @@ -4808,54 +4891,54 @@ int main(int argc, char *argv[]) { if (!quiet) fprintf(stderr, "mdbx_env_get_maxkeysize() failed, returns %zu\n", kbuf.iov_len); - goto env_close; + goto bailout; } kbuf.iov_base = malloc(kbuf.iov_len); if (!kbuf.iov_base) { - rc = MDBX_ENOMEM; - error("key-buffer", rc); - goto env_close; + err = MDBX_ENOMEM; + error("key-buffer", err); + goto bailout; } - while (rc == MDBX_SUCCESS) { + while (err == MDBX_SUCCESS) { if (user_break) { - rc = MDBX_EINTR; + err = MDBX_EINTR; break; } - rc = mdbx_txn_begin(env, nullptr, 0, &txn); - if (unlikely(rc != MDBX_SUCCESS)) { - error("mdbx_txn_begin", rc); - goto env_close; + err = mdbx_txn_begin(env, nullptr, 0, &txn); + if (unlikely(err != MDBX_SUCCESS)) { + error("mdbx_txn_begin", err); + goto bailout; } if (mode & GLOBAL) { mode -= GLOBAL; if (canary.v | canary.x | canary.y | canary.z) { - rc = mdbx_canary_put(txn, &canary); - if (unlikely(rc != MDBX_SUCCESS)) { - error("mdbx_canary_put", rc); - goto txn_abort; + err = mdbx_canary_put(txn, &canary); + if (unlikely(err != MDBX_SUCCESS)) { + error("mdbx_canary_put", err); + goto bailout; } } } const char *const dbi_name = subname ? subname : "@MAIN"; - rc = + err = mdbx_dbi_open_ex(txn, subname, dbi_flags | MDBX_CREATE, &dbi, (putflags & MDBX_APPEND) ? equal_or_greater : nullptr, (putflags & MDBX_APPEND) ? equal_or_greater : nullptr); - if (unlikely(rc != MDBX_SUCCESS)) { - error("mdbx_dbi_open_ex", rc); - goto txn_abort; + if (unlikely(err != MDBX_SUCCESS)) { + error("mdbx_dbi_open_ex", err); + goto bailout; } uint64_t present_sequence; - rc = mdbx_dbi_sequence(txn, dbi, &present_sequence, 0); - if (unlikely(rc != MDBX_SUCCESS)) { - error("mdbx_dbi_sequence", rc); - goto txn_abort; + err = mdbx_dbi_sequence(txn, dbi, &present_sequence, 0); + if (unlikely(err != MDBX_SUCCESS)) { + error("mdbx_dbi_sequence", err); + goto bailout; } if (present_sequence > sequence) { if (!quiet) @@ -4863,22 +4946,22 @@ int main(int argc, char *argv[]) { "present sequence for '%s' value (%" PRIu64 ") is greater than loaded (%" PRIu64 ")\n", dbi_name, present_sequence, sequence); - rc = MDBX_RESULT_TRUE; - goto txn_abort; + err = MDBX_RESULT_TRUE; + goto bailout; } if (present_sequence < sequence) { - rc = mdbx_dbi_sequence(txn, dbi, nullptr, sequence - present_sequence); - if (unlikely(rc != MDBX_SUCCESS)) { - error("mdbx_dbi_sequence", rc); - goto txn_abort; + err = mdbx_dbi_sequence(txn, dbi, nullptr, sequence - present_sequence); + if (unlikely(err != MDBX_SUCCESS)) { + error("mdbx_dbi_sequence", err); + goto bailout; } } if (purge) { - rc = mdbx_drop(txn, dbi, false); - if (unlikely(rc != MDBX_SUCCESS)) { - error("mdbx_drop", rc); - goto txn_abort; + err = mdbx_drop(txn, dbi, false); + if (unlikely(err != MDBX_SUCCESS)) { + error("mdbx_drop", err); + goto bailout; } } @@ -4886,85 +4969,85 @@ int main(int argc, char *argv[]) { putflags = (dbi_flags & MDBX_DUPSORT) ? putflags | MDBX_APPENDDUP : putflags & ~MDBX_APPENDDUP; - rc = mdbx_cursor_open(txn, dbi, &mc); - if (unlikely(rc != MDBX_SUCCESS)) { - error("mdbx_cursor_open", rc); - goto txn_abort; + err = mdbx_cursor_open(txn, dbi, &mc); + if (unlikely(err != MDBX_SUCCESS)) { + error("mdbx_cursor_open", err); + goto bailout; } int batch = 0; - while (rc == MDBX_SUCCESS) { + while (err == MDBX_SUCCESS) { MDBX_val key, data; - rc = readline(&key, &kbuf); - if (rc == EOF) + err = readline(&key, &kbuf); + if (err == EOF) break; - if (rc == MDBX_SUCCESS) - rc = readline(&data, &dbuf); - if (rc) { + if (err == MDBX_SUCCESS) + err = readline(&data, &dbuf); + if (err) { if (!quiet) fprintf(stderr, "%s: line %" PRIiSIZE ": failed to read key value\n", prog, lineno); - goto txn_abort; + goto bailout; } - rc = mdbx_cursor_put(mc, &key, &data, putflags); - if (rc == MDBX_KEYEXIST && putflags) + err = mdbx_cursor_put(mc, &key, &data, putflags); + if (err == MDBX_KEYEXIST && putflags) continue; - if (rc == MDBX_BAD_VALSIZE && rescue) { + if (err == MDBX_BAD_VALSIZE && rescue) { if (!quiet) fprintf(stderr, "%s: skip line %" PRIiSIZE ": due %s\n", prog, lineno, - mdbx_strerror(rc)); + mdbx_strerror(err)); continue; } - if (unlikely(rc != MDBX_SUCCESS)) { - error("mdbx_cursor_put", rc); - goto txn_abort; + if (unlikely(err != MDBX_SUCCESS)) { + error("mdbx_cursor_put", err); + goto bailout; } batch++; MDBX_txn_info txn_info; - rc = mdbx_txn_info(txn, &txn_info, false); - if (unlikely(rc != MDBX_SUCCESS)) { - error("mdbx_txn_info", rc); - goto txn_abort; + err = mdbx_txn_info(txn, &txn_info, false); + if (unlikely(err != MDBX_SUCCESS)) { + error("mdbx_txn_info", err); + goto bailout; } if (batch == 10000 || txn_info.txn_space_dirty > MEGABYTE * 256) { - rc = mdbx_txn_commit(txn); - if (unlikely(rc != MDBX_SUCCESS)) { - error("mdbx_txn_commit", rc); - goto env_close; + err = mdbx_txn_commit(txn); + if (unlikely(err != MDBX_SUCCESS)) { + error("mdbx_txn_commit", err); + goto bailout; } batch = 0; - rc = mdbx_txn_begin(env, nullptr, 0, &txn); - if (unlikely(rc != MDBX_SUCCESS)) { - error("mdbx_txn_begin", rc); - goto env_close; + err = mdbx_txn_begin(env, nullptr, 0, &txn); + if (unlikely(err != MDBX_SUCCESS)) { + error("mdbx_txn_begin", err); + goto bailout; } - rc = mdbx_cursor_bind(txn, mc, dbi); - if (unlikely(rc != MDBX_SUCCESS)) { - error("mdbx_cursor_bind", rc); - goto txn_abort; + err = mdbx_cursor_bind(txn, mc, dbi); + if (unlikely(err != MDBX_SUCCESS)) { + error("mdbx_cursor_bind", err); + goto bailout; } } } mdbx_cursor_close(mc); mc = nullptr; - rc = mdbx_txn_commit(txn); + err = mdbx_txn_commit(txn); txn = nullptr; - if (unlikely(rc != MDBX_SUCCESS)) { - error("mdbx_txn_commit", rc); - goto env_close; + if (unlikely(err != MDBX_SUCCESS)) { + error("mdbx_txn_commit", err); + goto bailout; } if (subname) { assert(dbi != MAIN_DBI); - rc = mdbx_dbi_close(env, dbi); - if (unlikely(rc != MDBX_SUCCESS)) { - error("mdbx_dbi_close", rc); - goto env_close; + err = mdbx_dbi_close(env, dbi); + if (unlikely(err != MDBX_SUCCESS)) { + error("mdbx_dbi_close", err); + goto bailout; } } else { assert(dbi == MAIN_DBI); @@ -4972,14 +5055,14 @@ int main(int argc, char *argv[]) { /* try read next header */ if (!(mode & NOHDR)) - rc = readhdr(); + err = readhdr(); else if (ferror(stdin) || feof(stdin)) break; } - switch (rc) { + switch (err) { case EOF: - rc = MDBX_SUCCESS; + err = MDBX_SUCCESS; case MDBX_SUCCESS: break; case MDBX_EINTR: @@ -4987,17 +5070,19 @@ int main(int argc, char *argv[]) { fprintf(stderr, "Interrupted by signal/user\n"); break; default: - if (unlikely(rc != MDBX_SUCCESS)) - error("readline", rc); + if (unlikely(err != MDBX_SUCCESS)) + error("readline", err); } -txn_abort: - mdbx_cursor_close(mc); - mdbx_txn_abort(txn); -env_close: - mdbx_env_close(env); +bailout: + if (mc) + mdbx_cursor_close(mc); + if (txn) + mdbx_txn_abort(txn); + if (env) + mdbx_env_close(env); free(kbuf.iov_base); free(dbuf.iov_base); - return rc ? EXIT_FAILURE : EXIT_SUCCESS; + return err ? EXIT_FAILURE : EXIT_SUCCESS; } diff --git a/mdbxdist/mdbx_stat.c b/mdbxdist/mdbx_stat.c index f844322..8fe3f55 100644 --- a/mdbxdist/mdbx_stat.c +++ b/mdbxdist/mdbx_stat.c @@ -1,7 +1,7 @@ /* mdbx_stat.c - memory-mapped database status tool */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -22,7 +22,7 @@ #define xMDBX_TOOLS /* Avoid using internal eASSERT() */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -34,7 +34,7 @@ * top-level directory of the distribution or, alternatively, at * . */ -#define MDBX_BUILD_SOURCERY 0c86daff919dc09ab5e99c8e5bfb1d51e1367babca9c630c97af92a2aa988ea9_v0_12_9_16_gfff3fbd8 +#define MDBX_BUILD_SOURCERY 91ff5b5423830ee44fca4b70dcb298f233338a17a3185c44df67ec16d3034af3_v0_13_0_38_gf1975363 #ifdef MDBX_CONFIG_H #include MDBX_CONFIG_H #endif @@ -47,11 +47,13 @@ #ifdef xMDBX_ALLOY /* Amalgamated build */ #define MDBX_INTERNAL_FUNC static -#define MDBX_INTERNAL_VAR static +#define MDBX_INTERNAL_VAR_PROTO static +#define MDBX_INTERNAL_VAR_INSTA static #else /* Non-amalgamated build */ #define MDBX_INTERNAL_FUNC -#define MDBX_INTERNAL_VAR extern +#define MDBX_INTERNAL_VAR_PROTO extern +#define MDBX_INTERNAL_VAR_INSTA #endif /* xMDBX_ALLOY */ /*----------------------------------------------------------------------------*/ @@ -116,6 +118,10 @@ disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ producing 'defined' has undefined behavior */ #endif +#if _MSC_VER < 1920 +/* avoid "error C2219: syntax error: type qualifier must be after '*'" */ +#define __restrict +#endif #if _MSC_VER > 1930 #pragma warning(disable : 6235) /* is always a constant */ #pragma warning(disable : 6237) /* is never evaluated and might \ @@ -161,7 +167,7 @@ #include "mdbx.h" /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -209,6 +215,7 @@ #include #include +#include #include #include #include @@ -846,7 +853,7 @@ __extern_C key_t ftok(const char *, int); /*----------------------------------------------------------------------------*/ -#if defined(MDBX_USE_VALGRIND) +#if defined(ENABLE_MEMCHECK) #include #ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE /* LY: available since Valgrind 3.10 */ @@ -868,7 +875,7 @@ __extern_C key_t ftok(const char *, int); #define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a, s) (0) #define VALGRIND_CHECK_MEM_IS_DEFINED(a, s) (0) #define RUNNING_ON_VALGRIND (0) -#endif /* MDBX_USE_VALGRIND */ +#endif /* ENABLE_MEMCHECK */ #ifdef __SANITIZE_ADDRESS__ #include @@ -1015,7 +1022,7 @@ extern "C" { /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2023 Leonid Yuriev + * Copyright 2015-2024 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -1219,8 +1226,8 @@ typedef pthread_mutex_t osal_fastmutex_t; /*----------------------------------------------------------------------------*/ /* OS abstraction layer stuff */ -MDBX_INTERNAL_VAR unsigned sys_pagesize; -MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_pagesize_ln2, +MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize; +MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR_PROTO unsigned sys_pagesize_ln2, sys_allocation_granularity; /* Get the size of a memory page for the system. @@ -1484,8 +1491,9 @@ MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); #endif #if defined(__linux__) || defined(__gnu_linux__) -MDBX_INTERNAL_VAR uint32_t linux_kernel_version; -MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; +MDBX_INTERNAL_VAR_PROTO uint32_t linux_kernel_version; +MDBX_INTERNAL_VAR_PROTO bool + mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ #ifndef osal_strdup @@ -1699,7 +1707,8 @@ MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, /// restore POSIX-fcntl locks after the closing of file descriptors. /// \return Error code (MDBX_PANIC) or zero on success. MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, - MDBX_env *inprocess_neighbor); + MDBX_env *inprocess_neighbor, + const uint32_t current_pid); /// \brief Connects to shared interprocess locking objects and tries to acquire /// the maximum lock level (shared if exclusive is not available) @@ -1727,6 +1736,8 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); /// operational lock. /// \return Error code or zero on success MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC int osal_lck_upgrade(MDBX_env *env, + bool dont_wait); /// \brief Locks LCK-file or/and table of readers for (de)registering. /// \return Error code or zero on success @@ -1735,16 +1746,12 @@ MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); /// \brief Unlocks LCK-file or/and table of readers after (de)registering. MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); -/// \brief Acquires lock for DB change (on writing transaction start) -/// Reading transactions will not be blocked. -/// Declared as LIBMDBX_API because it is used in mdbx_chk. +/// \brief Acquires write-transaction lock. /// \return Error code or zero on success -LIBMDBX_API int mdbx_txn_lock(MDBX_env *env, bool dont_wait); +MDBX_INTERNAL_FUNC int osal_txn_lock(MDBX_env *env, bool dont_wait); -/// \brief Releases lock once DB changes is made (after writing transaction -/// has finished). -/// Declared as LIBMDBX_API because it is used in mdbx_chk. -LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); +/// \brief Releases write-transaction lock.. +MDBX_INTERNAL_FUNC void osal_txn_unlock(MDBX_env *env); /// \brief Sets alive-flag of reader presence (indicative lock) for PID of /// the current process. The function does no more than needed for @@ -1773,7 +1780,7 @@ MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst); typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); -MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, +MDBX_INTERNAL_VAR_PROTO osal_srwlock_t_function osal_srwlock_Init, osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; @@ -1826,7 +1833,7 @@ typedef struct _FILE_REMOTE_PROTOCOL_INFO { typedef BOOL(WINAPI *MDBX_GetFileInformationByHandleEx)( _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -MDBX_INTERNAL_VAR MDBX_GetFileInformationByHandleEx +MDBX_INTERNAL_VAR_PROTO MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx; typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( @@ -1835,19 +1842,20 @@ typedef BOOL(WINAPI *MDBX_GetVolumeInformationByHandleW)( _Out_opt_ LPDWORD lpMaximumComponentLength, _Out_opt_ LPDWORD lpFileSystemFlags, _Out_opt_ LPWSTR lpFileSystemNameBuffer, _In_ DWORD nFileSystemNameSize); -MDBX_INTERNAL_VAR MDBX_GetVolumeInformationByHandleW +MDBX_INTERNAL_VAR_PROTO MDBX_GetVolumeInformationByHandleW mdbx_GetVolumeInformationByHandleW; typedef DWORD(WINAPI *MDBX_GetFinalPathNameByHandleW)(_In_ HANDLE hFile, _Out_ LPWSTR lpszFilePath, _In_ DWORD cchFilePath, _In_ DWORD dwFlags); -MDBX_INTERNAL_VAR MDBX_GetFinalPathNameByHandleW mdbx_GetFinalPathNameByHandleW; +MDBX_INTERNAL_VAR_PROTO MDBX_GetFinalPathNameByHandleW + mdbx_GetFinalPathNameByHandleW; typedef BOOL(WINAPI *MDBX_SetFileInformationByHandle)( _In_ HANDLE hFile, _In_ FILE_INFO_BY_HANDLE_CLASS FileInformationClass, _Out_ LPVOID lpFileInformation, _In_ DWORD dwBufferSize); -MDBX_INTERNAL_VAR MDBX_SetFileInformationByHandle +MDBX_INTERNAL_VAR_PROTO MDBX_SetFileInformationByHandle mdbx_SetFileInformationByHandle; typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( @@ -1856,10 +1864,10 @@ typedef NTSTATUS(NTAPI *MDBX_NtFsControlFile)( OUT PIO_STATUS_BLOCK IoStatusBlock, IN ULONG FsControlCode, IN OUT PVOID InputBuffer, IN ULONG InputBufferLength, OUT OPTIONAL PVOID OutputBuffer, IN ULONG OutputBufferLength); -MDBX_INTERNAL_VAR MDBX_NtFsControlFile mdbx_NtFsControlFile; +MDBX_INTERNAL_VAR_PROTO MDBX_NtFsControlFile mdbx_NtFsControlFile; typedef uint64_t(WINAPI *MDBX_GetTickCount64)(void); -MDBX_INTERNAL_VAR MDBX_GetTickCount64 mdbx_GetTickCount64; +MDBX_INTERNAL_VAR_PROTO MDBX_GetTickCount64 mdbx_GetTickCount64; #if !defined(_WIN32_WINNT_WIN8) || _WIN32_WINNT < _WIN32_WINNT_WIN8 typedef struct _WIN32_MEMORY_RANGE_ENTRY { @@ -1871,13 +1879,13 @@ typedef struct _WIN32_MEMORY_RANGE_ENTRY { typedef BOOL(WINAPI *MDBX_PrefetchVirtualMemory)( HANDLE hProcess, ULONG_PTR NumberOfEntries, PWIN32_MEMORY_RANGE_ENTRY VirtualAddresses, ULONG Flags); -MDBX_INTERNAL_VAR MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; +MDBX_INTERNAL_VAR_PROTO MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; typedef enum _SECTION_INHERIT { ViewShare = 1, ViewUnmap = 2 } SECTION_INHERIT; typedef NTSTATUS(NTAPI *MDBX_NtExtendSection)(IN HANDLE SectionHandle, IN PLARGE_INTEGER NewSectionSize); -MDBX_INTERNAL_VAR MDBX_NtExtendSection mdbx_NtExtendSection; +MDBX_INTERNAL_VAR_PROTO MDBX_NtExtendSection mdbx_NtExtendSection; static __inline bool mdbx_RunningUnderWine(void) { return !mdbx_NtExtendSection; @@ -1887,14 +1895,15 @@ typedef LSTATUS(WINAPI *MDBX_RegGetValueA)(HKEY hkey, LPCSTR lpSubKey, LPCSTR lpValue, DWORD dwFlags, LPDWORD pdwType, PVOID pvData, LPDWORD pcbData); -MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; +MDBX_INTERNAL_VAR_PROTO MDBX_RegGetValueA mdbx_RegGetValueA; NTSYSAPI ULONG RtlRandomEx(PULONG Seed); typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle, PUCHAR OverlappedRangeStart, ULONG Length); -MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; +MDBX_INTERNAL_VAR_PROTO MDBX_SetFileIoOverlappedRange + mdbx_SetFileIoOverlappedRange; #endif /* Windows */ @@ -2124,7 +2133,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1 #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ -/** Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP +/** Controls dirty pages tracking, spilling and persisting in `MDBX_WRITEMAP` * mode. 0/OFF = Don't track dirty pages at all, don't spill ones, and use * msync() to persist data. This is by-default on Linux and other systems where * kernel provides properly LRU tracking and effective flushing on-demand. 1/ON @@ -2141,6 +2150,22 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #error MDBX_AVOID_MSYNC must be defined as 0 or 1 #endif /* MDBX_AVOID_MSYNC */ +/** Управляет механизмом поддержки разреженных наборов DBI-хендлов для снижения + * накладных расходов при запуске и обработке транзакций. */ +#ifndef MDBX_ENABLE_DBI_SPARSE +#define MDBX_ENABLE_DBI_SPARSE 1 +#elif !(MDBX_ENABLE_DBI_SPARSE == 0 || MDBX_ENABLE_DBI_SPARSE == 1) +#error MDBX_ENABLE_DBI_SPARSE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_DBI_SPARSE */ + +/** Управляет механизмом отложенного освобождения и поддержки пути быстрого + * открытия DBI-хендлов без захвата блокировок. */ +#ifndef MDBX_ENABLE_DBI_LOCKFREE +#define MDBX_ENABLE_DBI_LOCKFREE 1 +#elif !(MDBX_ENABLE_DBI_LOCKFREE == 0 || MDBX_ENABLE_DBI_LOCKFREE == 1) +#error MDBX_ENABLE_DBI_LOCKFREE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ + /** Controls sort order of internal page number lists. * This mostly experimental/advanced option with not for regular MDBX users. * \warning The database format depend on this option and libmdbx built with @@ -2188,8 +2213,8 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; /** If defined then enables integration with Valgrind, * a memory analyzing tool. */ -#ifndef MDBX_USE_VALGRIND -#endif /* MDBX_USE_VALGRIND */ +#ifndef ENABLE_MEMCHECK +#endif /* ENABLE_MEMCHECK */ /** If defined then enables use C11 atomics, * otherwise detects ones availability automatically. */ @@ -2512,13 +2537,23 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #define MDBX_RUNTIME_FLAGS_INIT \ ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT -extern uint8_t runtime_flags; -extern uint8_t loglevel; -extern MDBX_debug_func *debug_logger; +union logger_union { + void *ptr; + MDBX_debug_func *fmt; + MDBX_debug_func_nofmt *nofmt; +}; + +MDBX_INTERNAL_VAR_PROTO struct mdbx_static { + uint8_t flags; + uint8_t loglevel; + union logger_union logger; + size_t logger_buffer_size; + char *logger_buffer; +} mdbx_static; MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { #if MDBX_DEBUG - if (MDBX_DBG_JITTER & runtime_flags) + if (MDBX_DBG_JITTER & mdbx_static.flags) osal_jitter(tiny); #else (void)tiny; @@ -2532,17 +2567,17 @@ MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, const char *fmt, va_list args); #if MDBX_DEBUG -#define LOG_ENABLED(msg) unlikely(msg <= loglevel) -#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) +#define LOG_ENABLED(msg) unlikely(msg <= mdbx_static.loglevel) +#define AUDIT_ENABLED() unlikely((mdbx_static.flags & MDBX_DBG_AUDIT)) #else /* MDBX_DEBUG */ -#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= mdbx_static.loglevel) #define AUDIT_ENABLED() (0) #endif /* MDBX_DEBUG */ #if MDBX_FORCE_ASSERTIONS #define ASSERT_ENABLED() (1) #elif MDBX_DEBUG -#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) +#define ASSERT_ENABLED() likely((mdbx_static.flags & MDBX_DBG_ASSERT)) #else #define ASSERT_ENABLED() (0) #endif /* assertions */ @@ -2977,7 +3012,8 @@ typedef struct MDBX_page { #define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) -/* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ +/* Drop legacy P_DIRTY flag for sub-pages for compatilibity, + * for assertions only. */ #define PAGETYPE_COMPAT(p) \ (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ @@ -3086,7 +3122,7 @@ typedef sem_t osal_ipclock_t; #endif /* MDBX_LOCKING */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) -MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_stubinit(osal_ipclock_t *ipc); MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); #endif /* MDBX_LOCKING */ @@ -3099,8 +3135,9 @@ MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); * read transactions started by the same thread need no further locking to * proceed. * - * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data. - * No reader table is used if the database is on a read-only filesystem. + * If MDBX_NOSTICKYTHREADS is set, the slot address is not saved in + * thread-specific data. No reader table is used if the database is on a + * read-only filesystem. * * Since the database uses multi-version concurrency control, readers don't * actually need any locking. This table is used to keep track of which @@ -3410,10 +3447,10 @@ typedef struct troika { #if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */ uint32_t unused_pad; #endif -#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) -#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) -#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) -#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3) +#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7u) +#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64u) +#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128u) +#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3u) txnid_t txnid[NUM_METAS]; } meta_troika_t; @@ -3443,6 +3480,8 @@ struct MDBX_txn { #error "Oops, some txn flags overlapped or wrong" #endif uint32_t mt_flags; + unsigned mt_numdbs; + size_t mt_owner; /* thread ID that owns this transaction */ MDBX_txn *mt_parent; /* parent of a nested txn */ /* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */ @@ -3460,31 +3499,30 @@ struct MDBX_txn { txnid_t mt_front; MDBX_env *mt_env; /* the DB environment */ - /* Array of records for each DB known in the environment. */ - MDBX_dbx *mt_dbxs; /* Array of MDBX_db records for each known DB */ MDBX_db *mt_dbs; - /* Array of sequence numbers for each DB handle */ - MDBX_atomic_uint32_t *mt_dbiseqs; - - /* Transaction DBI Flags */ -#define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ -#define DBI_STALE MDBX_DBI_STALE /* Named-DB record is older than txnID */ -#define DBI_FRESH MDBX_DBI_FRESH /* Named-DB handle opened in this txn */ -#define DBI_CREAT MDBX_DBI_CREAT /* Named-DB handle created in this txn */ -#define DBI_VALID 0x10 /* DB handle is valid, see also DB_VALID */ -#define DBI_USRVALID 0x20 /* As DB_VALID, but not set for FREE_DBI */ -#define DBI_AUDITED 0x40 /* Internal flag for accounting during audit */ - /* Array of flags for each DB */ - uint8_t *mt_dbistate; - /* Number of DB records in use, or 0 when the txn is finished. - * This number only ever increments until the txn finishes; we - * don't decrement it when individual DB handles are closed. */ - MDBX_dbi mt_numdbs; - size_t mt_owner; /* thread ID that owns this transaction */ + +#if MDBX_ENABLE_DBI_SPARSE + unsigned *__restrict mt_dbi_sparse; +#endif /* MDBX_ENABLE_DBI_SPARSE */ + + /* Non-shared DBI state flags inside transaction */ +#define DBI_DIRTY 0x01 /* DB was written in this txn */ +#define DBI_STALE 0x02 /* Named-DB record is older than txnID */ +#define DBI_FRESH 0x04 /* Named-DB handle opened in this txn */ +#define DBI_CREAT 0x08 /* Named-DB handle created in this txn */ +#define DBI_VALID 0x10 /* Handle is valid, see also DB_VALID */ +#define DBI_OLDEN 0x40 /* Handle was closed/reopened outside txn */ +#define DBI_LINDO 0x80 /* Lazy initialization done for DBI-slot */ + /* Array of non-shared txn's flags of DBI */ + uint8_t *__restrict mt_dbi_state; + + /* Array of sequence numbers for each DB handle. */ + uint32_t *__restrict mt_dbi_seqs; + MDBX_cursor **mt_cursors; + MDBX_canary mt_canary; void *mt_userctx; /* User-settable context */ - MDBX_cursor **mt_cursors; union { struct { @@ -3494,8 +3532,8 @@ struct MDBX_txn { struct { meta_troika_t troika; /* In write txns, array of cursors for each DB */ - MDBX_PNL relist; /* Reclaimed GC pages */ - txnid_t last_reclaimed; /* ID of last used record */ + MDBX_PNL __restrict relist; /* Reclaimed GC pages */ + txnid_t last_reclaimed; /* ID of last used record */ #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; #endif /* MDBX_ENABLE_REFUND */ @@ -3507,14 +3545,14 @@ struct MDBX_txn { * dirtylist into mt_parent after freeing hidden mt_parent pages. */ size_t dirtyroom; /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ - MDBX_dpl *dirtylist; + MDBX_dpl *__restrict dirtylist; /* The list of reclaimed txns from GC */ - MDBX_TXL lifo_reclaimed; + MDBX_TXL __restrict lifo_reclaimed; /* The list of pages that became unused during this transaction. */ - MDBX_PNL retired_pages; + MDBX_PNL __restrict retired_pages; /* The list of loose pages that became unused and may be reused * in this transaction, linked through `mp_next`. */ - MDBX_page *loose_pages; + MDBX_page *__restrict loose_pages; /* Number of loose pages (tw.loose_pages) */ size_t loose_count; union { @@ -3523,11 +3561,12 @@ struct MDBX_txn { /* The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are * shifted left by 1, deleted slots have the LSB set. */ - MDBX_PNL list; + MDBX_PNL __restrict list; } spilled; size_t writemap_dirty_npages; size_t writemap_spilled_npages; }; + uint64_t gc_time_acc; } tw; }; }; @@ -3566,8 +3605,8 @@ struct MDBX_cursor { MDBX_db *mc_db; /* The database auxiliary record for this cursor */ MDBX_dbx *mc_dbx; - /* The mt_dbistate for this database */ - uint8_t *mc_dbistate; + /* The mt_dbi_state[] for this DBI */ + uint8_t *__restrict mc_dbi_state; uint8_t mc_snum; /* number of pushed pages */ uint8_t mc_top; /* index of top page, normally mc_snum-1 */ @@ -3620,6 +3659,11 @@ typedef struct MDBX_cursor_couple { MDBX_xcursor inner; } MDBX_cursor_couple; +struct mdbx_defer_free_item { + struct mdbx_defer_free_item *next; + uint64_t timestamp; +}; + /* The database environment. */ struct MDBX_env { /* ----------------------------------------------------- mostly static part */ @@ -3637,6 +3681,7 @@ struct MDBX_env { #define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; + unsigned me_psize; /* DB page size, initialized from me_os_psize */ osal_mmap_t me_dxb_mmap; /* The main data file */ #define me_map me_dxb_mmap.base #define me_lazy_fd me_dxb_mmap.fd @@ -3649,7 +3694,6 @@ struct MDBX_env { #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; - unsigned me_psize; /* DB page size, initialized from me_os_psize */ uint16_t me_leaf_nodemax; /* max size of a leaf-node */ uint16_t me_branch_nodemax; /* max size of a branch-node */ uint16_t me_subpage_limit; @@ -3667,13 +3711,15 @@ struct MDBX_env { MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ osal_thread_key_t me_txkey; /* thread-key for readers */ - pathchar_t *me_pathname; /* path to the DB files */ - void *me_pbuf; /* scratch area for DUPSORT put() */ - MDBX_txn *me_txn0; /* preallocated write transaction */ - - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ + struct { /* path to the DB files */ + pathchar_t *lck, *dxb, *specified; + void *buffer; + } me_pathname; + void *me_pbuf; /* scratch area for DUPSORT put() */ + MDBX_txn *me_txn0; /* preallocated write transaction */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *__restrict me_db_flags; /* array of flags from MDBX_db.md_flags */ + MDBX_atomic_uint32_t *me_dbi_seqs; /* array of dbi sequence numbers */ unsigned me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ unsigned me_maxgc_per_branch; @@ -3687,6 +3733,7 @@ struct MDBX_env { unsigned rp_augment_limit; unsigned dp_limit; unsigned dp_initial; + uint64_t gc_time_limit; uint8_t dp_loose_limit; uint8_t spill_max_denominator; uint8_t spill_min_denominator; @@ -3696,6 +3743,8 @@ struct MDBX_env { unsigned writethrough_threshold; #endif /* Windows */ bool prefault_write; + bool prefer_waf_insteadof_balance; /* Strive to minimize WAF instead of + balancing pages fullment */ union { unsigned all; /* tracks options with non-auto values but tuned by user */ @@ -3725,20 +3774,23 @@ struct MDBX_env { } me_sysv_ipc; #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ bool me_incore; + bool me_prefault_write; - MDBX_env *me_lcklist_next; +#if MDBX_ENABLE_DBI_LOCKFREE + struct mdbx_defer_free_item *me_defer_free; +#endif /* MDBX_ENABLE_DBI_LOCKFREE */ /* --------------------------------------------------- mostly volatile part */ MDBX_txn *me_txn; /* current write transaction */ osal_fastmutex_t me_dbi_lock; - MDBX_dbi me_numdbs; /* number of DBs opened */ - bool me_prefault_write; + unsigned me_numdbs; /* number of DBs opened */ - MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ unsigned me_dp_reserve_len; + MDBX_page *__restrict me_dp_reserve; /* list of malloc'ed blocks for re-use */ + /* PNL of pages that became unused in a write txn */ - MDBX_PNL me_retired_pages; + MDBX_PNL __restrict me_retired_pages; osal_ioring_t me_ioring; #if defined(_WIN32) || defined(_WIN64) @@ -3756,13 +3808,12 @@ struct MDBX_env { #if MDBX_DEBUG MDBX_assert_func *me_assert_func; /* Callback for assertion failures */ #endif -#ifdef MDBX_USE_VALGRIND +#ifdef ENABLE_MEMCHECK int me_valgrind_handle; #endif -#if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - MDBX_atomic_uint32_t me_ignore_EDEADLK; +#if defined(ENABLE_MEMCHECK) || defined(__SANITIZE_ADDRESS__) pgno_t me_poison_edge; -#endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ +#endif /* ENABLE_MEMCHECK || __SANITIZE_ADDRESS__ */ #ifndef xMDBX_DEBUG_SPILLING #define xMDBX_DEBUG_SPILLING 0 @@ -3822,10 +3873,6 @@ osal_flush_incoherent_mmap(const void *addr, size_t nbytes, MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, int *dead); -MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, - MDBX_reader *end); -MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); - MDBX_INTERNAL_FUNC void global_ctor(void); MDBX_INTERNAL_FUNC void osal_ctor(void); MDBX_INTERNAL_FUNC void global_dtor(void); @@ -3940,7 +3987,8 @@ typedef struct MDBX_node { /* mdbx_dbi_open() flags */ #define DB_USABLE_FLAGS (DB_PERSISTENT_FLAGS | MDBX_CREATE | MDBX_DB_ACCEDE) -#define DB_VALID 0x8000 /* DB handle is valid, for me_dbflags */ +#define DB_VALID 0x8000u /* DB handle is valid, for me_db_flags */ +#define DB_POISON 0x7fffu /* update pending */ #define DB_INTERNAL_FLAGS DB_VALID #if DB_INTERNAL_FLAGS & DB_USABLE_FLAGS @@ -4029,11 +4077,11 @@ log2n_powerof2(size_t value_uintptr) { * environment and re-opening it with the new flags. */ #define ENV_CHANGEABLE_FLAGS \ (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | \ - MDBX_VALIDATION) + MDBX_NOMEMINIT | MDBX_DEPRECATED_COALESCE | MDBX_PAGEPERTURB | \ + MDBX_ACCEDE | MDBX_VALIDATION) #define ENV_CHANGELESS_FLAGS \ - (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ - MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) + (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOSTICKYTHREADS | \ + MDBX_NORDAHEAD | MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) #define ENV_USABLE_FLAGS (ENV_CHANGEABLE_FLAGS | ENV_CHANGELESS_FLAGS) #if !defined(__cplusplus) || CONSTEXPR_ENUM_FLAGS_OPERATIONS @@ -4067,6 +4115,36 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) +/******************************************************************************/ + +/** \brief Page types for traverse the b-tree. + * \see mdbx_env_pgwalk() \see MDBX_pgvisitor_func */ +enum MDBX_page_type_t { + MDBX_page_broken, + MDBX_page_large, + MDBX_page_branch, + MDBX_page_leaf, + MDBX_page_dupfixed_leaf, + MDBX_subpage_leaf, + MDBX_subpage_dupfixed_leaf, + MDBX_subpage_broken, +}; +typedef enum MDBX_page_type_t MDBX_page_type_t; + +typedef struct MDBX_walk_sdb { + MDBX_val name; + struct MDBX_db *internal, *nested; +} MDBX_walk_sdb_t; + +/** \brief Callback function for traverse the b-tree. \see mdbx_env_pgwalk() */ +typedef int +MDBX_pgvisitor_func(const size_t pgno, const unsigned number, void *const ctx, + const int deep, const MDBX_walk_sdb_t *subdb, + const size_t page_size, const MDBX_page_type_t page_type, + const MDBX_error_t err, const size_t nentries, + const size_t payload_bytes, const size_t header_bytes, + const size_t unused_bytes); + #if defined(_WIN32) || defined(_WIN64) /* * POSIX getopt for Windows