diff --git a/contrib/db/liblmdb/CHANGES b/contrib/db/liblmdb/CHANGES index f00efbb9..aabb1adf 100644 --- a/contrib/db/liblmdb/CHANGES +++ b/contrib/db/liblmdb/CHANGES @@ -1,70 +1,14 @@ LMDB 0.9 Change Log -LMDB 0.9.24 Release (2019/07/24) - ITS#8969 Tweak mdb_page_split - ITS#8975 WIN32 fix writemap set_mapsize crash - ITS#9007 Fix loose pages in WRITEMAP - -LMDB 0.9.23 Release (2018/12/19) - ITS#8756 Fix loose pages in dirty list - ITS#8831 Fix mdb_load flag init - ITS#8844 Fix mdb_env_close in forked process - Documentation - ITS#8857 mdb_cursor_del doesn't invalidate cursor - ITS#8908 GET_MULTIPLE etc don't change passed in key - -LMDB 0.9.22 Release (2018/03/22) - Fix MDB_DUPSORT alignment bug (ITS#8819) - Fix regression with new db from 0.9.19 (ITS#8760) - Fix liblmdb to build on Solaris (ITS#8612) - Fix delete behavior with DUPSORT DB (ITS#8622) - Fix mdb_cursor_get/mdb_cursor_del behavior (ITS#8722) - -LMDB 0.9.21 Release (2017/06/01) - Fix xcursor after cursor_del (ITS#8622) - -LMDB 0.9.20 (Withdrawn) - Fix mdb_load with escaped plaintext (ITS#8558) - Fix mdb_cursor_last / mdb_put interaction (ITS#8557) - -LMDB 0.9.19 Release (2016/12/28) - Fix mdb_env_cwalk cursor init (ITS#8424) - Fix robust mutexes on Solaris 10/11 (ITS#8339) - Tweak Win32 error message buffer - Fix MDB_GET_BOTH on non-dup record (ITS#8393) - Optimize mdb_drop - Fix xcursors after mdb_cursor_del (ITS#8406) - Fix MDB_NEXT_DUP after mdb_cursor_del (ITS#8412) - Fix mdb_cursor_put resetting C_EOF (ITS#8489) - Fix mdb_env_copyfd2 to return EPIPE on SIGPIPE (ITS#8504) - Fix mdb_env_copy with empty DB (ITS#8209) - Fix behaviors with fork (ITS#8505) - Fix mdb_dbi_open with mainDB cursors (ITS#8542) - Fix robust mutexes on kFreeBSD (ITS#8554) - Fix utf8_to_utf16 error checks (ITS#7992) - Fix F_NOCACHE on MacOS, error is non-fatal (ITS#7682) - Build - Make shared lib suffix overridable (ITS#8481) - Documentation - Cleanup doxygen nits - Note reserved vs actual mem/disk usage - - -LMDB 0.9.18 Release (2016/02/05) +LMDB 0.9.18 Release Engineering Fix robust mutex detection on glibc 2.10-11 (ITS#8330) - Fix page_search_root assert on FreeDB (ITS#8336) - Fix MDB_APPENDDUP vs. rewrite(single item) (ITS#8334) - Fix mdb_copy of large files on Windows - Fix subcursor move after delete (ITS#8355) - Fix mdb_midl_shirnk off-by-one (ITS#8363) Check for utf8_to_utf16 failures (ITS#7992) Catch strdup failure in mdb_dbi_open Build Additional makefile var tweaks (ITS#8169) Documentation Add Getting Started page - Update WRITEMAP description - + LMDB 0.9.17 Release (2015/11/30) Fix ITS#7377 catch calloc failure diff --git a/contrib/db/liblmdb/COPYRIGHT b/contrib/db/liblmdb/COPYRIGHT index f076556e..722d1a51 100644 --- a/contrib/db/liblmdb/COPYRIGHT +++ b/contrib/db/liblmdb/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright 2011-2019 Howard Chu, Symas Corp. +Copyright 2011-2015 Howard Chu, Symas Corp. All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/contrib/db/liblmdb/Doxyfile b/contrib/db/liblmdb/Doxyfile index 5ca2cfe8..5047c0bb 100644 --- a/contrib/db/liblmdb/Doxyfile +++ b/contrib/db/liblmdb/Doxyfile @@ -253,7 +253,7 @@ IDL_PROPERTY_SUPPORT = YES # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. -DISTRIBUTE_GROUP_DOC = YES +DISTRIBUTE_GROUP_DOC = NO # Set the SUBGROUPING tag to YES (the default) to allow class member groups of # the same type (for instance a group of public functions) to be put as a diff --git a/contrib/db/liblmdb/Makefile b/contrib/db/liblmdb/Makefile index f254511f..f3c93a2f 100644 --- a/contrib/db/liblmdb/Makefile +++ b/contrib/db/liblmdb/Makefile @@ -8,7 +8,7 @@ # platforms; you should not need to change any of these. # Read their descriptions in mdb.c if you do: # -# - MDB_USE_POSIX_SEM +# - MDB_USE_POSIX_MUTEX, MDB_USE_POSIX_SEM, MDB_USE_SYSV_SEM # - MDB_DSYNC # - MDB_FDATASYNC # - MDB_FDATASYNC_WORKS @@ -24,9 +24,8 @@ W = -W -Wall -Wno-unused-parameter -Wbad-function-cast -Wuninitialized THREADS = -pthread OPT = -O2 -g CFLAGS = $(THREADS) $(OPT) $(W) $(XCFLAGS) -LDLIBS = -SOLIBS = -SOEXT = .so +LDLIBS = # -lntdll # Windows needs ntdll +SOLIBS = # -lntdll prefix = /usr/local exec_prefix = $(prefix) bindir = $(exec_prefix)/bin @@ -38,7 +37,7 @@ mandir = $(datarootdir)/man ######################################################################## IHDRS = lmdb.h -ILIBS = liblmdb.a liblmdb$(SOEXT) +ILIBS = liblmdb.a liblmdb.so IPROGS = mdb_stat mdb_copy mdb_dump mdb_load IDOCS = mdb_stat.1 mdb_copy.1 mdb_dump.1 mdb_load.1 PROGS = $(IPROGS) mtest mtest2 mtest3 mtest4 mtest5 @@ -64,7 +63,7 @@ test: all liblmdb.a: mdb.o midl.o $(AR) rs $@ mdb.o midl.o -liblmdb$(SOEXT): mdb.lo midl.lo +liblmdb.so: mdb.lo midl.lo # $(CC) $(LDFLAGS) -pthread -shared -Wl,-Bsymbolic -o $@ mdb.o midl.o $(SOLIBS) $(CC) $(LDFLAGS) -pthread -shared -o $@ mdb.lo midl.lo $(SOLIBS) diff --git a/contrib/db/liblmdb/intro.doc b/contrib/db/liblmdb/intro.doc index 64dfcaad..870c7bb8 100644 --- a/contrib/db/liblmdb/intro.doc +++ b/contrib/db/liblmdb/intro.doc @@ -1,5 +1,5 @@ /* - * Copyright 2015-2018 Howard Chu, Symas Corp. + * Copyright 2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/contrib/db/liblmdb/lmdb.h b/contrib/db/liblmdb/lmdb.h index 2f55290d..0bca3eb7 100644 --- a/contrib/db/liblmdb/lmdb.h +++ b/contrib/db/liblmdb/lmdb.h @@ -53,14 +53,15 @@ * * Fix: Check for stale readers periodically, using the * #mdb_reader_check function or the \ref mdb_stat_1 "mdb_stat" tool. - * Stale writers will be cleared automatically on some systems: + * Stale writers will be cleared automatically on most systems: * - Windows - automatic + * - BSD, systems using SysV semaphores - automatic * - Linux, systems using POSIX mutexes with Robust option - automatic - * - not on BSD, systems using POSIX semaphores. * Otherwise just make all programs using the database close it; * the lockfile is always reset on first open of the environment. * - * - On BSD systems or others configured with MDB_USE_POSIX_SEM, + * - On BSD systems or others configured with MDB_USE_SYSV_SEM or + * MDB_USE_POSIX_SEM, * startup can fail due to semaphores owned by another userid. * * Fix: Open and close the database as the user which owns the @@ -96,12 +97,11 @@ * transactions. Each transaction belongs to one thread. See below. * The #MDB_NOTLS flag changes this for read-only transactions. * - * - Use an MDB_env* in the process which opened it, not after fork(). + * - Use an MDB_env* in the process which opened it, without fork()ing. * * - Do not have open an LMDB database twice in the same process at * the same time. Not even from a plain open() call - close()ing it - * breaks fcntl() advisory locking. (It is OK to reopen it after - * fork() - exec*(), since the lockfile has FD_CLOEXEC set.) + * breaks flock() advisory locking. * * - Avoid long-lived transactions. Read transactions prevent * reuse of pages freed by newer write transactions, thus the @@ -135,7 +135,7 @@ * * @author Howard Chu, Symas Corporation. * - * @copyright Copyright 2011-2019 Howard Chu, Symas Corp. All rights reserved. + * @copyright Copyright 2011-2016 Howard Chu, Symas Corp. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted only as authorized by the OpenLDAP @@ -166,6 +166,7 @@ #define _LMDB_H_ #include +#include #ifdef __cplusplus extern "C" { @@ -178,6 +179,13 @@ typedef int mdb_mode_t; typedef mode_t mdb_mode_t; #endif +#ifdef MDB_VL32 +typedef uint64_t mdb_size_t; +#define mdb_env_create mdb_env_create_vl32 /**< Prevent mixing with non-VL32 builds */ +#else +typedef size_t mdb_size_t; +#endif + /** An abstraction for a file handle. * On POSIX systems file handles are small integers. On Windows * they're opaque pointers. @@ -200,7 +208,7 @@ typedef int mdb_filehandle_t; /** Library minor version */ #define MDB_VERSION_MINOR 9 /** Library patch version */ -#define MDB_VERSION_PATCH 24 +#define MDB_VERSION_PATCH 70 /** Combine args a,b,c into a single integer for easy version comparisons */ #define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c)) @@ -210,7 +218,7 @@ typedef int mdb_filehandle_t; MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH) /** The release date of this library version */ -#define MDB_VERSION_DATE "July 24, 2019" +#define MDB_VERSION_DATE "December 19, 2015" /** A stringifier for the version info */ #define MDB_VERSTR(a,b,c,d) "LMDB " #a "." #b "." #c ": (" d ")" @@ -303,6 +311,8 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel #define MDB_NORDAHEAD 0x800000 /** don't initialize malloc'd memory before writing to datafile */ #define MDB_NOMEMINIT 0x1000000 + /** use the previous snapshot rather than the latest one */ +#define MDB_PREVSNAPSHOT 0x2000000 /** @} */ /** @defgroup mdb_dbi_open Database Flags @@ -370,7 +380,7 @@ typedef enum MDB_cursor_op { MDB_GET_BOTH, /**< Position at key/data pair. Only for #MDB_DUPSORT */ MDB_GET_BOTH_RANGE, /**< position at key, nearest data. Only for #MDB_DUPSORT */ MDB_GET_CURRENT, /**< Return key/data at current cursor position */ - MDB_GET_MULTIPLE, /**< Return up to a page of duplicate data items + MDB_GET_MULTIPLE, /**< Return key and up to a page of duplicate data items from current cursor position. Move cursor to prepare for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */ MDB_LAST, /**< Position at last key/data item */ @@ -379,7 +389,7 @@ typedef enum MDB_cursor_op { MDB_NEXT, /**< Position at next data item */ MDB_NEXT_DUP, /**< Position at next data item of current key. Only for #MDB_DUPSORT */ - MDB_NEXT_MULTIPLE, /**< Return up to a page of duplicate data items + MDB_NEXT_MULTIPLE, /**< Return key and up to a page of duplicate data items from next cursor position. Move cursor to prepare for #MDB_NEXT_MULTIPLE. Only for #MDB_DUPFIXED */ MDB_NEXT_NODUP, /**< Position at first data item of next key */ @@ -390,7 +400,7 @@ typedef enum MDB_cursor_op { MDB_SET, /**< Position at specified key */ MDB_SET_KEY, /**< Position at specified key, return key + data */ MDB_SET_RANGE, /**< Position at first key greater than or equal to specified key. */ - MDB_PREV_MULTIPLE /**< Position at previous page and return up to + MDB_PREV_MULTIPLE /**< Position at previous page and return key and up to a page of duplicate data items. Only for #MDB_DUPFIXED */ } MDB_cursor_op; @@ -457,18 +467,18 @@ typedef struct MDB_stat { unsigned int ms_psize; /**< Size of a database page. This is currently the same for all databases. */ unsigned int ms_depth; /**< Depth (height) of the B-tree */ - size_t ms_branch_pages; /**< Number of internal (non-leaf) pages */ - size_t ms_leaf_pages; /**< Number of leaf pages */ - size_t ms_overflow_pages; /**< Number of overflow pages */ - size_t ms_entries; /**< Number of data items */ + mdb_size_t ms_branch_pages; /**< Number of internal (non-leaf) pages */ + mdb_size_t ms_leaf_pages; /**< Number of leaf pages */ + mdb_size_t ms_overflow_pages; /**< Number of overflow pages */ + mdb_size_t ms_entries; /**< Number of data items */ } MDB_stat; /** @brief Information about the environment */ typedef struct MDB_envinfo { void *me_mapaddr; /**< Address of map, if fixed */ - size_t me_mapsize; /**< Size of the data memory map */ - size_t me_last_pgno; /**< ID of the last used page */ - size_t me_last_txnid; /**< ID of the last committed transaction */ + mdb_size_t me_mapsize; /**< Size of the data memory map */ + mdb_size_t me_last_pgno; /**< ID of the last used page */ + mdb_size_t me_last_txnid; /**< ID of the last committed transaction */ unsigned int me_maxreaders; /**< max reader slots in the environment */ unsigned int me_numreaders; /**< max reader slots used in the environment */ } MDB_envinfo; @@ -614,6 +624,12 @@ int mdb_env_create(MDB_env **env); * caller is expected to overwrite all of the memory that was * reserved in that case. * This flag may be changed at any time using #mdb_env_set_flags(). + *
  • #MDB_PREVSNAPSHOT + * Open the environment with the previous snapshot rather than the latest + * one. This loses the latest transaction, but may help work around some + * types of corruption. If opened with write access, this must be the + * only process using the environment. This flag is automatically reset + * after a write transaction is successfully committed. * * @param[in] mode The UNIX permissions to set on created files and semaphores. * This parameter is ignored on Windows. @@ -680,7 +696,6 @@ int mdb_env_copyfd(MDB_env *env, mdb_filehandle_t fd); *
  • #MDB_CP_COMPACT - Perform compaction while copying: omit free * pages and sequentially renumber all pages in output. This option * consumes more CPU and runs more slowly than the default. - * Currently it fails if the environment has suffered a page leak. * * @return A non-zero error value on failure and 0 on success. */ @@ -795,10 +810,6 @@ int mdb_env_get_flags(MDB_env *env, unsigned int *flags); int mdb_env_get_path(MDB_env *env, const char **path); /** @brief Return the filedescriptor for the given environment. - * - * This function may be called after fork(), so the descriptor can be - * closed before exec*(). Other LMDB file descriptors have FD_CLOEXEC. - * (Until LMDB 0.9.18, only the lockfile had that.) * * @param[in] env An environment handle returned by #mdb_env_create() * @param[out] fd Address of a mdb_filehandle_t to contain the descriptor. @@ -842,7 +853,7 @@ int mdb_env_get_fd(MDB_env *env, mdb_filehandle_t *fd); * an active write transaction. * */ -int mdb_env_set_mapsize(MDB_env *env, size_t size); +int mdb_env_set_mapsize(MDB_env *env, mdb_size_t size); /** @brief Set the maximum number of threads/reader slots for the environment. * @@ -955,6 +966,10 @@ int mdb_env_set_assert(MDB_env *env, MDB_assert_func *func); *
      *
    • #MDB_RDONLY * This transaction will not perform any write operations. + *
    • #MDB_NOSYNC + * Don't flush system buffers to disk when committing this transaction. + *
    • #MDB_NOMETASYNC + * Flush system buffers but omit metadata flush when committing this transaction. *
    * @param[out] txn Address where the new #MDB_txn handle will be stored * @return A non-zero error value on failure and 0 on success. Some possible @@ -987,7 +1002,7 @@ MDB_env *mdb_txn_env(MDB_txn *txn); * @param[in] txn A transaction handle returned by #mdb_txn_begin() * @return A transaction ID, valid if input is an active transaction. */ -size_t mdb_txn_id(MDB_txn *txn); +mdb_size_t mdb_txn_id(MDB_txn *txn); /** @brief Commit all the operations of a transaction into the database. * @@ -1103,9 +1118,8 @@ int mdb_txn_renew(MDB_txn *txn); * This flag may only be used in combination with #MDB_DUPSORT. This option * tells the library that the data items for this database are all the same * size, which allows further optimizations in storage and retrieval. When - * all data items are the same size, the #MDB_GET_MULTIPLE, #MDB_NEXT_MULTIPLE - * and #MDB_PREV_MULTIPLE cursor operations may be used to retrieve multiple - * items at once. + * all data items are the same size, the #MDB_GET_MULTIPLE and #MDB_NEXT_MULTIPLE + * cursor operations may be used to retrieve multiple items at once. *
  • #MDB_INTEGERDUP * This option specifies that duplicate data items are binary integers, * similar to #MDB_INTEGERKEY keys. @@ -1510,10 +1524,6 @@ int mdb_cursor_put(MDB_cursor *cursor, MDB_val *key, MDB_val *data, /** @brief Delete current key/data pair * * This function deletes the key/data pair to which the cursor refers. - * This does not invalidate the cursor, so operations such as MDB_NEXT - * can still be used on it. - * Both MDB_NEXT and MDB_GET_CURRENT will return the same record after - * this operation. * @param[in] cursor A cursor handle returned by #mdb_cursor_open() * @param[in] flags Options for this operation. This parameter * must be set to 0 or one of the values described here. @@ -1542,7 +1552,7 @@ int mdb_cursor_del(MDB_cursor *cursor, unsigned int flags); *
  • EINVAL - cursor is not initialized, or an invalid parameter was specified. * */ -int mdb_cursor_count(MDB_cursor *cursor, size_t *countp); +int mdb_cursor_count(MDB_cursor *cursor, mdb_size_t *countp); /** @brief Compare two data items according to a particular database. * diff --git a/contrib/db/liblmdb/mdb.c b/contrib/db/liblmdb/mdb.c index 692feaa3..ca9f3b12 100644 --- a/contrib/db/liblmdb/mdb.c +++ b/contrib/db/liblmdb/mdb.c @@ -5,7 +5,7 @@ * BerkeleyDB API, but much simplified. */ /* - * Copyright 2011-2019 Howard Chu, Symas Corp. + * Copyright 2011-2016 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -35,13 +35,50 @@ #ifndef _GNU_SOURCE #define _GNU_SOURCE 1 #endif -#if defined(__WIN64__) +#if defined(MDB_VL32) || defined(__WIN64__) #define _FILE_OFFSET_BITS 64 #endif #ifdef _WIN32 #include #include -#include /* get wcscpy() */ +#include + +#ifndef _NTDEF_ +typedef _Return_type_success_(return >= 0) LONG NTSTATUS; +typedef NTSTATUS *PNTSTATUS; +#endif + +/** Visual studio big files support +*/ + +/* We use native NT APIs to setup the memory map, so that we can + * let the DB file grow incrementally instead of always preallocating + * the full size. These APIs are defined in and + * but those headers are meant for driver-level development and + * conflict with the regular user-level headers, so we explicitly + * declare them here. Using these APIs also means we must link to + * ntdll.dll, which is not linked by default in user code. + */ +NTSTATUS WINAPI +NtCreateSection(OUT PHANDLE sh, IN ACCESS_MASK acc, + IN void * oa OPTIONAL, + IN PLARGE_INTEGER ms OPTIONAL, + IN ULONG pp, IN ULONG aa, IN HANDLE fh OPTIONAL); + +typedef enum _SECTION_INHERIT { + ViewShare = 1, + ViewUnmap = 2 +} SECTION_INHERIT; + +NTSTATUS WINAPI +NtMapViewOfSection(IN PHANDLE sh, IN HANDLE ph, + IN OUT PVOID *addr, IN ULONG_PTR zbits, + IN SIZE_T cs, IN OUT PLARGE_INTEGER off OPTIONAL, + IN OUT PSIZE_T vs, IN SECTION_INHERIT ih, + IN ULONG at, IN ULONG pp); + +NTSTATUS WINAPI +NtClose(HANDLE h); /** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it * as int64 which is wrong. MSVC doesn't define it at all, so just @@ -93,6 +130,13 @@ extern int cacheflush(char *addr, int nbytes, int cache); #define BROKEN_FDATASYNC #endif +#ifdef _WIN32 +typedef int64_t off64_t; +#else +typedef off_t off64_t; +#endif + + #include #include #include @@ -109,14 +153,10 @@ typedef SSIZE_T ssize_t; #include #endif -#if defined(__sun) || defined(ANDROID) +#if defined(__sun) || defined(__ANDROID__) /* Most platforms have posix_memalign, older may only have memalign */ #define HAVE_MEMALIGN 1 #include -/* On Solaris, we need the POSIX sigwait function */ -#if defined (__sun) -# define _POSIX_PTHREAD_SEMANTICS 1 -#endif #endif #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER)) @@ -124,25 +164,36 @@ typedef SSIZE_T ssize_t; #include /* defines BYTE_ORDER on HPUX and Solaris */ #endif -#if defined(__APPLE__) || defined (BSD) || defined(__FreeBSD_kernel__) -# define MDB_USE_POSIX_SEM 1 +#if defined(__APPLE__) || defined (BSD) +# if !(defined(MDB_USE_POSIX_MUTEX) || defined(MDB_USE_POSIX_SEM)) +# define MDB_USE_SYSV_SEM 1 +# endif # define MDB_FDATASYNC fsync -#elif defined(ANDROID) +#elif defined(__ANDROID__) # define MDB_FDATASYNC fsync #endif #ifndef _WIN32 #include -#include #ifdef MDB_USE_POSIX_SEM # define MDB_USE_HASH 1 #include +#elif defined(MDB_USE_SYSV_SEM) +#include +#include +#ifdef _SEM_SEMUN_UNDEFINED +union semun { + int val; + struct semid_ds *buf; + unsigned short *array; +}; +#endif /* _SEM_SEMUN_UNDEFINED */ #else #define MDB_USE_POSIX_MUTEX 1 -#endif -#endif +#endif /* MDB_USE_POSIX_SEM */ +#endif /* !_WIN32 */ -#if defined(_WIN32) + defined(MDB_USE_POSIX_SEM) \ +#if defined(_WIN32) + defined(MDB_USE_POSIX_SEM) + defined(MDB_USE_SYSV_SEM) \ + defined(MDB_USE_POSIX_MUTEX) != 1 # error "Ambiguous shared-lock implementation" #endif @@ -244,6 +295,8 @@ typedef SSIZE_T ssize_t; #define MDB_NO_ROOT (MDB_LAST_ERRCODE + 10) #ifdef _WIN32 #define MDB_OWNERDEAD ((int) WAIT_ABANDONED) +#elif defined MDB_USE_SYSV_SEM +#define MDB_OWNERDEAD (MDB_LAST_ERRCODE + 11) #elif defined(MDB_USE_POSIX_MUTEX) && defined(EOWNERDEAD) #define MDB_OWNERDEAD EOWNERDEAD /**< #LOCK_MUTEX0() result if dead owner */ #endif @@ -254,31 +307,28 @@ typedef SSIZE_T ssize_t; /** Some platforms define the EOWNERDEAD error code * even though they don't support Robust Mutexes. * Compile with -DMDB_USE_ROBUST=0, or use some other - * mechanism like -DMDB_USE_POSIX_SEM instead of - * -DMDB_USE_POSIX_MUTEX. - * (Posix semaphores are not robust.) + * mechanism like -DMDB_USE_SYSV_SEM instead of + * -DMDB_USE_POSIX_MUTEX. (SysV semaphores are + * also Robust, but some systems don't support them + * either.) */ #ifndef MDB_USE_ROBUST /* Android currently lacks Robust Mutex support. So does glibc < 2.4. */ -# if defined(MDB_USE_POSIX_MUTEX) && (defined(ANDROID) || \ +# if defined(MDB_USE_POSIX_MUTEX) && (defined(__ANDROID__) || \ (defined(__GLIBC__) && GLIBC_VER < 0x020004)) # define MDB_USE_ROBUST 0 # else # define MDB_USE_ROBUST 1 -# endif -#endif /* !MDB_USE_ROBUST */ - -#if defined(MDB_USE_POSIX_MUTEX) && (MDB_USE_ROBUST) /* glibc < 2.12 only provided _np API */ -# if (defined(__GLIBC__) && GLIBC_VER < 0x02000c) || \ - (defined(PTHREAD_MUTEX_ROBUST_NP) && !defined(PTHREAD_MUTEX_ROBUST)) +# if defined(__GLIBC__) && GLIBC_VER < 0x02000c # define PTHREAD_MUTEX_ROBUST PTHREAD_MUTEX_ROBUST_NP # define pthread_mutexattr_setrobust(attr, flag) pthread_mutexattr_setrobust_np(attr, flag) # define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex) # endif -#endif /* MDB_USE_POSIX_MUTEX && MDB_USE_ROBUST */ +# endif +#endif /* MDB_USE_ROBUST */ -#if defined(MDB_OWNERDEAD) && (MDB_USE_ROBUST) +#if defined(MDB_OWNERDEAD) && MDB_USE_ROBUST #define MDB_ROBUST_SUPPORTED 1 #endif @@ -301,10 +351,8 @@ typedef HANDLE mdb_mutex_t, mdb_mutexref_t; #define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE) #define pthread_cond_signal(x) SetEvent(*x) #define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0) -#define THREAD_CREATE(thr,start,arg) \ - (((thr) = CreateThread(NULL, 0, start, arg, 0, NULL)) ? 0 : ErrCode()) -#define THREAD_FINISH(thr) \ - (WaitForSingleObject(thr, INFINITE) ? ErrCode() : 0) +#define THREAD_CREATE(thr,start,arg) thr=CreateThread(NULL,0,start,arg,0,NULL) +#define THREAD_FINISH(thr) WaitForSingleObject(thr, INFINITE) #define LOCK_MUTEX0(mutex) WaitForSingleObject(mutex, INFINITE) #define UNLOCK_MUTEX(mutex) ReleaseMutex(mutex) #define mdb_mutex_consistent(mutex) 0 @@ -344,16 +392,50 @@ mdb_sem_wait(sem_t *sem) return rc; } +#elif defined MDB_USE_SYSV_SEM + +typedef struct mdb_mutex { + int semid; + int semnum; + int *locked; +} mdb_mutex_t[1], *mdb_mutexref_t; + +#define LOCK_MUTEX0(mutex) mdb_sem_wait(mutex) +#define UNLOCK_MUTEX(mutex) do { \ + struct sembuf sb = { 0, 1, SEM_UNDO }; \ + sb.sem_num = (mutex)->semnum; \ + *(mutex)->locked = 0; \ + semop((mutex)->semid, &sb, 1); \ +} while(0) + +static int +mdb_sem_wait(mdb_mutexref_t sem) +{ + int rc, *locked = sem->locked; + struct sembuf sb = { 0, -1, SEM_UNDO }; + sb.sem_num = sem->semnum; + do { + if (!semop(sem->semid, &sb, 1)) { + rc = *locked ? MDB_OWNERDEAD : MDB_SUCCESS; + *locked = 1; + break; + } + } while ((rc = errno) == EINTR); + return rc; +} + +#define mdb_mutex_consistent(mutex) 0 + #else /* MDB_USE_POSIX_MUTEX: */ - /** Shared mutex/semaphore as the original is stored. + /** Shared mutex/semaphore as it is stored (mdb_mutex_t), and as + * local variables keep it (mdb_mutexref_t). * - * Not for copies. Instead it can be assigned to an #mdb_mutexref_t. - * When mdb_mutexref_t is a pointer and mdb_mutex_t is not, then it - * is array[size 1] so it can be assigned to the pointer. + * An mdb_mutex_t can be assigned to an mdb_mutexref_t. They can + * be the same, or an array[size 1] and a pointer. + * @{ */ -typedef pthread_mutex_t mdb_mutex_t[1]; - /** Reference to an #mdb_mutex_t */ -typedef pthread_mutex_t *mdb_mutexref_t; +typedef pthread_mutex_t mdb_mutex_t[1], *mdb_mutexref_t; + /* @} */ /** Lock the reader or writer mutex. * Returns 0 or a code to give #mdb_mutex_failed(), as in #LOCK_MUTEX(). */ @@ -364,7 +446,7 @@ typedef pthread_mutex_t *mdb_mutexref_t; /** Mark mutex-protected data as repaired, after death of previous owner. */ #define mdb_mutex_consistent(mutex) pthread_mutex_consistent(mutex) -#endif /* MDB_USE_POSIX_SEM */ +#endif /* MDB_USE_POSIX_SEM || MDB_USE_SYSV_SEM */ /** Get the error code for the last failed system function. */ @@ -389,12 +471,30 @@ typedef pthread_mutex_t *mdb_mutexref_t; #define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE)) #endif +#ifdef MDB_VL32 +#ifdef _WIN32 +#define Y "I64" +#else +#define Y "ll" +#endif +#else +#define Y Z +#endif + #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) #define MNAME_LEN 32 +#elif defined(MDB_USE_SYSV_SEM) +#define MNAME_LEN (sizeof(int)) #else #define MNAME_LEN (sizeof(pthread_mutex_t)) #endif +#ifdef MDB_USE_SYSV_SEM +#define SYSV_SEM_FLAG 1 /**< SysV sems in lockfile format */ +#else +#define SYSV_SEM_FLAG 0 +#endif + /** @} */ #ifdef MDB_ROBUST_SUPPORTED @@ -536,7 +636,7 @@ static txnid_t mdb_debug_start; /** The version number for a database's datafile format. */ #define MDB_DATA_VERSION ((MDB_DEVEL) ? 999 : 1) /** The version number for a database's lockfile format. */ -#define MDB_LOCK_VERSION 1 +#define MDB_LOCK_VERSION ((MDB_DEVEL) ? 999 : 1) /** @brief The max size of a key we can write, or 0 for computed max. * @@ -725,14 +825,6 @@ typedef struct MDB_txbody { uint32_t mtb_magic; /** Format of this lock file. Must be set to #MDB_LOCK_FORMAT. */ uint32_t mtb_format; -#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) - char mtb_rmname[MNAME_LEN]; -#else - /** Mutex protecting access to this table. - * This is the reader table lock used with LOCK_MUTEX(). - */ - mdb_mutex_t mtb_rmutex; -#endif /** The ID of the last transaction committed to the database. * This is recorded here only for convenience; the value can always * be determined by reading the main database meta pages. @@ -743,6 +835,17 @@ typedef struct MDB_txbody { * when readers release their slots. */ volatile unsigned mtb_numreaders; +#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) + char mtb_rmname[MNAME_LEN]; +#elif defined(MDB_USE_SYSV_SEM) + int mtb_semid; + int mtb_rlocked; +#else + /** Mutex protecting access to this table. + * This is the reader table lock used with LOCK_MUTEX(). + */ + mdb_mutex_t mtb_rmutex; +#endif } MDB_txbody; /** The actual reader table definition. */ @@ -755,12 +858,19 @@ typedef struct MDB_txninfo { #define mti_rmname mt1.mtb.mtb_rmname #define mti_txnid mt1.mtb.mtb_txnid #define mti_numreaders mt1.mtb.mtb_numreaders +#ifdef MDB_USE_SYSV_SEM +#define mti_semid mt1.mtb.mtb_semid +#define mti_rlocked mt1.mtb.mtb_rlocked +#endif char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)]; } mt1; union { #if defined(_WIN32) || defined(MDB_USE_POSIX_SEM) char mt2_wmname[MNAME_LEN]; #define mti_wmname mt2.mt2_wmname +#elif defined MDB_USE_SYSV_SEM + int mt2_wlocked; +#define mti_wlocked mt2.mt2_wlocked #else mdb_mutex_t mt2_wmutex; #define mti_wmutex mt2.mt2_wmutex @@ -775,26 +885,13 @@ typedef struct MDB_txninfo { ((uint32_t) \ ((MDB_LOCK_VERSION) \ /* Flags which describe functionality */ \ + + (SYSV_SEM_FLAG << 18) \ + (((MDB_PIDLOCK) != 0) << 16))) /** @} */ -/** Common header for all page types. The page type depends on #mp_flags. - * - * #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with - * sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages - * omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. - * - * #P_OVERFLOW records occupy one or more contiguous pages where only the - * first has a page header. They hold the real data of #F_BIGDATA nodes. - * - * #P_SUBP sub-pages are small leaf "pages" with duplicate data. - * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. - * (Duplicate data can also go in sub-databases, which use normal pages.) - * - * #P_META pages contain #MDB_meta, the start point of an LMDB snapshot. - * - * Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once - * in the snapshot: Either used by a database or listed in a freeDB record. +/** Common header for all page types. + * Overflow records occupy a number of contiguous pages with no + * headers on any page after the first. */ typedef struct MDB_page { #define mp_pgno mp_p.p_pgno @@ -803,7 +900,7 @@ typedef struct MDB_page { pgno_t p_pgno; /**< page number */ struct MDB_page *p_next; /**< for in-memory list of freed pages */ } mp_p; - uint16_t mp_pad; /**< key size if this is a LEAF2 page */ + uint16_t mp_pad; /** @defgroup mdb_page Page Flags * @ingroup internal * Flags for the page headers. @@ -870,34 +967,25 @@ typedef struct MDB_page { /** The number of overflow pages needed to store the given size. */ #define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) - /** Link in #MDB_txn.%mt_loose_pgs list. - * Kept outside the page header, which is needed when reusing the page. - */ + /** Link in #MDB_txn.%mt_loose_pgs list */ #define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) /** Header for a single key/data pair within a page. * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2. * We guarantee 2-byte alignment for 'MDB_node's. - * - * #mn_lo and #mn_hi are used for data size on leaf nodes, and for child - * pgno on branch nodes. On 64 bit platforms, #mn_flags is also used - * for pgno. (Branch nodes have no flags). Lo and hi are in host byte - * order in case some accesses can be optimized to 32-bit word access. - * - * Leaf node flags describe node contents. #F_BIGDATA says the node's - * data part is the page number of an overflow page with actual data. - * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in - * a sub-page/sub-database, and named databases (just #F_SUBDATA). */ typedef struct MDB_node { - /** part of data size or pgno - * @{ */ + /** lo and hi are used for data size on leaf nodes and for + * child pgno on branch nodes. On 64 bit platforms, flags + * is also used for pgno. (Branch nodes have no flags). + * They are in host byte order in case that lets some + * accesses be optimized into a 32-bit word access. + */ #if BYTE_ORDER == LITTLE_ENDIAN - unsigned short mn_lo, mn_hi; + unsigned short mn_lo, mn_hi; /**< part of data size or pgno */ #else unsigned short mn_hi, mn_lo; #endif - /** @} */ /** @defgroup mdb_node Node Flags * @ingroup internal * Flags for node headers. @@ -1003,13 +1091,13 @@ typedef struct MDB_db { pgno_t md_branch_pages; /**< number of internal pages */ pgno_t md_leaf_pages; /**< number of leaf pages */ pgno_t md_overflow_pages; /**< number of overflow pages */ - size_t md_entries; /**< number of data items */ + mdb_size_t md_entries; /**< number of data items */ pgno_t md_root; /**< the root page of this tree */ } MDB_db; + /** mdb_dbi_open flags */ #define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ #define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) - /** #mdb_dbi_open() flags */ #define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\ MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) @@ -1033,17 +1121,22 @@ typedef struct MDB_meta { uint32_t mm_magic; /** Version number of this file. Must be set to #MDB_DATA_VERSION. */ uint32_t mm_version; +#ifdef MDB_VL32 + union { /* always zero since we don't support fixed mapping in MDB_VL32 */ + MDB_ID mmun_ull; + void *mmun_address; + } mm_un; +#define mm_address mm_un.mmun_address +#else void *mm_address; /**< address for fixed mapping */ - size_t mm_mapsize; /**< size of mmap region */ +#endif + pgno_t mm_mapsize; /**< size of mmap region */ MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ /** The size of pages used in this DB */ #define mm_psize mm_dbs[FREE_DBI].md_pad /** Any persistent environment flags. @ref mdb_env */ #define mm_flags mm_dbs[FREE_DBI].md_flags - /** Last used page in the datafile. - * Actually the file may be shorter if the freeDB lists the final pages. - */ - pgno_t mm_last_pg; + pgno_t mm_last_pg; /**< last used page in file */ volatile txnid_t mm_txnid; /**< txnid that committed this page */ } MDB_meta; @@ -1080,6 +1173,9 @@ struct MDB_txn { /** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ MDB_txn *mt_child; pgno_t mt_next_pgno; /**< next unallocated page */ +#ifdef MDB_VL32 + pgno_t mt_last_pgno; /**< last written page */ +#endif /** The ID of this transaction. IDs are integers incrementing from 1. * Only committed write transactions increment the ID. If a transaction * aborts, the ID may be re-used by the next writer. @@ -1093,7 +1189,7 @@ struct MDB_txn { * in this transaction, linked through #NEXT_LOOSE_PAGE(page). */ MDB_page *mt_loose_pgs; - /** Number of loose pages (#mt_loose_pgs) */ + /* #Number of loose pages (#mt_loose_pgs) */ int mt_loose_count; /** The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are @@ -1116,17 +1212,29 @@ struct MDB_txn { * @ingroup internal * @{ */ -#define DB_DIRTY 0x01 /**< DB was written in this txn */ +#define DB_DIRTY 0x01 /**< DB was modified or is DUPSORT data */ #define DB_STALE 0x02 /**< Named-DB record is older than txnID */ #define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ #define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ #define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ -#define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */ /** @} */ /** In write txns, array of cursors for each DB */ MDB_cursor **mt_cursors; /** Array of flags for each DB */ unsigned char *mt_dbflags; +#ifdef MDB_VL32 + /** List of read-only pages (actually chunks) */ + MDB_ID3L mt_rpages; + /** We map chunks of 16 pages. Even though Windows uses 4KB pages, all + * mappings must begin on 64KB boundaries. So we round off all pgnos to + * a chunk boundary. We do the same on Linux for symmetry, and also to + * reduce the frequency of mmap/munmap calls. + */ +#define MDB_RPAGE_CHUNK 16 +#define MDB_TRPAGE_SIZE 4096 /**< size of #mt_rpages array of chunks */ +#define MDB_TRPAGE_MAX (MDB_TRPAGE_SIZE-1) /**< maximum chunk index */ + unsigned int mt_rpcheck; /**< threshold for reclaiming unref'd chunks */ +#endif /** Number of DB records in use, or 0 when the txn is finished. * This number only ever increments until the txn finishes; we * don't decrement it when individual DB handles are closed. @@ -1138,7 +1246,9 @@ struct MDB_txn { * @{ */ /** #mdb_txn_begin() flags */ -#define MDB_TXN_BEGIN_FLAGS MDB_RDONLY +#define MDB_TXN_BEGIN_FLAGS (MDB_NOMETASYNC|MDB_NOSYNC|MDB_RDONLY) +#define MDB_TXN_NOMETASYNC MDB_NOMETASYNC /**< don't sync meta for this txn on commit */ +#define MDB_TXN_NOSYNC MDB_NOSYNC /**< don't sync this txn on commit */ #define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */ /* internal txn flags */ #define MDB_TXN_WRITEMAP MDB_WRITEMAP /**< copy of #MDB_env flag in writers */ @@ -1204,10 +1314,19 @@ struct MDB_cursor { #define C_SUB 0x04 /**< Cursor is a sub-cursor */ #define C_DEL 0x08 /**< last op was a cursor_del */ #define C_UNTRACK 0x40 /**< Un-track cursor when closing */ +#define C_WRITEMAP MDB_TXN_WRITEMAP /**< Copy of txn flag */ +/** Read-only cursor into the txn's original snapshot in the map. + * Set for read-only txns, and in #mdb_page_alloc() for #FREE_DBI when + * #MDB_DEVEL & 2. Only implements code which is necessary for this. + */ +#define C_ORIG_RDONLY MDB_TXN_RDONLY /** @} */ unsigned int mc_flags; /**< @ref mdb_cursor */ MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */ indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */ +#ifdef MDB_VL32 + MDB_page *mc_ovpg; /**< a referenced overflow page */ +#endif }; /** Context for sorted-dup records. @@ -1226,23 +1345,6 @@ typedef struct MDB_xcursor { unsigned char mx_dbflag; } MDB_xcursor; - /** Check if there is an inited xcursor */ -#define XCURSOR_INITED(mc) \ - ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) - - /** Update the xcursor's sub-page pointer, if any, in \b mc. Needed - * when the node which contains the sub-page may have moved. Called - * with leaf page \b mp = mc->mc_pg[\b top]. - */ -#define XCURSOR_REFRESH(mc, top, mp) do { \ - MDB_page *xr_pg = (mp); \ - MDB_node *xr_node; \ - if (!XCURSOR_INITED(mc) || (mc)->mc_ki[top] >= NUMKEYS(xr_pg)) break; \ - xr_node = NODEPTR(xr_pg, (mc)->mc_ki[top]); \ - if ((xr_node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) \ - (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ -} while (0) - /** State of FreeDB old pages, stored in the MDB_env */ typedef struct MDB_pgstate { pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ @@ -1253,7 +1355,10 @@ typedef struct MDB_pgstate { struct MDB_env { HANDLE me_fd; /**< The main data file */ HANDLE me_lfd; /**< The lock file */ - HANDLE me_mfd; /**< For writing and syncing the meta pages */ + HANDLE me_mfd; /**< just for writing the meta pages */ +#if defined(MDB_VL32) && defined(_WIN32) + HANDLE me_fmh; /**< File Mapping handle */ +#endif /** Failed to update the meta page. Probably an I/O error. */ #define MDB_FATAL_ERROR 0x80000000U /** Some fields are initialized. */ @@ -1278,8 +1383,8 @@ struct MDB_env { void *me_pbuf; /**< scratch area for DUPSORT put() */ MDB_txn *me_txn; /**< current write transaction */ MDB_txn *me_txn0; /**< prealloc'd write transaction */ - size_t me_mapsize; /**< size of the data memory map */ - off_t me_size; /**< current file size */ + mdb_size_t me_mapsize; /**< size of the data memory map */ + off64_t me_size; /**< current file size */ pgno_t me_maxpg; /**< me_mapsize / me_psize */ MDB_dbx *me_dbxs; /**< array of static DB info */ uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ @@ -1311,6 +1416,13 @@ struct MDB_env { #else mdb_mutex_t me_rmutex; mdb_mutex_t me_wmutex; +#endif +#ifdef MDB_VL32 + MDB_ID3L me_rpages; /**< like #mt_rpages, but global to env */ + pthread_mutex_t me_rpmutex; /**< control access to #me_rpages */ +#define MDB_ERPAGE_SIZE 16384 +#define MDB_ERPAGE_MAX (MDB_ERPAGE_SIZE-1) + unsigned int me_rpcheck; #endif void *me_userctx; /**< User-settable context */ MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ @@ -1372,7 +1484,7 @@ static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, unsigned int nflags); -static int mdb_env_read_header(MDB_env *env, MDB_meta *meta); +static int mdb_env_read_header(MDB_env *env, int prev, MDB_meta *meta); static MDB_meta *mdb_env_pick_meta(const MDB_env *env); static int mdb_env_write_meta(MDB_txn *txn); #ifdef MDB_USE_POSIX_MUTEX /* Drop unused excl arg */ @@ -1431,8 +1543,7 @@ static SECURITY_DESCRIPTOR mdb_null_sd; static SECURITY_ATTRIBUTES mdb_all_sa; static int mdb_sec_inited; -struct MDB_name; -static int utf8_to_utf16(const char *src, struct MDB_name *dst, int xtra); +static int utf8_to_utf16(const char *src, int srcsize, wchar_t **dst, int *dstsize); #endif /** Return the library version info. */ @@ -1611,20 +1722,20 @@ mdb_page_list(MDB_page *mp) case P_LEAF|P_LEAF2: type = "LEAF2 page"; break; case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break; case P_OVERFLOW: - fprintf(stderr, "Overflow page %"Z"u pages %u%s\n", + fprintf(stderr, "Overflow page %"Y"u pages %u%s\n", pgno, mp->mp_pages, state); return; case P_META: - fprintf(stderr, "Meta-page %"Z"u txnid %"Z"u\n", + fprintf(stderr, "Meta-page %"Y"u txnid %"Y"u\n", pgno, ((MDB_meta *)METADATA(mp))->mm_txnid); return; default: - fprintf(stderr, "Bad page %"Z"u flags 0x%X\n", pgno, mp->mp_flags); + fprintf(stderr, "Bad page %"Y"u flags 0x%u\n", pgno, mp->mp_flags); return; } nkeys = NUMKEYS(mp); - fprintf(stderr, "%s %"Z"u numkeys %d%s\n", type, pgno, nkeys, state); + fprintf(stderr, "%s %"Y"u numkeys %d%s\n", type, pgno, nkeys, state); for (i=0; imn_data; nsize = NODESIZE + key.mv_size; if (IS_BRANCH(mp)) { - fprintf(stderr, "key %d: page %"Z"u, %s\n", i, NODEPGNO(node), + fprintf(stderr, "key %d: page %"Y"u, %s\n", i, NODEPGNO(node), DKEY(&key)); total += nsize; } else { @@ -1674,7 +1785,7 @@ mdb_cursor_chk(MDB_cursor *mc) } if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i])) printf("ack!\n"); - if (XCURSOR_INITED(mc)) { + if (mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) && mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { @@ -1735,7 +1846,7 @@ static void mdb_audit(MDB_txn *txn) } } if (freecount + count + NUM_METAS != txn->mt_next_pgno) { - fprintf(stderr, "audit: %"Z"u freecount: %"Z"u count: %"Z"u total: %"Z"u next_pgno: %"Z"u\n", + fprintf(stderr, "audit: %"Y"u freecount: %"Y"u count: %"Y"u total: %"Y"u next_pgno: %"Y"u\n", txn->mt_txnid, freecount, count+NUM_METAS, freecount+count+NUM_METAS, txn->mt_next_pgno); } @@ -1752,8 +1863,8 @@ int mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) { MDB_cmp_func *dcmp = txn->mt_dbxs[dbi].md_dcmp; -#if UINT_MAX < SIZE_MAX - if (dcmp == mdb_cmp_int && a->mv_size == sizeof(size_t)) +#if UINT_MAX < SIZE_MAX || defined(MDB_VL32) + if (dcmp == mdb_cmp_int && a->mv_size == sizeof(mdb_size_t)) dcmp = mdb_cmp_clong; #endif return dcmp(a, b); @@ -1761,7 +1872,6 @@ mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) /** Allocate memory for a page. * Re-use old malloc'd pages first for singletons, otherwise just malloc. - * Set #MDB_TXN_ERROR on failure. */ static MDB_page * mdb_page_malloc(MDB_txn *txn, unsigned num) @@ -1836,6 +1946,47 @@ mdb_dlist_free(MDB_txn *txn) dl[0].mid = 0; } +#ifdef MDB_VL32 +static void +mdb_page_unref(MDB_txn *txn, MDB_page *mp) +{ + pgno_t pgno; + MDB_ID3L tl = txn->mt_rpages; + unsigned x, rem; + if (mp->mp_flags & (P_SUBP|P_DIRTY)) + return; + rem = mp->mp_pgno & (MDB_RPAGE_CHUNK-1); + pgno = mp->mp_pgno ^ rem; + x = mdb_mid3l_search(tl, pgno); + if (x != tl[0].mid && tl[x+1].mid == mp->mp_pgno) + x++; + if (tl[x].mref) + tl[x].mref--; +} +#define MDB_PAGE_UNREF(txn, mp) mdb_page_unref(txn, mp) + +static void +mdb_cursor_unref(MDB_cursor *mc) +{ + int i; + if (mc->mc_txn->mt_rpages[0].mid) { + if (!mc->mc_snum || !mc->mc_pg[0] || IS_SUBP(mc->mc_pg[0])) + return; + for (i=0; imc_snum; i++) + mdb_page_unref(mc->mc_txn, mc->mc_pg[i]); + if (mc->mc_ovpg) { + mdb_page_unref(mc->mc_txn, mc->mc_ovpg); + mc->mc_ovpg = 0; + } + } + mc->mc_snum = mc->mc_top = 0; + mc->mc_pg[0] = NULL; + mc->mc_flags &= ~C_INITIALIZED; +} +#else +#define MDB_PAGE_UNREF(txn, mp) +#endif /* MDB_VL32 */ + /** Loosen or free a single page. * Saves single pages to a list for future reuse * in this same txn. It has been pulled from the freeDB @@ -1877,7 +2028,7 @@ mdb_page_loose(MDB_cursor *mc, MDB_page *mp) } } if (loose) { - DPRINTF(("loosen db %d page %"Z"u", DDBI(mc), + DPRINTF(("loosen db %d page %"Y"u", DDBI(mc), mp->mp_pgno)); NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs; txn->mt_loose_pgs = mp; @@ -2129,13 +2280,15 @@ mdb_page_dirty(MDB_txn *txn, MDB_page *mp) } /** Allocate page numbers and memory for writing. Maintain me_pglast, - * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure. + * me_pghead and mt_next_pgno. * * If there are free pages available from older transactions, they * are re-used first. Otherwise allocate a new page at mt_next_pgno. * Do not modify the freedB, just merge freeDB records into me_pghead[] * and move me_pglast to say which records were consumed. Only this * function can create me_pghead and move me_pglast/mt_next_pgno. + * When #MDB_DEVEL & 2, it is not affected by #mdb_freelist_save(): it + * then uses the transaction's original snapshot of the freeDB. * @param[in] mc cursor A cursor handle identifying the transaction and * database for which we are allocating. * @param[in] num the number of pages to allocate. @@ -2173,7 +2326,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) np = txn->mt_loose_pgs; txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np); txn->mt_loose_count--; - DPRINTF(("db %d use loose page %"Z"u", DDBI(mc), + DPRINTF(("db %d use loose page %"Y"u", DDBI(mc), np->mp_pgno)); *mp = np; return MDB_SUCCESS; @@ -2211,6 +2364,14 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) last = env->me_pglast; oldest = env->me_pgoldest; mdb_cursor_init(&m2, txn, FREE_DBI, NULL); +#if (MDB_DEVEL) & 2 /* "& 2" so MDB_DEVEL=1 won't hide bugs breaking freeDB */ + /* Use original snapshot. TODO: Should need less care in code + * which modifies the database. Maybe we can delete some code? + */ + m2.mc_flags |= C_ORIG_RDONLY; + m2.mc_db = &env->me_metas[(txn->mt_txnid-1) & 1]->mm_dbs[FREE_DBI]; + m2.mc_dbflag = (unsigned char *)""; /* probably unnecessary */ +#endif if (last) { op = MDB_SET_RANGE; key.mv_data = &last; /* will look up last+1 */ @@ -2252,7 +2413,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) np = m2.mc_pg[m2.mc_top]; leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); if ((rc = mdb_node_read(&m2, leaf, &data)) != MDB_SUCCESS) - goto fail; + return rc; idl = (MDB_ID *) data.mv_data; i = idl[0]; @@ -2268,10 +2429,10 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) } env->me_pglast = last; #if (MDB_DEBUG) > 1 - DPRINTF(("IDL read txn %"Z"u root %"Z"u num %u", + DPRINTF(("IDL read txn %"Y"u root %"Y"u num %u", last, txn->mt_dbs[FREE_DBI].md_root, i)); for (j = i; j; j--) - DPRINTF(("IDL %"Z"u", idl[j])); + DPRINTF(("IDL %"Y"u", idl[j])); #endif /* Merge in descending sorted order */ mdb_midl_xmerge(mop, idl); @@ -2286,6 +2447,20 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp) rc = MDB_MAP_FULL; goto fail; } +#if defined(_WIN32) && !defined(MDB_VL32) + if (!(env->me_flags & MDB_RDONLY)) { + void *p; + p = (MDB_page *)(env->me_map + env->me_psize * pgno); + p = VirtualAlloc(p, env->me_psize * num, MEM_COMMIT, + (env->me_flags & MDB_WRITEMAP) ? PAGE_READWRITE: + PAGE_READONLY); + if (!p) { + DPUTS("VirtualAlloc failed"); + rc = ErrCode(); + goto fail; + } + } +#endif search_done: if (env->me_flags & MDB_WRITEMAP) { @@ -2402,7 +2577,6 @@ mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) } /** Touch a page: make it dirty and re-insert into tree with updated pgno. - * Set #MDB_TXN_ERROR on failure. * @param[in] mc cursor pointing to the page to be touched * @return 0 on success, non-zero on failure. */ @@ -2428,7 +2602,7 @@ mdb_page_touch(MDB_cursor *mc) (rc = mdb_page_alloc(mc, 1, &np))) goto fail; pgno = np->mp_pgno; - DPRINTF(("touched db %d page %"Z"u -> %"Z"u", DDBI(mc), + DPRINTF(("touched db %d page %"Y"u -> %"Y"u", DDBI(mc), mp->mp_pgno, pgno)); mdb_cassert(mc, mp->mp_pgno != pgno); mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); @@ -2491,11 +2665,18 @@ done: if (m2 == mc) continue; if (m2->mc_pg[mc->mc_top] == mp) { m2->mc_pg[mc->mc_top] = np; - if (IS_LEAF(np)) - XCURSOR_REFRESH(m2, mc->mc_top, np); + if ((mc->mc_db->md_flags & MDB_DUPSORT) && + IS_LEAF(np) && + (m2->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + { + MDB_node *leaf = NODEPTR(np, m2->mc_ki[mc->mc_top]); + if ((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + } } } } + MDB_PAGE_UNREF(mc->mc_txn, mp); return 0; fail: @@ -2504,7 +2685,7 @@ fail: } int -mdb_env_sync(MDB_env *env, int force) +mdb_env_sync0(MDB_env *env, int force, pgno_t numpgs) { int rc = 0; if (env->me_flags & MDB_RDONLY) @@ -2513,7 +2694,7 @@ mdb_env_sync(MDB_env *env, int force) if (env->me_flags & MDB_WRITEMAP) { int flags = ((env->me_flags & MDB_MAPASYNC) && !force) ? MS_ASYNC : MS_SYNC; - if (MDB_MSYNC(env->me_map, env->me_mapsize, flags)) + if (MDB_MSYNC(env->me_map, env->me_psize * numpgs, flags)) rc = ErrCode(); #ifdef _WIN32 else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd)) @@ -2533,6 +2714,13 @@ mdb_env_sync(MDB_env *env, int force) return rc; } +int +mdb_env_sync(MDB_env *env, int force) +{ + MDB_meta *m = mdb_env_pick_meta(env); + return mdb_env_sync0(env, force, m->mm_last_pg+1); +} + /** Back up parent txn's cursors, then grab the originals for tracking */ static int mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) @@ -2777,6 +2965,9 @@ mdb_txn_renew0(MDB_txn *txn) /* Moved to here to avoid a data race in read TXNs */ txn->mt_next_pgno = meta->mm_last_pg+1; +#ifdef MDB_VL32 + txn->mt_last_pgno = txn->mt_next_pgno - 1; +#endif txn->mt_flags = flags; @@ -2812,7 +3003,7 @@ mdb_txn_renew(MDB_txn *txn) rc = mdb_txn_renew0(txn); if (rc == MDB_SUCCESS) { - DPRINTF(("renew txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + DPRINTF(("renew txn %"Y"u%c %p on mdbenv %p, root page %"Y"u", txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root)); } @@ -2855,6 +3046,17 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) DPRINTF(("calloc: %s", strerror(errno))); return ENOMEM; } +#ifdef MDB_VL32 + if (!parent) { + txn->mt_rpages = malloc(MDB_TRPAGE_SIZE * sizeof(MDB_ID3)); + if (!txn->mt_rpages) { + free(txn); + return ENOMEM; + } + txn->mt_rpages[0].mid = 0; + txn->mt_rpcheck = MDB_TRPAGE_SIZE/2; + } +#endif txn->mt_dbxs = env->me_dbxs; /* static */ txn->mt_dbs = (MDB_db *) ((char *)txn + tsize); txn->mt_dbflags = (unsigned char *)txn + size - env->me_maxdbs; @@ -2882,6 +3084,9 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret) parent->mt_child = txn; txn->mt_parent = parent; txn->mt_numdbs = parent->mt_numdbs; +#ifdef MDB_VL32 + txn->mt_rpages = parent->mt_rpages; +#endif memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db)); /* Copy parent's mt_dbflags, but clear DB_NEW */ for (i=0; imt_numdbs; i++) @@ -2907,12 +3112,16 @@ renew: rc = mdb_txn_renew0(txn); } if (rc) { - if (txn != env->me_txn0) + if (txn != env->me_txn0) { +#ifdef MDB_VL32 + free(txn->mt_rpages); +#endif free(txn); + } } else { txn->mt_flags |= flags; /* could not change txn=me_txn0 earlier */ *ret = txn; - DPRINTF(("begin txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + DPRINTF(("begin txn %"Y"u%c %p on mdbenv %p, root page %"Y"u", txn->mt_txnid, (flags & MDB_RDONLY) ? 'r' : 'w', (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root)); } @@ -2927,7 +3136,7 @@ mdb_txn_env(MDB_txn *txn) return txn->mt_env; } -size_t +mdb_size_t mdb_txn_id(MDB_txn *txn) { if(!txn) return 0; @@ -2979,7 +3188,7 @@ mdb_txn_end(MDB_txn *txn, unsigned mode) /* Export or close DBI handles opened in this txn */ mdb_dbis_update(txn, mode & MDB_END_UPDATE); - DPRINTF(("%s txn %"Z"u%c %p on mdbenv %p, root page %"Z"u", + DPRINTF(("%s txn %"Y"u%c %p on mdbenv %p, root page %"Y"u", names[mode & MDB_END_OPMASK], txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w', (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root)); @@ -3033,7 +3242,31 @@ mdb_txn_end(MDB_txn *txn, unsigned mode) mdb_midl_free(pghead); } - +#ifdef MDB_VL32 + if (!txn->mt_parent) { + MDB_ID3L el = env->me_rpages, tl = txn->mt_rpages; + unsigned i, x, n = tl[0].mid; + pthread_mutex_lock(&env->me_rpmutex); + for (i = 1; i <= n; i++) { + if (tl[i].mid & (MDB_RPAGE_CHUNK-1)) { + /* tmp overflow pages that we didn't share in env */ + munmap(tl[i].mptr, tl[i].mcnt * env->me_psize); + } else { + x = mdb_mid3l_search(el, tl[i].mid); + if (tl[i].mptr == el[x].mptr) { + el[x].mref--; + } else { + /* another tmp overflow page */ + munmap(tl[i].mptr, tl[i].mcnt * env->me_psize); + } + } + } + pthread_mutex_unlock(&env->me_rpmutex); + tl[0].mid = 0; + if (mode & MDB_END_FREE) + free(tl); + } +#endif if (mode & MDB_END_FREE) free(txn); } @@ -3065,6 +3298,9 @@ mdb_txn_abort(MDB_txn *txn) /** Save the freelist as of this transaction to the freeDB. * This changes the freelist. Keep trying until it stabilizes. + * + * When (MDB_DEVEL) & 2, the changes do not affect #mdb_page_alloc(), + * it then uses the transaction's original snapshot of the freeDB. */ static int mdb_freelist_save(MDB_txn *txn) @@ -3094,41 +3330,10 @@ mdb_freelist_save(MDB_txn *txn) * we may be unable to return them to me_pghead. */ MDB_page *mp = txn->mt_loose_pgs; - MDB_ID2 *dl = txn->mt_u.dirty_list; - unsigned x; if ((rc = mdb_midl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0) return rc; - for (; mp; mp = NEXT_LOOSE_PAGE(mp)) { + for (; mp; mp = NEXT_LOOSE_PAGE(mp)) mdb_midl_xappend(txn->mt_free_pgs, mp->mp_pgno); - /* must also remove from dirty list */ - if (txn->mt_flags & MDB_TXN_WRITEMAP) { - for (x=1; x<=dl[0].mid; x++) - if (dl[x].mid == mp->mp_pgno) - break; - mdb_tassert(txn, x <= dl[0].mid); - } else { - x = mdb_mid2l_search(dl, mp->mp_pgno); - mdb_tassert(txn, dl[x].mid == mp->mp_pgno); - mdb_dpage_free(env, mp); - } - dl[x].mptr = NULL; - } - { - /* squash freed slots out of the dirty list */ - unsigned y; - for (y=1; dl[y].mptr && y <= dl[0].mid; y++); - if (y <= dl[0].mid) { - for(x=y, y++;;) { - while (!dl[y].mptr && y <= dl[0].mid) y++; - if (y > dl[0].mid) break; - dl[x++] = dl[y++]; - } - dl[0].mid = x-1; - } else { - /* all slots freed */ - dl[0].mid = 0; - } - } txn->mt_loose_pgs = NULL; txn->mt_loose_count = 0; } @@ -3184,10 +3389,10 @@ mdb_freelist_save(MDB_txn *txn) #if (MDB_DEBUG) > 1 { unsigned int i = free_pgs[0]; - DPRINTF(("IDL write txn %"Z"u root %"Z"u num %u", + DPRINTF(("IDL write txn %"Y"u root %"Y"u num %u", txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i)); for (; i; i--) - DPRINTF(("IDL %"Z"u", free_pgs[i])); + DPRINTF(("IDL %"Y"u", free_pgs[i])); } #endif continue; @@ -3298,15 +3503,16 @@ mdb_page_flush(MDB_txn *txn, int keep) MDB_ID2L dl = txn->mt_u.dirty_list; unsigned psize = env->me_psize, j; int i, pagecount = dl[0].mid, rc; - size_t size = 0, pos = 0; + size_t size = 0; + off64_t pos = 0; pgno_t pgno = 0; MDB_page *dp = NULL; #ifdef _WIN32 OVERLAPPED ov; #else struct iovec iov[MDB_COMMIT_PAGES]; - ssize_t wpos = 0, wsize = 0, wres; - size_t next_pos = 1; /* impossible pos, so pos != next_pos */ + ssize_t wsize = 0, wres; + off64_t wpos = 0, next_pos = 1; /* impossible pos, so pos != next_pos */ int n = 0; #endif @@ -3405,7 +3611,7 @@ retry_seek: wpos = pos; wsize = 0; } - DPRINTF(("committing page %"Z"u", pgno)); + DPRINTF(("committing page %"Y"u", pgno)); next_pos = pos + size; iov[n].iov_len = size; iov[n].iov_base = (char *)dp; @@ -3413,6 +3619,10 @@ retry_seek: n++; #endif /* _WIN32 */ } +#ifdef MDB_VL32 + if (pgno > txn->mt_last_pgno) + txn->mt_last_pgno = pgno; +#endif /* MIPS has cache coherency issues, this is a no-op everywhere else * Note: for any size >= on-chip cache size, entire on-chip cache is @@ -3438,6 +3648,8 @@ done: return MDB_SUCCESS; } +static int ESECT mdb_env_share_locks(MDB_env *env, int *excl); + int mdb_txn_commit(MDB_txn *txn) { @@ -3614,7 +3826,7 @@ mdb_txn_commit(MDB_txn *txn) !(txn->mt_flags & (MDB_TXN_DIRTY|MDB_TXN_SPILLS))) goto done; - DPRINTF(("committing txn %"Z"u %p on mdbenv %p, root page %"Z"u", + DPRINTF(("committing txn %"Y"u %p on mdbenv %p, root page %"Y"u", txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root)); /* Update DB root pointers */ @@ -3652,11 +3864,23 @@ mdb_txn_commit(MDB_txn *txn) mdb_audit(txn); #endif - if ((rc = mdb_page_flush(txn, 0)) || - (rc = mdb_env_sync(env, 0)) || - (rc = mdb_env_write_meta(txn))) + if ((rc = mdb_page_flush(txn, 0))) + goto fail; + if (!F_ISSET(txn->mt_flags, MDB_TXN_NOSYNC) && + (rc = mdb_env_sync0(env, 0, txn->mt_next_pgno))) + goto fail; + if ((rc = mdb_env_write_meta(txn))) goto fail; end_mode = MDB_END_COMMITTED|MDB_END_UPDATE; + if (env->me_flags & MDB_PREVSNAPSHOT) { + if (!(env->me_flags & MDB_NOLOCK)) { + int excl; + rc = mdb_env_share_locks(env, &excl); + if (rc) + goto fail; + } + env->me_flags ^= MDB_PREVSNAPSHOT; + } done: mdb_txn_end(txn, end_mode); @@ -3670,11 +3894,12 @@ fail: /** Read the environment parameters of a DB environment before * mapping it into memory. * @param[in] env the environment handle + * @param[in] prev whether to read the backup meta page * @param[out] meta address of where to store the meta information * @return 0 on success, non-zero on failure. */ static int ESECT -mdb_env_read_header(MDB_env *env, MDB_meta *meta) +mdb_env_read_header(MDB_env *env, int prev, MDB_meta *meta) { MDB_metabuf pbuf; MDB_page *p; @@ -3709,7 +3934,7 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta) p = (MDB_page *)&pbuf; if (!F_ISSET(p->mp_flags, P_META)) { - DPRINTF(("page %"Z"u not a meta page", p->mp_pgno)); + DPRINTF(("page %"Y"u not a meta page", p->mp_pgno)); return MDB_INVALID; } @@ -3725,7 +3950,7 @@ mdb_env_read_header(MDB_env *env, MDB_meta *meta) return MDB_VERSION_MISMATCH; } - if (off == 0 || m->mm_txnid > meta->mm_txnid) + if (off == 0 || (prev ? m->mm_txnid < meta->mm_txnid : m->mm_txnid > meta->mm_txnid)) *meta = *m; } return 0; @@ -3779,7 +4004,6 @@ mdb_env_init_meta(MDB_env *env, MDB_meta *meta) p = calloc(NUM_METAS, psize); if (!p) return ENOMEM; - p->mp_pgno = 0; p->mp_flags = P_META; *(MDB_meta *)METADATA(p) = *meta; @@ -3810,8 +4034,8 @@ mdb_env_write_meta(MDB_txn *txn) MDB_env *env; MDB_meta meta, metab, *mp; unsigned flags; - size_t mapsize; - off_t off; + mdb_size_t mapsize; + off64_t off; int rc, len, toggle; char *ptr; HANDLE mfd; @@ -3822,11 +4046,11 @@ mdb_env_write_meta(MDB_txn *txn) #endif toggle = txn->mt_txnid & 1; - DPRINTF(("writing meta page %d for root page %"Z"u", + DPRINTF(("writing meta page %d for root page %"Y"u", toggle, txn->mt_dbs[MAIN_DBI].md_root)); env = txn->mt_env; - flags = env->me_flags; + flags = txn->mt_flags | env->me_flags; mp = env->me_metas[toggle]; mapsize = env->me_metas[toggle ^ 1]->mm_mapsize; /* Persist any increases of mapsize config */ @@ -3874,10 +4098,7 @@ mdb_env_write_meta(MDB_txn *txn) len = sizeof(MDB_meta) - off; off += (char *)mp - env->me_map; - /* Write to the SYNC fd unless MDB_NOSYNC/MDB_NOMETASYNC. - * (me_mfd goes to the same file as me_fd, but writing to it - * also syncs to disk. Avoids a separate fdatasync() call.) - */ + /* Write to the SYNC fd */ mfd = (flags & (MDB_NOSYNC|MDB_NOMETASYNC)) ? env->me_fd : env->me_mfd; #ifdef _WIN32 { @@ -3938,7 +4159,8 @@ static MDB_meta * mdb_env_pick_meta(const MDB_env *env) { MDB_meta *const *metas = env->me_metas; - return metas[ metas[0]->mm_txnid < metas[1]->mm_txnid ]; + return metas[ (metas[0]->mm_txnid < metas[1]->mm_txnid) ^ + ((env->me_flags & MDB_PREVSNAPSHOT) != 0) ]; } int ESECT @@ -3958,6 +4180,9 @@ mdb_env_create(MDB_env **env) #ifdef MDB_USE_POSIX_SEM e->me_rmutex = SEM_FAILED; e->me_wmutex = SEM_FAILED; +#elif defined MDB_USE_SYSV_SEM + e->me_rmutex->semid = -1; + e->me_wmutex->semid = -1; #endif e->me_pid = getpid(); GET_PAGESIZE(e->me_os_psize); @@ -3966,6 +4191,19 @@ mdb_env_create(MDB_env **env) return MDB_SUCCESS; } +#ifdef _WIN32 +/** @brief Map a result from an NTAPI call to WIN32. */ +static DWORD +mdb_nt2win32(NTSTATUS st) +{ + OVERLAPPED o = {0}; + DWORD br; + o.Internal = st; + GetOverlappedResult(NULL, &o, &br, FALSE); + return GetLastError(); +} +#endif + static int ESECT mdb_env_map(MDB_env *env, void *addr) { @@ -3973,42 +4211,51 @@ mdb_env_map(MDB_env *env, void *addr) unsigned int flags = env->me_flags; #ifdef _WIN32 int rc; + int access = SECTION_MAP_READ; HANDLE mh; - LONG sizelo, sizehi; - size_t msize; + void *map; + SIZE_T msize; + ULONG pageprot = PAGE_READONLY, secprot, alloctype; + if (flags & MDB_WRITEMAP) { + access |= SECTION_MAP_WRITE; + pageprot = PAGE_READWRITE; + } if (flags & MDB_RDONLY) { - /* Don't set explicit map size, use whatever exists */ + secprot = PAGE_READONLY; msize = 0; - sizelo = 0; - sizehi = 0; + alloctype = 0; } else { + secprot = PAGE_READWRITE; msize = env->me_mapsize; - sizelo = msize & 0xffffffff; - sizehi = msize >> 16 >> 16; /* only needed on Win64 */ - - /* Windows won't create mappings for zero length files. - * and won't map more than the file size. - * Just set the maxsize right now. - */ - if (!(flags & MDB_WRITEMAP) && (SetFilePointer(env->me_fd, sizelo, &sizehi, 0) != (DWORD)sizelo - || !SetEndOfFile(env->me_fd) - || SetFilePointer(env->me_fd, 0, NULL, 0) != 0)) - return ErrCode(); + alloctype = MEM_RESERVE; } - mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ? - PAGE_READWRITE : PAGE_READONLY, - sizehi, sizelo, NULL); - if (!mh) - return ErrCode(); - env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ? - FILE_MAP_WRITE : FILE_MAP_READ, - 0, 0, msize, addr); - rc = env->me_map ? 0 : ErrCode(); - CloseHandle(mh); + rc = NtCreateSection(&mh, access, NULL, NULL, secprot, SEC_RESERVE, env->me_fd); if (rc) - return rc; + return mdb_nt2win32(rc); + map = addr; +#ifdef MDB_VL32 + msize = NUM_METAS * env->me_psize; +#endif + rc = NtMapViewOfSection(mh, GetCurrentProcess(), &map, 0, 0, NULL, &msize, ViewUnmap, alloctype, pageprot); +#ifdef MDB_VL32 + env->me_fmh = mh; +#else + NtClose(mh); +#endif + if (rc) + return mdb_nt2win32(rc); + env->me_map = map; +#else +#ifdef MDB_VL32 + (void) flags; + env->me_map = mmap(addr, NUM_METAS * env->me_psize, PROT_READ, MAP_SHARED, + env->me_fd, 0); + if (env->me_map == MAP_FAILED) { + env->me_map = NULL; + return ErrCode(); + } #else int prot = PROT_READ; if (flags & MDB_WRITEMAP) { @@ -4042,6 +4289,7 @@ mdb_env_map(MDB_env *env, void *addr) */ if (addr && env->me_map != addr) return EBUSY; /* TODO: Make a new MDB_* error code? */ +#endif p = (MDB_page *)env->me_map; env->me_metas[0] = METADATA(p); @@ -4051,15 +4299,17 @@ mdb_env_map(MDB_env *env, void *addr) } int ESECT -mdb_env_set_mapsize(MDB_env *env, size_t size) +mdb_env_set_mapsize(MDB_env *env, mdb_size_t size) { /* If env is already open, caller is responsible for making * sure there are no active txns. */ if (env->me_map) { - int rc; MDB_meta *meta; +#ifndef MDB_VL32 void *old; + int rc; +#endif if (env->me_txn) return EINVAL; meta = mdb_env_pick_meta(env); @@ -4067,16 +4317,21 @@ mdb_env_set_mapsize(MDB_env *env, size_t size) size = meta->mm_mapsize; { /* Silently round up to minimum if the size is too small */ - size_t minsize = (meta->mm_last_pg + 1) * env->me_psize; + mdb_size_t minsize = (meta->mm_last_pg + 1) * env->me_psize; if (size < minsize) size = minsize; } +#ifndef MDB_VL32 + /* For MDB_VL32 this bit is a noop since we dynamically remap + * chunks of the DB anyway. + */ munmap(env->me_map, env->me_mapsize); env->me_mapsize = size; old = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : NULL; rc = mdb_env_map(env, old); if (rc) return rc; +#endif /* !MDB_VL32 */ } env->me_mapsize = size; if (env->me_psize) @@ -4112,7 +4367,7 @@ mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers) } static int ESECT -mdb_fsize(HANDLE fd, size_t *size) +mdb_fsize(HANDLE fd, mdb_size_t *size) { #ifdef _WIN32 LARGE_INTEGER fsize; @@ -4132,189 +4387,6 @@ mdb_fsize(HANDLE fd, size_t *size) return MDB_SUCCESS; } - -#ifdef _WIN32 -typedef wchar_t mdb_nchar_t; -# define MDB_NAME(str) L##str -# define mdb_name_cpy wcscpy -#else -/** Character type for file names: char on Unix, wchar_t on Windows */ -typedef char mdb_nchar_t; -# define MDB_NAME(str) str /**< #mdb_nchar_t[] string literal */ -# define mdb_name_cpy strcpy /**< Copy name (#mdb_nchar_t string) */ -#endif - -/** Filename - string of #mdb_nchar_t[] */ -typedef struct MDB_name { - int mn_len; /**< Length */ - int mn_alloced; /**< True if #mn_val was malloced */ - mdb_nchar_t *mn_val; /**< Contents */ -} MDB_name; - -/** Filename suffixes [datafile,lockfile][without,with MDB_NOSUBDIR] */ -static const mdb_nchar_t *const mdb_suffixes[2][2] = { - { MDB_NAME("/data.mdb"), MDB_NAME("") }, - { MDB_NAME("/lock.mdb"), MDB_NAME("-lock") } -}; - -#define MDB_SUFFLEN 9 /**< Max string length in #mdb_suffixes[] */ - -/** Set up filename + scratch area for filename suffix, for opening files. - * It should be freed with #mdb_fname_destroy(). - * On Windows, paths are converted from char *UTF-8 to wchar_t *UTF-16. - * - * @param[in] path Pathname for #mdb_env_open(). - * @param[in] envflags Whether a subdir and/or lockfile will be used. - * @param[out] fname Resulting filename, with room for a suffix if necessary. - */ -static int ESECT -mdb_fname_init(const char *path, unsigned envflags, MDB_name *fname) -{ - int no_suffix = F_ISSET(envflags, MDB_NOSUBDIR|MDB_NOLOCK); - fname->mn_alloced = 0; -#ifdef _WIN32 - return utf8_to_utf16(path, fname, no_suffix ? 0 : MDB_SUFFLEN); -#else - fname->mn_len = strlen(path); - if (no_suffix) - fname->mn_val = (char *) path; - else if ((fname->mn_val = malloc(fname->mn_len + MDB_SUFFLEN+1)) != NULL) { - fname->mn_alloced = 1; - strcpy(fname->mn_val, path); - } - else - return ENOMEM; - return MDB_SUCCESS; -#endif -} - -/** Destroy \b fname from #mdb_fname_init() */ -#define mdb_fname_destroy(fname) \ - do { if ((fname).mn_alloced) free((fname).mn_val); } while (0) - -#ifdef O_CLOEXEC /* POSIX.1-2008: Set FD_CLOEXEC atomically at open() */ -# define MDB_CLOEXEC O_CLOEXEC -#else -# define MDB_CLOEXEC 0 -#endif - -/** File type, access mode etc. for #mdb_fopen() */ -enum mdb_fopen_type { -#ifdef _WIN32 - MDB_O_RDONLY, MDB_O_RDWR, MDB_O_META, MDB_O_COPY, MDB_O_LOCKS -#else - /* A comment in mdb_fopen() explains some O_* flag choices. */ - MDB_O_RDONLY= O_RDONLY, /**< for RDONLY me_fd */ - MDB_O_RDWR = O_RDWR |O_CREAT, /**< for me_fd */ - MDB_O_META = O_WRONLY|MDB_DSYNC |MDB_CLOEXEC, /**< for me_mfd */ - MDB_O_COPY = O_WRONLY|O_CREAT|O_EXCL|MDB_CLOEXEC, /**< for #mdb_env_copy() */ - /** Bitmask for open() flags in enum #mdb_fopen_type. The other bits - * distinguish otherwise-equal MDB_O_* constants from each other. - */ - MDB_O_MASK = MDB_O_RDWR|MDB_CLOEXEC | MDB_O_RDONLY|MDB_O_META|MDB_O_COPY, - MDB_O_LOCKS = MDB_O_RDWR|MDB_CLOEXEC | ((MDB_O_MASK+1) & ~MDB_O_MASK) /**< for me_lfd */ -#endif -}; - -/** Open an LMDB file. - * @param[in] env The LMDB environment. - * @param[in,out] fname Path from from #mdb_fname_init(). A suffix is - * appended if necessary to create the filename, without changing mn_len. - * @param[in] which Determines file type, access mode, etc. - * @param[in] mode The Unix permissions for the file, if we create it. - * @param[out] res Resulting file handle. - * @return 0 on success, non-zero on failure. - */ -static int ESECT -mdb_fopen(const MDB_env *env, MDB_name *fname, - enum mdb_fopen_type which, mdb_mode_t mode, - HANDLE *res) -{ - int rc = MDB_SUCCESS; - HANDLE fd; -#ifdef _WIN32 - DWORD acc, share, disp, attrs; -#else - int flags; -#endif - - if (fname->mn_alloced) /* modifiable copy */ - mdb_name_cpy(fname->mn_val + fname->mn_len, - mdb_suffixes[which==MDB_O_LOCKS][F_ISSET(env->me_flags, MDB_NOSUBDIR)]); - - /* The directory must already exist. Usually the file need not. - * MDB_O_META requires the file because we already created it using - * MDB_O_RDWR. MDB_O_COPY must not overwrite an existing file. - * - * With MDB_O_COPY we do not want the OS to cache the writes, since - * the source data is already in the OS cache. - * - * The lockfile needs FD_CLOEXEC (close file descriptor on exec*()) - * to avoid the flock() issues noted under Caveats in lmdb.h. - * Also set it for other filehandles which the user cannot get at - * and close himself, which he may need after fork(). I.e. all but - * me_fd, which programs do use via mdb_env_get_fd(). - */ - -#ifdef _WIN32 - acc = GENERIC_READ|GENERIC_WRITE; - share = FILE_SHARE_READ|FILE_SHARE_WRITE; - disp = OPEN_ALWAYS; - attrs = FILE_ATTRIBUTE_NORMAL; - switch (which) { - case MDB_O_RDONLY: /* read-only datafile */ - acc = GENERIC_READ; - disp = OPEN_EXISTING; - break; - case MDB_O_META: /* for writing metapages */ - acc = GENERIC_WRITE; - disp = OPEN_EXISTING; - attrs = FILE_ATTRIBUTE_NORMAL|FILE_FLAG_WRITE_THROUGH; - break; - case MDB_O_COPY: /* mdb_env_copy() & co */ - acc = GENERIC_WRITE; - share = 0; - disp = CREATE_NEW; - attrs = FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH; - break; - default: break; /* silence gcc -Wswitch (not all enum values handled) */ - } - fd = CreateFileW(fname->mn_val, acc, share, NULL, disp, attrs, NULL); -#else - fd = open(fname->mn_val, which & MDB_O_MASK, mode); -#endif - - if (fd == INVALID_HANDLE_VALUE) - rc = ErrCode(); -#ifndef _WIN32 - else { - if (which != MDB_O_RDONLY && which != MDB_O_RDWR) { - /* Set CLOEXEC if we could not pass it to open() */ - if (!MDB_CLOEXEC && (flags = fcntl(fd, F_GETFD)) != -1) - (void) fcntl(fd, F_SETFD, flags | FD_CLOEXEC); - } - if (which == MDB_O_COPY && env->me_psize >= env->me_os_psize) { - /* This may require buffer alignment. There is no portable - * way to ask how much, so we require OS pagesize alignment. - */ -# ifdef F_NOCACHE /* __APPLE__ */ - (void) fcntl(fd, F_NOCACHE, 1); -# elif defined O_DIRECT - /* open(...O_DIRECT...) would break on filesystems without - * O_DIRECT support (ITS#7682). Try to set it here instead. - */ - if ((flags = fcntl(fd, F_GETFL)) != -1) - (void) fcntl(fd, F_SETFL, flags | O_DIRECT); -# endif - } - } -#endif /* !_WIN32 */ - - *res = fd; - return rc; -} - - #ifdef BROKEN_FDATASYNC #include #include @@ -4323,7 +4395,7 @@ mdb_fopen(const MDB_env *env, MDB_name *fname, /** Further setup required for opening an LMDB environment */ static int ESECT -mdb_env_open2(MDB_env *env) +mdb_env_open2(MDB_env *env, int prev) { unsigned int flags = env->me_flags; int i, newenv = 0, rc; @@ -4386,7 +4458,7 @@ mdb_env_open2(MDB_env *env) } #endif - if ((i = mdb_env_read_header(env, &meta)) != 0) { + if ((i = mdb_env_read_header(env, prev, &meta)) != 0) { if (i != ENOENT) return i; DPUTS("new mdbenv"); @@ -4409,7 +4481,7 @@ mdb_env_open2(MDB_env *env) /* Make sure mapsize >= committed data size. Even when using * mm_mapsize, which could be broken in old files (ITS#7789). */ - size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize; + mdb_size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize; if (env->me_mapsize < minsize) env->me_mapsize = minsize; } @@ -4428,6 +4500,18 @@ mdb_env_open2(MDB_env *env) return rc; newenv = 0; } +#ifdef _WIN32 + /* For FIXEDMAP, make sure the file is non-empty before we attempt to map it */ + if (newenv) { + char dummy = 0; + DWORD len; + rc = WriteFile(env->me_fd, &dummy, 1, &len, NULL); + if (!rc) { + rc = ErrCode(); + return rc; + } + } +#endif rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL); if (rc) @@ -4450,6 +4534,9 @@ mdb_env_open2(MDB_env *env) #endif env->me_maxpg = env->me_mapsize / env->me_psize; + if (env->me_txns) + env->me_txns->mti_txnid = meta.mm_txnid; + #if MDB_DEBUG { MDB_meta *meta = mdb_env_pick_meta(env); @@ -4459,11 +4546,11 @@ mdb_env_open2(MDB_env *env) meta->mm_version, env->me_psize)); DPRINTF(("using meta page %d", (int) (meta->mm_txnid & 1))); DPRINTF(("depth: %u", db->md_depth)); - DPRINTF(("entries: %"Z"u", db->md_entries)); - DPRINTF(("branch pages: %"Z"u", db->md_branch_pages)); - DPRINTF(("leaf pages: %"Z"u", db->md_leaf_pages)); - DPRINTF(("overflow pages: %"Z"u", db->md_overflow_pages)); - DPRINTF(("root: %"Z"u", db->md_root)); + DPRINTF(("entries: %"Y"u", db->md_entries)); + DPRINTF(("branch pages: %"Y"u", db->md_branch_pages)); + DPRINTF(("leaf pages: %"Y"u", db->md_leaf_pages)); + DPRINTF(("overflow pages: %"Y"u", db->md_overflow_pages)); + DPRINTF(("root: %"Y"u", db->md_root)); } #endif @@ -4480,11 +4567,7 @@ mdb_env_reader_dest(void *ptr) { MDB_reader *reader = ptr; -#ifndef _WIN32 - if (reader->mr_pid == getpid()) /* catch pthread_exit() in child process */ -#endif - /* We omit the mutex, so do this atomically (i.e. skip mr_txnid) */ - reader->mr_pid = 0; + reader->mr_pid = 0; } #ifdef _WIN32 @@ -4549,9 +4632,6 @@ static int ESECT mdb_env_share_locks(MDB_env *env, int *excl) { int rc = 0; - MDB_meta *meta = mdb_env_pick_meta(env); - - env->me_txns->mti_txnid = meta->mm_txnid; #ifdef _WIN32 { @@ -4724,30 +4804,56 @@ mdb_hash_enc(MDB_val *val, char *encbuf) /** Open and/or initialize the lock region for the environment. * @param[in] env The LMDB environment. - * @param[in] fname Filename + scratch area, from #mdb_fname_init(). + * @param[in] lpath The pathname of the file used for the lock region. * @param[in] mode The Unix permissions for the file, if we create it. * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive * @return 0 on success, non-zero on failure. */ static int ESECT -mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) +mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) { #ifdef _WIN32 # define MDB_ERRCODE_ROFS ERROR_WRITE_PROTECT #else # define MDB_ERRCODE_ROFS EROFS +#ifdef O_CLOEXEC /* Linux: Open file and set FD_CLOEXEC atomically */ +# define MDB_CLOEXEC O_CLOEXEC +#else + int fdflags; +# define MDB_CLOEXEC 0 +#endif +#endif +#ifdef MDB_USE_SYSV_SEM + int semid; + union semun semu; #endif int rc; - off_t size, rsize; + off64_t size, rsize; - rc = mdb_fopen(env, fname, MDB_O_LOCKS, mode, &env->me_lfd); - if (rc) { - /* Omit lockfile if read-only env on read-only filesystem */ +#ifdef _WIN32 + wchar_t *wlpath; + rc = utf8_to_utf16(lpath, -1, &wlpath, NULL); + if (rc) + return rc; + env->me_lfd = CreateFileW(wlpath, GENERIC_READ|GENERIC_WRITE, + FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, + FILE_ATTRIBUTE_NORMAL, NULL); + free(wlpath); +#else + env->me_lfd = open(lpath, O_RDWR|O_CREAT|MDB_CLOEXEC, mode); +#endif + if (env->me_lfd == INVALID_HANDLE_VALUE) { + rc = ErrCode(); if (rc == MDB_ERRCODE_ROFS && (env->me_flags & MDB_RDONLY)) { return MDB_SUCCESS; } - goto fail; + goto fail_errno; } +#if ! ((MDB_CLOEXEC) || defined(_WIN32)) + /* Lose record locks when exec*() */ + if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0) + fcntl(env->me_lfd, F_SETFD, fdflags); +#endif if (!(env->me_flags & MDB_NOTLS)) { rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest); @@ -4872,29 +4978,31 @@ mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) env->me_wmutex = sem_open(env->me_txns->mti_wmname, O_CREAT|O_EXCL, mode, 1); if (env->me_wmutex == SEM_FAILED) goto fail_errno; +#elif defined(MDB_USE_SYSV_SEM) + unsigned short vals[2] = {1, 1}; + key_t key = ftok(lpath, 'M'); + if (key == -1) + goto fail_errno; + semid = semget(key, 2, (mode & 0777) | IPC_CREAT); + if (semid < 0) + goto fail_errno; + semu.array = vals; + if (semctl(semid, 0, SETALL, semu) < 0) + goto fail_errno; + env->me_txns->mti_semid = semid; #else /* MDB_USE_POSIX_MUTEX: */ pthread_mutexattr_t mattr; - /* Solaris needs this before initing a robust mutex. Otherwise - * it may skip the init and return EBUSY "seems someone already - * inited" or EINVAL "it was inited differently". - */ - memset(env->me_txns->mti_rmutex, 0, sizeof(*env->me_txns->mti_rmutex)); - memset(env->me_txns->mti_wmutex, 0, sizeof(*env->me_txns->mti_wmutex)); - - if ((rc = pthread_mutexattr_init(&mattr))) - goto fail; - - rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED); + if ((rc = pthread_mutexattr_init(&mattr)) + || (rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED)) #ifdef MDB_ROBUST_SUPPORTED - if (!rc) rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST); + || (rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST)) #endif - if (!rc) rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr); - if (!rc) rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr); - pthread_mutexattr_destroy(&mattr); - if (rc) + || (rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr)) + || (rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr))) goto fail; -#endif /* _WIN32 || MDB_USE_POSIX_SEM */ + pthread_mutexattr_destroy(&mattr); +#endif /* _WIN32 || ... */ env->me_txns->mti_magic = MDB_MAGIC; env->me_txns->mti_format = MDB_LOCK_FORMAT; @@ -4902,6 +5010,9 @@ mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) env->me_txns->mti_numreaders = 0; } else { +#ifdef MDB_USE_SYSV_SEM + struct semid_ds buf; +#endif if (env->me_txns->mti_magic != MDB_MAGIC) { DPUTS("lock region has invalid magic"); rc = MDB_INVALID; @@ -4927,8 +5038,33 @@ mdb_env_setup_locks(MDB_env *env, MDB_name *fname, int mode, int *excl) if (env->me_rmutex == SEM_FAILED) goto fail_errno; env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0); if (env->me_wmutex == SEM_FAILED) goto fail_errno; +#elif defined(MDB_USE_SYSV_SEM) + semid = env->me_txns->mti_semid; + semu.buf = &buf; + /* check for read access */ + if (semctl(semid, 0, IPC_STAT, semu) < 0) + goto fail_errno; + /* check for write access */ + if (semctl(semid, 0, IPC_SET, semu) < 0) + goto fail_errno; #endif } +#ifdef MDB_USE_SYSV_SEM + env->me_rmutex->semid = semid; + env->me_wmutex->semid = semid; + env->me_rmutex->semnum = 0; + env->me_wmutex->semnum = 1; + env->me_rmutex->locked = &env->me_txns->mti_rlocked; + env->me_wmutex->locked = &env->me_txns->mti_wlocked; +#endif +#ifdef MDB_VL32 +#ifdef _WIN32 + env->me_rpmutex = CreateMutex(NULL, FALSE, NULL); +#else + pthread_mutex_init(&env->me_rpmutex, NULL); +#endif +#endif + return MDB_SUCCESS; fail_errno: @@ -4937,13 +5073,19 @@ fail: return rc; } + /** The name of the lock file in the DB environment */ +#define LOCKNAME "/lock.mdb" + /** The name of the data file in the DB environment */ +#define DATANAME "/data.mdb" + /** The suffix of the lock file when no subdir is used */ +#define LOCKSUFF "-lock" /** Only a subset of the @ref mdb_env flags can be changed * at runtime. Changing other flags requires closing the * environment and re-opening it with the new flags. */ #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT) #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY| \ - MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD) + MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD|MDB_PREVSNAPSHOT) #if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS) # error "Persistent DB flags & env flags overlap, but both go in mm_flags" @@ -4952,18 +5094,47 @@ fail: int ESECT mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode) { - int rc, excl = -1; - MDB_name fname; + int oflags, rc, len, excl = -1; + char *lpath, *dpath; +#ifdef _WIN32 + wchar_t *wpath; +#endif if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) return EINVAL; +#ifdef MDB_VL32 + if (flags & MDB_WRITEMAP) { + /* silently ignore WRITEMAP in 32 bit mode */ + flags ^= MDB_WRITEMAP; + } + if (flags & MDB_FIXEDMAP) { + /* cannot support FIXEDMAP */ + return EINVAL; + } +#endif + + len = strlen(path); + if (flags & MDB_NOSUBDIR) { + rc = len + sizeof(LOCKSUFF) + len + 1; + } else { + rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME); + } + lpath = malloc(rc); + if (!lpath) + return ENOMEM; + if (flags & MDB_NOSUBDIR) { + dpath = lpath + len + sizeof(LOCKSUFF); + sprintf(lpath, "%s" LOCKSUFF, path); + strcpy(dpath, path); + } else { + dpath = lpath + len + sizeof(LOCKNAME); + sprintf(lpath, "%s" LOCKNAME, path); + sprintf(dpath, "%s" DATANAME, path); + } + + rc = MDB_SUCCESS; flags |= env->me_flags; - - rc = mdb_fname_init(path, flags, &fname); - if (rc) - return rc; - if (flags & MDB_RDONLY) { /* silently ignore WRITEMAP when we're only getting read access */ flags &= ~MDB_WRITEMAP; @@ -4972,6 +5143,17 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode (env->me_dirty_list = calloc(MDB_IDL_UM_SIZE, sizeof(MDB_ID2))))) rc = ENOMEM; } +#ifdef MDB_VL32 + if (!rc) { + env->me_rpages = malloc(MDB_ERPAGE_SIZE * sizeof(MDB_ID3)); + if (!env->me_rpages) { + rc = ENOMEM; + goto leave; + } + env->me_rpages[0].mid = 0; + env->me_rpcheck = MDB_ERPAGE_SIZE/2; + } +#endif env->me_flags = flags |= MDB_ENV_ACTIVE; if (rc) goto leave; @@ -4988,34 +5170,76 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode /* For RDONLY, get lockfile after we know datafile exists */ if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) { - rc = mdb_env_setup_locks(env, &fname, mode, &excl); + rc = mdb_env_setup_locks(env, lpath, mode, &excl); if (rc) goto leave; } - rc = mdb_fopen(env, &fname, - (flags & MDB_RDONLY) ? MDB_O_RDONLY : MDB_O_RDWR, - mode, &env->me_fd); +#ifdef _WIN32 + if (F_ISSET(flags, MDB_RDONLY)) { + oflags = GENERIC_READ; + len = OPEN_EXISTING; + } else { + oflags = GENERIC_READ|GENERIC_WRITE; + len = OPEN_ALWAYS; + } + mode = FILE_ATTRIBUTE_NORMAL; + rc = utf8_to_utf16(dpath, -1, &wpath, NULL); if (rc) goto leave; + env->me_fd = CreateFileW(wpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE, + NULL, len, mode, NULL); + free(wpath); +#else + if (F_ISSET(flags, MDB_RDONLY)) + oflags = O_RDONLY; + else + oflags = O_RDWR | O_CREAT; - if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { - rc = mdb_env_setup_locks(env, &fname, mode, &excl); - if (rc) - goto leave; + env->me_fd = open(dpath, oflags, mode); +#endif + if (env->me_fd == INVALID_HANDLE_VALUE) { + rc = ErrCode(); + goto leave; } - if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) { - if (!(flags & (MDB_RDONLY|MDB_WRITEMAP))) { + if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { + rc = mdb_env_setup_locks(env, lpath, mode, &excl); + if (rc) + goto leave; + if ((flags & MDB_PREVSNAPSHOT) && !excl) { + rc = EAGAIN; + goto leave; + } + } + + if ((rc = mdb_env_open2(env, flags & MDB_PREVSNAPSHOT)) == MDB_SUCCESS) { + if (flags & (MDB_RDONLY|MDB_WRITEMAP)) { + env->me_mfd = env->me_fd; + } else { /* Synchronous fd for meta writes. Needed even with * MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset. */ - rc = mdb_fopen(env, &fname, MDB_O_META, mode, &env->me_mfd); +#ifdef _WIN32 + len = OPEN_EXISTING; + rc = utf8_to_utf16(dpath, -1, &wpath, NULL); if (rc) goto leave; + env->me_mfd = CreateFileW(wpath, oflags, + FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len, + mode | FILE_FLAG_WRITE_THROUGH, NULL); + free(wpath); +#else + oflags &= ~O_CREAT; + env->me_mfd = open(dpath, oflags | MDB_DSYNC, mode); +#endif + if (env->me_mfd == INVALID_HANDLE_VALUE) { + rc = ErrCode(); + goto leave; + } } DPRINTF(("opened dbenv %p", (void *) env)); - if (excl > 0) { + if (excl > 0 && !(flags & MDB_PREVSNAPSHOT)) { rc = mdb_env_share_locks(env, &excl); if (rc) goto leave; @@ -5032,6 +5256,16 @@ mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs); txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs); txn->mt_env = env; +#ifdef MDB_VL32 + txn->mt_rpages = malloc(MDB_TRPAGE_SIZE * sizeof(MDB_ID3)); + if (!txn->mt_rpages) { + free(txn); + rc = ENOMEM; + goto leave; + } + txn->mt_rpages[0].mid = 0; + txn->mt_rpcheck = MDB_TRPAGE_SIZE/2; +#endif txn->mt_dbxs = env->me_dbxs; txn->mt_flags = MDB_TXN_FINISHED; env->me_txn0 = txn; @@ -5045,7 +5279,7 @@ leave: if (rc) { mdb_env_close0(env, excl); } - mdb_fname_destroy(fname); + free(lpath); return rc; } @@ -5070,6 +5304,15 @@ mdb_env_close0(MDB_env *env, int excl) free(env->me_dbflags); free(env->me_path); free(env->me_dirty_list); +#ifdef MDB_VL32 + if (env->me_txn0 && env->me_txn0->mt_rpages) + free(env->me_txn0->mt_rpages); + { unsigned int x; + for (x=1; x<=env->me_rpages[0].mid; x++) + munmap(env->me_rpages[x].mptr, env->me_rpages[x].mcnt * env->me_psize); + } + free(env->me_rpages); +#endif free(env->me_txn0); mdb_midl_free(env->me_free_pgs); @@ -5087,14 +5330,18 @@ mdb_env_close0(MDB_env *env, int excl) } if (env->me_map) { +#ifdef MDB_VL32 + munmap(env->me_map, NUM_METAS*env->me_psize); +#else munmap(env->me_map, env->me_mapsize); +#endif } - if (env->me_mfd != INVALID_HANDLE_VALUE) + if (env->me_mfd != env->me_fd && env->me_mfd != INVALID_HANDLE_VALUE) (void) close(env->me_mfd); if (env->me_fd != INVALID_HANDLE_VALUE) (void) close(env->me_fd); if (env->me_txns) { - MDB_PID_T pid = getpid(); + MDB_PID_T pid = env->me_pid; /* Clearing readers is done in this function because * me_txkey with its destructor must be disabled first. * @@ -5128,6 +5375,16 @@ mdb_env_close0(MDB_env *env, int excl) sem_unlink(env->me_txns->mti_wmname); } } +#elif defined(MDB_USE_SYSV_SEM) + if (env->me_rmutex->semid != -1) { + /* If we have the filelock: If we are the + * only remaining user, clean up semaphores. + */ + if (excl == 0) + mdb_env_excl_lock(env, &excl); + if (excl > 0) + semctl(env->me_rmutex->semid, 0, IPC_RMID); + } #endif munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); } @@ -5142,6 +5399,14 @@ mdb_env_close0(MDB_env *env, int excl) #endif (void) close(env->me_lfd); } +#ifdef MDB_VL32 +#ifdef _WIN32 + if (env->me_fmh) CloseHandle(env->me_fmh); + if (env->me_rpmutex) CloseHandle(env->me_rpmutex); +#else + pthread_mutex_destroy(&env->me_rpmutex); +#endif +#endif env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY); } @@ -5165,18 +5430,18 @@ mdb_env_close(MDB_env *env) free(env); } -/** Compare two items pointing at aligned size_t's */ +/** Compare two items pointing at aligned mdb_size_t's */ static int mdb_cmp_long(const MDB_val *a, const MDB_val *b) { - return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 : - *(size_t *)a->mv_data > *(size_t *)b->mv_data; + return (*(mdb_size_t *)a->mv_data < *(mdb_size_t *)b->mv_data) ? -1 : + *(mdb_size_t *)a->mv_data > *(mdb_size_t *)b->mv_data; } /** Compare two items pointing at aligned unsigned int's. * * This is also set as #MDB_INTEGERDUP|#MDB_DUPFIXED's #MDB_dbx.%md_dcmp, - * but #mdb_cmp_clong() is called instead if the data type is size_t. + * but #mdb_cmp_clong() is called instead if the data type is mdb_size_t. */ static int mdb_cmp_int(const MDB_val *a, const MDB_val *b) @@ -5281,7 +5546,7 @@ mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) nkeys = NUMKEYS(mp); - DPRINTF(("searching %u keys in %s %spage %"Z"u", + DPRINTF(("searching %u keys in %s %spage %"Y"u", nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", mdb_dbg_pgno(mp))); @@ -5293,7 +5558,7 @@ mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) * alignment is guaranteed. Use faster mdb_cmp_int. */ if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) { - if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t)) + if (NODEPTR(mp, 1)->mn_ksize == sizeof(mdb_size_t)) cmp = mdb_cmp_long; else cmp = mdb_cmp_int; @@ -5329,7 +5594,7 @@ mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp) DPRINTF(("found leaf index %u [%s], rc = %i", i, DKEY(&nodekey), rc)); else - DPRINTF(("found branch index %u [%s -> %"Z"u], rc = %i", + DPRINTF(("found branch index %u [%s -> %"Y"u], rc = %i", i, DKEY(&nodekey), NODEPGNO(node), rc)); #endif if (rc == 0) @@ -5377,7 +5642,7 @@ static void mdb_cursor_pop(MDB_cursor *mc) { if (mc->mc_snum) { - DPRINTF(("popping page %"Z"u off db %d cursor %p", + DPRINTF(("popping page %"Y"u off db %d cursor %p", mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *) mc)); mc->mc_snum--; @@ -5389,13 +5654,11 @@ mdb_cursor_pop(MDB_cursor *mc) } } -/** Push a page onto the top of the cursor's stack. - * Set #MDB_TXN_ERROR on failure. - */ +/** Push a page onto the top of the cursor's stack. */ static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) { - DPRINTF(("pushing page %"Z"u on db %d cursor %p", mp->mp_pgno, + DPRINTF(("pushing page %"Y"u on db %d cursor %p", mp->mp_pgno, DDBI(mc), (void *) mc)); if (mc->mc_snum >= CURSOR_STACK) { @@ -5410,8 +5673,295 @@ mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) return MDB_SUCCESS; } +#ifdef MDB_VL32 +/** Map a read-only page. + * There are two levels of tracking in use, a per-txn list and a per-env list. + * ref'ing and unref'ing the per-txn list is faster since it requires no + * locking. Pages are cached in the per-env list for global reuse, and a lock + * is required. Pages are not immediately unmapped when their refcnt goes to + * zero; they hang around in case they will be reused again soon. + * + * When the per-txn list gets full, all pages with refcnt=0 are purged from the + * list and their refcnts in the per-env list are decremented. + * + * When the per-env list gets full, all pages with refcnt=0 are purged from the + * list and their pages are unmapped. + * + * @note "full" means the list has reached its respective rpcheck threshold. + * This threshold slowly raises if no pages could be purged on a given check, + * and returns to its original value when enough pages were purged. + * + * If purging doesn't free any slots, filling the per-txn list will return + * MDB_TXN_FULL, and filling the per-env list returns MDB_MAP_FULL. + * + * Reference tracking in a txn is imperfect, pages can linger with non-zero + * refcnt even without active references. It was deemed to be too invasive + * to add unrefs in every required location. However, all pages are unref'd + * at the end of the transaction. This guarantees that no stale references + * linger in the per-env list. + * + * Usually we map chunks of 16 pages at a time, but if an overflow page begins + * at the tail of the chunk we extend the chunk to include the entire overflow + * page. Unfortunately, pages can be turned into overflow pages after their + * chunk was already mapped. In that case we must remap the chunk if the + * overflow page is referenced. If the chunk's refcnt is 0 we can just remap + * it, otherwise we temporarily map a new chunk just for the overflow page. + * + * @note this chunk handling means we cannot guarantee that a data item + * returned from the DB will stay alive for the duration of the transaction: + * We unref pages as soon as a cursor moves away from the page + * A subsequent op may cause a purge, which may unmap any unref'd chunks + * The caller must copy the data if it must be used later in the same txn. + * + * Also - our reference counting revolves around cursors, but overflow pages + * aren't pointed to by a cursor's page stack. We have to remember them + * explicitly, in the added mc_ovpg field. A single cursor can only hold a + * reference to one overflow page at a time. + * + * @param[in] txn the transaction for this access. + * @param[in] pgno the page number for the page to retrieve. + * @param[out] ret address of a pointer where the page's address will be stored. + * @return 0 on success, non-zero on failure. + */ +static int +mdb_rpage_get(MDB_txn *txn, pgno_t pg0, MDB_page **ret) +{ + MDB_env *env = txn->mt_env; + MDB_page *p; + MDB_ID3L tl = txn->mt_rpages; + MDB_ID3L el = env->me_rpages; + MDB_ID3 id3; + unsigned x, rem; + pgno_t pgno; + int rc, retries = 1; +#ifdef _WIN32 + LARGE_INTEGER off; + SIZE_T len; +#define SET_OFF(off,val) off.QuadPart = val +#define MAP(rc,env,addr,len,off) \ + addr = NULL; \ + rc = NtMapViewOfSection(env->me_fmh, GetCurrentProcess(), &addr, 0, \ + len, &off, &len, ViewUnmap, (env->me_flags & MDB_RDONLY) ? 0 : MEM_RESERVE, PAGE_READONLY); \ + if (rc) rc = mdb_nt2win32(rc) +#else + off64_t off; + size_t len; +#define SET_OFF(off,val) off = val +#define MAP(rc,env,addr,len,off) \ + addr = mmap(NULL, len, PROT_READ, MAP_SHARED, env->me_fd, off); \ + rc = (addr == MAP_FAILED) ? errno : 0 +#endif + + /* remember the offset of the actual page number, so we can + * return the correct pointer at the end. + */ + rem = pg0 & (MDB_RPAGE_CHUNK-1); + pgno = pg0 ^ rem; + + id3.mid = 0; + x = mdb_mid3l_search(tl, pgno); + if (x <= tl[0].mid && tl[x].mid == pgno) { + if (x != tl[0].mid && tl[x+1].mid == pg0) + x++; + /* check for overflow size */ + p = (MDB_page *)((char *)tl[x].mptr + rem * env->me_psize); + if (IS_OVERFLOW(p) && p->mp_pages + rem > tl[x].mcnt) { + id3.mcnt = p->mp_pages + rem; + len = id3.mcnt * env->me_psize; + SET_OFF(off, pgno * env->me_psize); + MAP(rc, env, id3.mptr, len, off); + if (rc) + return rc; + /* check for local-only page */ + if (rem) { + mdb_tassert(txn, tl[x].mid != pg0); + /* hope there's room to insert this locally. + * setting mid here tells later code to just insert + * this id3 instead of searching for a match. + */ + id3.mid = pg0; + goto notlocal; + } else { + /* ignore the mapping we got from env, use new one */ + tl[x].mptr = id3.mptr; + tl[x].mcnt = id3.mcnt; + /* if no active ref, see if we can replace in env */ + if (!tl[x].mref) { + unsigned i; + pthread_mutex_lock(&env->me_rpmutex); + i = mdb_mid3l_search(el, tl[x].mid); + if (el[i].mref == 1) { + /* just us, replace it */ + munmap(el[i].mptr, el[i].mcnt * env->me_psize); + el[i].mptr = tl[x].mptr; + el[i].mcnt = tl[x].mcnt; + } else { + /* there are others, remove ourself */ + el[i].mref--; + } + pthread_mutex_unlock(&env->me_rpmutex); + } + } + } + id3.mptr = tl[x].mptr; + id3.mcnt = tl[x].mcnt; + tl[x].mref++; + goto ok; + } + +notlocal: + if (tl[0].mid >= MDB_TRPAGE_MAX - txn->mt_rpcheck) { + unsigned i, y; + /* purge unref'd pages from our list and unref in env */ + pthread_mutex_lock(&env->me_rpmutex); +retry: + y = 0; + for (i=1; i<=tl[0].mid; i++) { + if (!tl[i].mref) { + if (!y) y = i; + /* tmp overflow pages don't go to env */ + if (tl[i].mid & (MDB_RPAGE_CHUNK-1)) { + munmap(tl[i].mptr, tl[i].mcnt * env->me_psize); + continue; + } + x = mdb_mid3l_search(el, tl[i].mid); + el[x].mref--; + } + } + pthread_mutex_unlock(&env->me_rpmutex); + if (!y) { + /* we didn't find any unref'd chunks. + * if we're out of room, fail. + */ + if (tl[0].mid >= MDB_TRPAGE_MAX) + return MDB_TXN_FULL; + /* otherwise, raise threshold for next time around + * and let this go. + */ + txn->mt_rpcheck /= 2; + } else { + /* we found some unused; consolidate the list */ + for (i=y+1; i<= tl[0].mid; i++) + if (tl[i].mref) + tl[y++] = tl[i]; + tl[0].mid = y-1; + /* decrease the check threshold toward its original value */ + if (!txn->mt_rpcheck) + txn->mt_rpcheck = 1; + while (txn->mt_rpcheck < tl[0].mid && txn->mt_rpcheck < MDB_TRPAGE_SIZE/2) + txn->mt_rpcheck *= 2; + } + } + if (tl[0].mid < MDB_TRPAGE_SIZE) { + id3.mref = 1; + if (id3.mid) + goto found; + /* don't map past last written page in read-only envs */ + if ((env->me_flags & MDB_RDONLY) && pgno + MDB_RPAGE_CHUNK-1 > txn->mt_last_pgno) + id3.mcnt = txn->mt_last_pgno + 1 - pgno; + else + id3.mcnt = MDB_RPAGE_CHUNK; + len = id3.mcnt * env->me_psize; + id3.mid = pgno; + + /* search for page in env */ + pthread_mutex_lock(&env->me_rpmutex); + x = mdb_mid3l_search(el, pgno); + if (x <= el[0].mid && el[x].mid == pgno) { + id3.mptr = el[x].mptr; + id3.mcnt = el[x].mcnt; + /* check for overflow size */ + p = (MDB_page *)((char *)id3.mptr + rem * env->me_psize); + if (IS_OVERFLOW(p) && p->mp_pages + rem > id3.mcnt) { + id3.mcnt = p->mp_pages + rem; + len = id3.mcnt * env->me_psize; + SET_OFF(off, pgno * env->me_psize); + MAP(rc, env, id3.mptr, len, off); + if (rc) + goto fail; + if (!el[x].mref) { + munmap(el[x].mptr, env->me_psize * el[x].mcnt); + el[x].mptr = id3.mptr; + el[x].mcnt = id3.mcnt; + } else { + id3.mid = pg0; + pthread_mutex_unlock(&env->me_rpmutex); + goto found; + } + } + el[x].mref++; + pthread_mutex_unlock(&env->me_rpmutex); + goto found; + } + if (el[0].mid >= MDB_ERPAGE_MAX - env->me_rpcheck) { + /* purge unref'd pages */ + unsigned i, y = 0; + for (i=1; i<=el[0].mid; i++) { + if (!el[i].mref) { + if (!y) y = i; + munmap(el[i].mptr, env->me_psize * el[i].mcnt); + } + } + if (!y) { + if (retries) { + /* see if we can unref some local pages */ + retries--; + id3.mid = 0; + goto retry; + } + if (el[0].mid >= MDB_ERPAGE_MAX) { + pthread_mutex_unlock(&env->me_rpmutex); + return MDB_MAP_FULL; + } + env->me_rpcheck /= 2; + } else { + for (i=y+1; i<= el[0].mid; i++) + if (el[i].mref) + el[y++] = el[i]; + el[0].mid = y-1; + if (!env->me_rpcheck) + env->me_rpcheck = 1; + while (env->me_rpcheck < el[0].mid && env->me_rpcheck < MDB_ERPAGE_SIZE/2) + env->me_rpcheck *= 2; + } + } + SET_OFF(off, pgno * env->me_psize); + MAP(rc, env, id3.mptr, len, off); + if (rc) { +fail: + pthread_mutex_unlock(&env->me_rpmutex); + return rc; + } + /* check for overflow size */ + p = (MDB_page *)((char *)id3.mptr + rem * env->me_psize); + if (IS_OVERFLOW(p) && p->mp_pages + rem > id3.mcnt) { + id3.mcnt = p->mp_pages + rem; + munmap(id3.mptr, len); + len = id3.mcnt * env->me_psize; + MAP(rc, env, id3.mptr, len, off); + if (rc) + goto fail; + } + mdb_mid3l_insert(el, &id3); + pthread_mutex_unlock(&env->me_rpmutex); +found: + mdb_mid3l_insert(tl, &id3); + } else { + return MDB_TXN_FULL; + } +ok: + p = (MDB_page *)((char *)id3.mptr + rem * env->me_psize); +#if MDB_DEBUG /* we don't need this check any more */ + if (IS_OVERFLOW(p)) { + mdb_tassert(txn, p->mp_pages + rem <= id3.mcnt); + } +#endif + *ret = p; + return MDB_SUCCESS; +} +#endif + /** Find the address of the page corresponding to a given page number. - * Set #MDB_TXN_ERROR on failure. * @param[in] mc the cursor accessing the page. * @param[in] pgno the page number for the page to retrieve. * @param[out] ret address of a pointer where the page's address will be stored. @@ -5422,11 +5972,13 @@ static int mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) { MDB_txn *txn = mc->mc_txn; +#ifndef MDB_VL32 MDB_env *env = txn->mt_env; +#endif MDB_page *p = NULL; int level; - if (! (txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_WRITEMAP))) { + if (! (mc->mc_flags & (C_ORIG_RDONLY|C_WRITEMAP))) { MDB_txn *tx2 = txn; level = 1; do { @@ -5441,7 +5993,13 @@ mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) MDB_ID pn = pgno << 1; x = mdb_midl_search(tx2->mt_spill_pgs, pn); if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) { +#ifdef MDB_VL32 + int rc = mdb_rpage_get(txn, pgno, &p); + if (rc) + return rc; +#else p = (MDB_page *)(env->me_map + env->me_psize * pgno); +#endif goto done; } } @@ -5458,9 +6016,17 @@ mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) if (pgno < txn->mt_next_pgno) { level = 0; +#ifdef MDB_VL32 + { + int rc = mdb_rpage_get(txn, pgno, &p); + if (rc) + return rc; + } +#else p = (MDB_page *)(env->me_map + env->me_psize * pgno); +#endif } else { - DPRINTF(("page %"Z"u not found", pgno)); + DPRINTF(("page %"Y"u not found", pgno)); txn->mt_flags |= MDB_TXN_ERROR; return MDB_PAGE_NOTFOUND; } @@ -5486,27 +6052,18 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) MDB_node *node; indx_t i; - DPRINTF(("branch page %"Z"u has %u keys", mp->mp_pgno, NUMKEYS(mp))); + DPRINTF(("branch page %"Y"u has %u keys", mp->mp_pgno, NUMKEYS(mp))); /* Don't assert on branch pages in the FreeDB. We can get here * while in the process of rebalancing a FreeDB branch page; we must * let that proceed. ITS#8336 */ mdb_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); - DPRINTF(("found index 0 to page %"Z"u", NODEPGNO(NODEPTR(mp, 0)))); + DPRINTF(("found index 0 to page %"Y"u", NODEPGNO(NODEPTR(mp, 0)))); if (flags & (MDB_PS_FIRST|MDB_PS_LAST)) { i = 0; - if (flags & MDB_PS_LAST) { + if (flags & MDB_PS_LAST) i = NUMKEYS(mp) - 1; - /* if already init'd, see if we're already in right place */ - if (mc->mc_flags & C_INITIALIZED) { - if (mc->mc_ki[mc->mc_top] == i) { - mc->mc_top = mc->mc_snum++; - mp = mc->mc_pg[mc->mc_top]; - goto ready; - } - } - } } else { int exact; node = mdb_node_search(mc, key, &exact); @@ -5532,7 +6089,6 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) if ((rc = mdb_cursor_push(mc, mp))) return rc; -ready: if (flags & MDB_PS_MODIFY) { if ((rc = mdb_page_touch(mc)) != 0) return rc; @@ -5547,7 +6103,7 @@ ready: return MDB_CORRUPTED; } - DPRINTF(("found leaf page %"Z"u for key [%s]", mp->mp_pgno, + DPRINTF(("found leaf page %"Y"u for key [%s]", mp->mp_pgno, key ? DKEY(key) : "null")); mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; @@ -5643,14 +6199,26 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) } mdb_cassert(mc, root > 1); - if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) + if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) { +#ifdef MDB_VL32 + if (mc->mc_pg[0]) + MDB_PAGE_UNREF(mc->mc_txn, mc->mc_pg[0]); +#endif if ((rc = mdb_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0) return rc; + } +#ifdef MDB_VL32 + { + int i; + for (i=1; imc_snum; i++) + MDB_PAGE_UNREF(mc->mc_txn, mc->mc_pg[i]); + } +#endif mc->mc_snum = 1; mc->mc_top = 0; - DPRINTF(("db %d root page %"Z"u has flags 0x%X", + DPRINTF(("db %d root page %"Y"u has flags 0x%X", DDBI(mc), root, mc->mc_pg[0]->mp_flags)); if (flags & MDB_PS_MODIFY) { @@ -5675,7 +6243,7 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) MDB_ID pn = pg << 1; int rc; - DPRINTF(("free ov page %"Z"u (%d)", pg, ovpages)); + DPRINTF(("free ov page %"Y"u (%d)", pg, ovpages)); /* If the page is dirty or on the spill list we just acquired it, * so we should give it back to our current free list, if any. * Otherwise put it onto the list of pages we freed in this txn. @@ -5736,6 +6304,10 @@ release: if (rc) return rc; } +#ifdef MDB_VL32 + if (mc->mc_ovpg == mp) + mc->mc_ovpg = NULL; +#endif mc->mc_db->md_overflow_pages -= ovpages; return 0; } @@ -5753,6 +6325,12 @@ mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data) pgno_t pgno; int rc; +#ifdef MDB_VL32 + if (mc->mc_ovpg) { + MDB_PAGE_UNREF(mc->mc_txn, mc->mc_ovpg); + mc->mc_ovpg = 0; + } +#endif if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { data->mv_size = NODEDSZ(leaf); data->mv_data = NODEDATA(leaf); @@ -5764,10 +6342,13 @@ mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data) data->mv_size = NODEDSZ(leaf); memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); if ((rc = mdb_page_get(mc, pgno, &omp, NULL)) != 0) { - DPRINTF(("read overflow page %"Z"u failed", pgno)); + DPRINTF(("read overflow page %"Y"u failed", pgno)); return rc; } data->mv_data = METADATA(omp); +#ifdef MDB_VL32 + mc->mc_ovpg = omp; +#endif return MDB_SUCCESS; } @@ -5778,7 +6359,7 @@ mdb_get(MDB_txn *txn, MDB_dbi dbi, { MDB_cursor mc; MDB_xcursor mx; - int exact = 0; + int exact = 0, rc; DKBUF; DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key))); @@ -5790,7 +6371,16 @@ mdb_get(MDB_txn *txn, MDB_dbi dbi, return MDB_BAD_TXN; mdb_cursor_init(&mc, txn, dbi, &mx); - return mdb_cursor_set(&mc, key, data, MDB_SET, &exact); + rc = mdb_cursor_set(&mc, key, data, MDB_SET, &exact); +#ifdef MDB_VL32 + { + /* unref all the pages - caller must copy the data + * before doing anything else + */ + mdb_cursor_unref(&mc); + } +#endif + return rc; } /** Find a sibling for a page. @@ -5807,13 +6397,19 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right) int rc; MDB_node *indx; MDB_page *mp; +#ifdef MDB_VL32 + MDB_page *op; +#endif if (mc->mc_snum < 2) { return MDB_NOTFOUND; /* root has no siblings */ } +#ifdef MDB_VL32 + op = mc->mc_pg[mc->mc_top]; +#endif mdb_cursor_pop(mc); - DPRINTF(("parent page is page %"Z"u, index %u", + DPRINTF(("parent page is page %"Y"u, index %u", mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top])); if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top])) @@ -5836,6 +6432,8 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right) } mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); + MDB_PAGE_UNREF(mc->mc_txn, op); + indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if ((rc = mdb_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0) { /* mc will be inconsistent if caller does mc_snum++ as above */ @@ -5858,20 +6456,14 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) MDB_node *leaf; int rc; - if ((mc->mc_flags & C_DEL && op == MDB_NEXT_DUP)) + if (mc->mc_flags & C_EOF) { return MDB_NOTFOUND; + } - if (!(mc->mc_flags & C_INITIALIZED)) - return mdb_cursor_first(mc, key, data); + mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); mp = mc->mc_pg[mc->mc_top]; - if (mc->mc_flags & C_EOF) { - if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)-1) - return MDB_NOTFOUND; - mc->mc_flags ^= C_EOF; - } - if (mc->mc_db->md_flags & MDB_DUPSORT) { leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -5883,6 +6475,13 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) return rc; } } +#ifdef MDB_VL32 + else { + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + mdb_cursor_unref(&mc->mc_xcursor->mx_cursor); + } + } +#endif } else { mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); if (op == MDB_NEXT_DUP) @@ -5890,7 +6489,7 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) } } - DPRINTF(("cursor_next: top page is %"Z"u in cursor %p", + DPRINTF(("cursor_next: top page is %"Y"u in cursor %p", mdb_dbg_pgno(mp), (void *) mc)); if (mc->mc_flags & C_DEL) { mc->mc_flags ^= C_DEL; @@ -5904,12 +6503,12 @@ mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) return rc; } mp = mc->mc_pg[mc->mc_top]; - DPRINTF(("next page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); + DPRINTF(("next page is %"Y"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); } else mc->mc_ki[mc->mc_top]++; skip: - DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", + DPRINTF(("==> cursor points to page %"Y"u with %u keys, key index %u", mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); if (IS_LEAF2(mp)) { @@ -5947,12 +6546,7 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) MDB_node *leaf; int rc; - if (!(mc->mc_flags & C_INITIALIZED)) { - rc = mdb_cursor_last(mc, key, data); - if (rc) - return rc; - mc->mc_ki[mc->mc_top]++; - } + mdb_cassert(mc, mc->mc_flags & C_INITIALIZED); mp = mc->mc_pg[mc->mc_top]; @@ -5969,6 +6563,13 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) return rc; } } +#ifdef MDB_VL32 + else { + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + mdb_cursor_unref(&mc->mc_xcursor->mx_cursor); + } + } +#endif } else { mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); if (op == MDB_PREV_DUP) @@ -5976,7 +6577,7 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) } } - DPRINTF(("cursor_prev: top page is %"Z"u in cursor %p", + DPRINTF(("cursor_prev: top page is %"Y"u in cursor %p", mdb_dbg_pgno(mp), (void *) mc)); mc->mc_flags &= ~(C_EOF|C_DEL); @@ -5988,11 +6589,13 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) } mp = mc->mc_pg[mc->mc_top]; mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1; - DPRINTF(("prev page is %"Z"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); + DPRINTF(("prev page is %"Y"u, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top])); } else mc->mc_ki[mc->mc_top]--; - DPRINTF(("==> cursor points to page %"Z"u with %u keys, key index %u", + mc->mc_flags &= ~C_EOF; + + DPRINTF(("==> cursor points to page %"Y"u with %u keys, key index %u", mdb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top])); if (IS_LEAF2(mp)) { @@ -6035,8 +6638,14 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (key->mv_size == 0) return MDB_BAD_VALSIZE; - if (mc->mc_xcursor) + if (mc->mc_xcursor) { +#ifdef MDB_VL32 + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + mdb_cursor_unref(&mc->mc_xcursor->mx_cursor); + } +#endif mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + } /* See if we're already on the right page */ if (mc->mc_flags & C_INITIALIZED) { @@ -6102,7 +6711,6 @@ mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, } } rc = 0; - mc->mc_flags &= ~C_EOF; goto set2; } } @@ -6194,8 +6802,8 @@ set1: if ((rc = mdb_node_read(mc, leaf, &olddata)) != MDB_SUCCESS) return rc; dcmp = mc->mc_dbx->md_dcmp; -#if UINT_MAX < SIZE_MAX - if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) +#if UINT_MAX < SIZE_MAX || defined(MDB_VL32) + if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(mdb_size_t)) dcmp = mdb_cmp_clong; #endif rc = dcmp(data, &olddata); @@ -6229,8 +6837,14 @@ mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) int rc; MDB_node *leaf; - if (mc->mc_xcursor) + if (mc->mc_xcursor) { +#ifdef MDB_VL32 + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + mdb_cursor_unref(&mc->mc_xcursor->mx_cursor); + } +#endif mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + } if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { rc = mdb_page_search(mc, NULL, MDB_PS_FIRST); @@ -6273,16 +6887,25 @@ mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) int rc; MDB_node *leaf; - if (mc->mc_xcursor) + if (mc->mc_xcursor) { +#ifdef MDB_VL32 + if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + mdb_cursor_unref(&mc->mc_xcursor->mx_cursor); + } +#endif mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdb_page_search(mc, NULL, MDB_PS_LAST); - if (rc != MDB_SUCCESS) - return rc; } - mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + if (!(mc->mc_flags & C_EOF)) { + + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdb_page_search(mc, NULL, MDB_PS_LAST); + if (rc != MDB_SUCCESS) + return rc; + } + mdb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + + } mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1; mc->mc_flags |= C_INITIALIZED|C_EOF; leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); @@ -6396,7 +7019,10 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, rc = MDB_INCOMPATIBLE; break; } - rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); + if (!(mc->mc_flags & C_INITIALIZED)) + rc = mdb_cursor_first(mc, key, data); + else + rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP); if (rc == MDB_SUCCESS) { if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { MDB_cursor *mx; @@ -6438,11 +7064,21 @@ fetchm: case MDB_NEXT: case MDB_NEXT_DUP: case MDB_NEXT_NODUP: - rc = mdb_cursor_next(mc, key, data, op); + if (!(mc->mc_flags & C_INITIALIZED)) + rc = mdb_cursor_first(mc, key, data); + else + rc = mdb_cursor_next(mc, key, data, op); break; case MDB_PREV: case MDB_PREV_DUP: case MDB_PREV_NODUP: + if (!(mc->mc_flags & C_INITIALIZED)) { + rc = mdb_cursor_last(mc, key, data); + if (rc) + break; + mc->mc_flags |= C_INITIALIZED; + mc->mc_ki[mc->mc_top]++; + } rc = mdb_cursor_prev(mc, key, data, op); break; case MDB_FIRST: @@ -6459,11 +7095,6 @@ fetchm: rc = MDB_INCOMPATIBLE; break; } - if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) { - mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); - rc = MDB_NOTFOUND; - break; - } { MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -6505,8 +7136,7 @@ mdb_cursor_touch(MDB_cursor *mc) { int rc = MDB_SUCCESS; - if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY|DB_DUPDATA))) { - /* Touch DB record of named DB */ + if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & DB_DIRTY)) { MDB_cursor mc2; MDB_xcursor mcx; if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) @@ -6731,8 +7361,8 @@ more: if (flags == MDB_CURRENT) goto current; dcmp = mc->mc_dbx->md_dcmp; -#if UINT_MAX < SIZE_MAX - if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(size_t)) +#if UINT_MAX < SIZE_MAX || defined(MDB_VL32) + if (dcmp == mdb_cmp_int && olddata.mv_size == sizeof(mdb_size_t)) dcmp = mdb_cmp_clong; #endif /* does data match? */ @@ -6830,9 +7460,8 @@ prep_subDB: } else { memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE, olddata.mv_size - fp->mp_upper - PAGEBASE); - memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs), NUMKEYS(fp) * sizeof(mp->mp_ptrs[0])); for (i=0; imp_ptrs[i] += offset; + mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset; } } @@ -6886,13 +7515,8 @@ current: /* Note - this page is already counted in parent's dirty_room */ rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); mdb_cassert(mc, rc2 == 0); - /* Currently we make the page look as with put() in the - * parent txn, in case the user peeks at MDB_RESERVEd - * or unused parts. Some users treat ovpages specially. - */ if (!(flags & MDB_RESERVE)) { - /* Skip the part where LMDB will put *data. - * Copy end of page, adjusting alignment so + /* Copy end of page, adjusting alignment so * compiler may copy words instead of bytes. */ off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t); @@ -6961,7 +7585,11 @@ new_sub: if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) { m3->mc_ki[i]++; } - XCURSOR_REFRESH(m3, i, mp); + if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { + MDB_node *n2 = NODEPTR(mp, m3->mc_ki[i]); + if ((n2->mn_flags & (F_SUBDATA|F_DUPDATA)) == F_DUPDATA) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); + } } } } @@ -6974,7 +7602,7 @@ new_sub: */ if (do_sub) { int xflags, new_dupdata; - size_t ecount; + mdb_size_t ecount; put_sub: xdata.mv_size = 0; xdata.mv_data = ""; @@ -7003,6 +7631,7 @@ put_sub: MDB_xcursor *mx = mc->mc_xcursor; unsigned i = mc->mc_top; MDB_page *mp = mc->mc_pg[i]; + int nkeys = NUMKEYS(mp); for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; @@ -7010,8 +7639,10 @@ put_sub: if (m2->mc_pg[i] == mp) { if (m2->mc_ki[i] == mc->mc_ki[i]) { mdb_xcursor_init2(m2, mx, new_dupdata); - } else if (!insert_key) { - XCURSOR_REFRESH(m2, i, mp); + } else if (!insert_key && m2->mc_ki[i] < nkeys) { + MDB_node *n2 = NODEPTR(mp, m2->mc_ki[i]); + if ((n2->mn_flags & (F_SUBDATA|F_DUPDATA)) == F_DUPDATA) + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); } } } @@ -7116,7 +7747,13 @@ mdb_cursor_del(MDB_cursor *mc, unsigned int flags) if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; if (!(m2->mc_flags & C_INITIALIZED)) continue; if (m2->mc_pg[mc->mc_top] == mp) { - XCURSOR_REFRESH(m2, mc->mc_top, mp); + if (m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) { + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); + } else { + MDB_node *n2 = NODEPTR(mp, m2->mc_ki[mc->mc_top]); + if (!(n2->mn_flags & F_SUBDATA)) + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); + } } } } @@ -7161,7 +7798,6 @@ fail: } /** Allocate and initialize new pages for a database. - * Set #MDB_TXN_ERROR on failure. * @param[in] mc a cursor on the database being added to. * @param[in] flags flags defining what type of page is being allocated. * @param[in] num the number of pages to allocate. This is usually 1, @@ -7177,7 +7813,7 @@ mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp) if ((rc = mdb_page_alloc(mc, num, &np))) return rc; - DPRINTF(("allocated new mpage %"Z"u, page size %u", + DPRINTF(("allocated new mpage %"Y"u, page size %u", np->mp_pgno, mc->mc_txn->mt_env->me_psize)); np->mp_flags = flags | P_DIRTY; np->mp_lower = (PAGEHDRSZ-PAGEBASE); @@ -7247,7 +7883,6 @@ mdb_branch_size(MDB_env *env, MDB_val *key) } /** Add a node to the page pointed to by the cursor. - * Set #MDB_TXN_ERROR on failure. * @param[in] mc The cursor for this operation. * @param[in] indx The index on the page where the new node should be added. * @param[in] key The key for the new node. @@ -7278,7 +7913,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, mdb_cassert(mc, mp->mp_upper >= mp->mp_lower); - DPRINTF(("add to %s %spage %"Z"u index %i, data size %"Z"u key size %"Z"u [%s]", + DPRINTF(("add to %s %spage %"Y"u index %i, data size %"Z"u key size %"Z"u [%s]", IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", mdb_dbg_pgno(mp), indx, data ? data->mv_size : 0, @@ -7319,7 +7954,7 @@ mdb_node_add(MDB_cursor *mc, indx_t indx, goto full; if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp))) return rc; - DPRINTF(("allocated overflow page %"Z"u", ofp->mp_pgno)); + DPRINTF(("allocated overflow page %"Y"u", ofp->mp_pgno)); flags |= F_BIGDATA; goto update; } else { @@ -7376,7 +8011,7 @@ update: return MDB_SUCCESS; full: - DPRINTF(("not enough room in page %"Z"u, got %u ptrs", + DPRINTF(("not enough room in page %"Y"u, got %u ptrs", mdb_dbg_pgno(mp), NUMKEYS(mp))); DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room)); DPRINTF(("node size = %"Z"u", node_size)); @@ -7399,7 +8034,7 @@ mdb_node_del(MDB_cursor *mc, int ksize) MDB_node *node; char *base; - DPRINTF(("delete node %u on %s page %"Z"u", indx, + DPRINTF(("delete node %u on %s page %"Y"u", indx, IS_LEAF(mp) ? "leaf" : "branch", mdb_dbg_pgno(mp))); numkeys = NUMKEYS(mp); mdb_cassert(mc, indx < numkeys); @@ -7508,7 +8143,10 @@ mdb_xcursor_init0(MDB_cursor *mc) mx->mx_cursor.mc_dbflag = &mx->mx_dbflag; mx->mx_cursor.mc_snum = 0; mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB; +#ifdef MDB_VL32 + mx->mx_cursor.mc_ovpg = 0; +#endif + mx->mx_cursor.mc_flags = C_SUB | (mc->mc_flags & (C_ORIG_RDONLY|C_WRITEMAP)); mx->mx_dbx.md_name.mv_size = 0; mx->mx_dbx.md_name.mv_data = NULL; mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; @@ -7527,12 +8165,12 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) { MDB_xcursor *mx = mc->mc_xcursor; + mx->mx_cursor.mc_flags &= C_SUB|C_ORIG_RDONLY|C_WRITEMAP; if (node->mn_flags & F_SUBDATA) { memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db)); mx->mx_cursor.mc_pg[0] = 0; mx->mx_cursor.mc_snum = 0; mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB; } else { MDB_page *fp = NODEDATA(node); mx->mx_db.md_pad = 0; @@ -7545,7 +8183,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno); mx->mx_cursor.mc_snum = 1; mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB; + mx->mx_cursor.mc_flags |= C_INITIALIZED; mx->mx_cursor.mc_pg[0] = fp; mx->mx_cursor.mc_ki[0] = 0; if (mc->mc_db->md_flags & MDB_DUPFIXED) { @@ -7555,11 +8193,11 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) mx->mx_db.md_flags |= MDB_INTEGERKEY; } } - DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, + DPRINTF(("Sub-db -%u root page %"Y"u", mx->mx_cursor.mc_dbi, mx->mx_db.md_root)); - mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; -#if UINT_MAX < SIZE_MAX - if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) + mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ +#if UINT_MAX < SIZE_MAX || defined(MDB_VL32) + if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(mdb_size_t)) mx->mx_dbx.md_cmp = mdb_cmp_clong; #endif } @@ -7583,7 +8221,7 @@ mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) mx->mx_cursor.mc_top = 0; mx->mx_cursor.mc_flags |= C_INITIALIZED; mx->mx_cursor.mc_ki[0] = 0; - mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; + mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ #if UINT_MAX < SIZE_MAX mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; #endif @@ -7592,7 +8230,7 @@ mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) } mx->mx_db = src_mx->mx_db; mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; - DPRINTF(("Sub-db -%u root page %"Z"u", mx->mx_cursor.mc_dbi, + DPRINTF(("Sub-db -%u root page %"Y"u", mx->mx_cursor.mc_dbi, mx->mx_db.md_root)); } @@ -7611,7 +8249,10 @@ mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) mc->mc_top = 0; mc->mc_pg[0] = 0; mc->mc_ki[0] = 0; - mc->mc_flags = 0; +#ifdef MDB_VL32 + mc->mc_ovpg = 0; +#endif + mc->mc_flags = txn->mt_flags & (C_ORIG_RDONLY|C_WRITEMAP); if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) { mdb_tassert(txn, mx != NULL); mc->mc_xcursor = mx; @@ -7676,7 +8317,7 @@ mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc) /* Return the count of duplicate data items for the current key */ int -mdb_cursor_count(MDB_cursor *mc, size_t *countp) +mdb_cursor_count(MDB_cursor *mc, mdb_size_t *countp) { MDB_node *leaf; @@ -7692,15 +8333,9 @@ mdb_cursor_count(MDB_cursor *mc, size_t *countp) if (!(mc->mc_flags & C_INITIALIZED)) return EINVAL; - if (!mc->mc_snum) + if (!mc->mc_snum || (mc->mc_flags & C_EOF)) return MDB_NOTFOUND; - if (mc->mc_flags & C_EOF) { - if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) - return MDB_NOTFOUND; - mc->mc_flags ^= C_EOF; - } - leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { *countp = 1; @@ -7716,6 +8351,11 @@ mdb_cursor_count(MDB_cursor *mc, size_t *countp) void mdb_cursor_close(MDB_cursor *mc) { +#ifdef MDB_VL32 + if (mc) { + mdb_cursor_unref(mc); + } +#endif if (mc && !mc->mc_backup) { /* remove from txn, if tracked */ if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { @@ -7742,7 +8382,6 @@ mdb_cursor_dbi(MDB_cursor *mc) } /** Replace the key for a branch node with a new key. - * Set #MDB_TXN_ERROR on failure. * @param[in] mc Cursor pointing to the node to operate on. * @param[in] key The new key to use. * @return 0 on success, non-zero on failure. @@ -7768,7 +8407,7 @@ mdb_update_key(MDB_cursor *mc, MDB_val *key) char kbuf2[DKBUF_MAXKEYSIZE*2+1]; k2.mv_data = NODEKEY(node); k2.mv_size = node->mn_ksize; - DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Z"u", + DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Y"u", indx, ptr, mdb_dkey(&k2, kbuf2), DKEY(key), @@ -7916,7 +8555,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) return rc; } - DPRINTF(("moving %s node %u [%s] on page %"Z"u to node %u on page %"Z"u", + DPRINTF(("moving %s node %u [%s] on page %"Y"u to node %u on page %"Y"u", IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch", csrc->mc_ki[csrc->mc_top], DKEY(&key), @@ -7962,8 +8601,12 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; m3->mc_ki[csrc->mc_top-1]++; } - if (IS_LEAF(mps)) - XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]); + if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) && + IS_LEAF(mps)) { + MDB_node *node = NODEPTR(m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); + if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + } } } else /* Adding on the right, bump others down */ @@ -7984,8 +8627,12 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) } else { m3->mc_ki[csrc->mc_top]--; } - if (IS_LEAF(mps)) - XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]); + if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) && + IS_LEAF(mps)) { + MDB_node *node = NODEPTR(m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); + if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + } } } } @@ -8002,7 +8649,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) key.mv_size = NODEKSZ(srcnode); key.mv_data = NODEKEY(srcnode); } - DPRINTF(("update separator for source page %"Z"u to [%s]", + DPRINTF(("update separator for source page %"Y"u to [%s]", csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key))); mdb_cursor_copy(csrc, &mn); mn.mc_snum--; @@ -8033,7 +8680,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) key.mv_size = NODEKSZ(srcnode); key.mv_data = NODEKEY(srcnode); } - DPRINTF(("update separator for destination page %"Z"u to [%s]", + DPRINTF(("update separator for destination page %"Y"u to [%s]", cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key))); mdb_cursor_copy(cdst, &mn); mn.mc_snum--; @@ -8079,7 +8726,7 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) psrc = csrc->mc_pg[csrc->mc_top]; pdst = cdst->mc_pg[cdst->mc_top]; - DPRINTF(("merging page %"Z"u into %"Z"u", psrc->mp_pgno, pdst->mp_pgno)); + DPRINTF(("merging page %"Y"u into %"Y"u", psrc->mp_pgno, pdst->mp_pgno)); mdb_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ mdb_cassert(csrc, cdst->mc_snum > 1); @@ -8136,7 +8783,7 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) } } - DPRINTF(("dst page %"Z"u now has %u keys (%.1f%% filled)", + DPRINTF(("dst page %"Y"u now has %u keys (%.1f%% filled)", pdst->mp_pgno, NUMKEYS(pdst), (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10)); @@ -8186,8 +8833,12 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) m3->mc_ki[top-1] > csrc->mc_ki[top-1]) { m3->mc_ki[top-1]--; } - if (IS_LEAF(psrc)) - XCURSOR_REFRESH(m3, top, m3->mc_pg[top]); + if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) && + IS_LEAF(psrc)) { + MDB_node *node = NODEPTR(m3->mc_pg[top], m3->mc_ki[top]); + if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + } } } { @@ -8220,6 +8871,9 @@ mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst) cdst->mc_snum = csrc->mc_snum; cdst->mc_top = csrc->mc_top; cdst->mc_flags = csrc->mc_flags; +#ifdef MDB_VL32 + cdst->mc_ovpg = csrc->mc_ovpg; +#endif for (i=0; imc_snum; i++) { cdst->mc_pg[i] = csrc->mc_pg[i]; @@ -8248,14 +8902,14 @@ mdb_rebalance(MDB_cursor *mc) minkeys = 1; thresh = FILL_THRESHOLD; } - DPRINTF(("rebalancing %s page %"Z"u (has %u keys, %.1f%% full)", + DPRINTF(("rebalancing %s page %"Y"u (has %u keys, %.1f%% full)", IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch", mdb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10)); if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh && NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) { - DPRINTF(("no need to rebalance page %"Z"u, above fill threshold", + DPRINTF(("no need to rebalance page %"Y"u, above fill threshold", mdb_dbg_pgno(mc->mc_pg[mc->mc_top]))); return MDB_SUCCESS; } @@ -8384,7 +9038,7 @@ mdb_rebalance(MDB_cursor *mc) fromleft = 1; } - DPRINTF(("found neighbor page %"Z"u (%u keys, %.1f%% full)", + DPRINTF(("found neighbor page %"Y"u (%u keys, %.1f%% full)", mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10)); @@ -8441,15 +9095,16 @@ mdb_cursor_del0(MDB_cursor *mc) if (m3->mc_pg[mc->mc_top] == mp) { if (m3->mc_ki[mc->mc_top] == ki) { m3->mc_flags |= C_DEL; - if (mc->mc_db->md_flags & MDB_DUPSORT) { - /* Sub-cursor referred into dataset which is gone */ - m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - } - continue; + if (mc->mc_db->md_flags & MDB_DUPSORT) + m3->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; } else if (m3->mc_ki[mc->mc_top] > ki) { m3->mc_ki[mc->mc_top]--; } - XCURSOR_REFRESH(m3, mc->mc_top, mp); + if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { + MDB_node *node = NODEPTR(m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); + if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + } } } } @@ -8475,32 +9130,11 @@ mdb_cursor_del0(MDB_cursor *mc) continue; if (m3->mc_pg[mc->mc_top] == mp) { /* if m3 points past last node in page, find next sibling */ - if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) { - if (m3->mc_ki[mc->mc_top] >= nkeys) { - rc = mdb_cursor_sibling(m3, 1); - if (rc == MDB_NOTFOUND) { - m3->mc_flags |= C_EOF; - rc = MDB_SUCCESS; - continue; - } - } - if (mc->mc_db->md_flags & MDB_DUPSORT) { - MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); - /* If this node has dupdata, it may need to be reinited - * because its data has moved. - * If the xcursor was not initd it must be reinited. - * Else if node points to a subDB, nothing is needed. - * Else (xcursor was initd, not a subDB) needs mc_pg[0] reset. - */ - if (node->mn_flags & F_DUPDATA) { - if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - if (!(node->mn_flags & F_SUBDATA)) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); - } else { - mdb_xcursor_init1(m3, node); - m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; - } - } + if (m3->mc_ki[mc->mc_top] >= nkeys) { + rc = mdb_cursor_sibling(m3, 1); + if (rc == MDB_NOTFOUND) { + m3->mc_flags |= C_EOF; + rc = MDB_SUCCESS; } } } @@ -8575,7 +9209,6 @@ mdb_del0(MDB_txn *txn, MDB_dbi dbi, } /** Split a page and insert a new node. - * Set #MDB_TXN_ERROR on failure. * @param[in,out] mc Cursor pointing to the page and desired insertion index. * The cursor will be updated to point to the actual page and index where * the node got inserted after the split. @@ -8607,7 +9240,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno newindx = mc->mc_ki[mc->mc_top]; nkeys = NUMKEYS(mp); - DPRINTF(("-----> splitting %s page %"Z"u and adding [%s] at index %i/%i", + DPRINTF(("-----> splitting %s page %"Y"u and adding [%s] at index %i/%i", IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys)); @@ -8615,7 +9248,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) return rc; rp->mp_pad = mp->mp_pad; - DPRINTF(("new right sibling: page %"Z"u", rp->mp_pgno)); + DPRINTF(("new right sibling: page %"Y"u", rp->mp_pgno)); /* Usually when splitting the root page, the cursor * height is 1. But when called from mdb_update_key, @@ -8633,7 +9266,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno mc->mc_pg[0] = pp; mc->mc_ki[0] = 0; mc->mc_db->md_root = pp->mp_pgno; - DPRINTF(("root split! new root = %"Z"u", pp->mp_pgno)); + DPRINTF(("root split! new root = %"Y"u", pp->mp_pgno)); new_root = mc->mc_db->md_depth++; /* Add left (implicit) pointer. */ @@ -8650,7 +9283,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno ptop = 0; } else { ptop = mc->mc_top-1; - DPRINTF(("parent branch page is %"Z"u", mc->mc_pg[ptop]->mp_pgno)); + DPRINTF(("parent branch page is %"Y"u", mc->mc_pg[ptop]->mp_pgno)); } mdb_cursor_copy(mc, &mn); @@ -8749,7 +9382,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno * the split so the new page is emptier than the old page. * This yields better packing during sequential inserts. */ - if (nkeys < 32 || nsize > pmax/16 || newindx >= nkeys) { + if (nkeys < 20 || nsize > pmax/16 || newindx >= nkeys) { /* Find split point */ psize = 0; if (newindx <= split_indx || newindx >= nkeys) { @@ -8985,8 +9618,12 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { m3->mc_ki[ptop]++; } - if (IS_LEAF(mp)) - XCURSOR_REFRESH(m3, mc->mc_top, m3->mc_pg[mc->mc_top]); + if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) && + IS_LEAF(mp)) { + MDB_node *node = NODEPTR(m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); + if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + } } } DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp))); @@ -9027,26 +9664,23 @@ mdb_put(MDB_txn *txn, MDB_dbi dbi, #ifndef MDB_WBUF #define MDB_WBUF (1024*1024) #endif -#define MDB_EOF 0x10 /**< #mdb_env_copyfd1() is done reading */ - /** State needed for a double-buffering compacting copy. */ + /** State needed for a compacting copy. */ typedef struct mdb_copy { - MDB_env *mc_env; - MDB_txn *mc_txn; pthread_mutex_t mc_mutex; - pthread_cond_t mc_cond; /**< Condition variable for #mc_new */ + pthread_cond_t mc_cond; char *mc_wbuf[2]; char *mc_over[2]; + MDB_env *mc_env; + MDB_txn *mc_txn; int mc_wlen[2]; int mc_olen[2]; pgno_t mc_next_pgno; HANDLE mc_fd; - int mc_toggle; /**< Buffer number in provider */ - int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */ - /** Error code. Never cleared if set. Both threads can set nonzero - * to fail the copy. Not mutex-protected, LMDB expects atomic int. - */ - volatile int mc_error; + int mc_status; + volatile int mc_new; + int mc_toggle; + } mdb_copy; /** Dedicated writer thread for compacting copy. */ @@ -9062,38 +9696,26 @@ mdb_env_copythr(void *arg) #else int len; #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0) -#ifdef SIGPIPE - sigset_t set; - sigemptyset(&set); - sigaddset(&set, SIGPIPE); - if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0) - my->mc_error = rc; -#endif #endif pthread_mutex_lock(&my->mc_mutex); + my->mc_new = 0; + pthread_cond_signal(&my->mc_cond); for(;;) { while (!my->mc_new) pthread_cond_wait(&my->mc_cond, &my->mc_mutex); - if (my->mc_new == 0 + MDB_EOF) /* 0 buffers, just EOF */ + if (my->mc_new < 0) { + my->mc_new = 0; break; + } + my->mc_new = 0; wsize = my->mc_wlen[toggle]; ptr = my->mc_wbuf[toggle]; again: - rc = MDB_SUCCESS; - while (wsize > 0 && !my->mc_error) { + while (wsize > 0) { DO_WRITE(rc, my->mc_fd, ptr, wsize, len); if (!rc) { rc = ErrCode(); -#if defined(SIGPIPE) && !defined(_WIN32) - if (rc == EPIPE) { - /* Collect the pending SIGPIPE, otherwise at least OS X - * gives it to the process on thread-exit (ITS#8504). - */ - int tmp; - sigwait(&set, &tmp); - } -#endif break; } else if (len > 0) { rc = MDB_SUCCESS; @@ -9106,7 +9728,8 @@ again: } } if (rc) { - my->mc_error = rc; + my->mc_status = rc; + break; } /* If there's an overflow page tail, write it too */ if (my->mc_olen[toggle]) { @@ -9117,45 +9740,38 @@ again: } my->mc_wlen[toggle] = 0; toggle ^= 1; - /* Return the empty buffer to provider */ - my->mc_new--; pthread_cond_signal(&my->mc_cond); } + pthread_cond_signal(&my->mc_cond); pthread_mutex_unlock(&my->mc_mutex); return (THREAD_RET)0; #undef DO_WRITE } - /** Give buffer and/or #MDB_EOF to writer thread, await unused buffer. - * - * @param[in] my control structure. - * @param[in] adjust (1 to hand off 1 buffer) | (MDB_EOF when ending). - */ + /** Tell the writer thread there's a buffer ready to write */ static int ESECT -mdb_env_cthr_toggle(mdb_copy *my, int adjust) +mdb_env_cthr_toggle(mdb_copy *my, int st) { + int toggle = my->mc_toggle ^ 1; pthread_mutex_lock(&my->mc_mutex); - my->mc_new += adjust; - pthread_cond_signal(&my->mc_cond); - while (my->mc_new & 2) /* both buffers in use */ + if (my->mc_status) { + pthread_mutex_unlock(&my->mc_mutex); + return my->mc_status; + } + while (my->mc_new == 1) pthread_cond_wait(&my->mc_cond, &my->mc_mutex); + my->mc_new = st; + my->mc_toggle = toggle; + pthread_cond_signal(&my->mc_cond); pthread_mutex_unlock(&my->mc_mutex); - - my->mc_toggle ^= (adjust & 1); - /* Both threads reset mc_wlen, to be safe from threading errors */ - my->mc_wlen[my->mc_toggle] = 0; - return my->mc_error; + return 0; } - /** Depth-first tree traversal for compacting copy. - * @param[in] my control structure. - * @param[in,out] pg database root. - * @param[in] flags includes #F_DUPDATA if it is a sorted-duplicate sub-DB. - */ + /** Depth-first tree traversal for compacting copy. */ static int ESECT mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) { - MDB_cursor mc = {0}; + MDB_cursor mc; MDB_node *ni; MDB_page *mo, *mp, *leaf; char *buf, *ptr; @@ -9167,6 +9783,7 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) return MDB_SUCCESS; mc.mc_snum = 1; + mc.mc_top = 0; mc.mc_txn = my->mc_txn; rc = mdb_page_get(&mc, *pg, &mc.mc_pg[0], NULL); @@ -9213,7 +9830,6 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) } memcpy(&pg, NODEDATA(ni), sizeof(pg)); - memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); rc = mdb_page_get(&mc, pg, &omp, NULL); if (rc) goto done; @@ -9236,6 +9852,7 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) goto done; toggle = my->mc_toggle; } + memcpy(NODEDATA(ni), &mo->mp_pgno, sizeof(pgno_t)); } else if (ni->mn_flags & F_SUBDATA) { MDB_db db; @@ -9313,56 +9930,47 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd) { MDB_meta *mm; MDB_page *mp; - mdb_copy my = {0}; + mdb_copy my; MDB_txn *txn = NULL; pthread_t thr; - pgno_t root, new_root; - int rc = MDB_SUCCESS; + int rc; #ifdef _WIN32 - if (!(my.mc_mutex = CreateMutex(NULL, FALSE, NULL)) || - !(my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL))) { - rc = ErrCode(); - goto done; - } + my.mc_mutex = CreateMutex(NULL, FALSE, NULL); + my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL); my.mc_wbuf[0] = _aligned_malloc(MDB_WBUF*2, env->me_os_psize); - if (my.mc_wbuf[0] == NULL) { - /* _aligned_malloc() sets errno, but we use Windows error codes */ - rc = ERROR_NOT_ENOUGH_MEMORY; - goto done; - } + if (my.mc_wbuf[0] == NULL) + return errno; #else - if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0) - return rc; - if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0) - goto done2; + pthread_mutex_init(&my.mc_mutex, NULL); + pthread_cond_init(&my.mc_cond, NULL); #ifdef HAVE_MEMALIGN my.mc_wbuf[0] = memalign(env->me_os_psize, MDB_WBUF*2); - if (my.mc_wbuf[0] == NULL) { - rc = errno; - goto done; - } + if (my.mc_wbuf[0] == NULL) + return errno; #else - { - void *p; - if ((rc = posix_memalign(&p, env->me_os_psize, MDB_WBUF*2)) != 0) - goto done; - my.mc_wbuf[0] = p; - } + rc = posix_memalign((void **)&my.mc_wbuf[0], env->me_os_psize, MDB_WBUF*2); + if (rc) + return rc; #endif #endif memset(my.mc_wbuf[0], 0, MDB_WBUF*2); my.mc_wbuf[1] = my.mc_wbuf[0] + MDB_WBUF; + my.mc_wlen[0] = 0; + my.mc_wlen[1] = 0; + my.mc_olen[0] = 0; + my.mc_olen[1] = 0; my.mc_next_pgno = NUM_METAS; + my.mc_status = 0; + my.mc_new = 1; + my.mc_toggle = 0; my.mc_env = env; my.mc_fd = fd; - rc = THREAD_CREATE(thr, mdb_env_copythr, &my); - if (rc) - goto done; + THREAD_CREATE(thr, mdb_env_copythr, &my); rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); if (rc) - goto finish; + return rc; mp = (MDB_page *)my.mc_wbuf[0]; memset(mp, 0, NUM_METAS * env->me_psize); @@ -9378,64 +9986,57 @@ mdb_env_copyfd1(MDB_env *env, HANDLE fd) *(MDB_meta *)METADATA(mp) = *mm; mm = (MDB_meta *)METADATA(mp); - /* Set metapage 1 with current main DB */ - root = new_root = txn->mt_dbs[MAIN_DBI].md_root; - if (root != P_INVALID) { - /* Count free pages + freeDB pages. Subtract from last_pg - * to find the new last_pg, which also becomes the new root. - */ + /* Count the number of free pages, subtract from lastpg to find + * number of active pages + */ + { MDB_ID freecount = 0; MDB_cursor mc; MDB_val key, data; mdb_cursor_init(&mc, txn, FREE_DBI, NULL); while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0) freecount += *(MDB_ID *)data.mv_data; - if (rc != MDB_NOTFOUND) - goto finish; freecount += txn->mt_dbs[FREE_DBI].md_branch_pages + txn->mt_dbs[FREE_DBI].md_leaf_pages + txn->mt_dbs[FREE_DBI].md_overflow_pages; - new_root = txn->mt_next_pgno - 1 - freecount; - mm->mm_last_pg = new_root; + /* Set metapage 1 */ + mm->mm_last_pg = txn->mt_next_pgno - freecount - 1; mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; - mm->mm_dbs[MAIN_DBI].md_root = new_root; - } else { - /* When the DB is empty, handle it specially to - * fix any breakage like page leaks from ITS#8174. - */ - mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags; + if (mm->mm_last_pg > NUM_METAS-1) { + mm->mm_dbs[MAIN_DBI].md_root = mm->mm_last_pg; + mm->mm_txnid = 1; + } else { + mm->mm_dbs[MAIN_DBI].md_root = P_INVALID; + } } - if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) { - mm->mm_txnid = 1; /* use metapage 1 */ - } - my.mc_wlen[0] = env->me_psize * NUM_METAS; my.mc_txn = txn; - rc = mdb_env_cwalk(&my, &root, 0); - if (rc == MDB_SUCCESS && root != new_root) { - rc = MDB_INCOMPATIBLE; /* page leak or corrupt DB */ - } + pthread_mutex_lock(&my.mc_mutex); + while(my.mc_new) + pthread_cond_wait(&my.mc_cond, &my.mc_mutex); + pthread_mutex_unlock(&my.mc_mutex); + rc = mdb_env_cwalk(&my, &txn->mt_dbs[MAIN_DBI].md_root, 0); + if (rc == MDB_SUCCESS && my.mc_wlen[my.mc_toggle]) + rc = mdb_env_cthr_toggle(&my, 1); + mdb_env_cthr_toggle(&my, -1); + pthread_mutex_lock(&my.mc_mutex); + while(my.mc_new) + pthread_cond_wait(&my.mc_cond, &my.mc_mutex); + pthread_mutex_unlock(&my.mc_mutex); + THREAD_FINISH(thr); -finish: - if (rc) - my.mc_error = rc; - mdb_env_cthr_toggle(&my, 1 | MDB_EOF); - rc = THREAD_FINISH(thr); mdb_txn_abort(txn); - -done: #ifdef _WIN32 - if (my.mc_wbuf[0]) _aligned_free(my.mc_wbuf[0]); - if (my.mc_cond) CloseHandle(my.mc_cond); - if (my.mc_mutex) CloseHandle(my.mc_mutex); + CloseHandle(my.mc_cond); + CloseHandle(my.mc_mutex); + _aligned_free(my.mc_wbuf[0]); #else - free(my.mc_wbuf[0]); pthread_cond_destroy(&my.mc_cond); -done2: pthread_mutex_destroy(&my.mc_mutex); + free(my.mc_wbuf[0]); #endif - return rc ? rc : my.mc_error; + return rc; } /** Copy environment as-is. */ @@ -9445,7 +10046,7 @@ mdb_env_copyfd0(MDB_env *env, HANDLE fd) MDB_txn *txn = NULL; mdb_mutexref_t wmutex = NULL; int rc; - size_t wsize, w3; + mdb_size_t wsize, w3; char *ptr; #ifdef _WIN32 DWORD len, w2; @@ -9506,7 +10107,7 @@ mdb_env_copyfd0(MDB_env *env, HANDLE fd) w3 = txn->mt_next_pgno * env->me_psize; { - size_t fsize = 0; + mdb_size_t fsize = 0; if ((rc = mdb_fsize(env->me_fd, &fsize))) goto leave; if (w3 > fsize) @@ -9556,20 +10157,67 @@ mdb_env_copyfd(MDB_env *env, HANDLE fd) int ESECT mdb_env_copy2(MDB_env *env, const char *path, unsigned int flags) { - int rc; - MDB_name fname; + int rc, len; + char *lpath; HANDLE newfd = INVALID_HANDLE_VALUE; +#ifdef _WIN32 + wchar_t *wpath; +#endif - rc = mdb_fname_init(path, env->me_flags | MDB_NOLOCK, &fname); - if (rc == MDB_SUCCESS) { - rc = mdb_fopen(env, &fname, MDB_O_COPY, 0666, &newfd); - mdb_fname_destroy(fname); + if (env->me_flags & MDB_NOSUBDIR) { + lpath = (char *)path; + } else { + len = strlen(path); + len += sizeof(DATANAME); + lpath = malloc(len); + if (!lpath) + return ENOMEM; + sprintf(lpath, "%s" DATANAME, path); } - if (rc == MDB_SUCCESS) { - rc = mdb_env_copyfd2(env, newfd, flags); + + /* The destination path must exist, but the destination file must not. + * We don't want the OS to cache the writes, since the source data is + * already in the OS cache. + */ +#ifdef _WIN32 + rc = utf8_to_utf16(lpath, -1, &wpath, NULL); + if (rc) + goto leave; + newfd = CreateFileW(wpath, GENERIC_WRITE, 0, NULL, CREATE_NEW, + FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL); + free(wpath); +#else + newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL, 0666); +#endif + if (newfd == INVALID_HANDLE_VALUE) { + rc = ErrCode(); + goto leave; + } + + if (env->me_psize >= env->me_os_psize) { +#ifdef O_DIRECT + /* Set O_DIRECT if the file system supports it */ + if ((rc = fcntl(newfd, F_GETFL)) != -1) + (void) fcntl(newfd, F_SETFL, rc | O_DIRECT); +#endif +#ifdef F_NOCACHE /* __APPLE__ */ + rc = fcntl(newfd, F_NOCACHE, 1); + if (rc) { + rc = ErrCode(); + goto leave; + } +#endif + } + + rc = mdb_env_copyfd2(env, newfd, flags); + +leave: + if (!(env->me_flags & MDB_NOSUBDIR)) + free(lpath); + if (newfd != INVALID_HANDLE_VALUE) if (close(newfd) < 0 && rc == MDB_SUCCESS) rc = ErrCode(); - } + return rc; } @@ -9791,11 +10439,8 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]); if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA) return MDB_INCOMPATIBLE; - } else { - if (rc != MDB_NOTFOUND || !(flags & MDB_CREATE)) - return rc; - if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) - return EACCES; + } else if (! (rc == MDB_NOTFOUND && (flags & MDB_CREATE))) { + return rc; } /* Done here so we cannot fail after creating a new DB */ @@ -9809,8 +10454,7 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *db memset(&dummy, 0, sizeof(dummy)); dummy.md_root = P_INVALID; dummy.md_flags = flags & PERSISTENT_FLAGS; - WITH_CURSOR_TRACKING(mc, - rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA)); + rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA); dbflag |= DB_DIRTY; } @@ -9910,6 +10554,11 @@ mdb_drop0(MDB_cursor *mc, int subs) mdb_cursor_pop(mc); mdb_cursor_copy(mc, &mx); +#ifdef MDB_VL32 + /* bump refcount for mx's pages */ + for (i=0; imc_snum; i++) + mdb_page_get(&mx, mc->mc_pg[i]->mp_pgno, &mx.mc_pg[i], NULL); +#endif while (mc->mc_snum > 0) { MDB_page *mp = mc->mc_pg[mc->mc_top]; unsigned n = NUMKEYS(mp); @@ -9975,6 +10624,10 @@ pop: done: if (rc) txn->mt_flags |= MDB_TXN_ERROR; +#ifdef MDB_VL32 + /* drop refcount for mx's pages */ + mdb_cursor_unref(&mx); +#endif } else if (rc == MDB_NOTFOUND) { rc = MDB_SUCCESS; } @@ -10094,7 +10747,7 @@ mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) if (mr[i].mr_pid) { txnid_t txnid = mr[i].mr_txnid; sprintf(buf, txnid == (txnid_t)-1 ? - "%10d %"Z"x -\n" : "%10d %"Z"x %"Z"u\n", + "%10d %"Z"x -\n" : "%10d %"Z"x %"Y"u\n", (int)mr[i].mr_pid, (size_t)mr[i].mr_tid, txnid); if (first) { first = 0; @@ -10163,7 +10816,7 @@ mdb_reader_check(MDB_env *env, int *dead) return env->me_txns ? mdb_reader_check0(env, 0, dead) : MDB_SUCCESS; } -/** As #mdb_reader_check(). \b rlocked is set if caller locked #me_rmutex. */ +/** As #mdb_reader_check(). rlocked = . */ static int ESECT mdb_reader_check0(MDB_env *env, int rlocked, int *dead) { @@ -10199,7 +10852,7 @@ mdb_reader_check0(MDB_env *env, int rlocked, int *dead) } for (; jmn_alloced = 1; - dst->mn_len = need - 1; - dst->mn_val = result; - return MDB_SUCCESS; - } + int need; + wchar_t *result; + need = MultiByteToWideChar(CP_UTF8, 0, src, srcsize, NULL, 0); + if (need == 0xFFFD) + return EILSEQ; + if (need == 0) + return EINVAL; + result = malloc(sizeof(wchar_t) * need); + if (!result) + return ENOMEM; + MultiByteToWideChar(CP_UTF8, 0, src, srcsize, result, need); + if (dstsize) + *dstsize = need; + *dst = result; + return 0; } #endif /* defined(_WIN32) */ -/** @} */ diff --git a/contrib/db/liblmdb/mdb_copy.1 b/contrib/db/liblmdb/mdb_copy.1 index 594ff124..401e47ab 100644 --- a/contrib/db/liblmdb/mdb_copy.1 +++ b/contrib/db/liblmdb/mdb_copy.1 @@ -1,5 +1,5 @@ -.TH MDB_COPY 1 "2014/07/01" "LMDB 0.9.14" -.\" Copyright 2012-2018 Howard Chu, Symas Corp. All Rights Reserved. +.TH MDB_COPY 1 "2014/06/20" "LMDB 0.9.14" +.\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .SH NAME mdb_copy \- LMDB environment copy tool @@ -11,6 +11,8 @@ mdb_copy \- LMDB environment copy tool .BR \-c ] [\c .BR \-n ] +[\c +.BR \-v ] .B srcpath [\c .BR dstpath ] @@ -36,10 +38,13 @@ Write the library version number to the standard output, and exit. Compact while copying. Only current data pages will be copied; freed or unused pages will be omitted from the copy. This option will slow down the backup process as it is more CPU-intensive. -Currently it fails if the environment has suffered a page leak. .TP .BR \-n Open LDMB environment(s) which do not use subdirectories. +.TP +.BR \-v +Use the previous environment state instead of the latest state. +This may be useful if the latest state has been corrupted. .SH DIAGNOSTICS Exit status is zero if no errors occur. diff --git a/contrib/db/liblmdb/mdb_copy.c b/contrib/db/liblmdb/mdb_copy.c index 1b89396e..95a6e713 100644 --- a/contrib/db/liblmdb/mdb_copy.c +++ b/contrib/db/liblmdb/mdb_copy.c @@ -1,6 +1,6 @@ /* mdb_copy.c - memory-mapped database backup tool */ /* - * Copyright 2012-2018 Howard Chu, Symas Corp. + * Copyright 2012-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -38,6 +38,8 @@ int main(int argc,char * argv[]) for (; argc > 1 && argv[1][0] == '-'; argc--, argv++) { if (argv[1][1] == 'n' && argv[1][2] == '\0') flags |= MDB_NOSUBDIR; + else if (argv[1][1] == 'v' && argv[1][2] == '\0') + flags |= MDB_PREVSNAPSHOT; else if (argv[1][1] == 'c' && argv[1][2] == '\0') cpflags |= MDB_CP_COMPACT; else if (argv[1][1] == 'V' && argv[1][2] == '\0') { @@ -48,7 +50,7 @@ int main(int argc,char * argv[]) } if (argc<2 || argc>3) { - fprintf(stderr, "usage: %s [-V] [-c] [-n] srcpath [dstpath]\n", progname); + fprintf(stderr, "usage: %s [-V] [-c] [-n] [-v] srcpath [dstpath]\n", progname); exit(EXIT_FAILURE); } diff --git a/contrib/db/liblmdb/mdb_dump.1 b/contrib/db/liblmdb/mdb_dump.1 index 72cf6ca8..a25fb92e 100644 --- a/contrib/db/liblmdb/mdb_dump.1 +++ b/contrib/db/liblmdb/mdb_dump.1 @@ -1,5 +1,5 @@ -.TH MDB_DUMP 1 "2015/09/30" "LMDB 0.9.17" -.\" Copyright 2014-2018 Howard Chu, Symas Corp. All Rights Reserved. +.TH MDB_DUMP 1 "2014/06/20" "LMDB 0.9.14" +.\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .SH NAME mdb_dump \- LMDB environment export tool @@ -14,6 +14,8 @@ mdb_dump \- LMDB environment export tool [\c .BR \-n ] [\c +.BR \-v ] +[\c .BR \-p ] [\c .BR \-a \ | @@ -42,6 +44,10 @@ names will be listed, no data will be output. .BR \-n Dump an LMDB database which does not use subdirectories. .TP +.BR \-v +Use the previous environment state instead of the latest state. +This may be useful if the latest state has been corrupted. +.TP .BR \-p If characters in either the key or data items are printing characters (as defined by isprint(3)), output them directly. This option permits users to diff --git a/contrib/db/liblmdb/mdb_dump.c b/contrib/db/liblmdb/mdb_dump.c index 9df5dc0b..7a42bc0b 100644 --- a/contrib/db/liblmdb/mdb_dump.c +++ b/contrib/db/liblmdb/mdb_dump.c @@ -1,6 +1,6 @@ /* mdb_dump.c - memory-mapped database dump tool */ /* - * Copyright 2011-2018 Howard Chu, Symas Corp. + * Copyright 2011-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -25,6 +25,15 @@ #else #define Z "z" #endif +#ifdef MDB_VL32 +#ifdef _WIN32 +#define Y "I64" +#else +#define Y "ll" +#endif +#else +#define Y Z +#endif #define PRINT 1 static int mode; @@ -115,7 +124,7 @@ static int dumpit(MDB_txn *txn, MDB_dbi dbi, char *name) if (name) printf("database=%s\n", name); printf("type=btree\n"); - printf("mapsize=%" Z "u\n", info.me_mapsize); + printf("mapsize=%" Y "u\n", info.me_mapsize); if (info.me_mapaddr) printf("mapaddr=%p\n", info.me_mapaddr); printf("maxreaders=%u\n", info.me_maxreaders); @@ -155,7 +164,7 @@ static int dumpit(MDB_txn *txn, MDB_dbi dbi, char *name) static void usage(char *prog) { - fprintf(stderr, "usage: %s [-V] [-f output] [-l] [-n] [-p] [-a|-s subdb] dbpath\n", prog); + fprintf(stderr, "usage: %s [-V] [-f output] [-l] [-n] [-p] [-v] [-a|-s subdb] dbpath\n", prog); exit(EXIT_FAILURE); } @@ -179,6 +188,7 @@ int main(int argc, char *argv[]) * -n: use NOSUBDIR flag on env_open * -p: use printable characters * -f: write to file instead of stdout + * -v: use previous snapshot * -V: print version and exit * (default) dump only the main DB */ @@ -206,6 +216,9 @@ int main(int argc, char *argv[]) case 'n': envflags |= MDB_NOSUBDIR; break; + case 'v': + envflags |= MDB_PREVSNAPSHOT; + break; case 'p': mode |= PRINT; break; diff --git a/contrib/db/liblmdb/mdb_load.1 b/contrib/db/liblmdb/mdb_load.1 index 998acc12..ede3702d 100644 --- a/contrib/db/liblmdb/mdb_load.1 +++ b/contrib/db/liblmdb/mdb_load.1 @@ -1,5 +1,5 @@ -.TH MDB_LOAD 1 "2015/09/30" "LMDB 0.9.17" -.\" Copyright 2014-2018 Howard Chu, Symas Corp. All Rights Reserved. +.TH MDB_LOAD 1 "2014/06/20" "LMDB 0.9.14" +.\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .SH NAME mdb_load \- LMDB environment import tool @@ -37,6 +37,13 @@ option below. .BR \-V Write the library version number to the standard output, and exit. .TP +.BR \-a +Append all records in the order they appear in the input. The input is assumed to already be +in correctly sorted order and no sorting or checking for redundant values will be performed. +This option must be used to reload data that was produced by running +.B mdb_dump +on a database that uses custom compare functions. +.TP .BR \-f \ file Read from the specified file instead of from the standard input. .TP diff --git a/contrib/db/liblmdb/mdb_load.c b/contrib/db/liblmdb/mdb_load.c index 0f177f1e..797c2f97 100644 --- a/contrib/db/liblmdb/mdb_load.c +++ b/contrib/db/liblmdb/mdb_load.c @@ -1,6 +1,6 @@ /* mdb_load.c - memory-mapped database load tool */ /* - * Copyright 2011-2018 Howard Chu, Symas Corp. + * Copyright 2011-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -37,12 +37,22 @@ static int Eof; static MDB_envinfo info; static MDB_val kbuf, dbuf; +static MDB_val k0buf; #ifdef _WIN32 #define Z "I" #else #define Z "z" #endif +#ifdef MDB_VL32 +#ifdef _WIN32 +#define Y "I64" +#else +#define Y "ll" +#endif +#else +#define Y Z +#endif #define STRLENOF(s) (sizeof(s)-1) @@ -68,7 +78,6 @@ static void readhdr(void) { char *ptr; - flags = 0; while (fgets(dbuf.mv_data, dbuf.mv_size, stdin) != NULL) { lineno++; if (!strncmp(dbuf.mv_data, "VERSION=", STRLENOF("VERSION="))) { @@ -113,7 +122,7 @@ static void readhdr(void) int i; ptr = memchr(dbuf.mv_data, '\n', dbuf.mv_size); if (ptr) *ptr = '\0'; - i = sscanf((char *)dbuf.mv_data+STRLENOF("mapsize="), "%" Z "u", &info.me_mapsize); + i = sscanf((char *)dbuf.mv_data+STRLENOF("mapsize="), "%" Y "u", &info.me_mapsize); if (i != 1) { fprintf(stderr, "%s: line %" Z "d: invalid mapsize %s\n", prog, lineno, (char *)dbuf.mv_data+STRLENOF("mapsize=")); @@ -249,8 +258,7 @@ badend: c2 += 2; } } else { - /* copies are redundant when no escapes were used */ - *c1++ = *c2++; + c1++; c2++; } } } else { @@ -278,10 +286,15 @@ badend: static void usage(void) { - fprintf(stderr, "usage: %s [-V] [-f input] [-n] [-s name] [-N] [-T] dbpath\n", prog); + fprintf(stderr, "usage: %s [-V] [-a] [-f input] [-n] [-s name] [-N] [-T] dbpath\n", prog); exit(EXIT_FAILURE); } +static int greater(const MDB_val *a, const MDB_val *b) +{ + return 1; +} + int main(int argc, char *argv[]) { int i, rc; @@ -291,7 +304,8 @@ int main(int argc, char *argv[]) MDB_dbi dbi; char *envname; int envflags = 0, putflags = 0; - int dohdr = 0; + int dohdr = 0, append = 0; + MDB_val prevk; prog = argv[0]; @@ -299,19 +313,23 @@ int main(int argc, char *argv[]) usage(); } - /* -f: load file instead of stdin + /* -a: append records in input order + * -f: load file instead of stdin * -n: use NOSUBDIR flag on env_open * -s: load into named subDB * -N: use NOOVERWRITE on puts * -T: read plaintext * -V: print version and exit */ - while ((i = getopt(argc, argv, "f:ns:NTV")) != EOF) { + while ((i = getopt(argc, argv, "af:ns:NTV")) != EOF) { switch(i) { case 'V': printf("%s\n", MDB_VERSION_STRING); exit(0); break; + case 'a': + append = 1; + break; case 'f': if (freopen(optarg, "r", stdin) == NULL) { fprintf(stderr, "%s: %s: reopen: %s\n", @@ -370,11 +388,17 @@ int main(int argc, char *argv[]) } kbuf.mv_size = mdb_env_get_maxkeysize(env) * 2 + 2; - kbuf.mv_data = malloc(kbuf.mv_size); + kbuf.mv_data = malloc(kbuf.mv_size * 2); + k0buf.mv_size = kbuf.mv_size; + k0buf.mv_data = (char *)kbuf.mv_data + kbuf.mv_size; + prevk.mv_size = 0; + prevk.mv_data = k0buf.mv_data; while(!Eof) { MDB_val key, data; int batch = 0; + flags = 0; + int appflag; if (!dohdr) { dohdr = 1; @@ -392,6 +416,11 @@ int main(int argc, char *argv[]) fprintf(stderr, "mdb_open failed, error %d %s\n", rc, mdb_strerror(rc)); goto txn_abort; } + if (append) { + mdb_set_compare(txn, dbi, greater); + if (flags & MDB_DUPSORT) + mdb_set_dupsort(txn, dbi, greater); + } rc = mdb_cursor_open(txn, dbi, &mc); if (rc) { @@ -410,7 +439,20 @@ int main(int argc, char *argv[]) goto txn_abort; } - rc = mdb_cursor_put(mc, &key, &data, putflags); + if (append) { + appflag = MDB_APPEND; + if (flags & MDB_DUPSORT) { + if (prevk.mv_size == key.mv_size && !memcmp(prevk.mv_data, key.mv_data, key.mv_size)) + appflag = MDB_APPENDDUP; + else { + memcpy(prevk.mv_data, key.mv_data, key.mv_size); + prevk.mv_size = key.mv_size; + } + } + } else { + appflag = 0; + } + rc = mdb_cursor_put(mc, &key, &data, putflags|appflag); if (rc == MDB_KEYEXIST && putflags) continue; if (rc) { diff --git a/contrib/db/liblmdb/mdb_stat.1 b/contrib/db/liblmdb/mdb_stat.1 index 7c3f2846..bf49bd3b 100644 --- a/contrib/db/liblmdb/mdb_stat.1 +++ b/contrib/db/liblmdb/mdb_stat.1 @@ -1,5 +1,5 @@ -.TH MDB_STAT 1 "2015/09/30" "LMDB 0.9.17" -.\" Copyright 2012-2018 Howard Chu, Symas Corp. All Rights Reserved. +.TH MDB_STAT 1 "2014/06/20" "LMDB 0.9.14" +.\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. .SH NAME mdb_stat \- LMDB environment status tool @@ -14,6 +14,8 @@ mdb_stat \- LMDB environment status tool [\c .BR \-n ] [\c +.BR \-v ] +[\c .BR \-r [ r ]] [\c .BR \-a \ | @@ -39,6 +41,10 @@ If \fB\-fff\fP is given, display the full list of page IDs in the freelist. .BR \-n Display the status of an LMDB database which does not use subdirectories. .TP +.BR \-v +Use the previous environment state instead of the latest state. +This may be useful if the latest state has been corrupted. +.TP .BR \-r Display information about the environment reader table. Shows the process ID, thread ID, and transaction ID for each active diff --git a/contrib/db/liblmdb/mdb_stat.c b/contrib/db/liblmdb/mdb_stat.c index f4c0dc1e..30ec81fe 100644 --- a/contrib/db/liblmdb/mdb_stat.c +++ b/contrib/db/liblmdb/mdb_stat.c @@ -1,6 +1,6 @@ /* mdb_stat.c - memory-mapped database status tool */ /* - * Copyright 2011-2018 Howard Chu, Symas Corp. + * Copyright 2011-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -22,6 +22,15 @@ #else #define Z "z" #endif +#ifdef MDB_VL32 +#ifdef _WIN32 +#define Y "I64" +#else +#define Y "ll" +#endif +#else +#define Y Z +#endif static void prstat(MDB_stat *ms) { @@ -29,15 +38,15 @@ static void prstat(MDB_stat *ms) printf(" Page size: %u\n", ms->ms_psize); #endif printf(" Tree depth: %u\n", ms->ms_depth); - printf(" Branch pages: %"Z"u\n", ms->ms_branch_pages); - printf(" Leaf pages: %"Z"u\n", ms->ms_leaf_pages); - printf(" Overflow pages: %"Z"u\n", ms->ms_overflow_pages); - printf(" Entries: %"Z"u\n", ms->ms_entries); + printf(" Branch pages: %"Y"u\n", ms->ms_branch_pages); + printf(" Leaf pages: %"Y"u\n", ms->ms_leaf_pages); + printf(" Overflow pages: %"Y"u\n", ms->ms_overflow_pages); + printf(" Entries: %"Y"u\n", ms->ms_entries); } static void usage(char *prog) { - fprintf(stderr, "usage: %s [-V] [-n] [-e] [-r[r]] [-f[f[f]]] [-a|-s subdb] dbpath\n", prog); + fprintf(stderr, "usage: %s [-V] [-n] [-e] [-r[r]] [-f[f[f]]] [-v] [-a|-s subdb] dbpath\n", prog); exit(EXIT_FAILURE); } @@ -64,6 +73,7 @@ int main(int argc, char *argv[]) * -f: print freelist info * -r: print reader info * -n: use NOSUBDIR flag on env_open + * -v: use previous snapshot * -V: print version and exit * (default) print stat of only the main DB */ @@ -87,6 +97,9 @@ int main(int argc, char *argv[]) case 'n': envflags |= MDB_NOSUBDIR; break; + case 'v': + envflags |= MDB_PREVSNAPSHOT; + break; case 'r': rdrinfo++; break; @@ -125,11 +138,11 @@ int main(int argc, char *argv[]) (void)mdb_env_info(env, &mei); printf("Environment Info\n"); printf(" Map address: %p\n", mei.me_mapaddr); - printf(" Map size: %"Z"u\n", mei.me_mapsize); + printf(" Map size: %"Y"u\n", mei.me_mapsize); printf(" Page size: %u\n", mst.ms_psize); - printf(" Max pages: %"Z"u\n", mei.me_mapsize / mst.ms_psize); - printf(" Number of pages used: %"Z"u\n", mei.me_last_pgno+1); - printf(" Last transaction ID: %"Z"u\n", mei.me_last_txnid); + printf(" Max pages: %"Y"u\n", mei.me_mapsize / mst.ms_psize); + printf(" Number of pages used: %"Y"u\n", mei.me_last_pgno+1); + printf(" Last transaction ID: %"Y"u\n", mei.me_last_txnid); printf(" Max readers: %u\n", mei.me_maxreaders); printf(" Number of readers used: %u\n", mei.me_numreaders); } diff --git a/contrib/db/liblmdb/midl.c b/contrib/db/liblmdb/midl.c index 75420c41..152a1ec0 100644 --- a/contrib/db/liblmdb/midl.c +++ b/contrib/db/liblmdb/midl.c @@ -3,8 +3,7 @@ /* $OpenLDAP$ */ /* This work is part of OpenLDAP Software . * - * Copyright 2000-2019 The OpenLDAP Foundation. - * Portions Copyright 2001-2018 Howard Chu, Symas Corp. + * Copyright 2000-2015 The OpenLDAP Foundation. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -355,5 +354,67 @@ int mdb_mid2l_append( MDB_ID2L ids, MDB_ID2 *id ) return 0; } +#ifdef MDB_VL32 +unsigned mdb_mid3l_search( MDB_ID3L ids, MDB_ID id ) +{ + /* + * binary search of id in ids + * if found, returns position of id + * if not found, returns first position greater than id + */ + unsigned base = 0; + unsigned cursor = 1; + int val = 0; + unsigned n = (unsigned)ids[0].mid; + + while( 0 < n ) { + unsigned pivot = n >> 1; + cursor = base + pivot + 1; + val = CMP( id, ids[cursor].mid ); + + if( val < 0 ) { + n = pivot; + + } else if ( val > 0 ) { + base = cursor; + n -= pivot + 1; + + } else { + return cursor; + } + } + + if( val > 0 ) { + ++cursor; + } + return cursor; +} + +int mdb_mid3l_insert( MDB_ID3L ids, MDB_ID3 *id ) +{ + unsigned x, i; + + x = mdb_mid3l_search( ids, id->mid ); + + if( x < 1 ) { + /* internal error */ + return -2; + } + + if ( x <= ids[0].mid && ids[x].mid == id->mid ) { + /* duplicate */ + return -1; + } + + /* insert id */ + ids[0].mid++; + for (i=(unsigned)ids[0].mid; i>x; i--) + ids[i] = ids[i-1]; + ids[x] = *id; + + return 0; +} +#endif /* MDB_VL32 */ + /** @} */ /** @} */ diff --git a/contrib/db/liblmdb/midl.h b/contrib/db/liblmdb/midl.h index 462c3497..1555ecb1 100644 --- a/contrib/db/liblmdb/midl.h +++ b/contrib/db/liblmdb/midl.h @@ -11,8 +11,7 @@ /* $OpenLDAP$ */ /* This work is part of OpenLDAP Software . * - * Copyright 2000-2019 The OpenLDAP Foundation. - * Portions Copyright 2001-2018 Howard Chu, Symas Corp. + * Copyright 2000-2015 The OpenLDAP Foundation. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -28,6 +27,7 @@ #define _MDB_MIDL_H_ #include +#include #ifdef __cplusplus extern "C" { @@ -43,7 +43,11 @@ extern "C" { /** A generic unsigned ID number. These were entryIDs in back-bdb. * Preferably it should have the same size as a pointer. */ +#ifdef MDB_VL32 +typedef uint64_t MDB_ID; +#else typedef size_t MDB_ID; +#endif /** An IDL is an ID List, a sorted array of IDs. The first * element of the array is a counter for how many actual @@ -178,6 +182,20 @@ int mdb_mid2l_insert( MDB_ID2L ids, MDB_ID2 *id ); */ int mdb_mid2l_append( MDB_ID2L ids, MDB_ID2 *id ); +#ifdef MDB_VL32 +typedef struct MDB_ID3 { + MDB_ID mid; /**< The ID */ + void *mptr; /**< The pointer */ + unsigned int mcnt; /**< Number of pages */ + unsigned int mref; /**< Refcounter */ +} MDB_ID3; + +typedef MDB_ID3 *MDB_ID3L; + +unsigned mdb_mid3l_search( MDB_ID3L ids, MDB_ID id ); +int mdb_mid3l_insert( MDB_ID3L ids, MDB_ID3 *id ); + +#endif /* MDB_VL32 */ /** @} */ /** @} */ #ifdef __cplusplus diff --git a/contrib/db/liblmdb/mtest.c b/contrib/db/liblmdb/mtest.c index 6fc5840c..9d15088b 100644 --- a/contrib/db/liblmdb/mtest.c +++ b/contrib/db/liblmdb/mtest.c @@ -1,6 +1,6 @@ /* mtest.c - memory-mapped database tester/toy */ /* - * Copyright 2011-2018 Howard Chu, Symas Corp. + * Copyright 2011-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/contrib/db/liblmdb/mtest2.c b/contrib/db/liblmdb/mtest2.c index 64b742aa..eacbe59d 100644 --- a/contrib/db/liblmdb/mtest2.c +++ b/contrib/db/liblmdb/mtest2.c @@ -1,6 +1,6 @@ /* mtest2.c - memory-mapped database tester/toy */ /* - * Copyright 2011-2018 Howard Chu, Symas Corp. + * Copyright 2011-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/contrib/db/liblmdb/mtest3.c b/contrib/db/liblmdb/mtest3.c index 81e4bbf9..9db79e62 100644 --- a/contrib/db/liblmdb/mtest3.c +++ b/contrib/db/liblmdb/mtest3.c @@ -1,6 +1,6 @@ /* mtest3.c - memory-mapped database tester/toy */ /* - * Copyright 2011-2018 Howard Chu, Symas Corp. + * Copyright 2011-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/contrib/db/liblmdb/mtest4.c b/contrib/db/liblmdb/mtest4.c index c355cf10..6df890e2 100644 --- a/contrib/db/liblmdb/mtest4.c +++ b/contrib/db/liblmdb/mtest4.c @@ -1,6 +1,6 @@ /* mtest4.c - memory-mapped database tester/toy */ /* - * Copyright 2011-2018 Howard Chu, Symas Corp. + * Copyright 2011-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/contrib/db/liblmdb/mtest5.c b/contrib/db/liblmdb/mtest5.c index 95793ec1..14e3c0da 100644 --- a/contrib/db/liblmdb/mtest5.c +++ b/contrib/db/liblmdb/mtest5.c @@ -1,6 +1,6 @@ /* mtest5.c - memory-mapped database tester/toy */ /* - * Copyright 2011-2018 Howard Chu, Symas Corp. + * Copyright 2011-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/contrib/db/liblmdb/mtest6.c b/contrib/db/liblmdb/mtest6.c index cb0d4d73..ae3c7f26 100644 --- a/contrib/db/liblmdb/mtest6.c +++ b/contrib/db/liblmdb/mtest6.c @@ -1,6 +1,6 @@ /* mtest6.c - memory-mapped database tester/toy */ /* - * Copyright 2011-2018 Howard Chu, Symas Corp. + * Copyright 2011-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/contrib/db/liblmdb/sample-bdb.txt b/contrib/db/liblmdb/sample-bdb.txt index 97220f0e..563807a2 100644 --- a/contrib/db/liblmdb/sample-bdb.txt +++ b/contrib/db/liblmdb/sample-bdb.txt @@ -3,7 +3,7 @@ * Do a line-by-line comparison of this and sample-mdb.txt */ /* - * Copyright 2012-2018 Howard Chu, Symas Corp. + * Copyright 2012-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/contrib/db/liblmdb/sample-mdb.txt b/contrib/db/liblmdb/sample-mdb.txt index 1d20ed3d..10a25687 100644 --- a/contrib/db/liblmdb/sample-mdb.txt +++ b/contrib/db/liblmdb/sample-mdb.txt @@ -3,7 +3,7 @@ * Do a line-by-line comparison of this and sample-bdb.txt */ /* - * Copyright 2012-2018 Howard Chu, Symas Corp. + * Copyright 2012-2015 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without diff --git a/src/common/db_abstract_accessor.h b/src/common/db_abstract_accessor.h index 9b84c9f8..d74937ee 100644 --- a/src/common/db_abstract_accessor.h +++ b/src/common/db_abstract_accessor.h @@ -241,9 +241,9 @@ namespace tools m_is_open = false; return m_backend->close(); } - bool open(const std::string& path, uint64_t flags = 0) + bool open(const std::string& path, uint64_t cache_sz = CACHE_SIZE) { - bool r = m_backend->open(path, flags); + bool r = m_backend->open(path, cache_sz); if(r) m_is_open = true; @@ -558,10 +558,10 @@ namespace tools bool init(const std::string& container_name) { #ifdef ENABLE_PROFILING - m_get_profiler.m_name = container_name +":get"; - m_set_profiler.m_name = container_name + ":set"; - m_explicit_get_profiler.m_name = container_name + ":explicit_get"; - m_explicit_set_profiler.m_name = container_name + ":explicit_set"; + m_get_profiler.m_name = container_name +":get"; + m_set_profiler.m_name = container_name + ":set"; + m_explicit_get_profiler.m_name = container_name + ":explicit_get"; + m_explicit_set_profiler.m_name = container_name + ":explicit_set"; m_commit_profiler.m_name = container_name + ":commit"; #endif return bdb.get_backend()->open_container(container_name, m_h); diff --git a/src/common/db_backend_base.h b/src/common/db_backend_base.h index e3187c82..538abd62 100644 --- a/src/common/db_backend_base.h +++ b/src/common/db_backend_base.h @@ -5,6 +5,13 @@ #pragma once +#ifndef ENV32BIT +#define CACHE_SIZE uint64_t(uint64_t(1UL * 128UL) * 1024UL * 1024UL * 1024UL) +#else +#define CACHE_SIZE (1 * 1024UL * 1024UL * 1024UL) +#endif + + namespace tools { namespace db @@ -25,20 +32,20 @@ namespace tools struct i_db_backend { - virtual bool close() = 0; + virtual bool close()=0; virtual bool begin_transaction(bool read_only = false) = 0; - virtual bool commit_transaction() = 0; - virtual void abort_transaction() = 0; - virtual bool open(const std::string& path, uint64_t flags = 0) = 0; - virtual bool open_container(const std::string& name, container_handle& h) = 0; + virtual bool commit_transaction()=0; + virtual void abort_transaction()=0; + virtual bool open(const std::string& path, uint64_t cache_sz = CACHE_SIZE) = 0; + virtual bool open_container(const std::string& name, container_handle& h)=0; virtual bool erase(container_handle h, const char* k, size_t s) = 0; virtual uint64_t size(container_handle h) = 0; virtual bool get(container_handle h, const char* k, size_t s, std::string& res_buff) = 0; virtual bool set(container_handle h, const char* k, size_t s, const char* v, size_t vs) = 0; virtual bool clear(container_handle h) = 0; - virtual bool enumerate(container_handle h, i_db_callback* pcb) = 0; + virtual bool enumerate(container_handle h, i_db_callback* pcb)=0; virtual bool get_stat_info(stat_info& si) = 0; - virtual ~i_db_backend() {}; + virtual ~i_db_backend(){}; }; } -} +} \ No newline at end of file diff --git a/src/currency_core/blockchain_storage.cpp b/src/currency_core/blockchain_storage.cpp index 61cf312b..827adc65 100644 --- a/src/currency_core/blockchain_storage.cpp +++ b/src/currency_core/blockchain_storage.cpp @@ -76,6 +76,7 @@ DISABLE_VS_WARNINGS(4267) namespace { + const command_line::arg_descriptor arg_db_cache_l1 = { "db-cache-l1", "Specify size of memory mapped db cache file", 0, true }; const command_line::arg_descriptor arg_db_cache_l2 = { "db-cache-l2", "Specify cached elements in db helpers", 0, true }; } @@ -151,6 +152,7 @@ std::shared_ptr blockchain_storage::get_tx(const crypto::hash &id) //------------------------------------------------------------------ void blockchain_storage::init_options(boost::program_options::options_description& desc) { + command_line::add_arg(desc, arg_db_cache_l1); command_line::add_arg(desc, arg_db_cache_l2); } //------------------------------------------------------------------ @@ -213,20 +215,36 @@ bool blockchain_storage::init(const std::string& config_folder, const boost::pro return false; } + uint64_t cache_size_l1 = CACHE_SIZE; + if (command_line::has_arg(vm, arg_db_cache_l1)) + { + cache_size_l1 = command_line::get_arg(vm, arg_db_cache_l1); + } + LOG_PRINT_GREEN("Using db file cache size(L1): " << cache_size_l1, LOG_LEVEL_0); + m_config_folder = config_folder; + + // remove old incompartible DB + const std::string old_db_folder_path = m_config_folder + "/" CURRENCY_BLOCKCHAINDATA_FOLDERNAME_OLD; + if (boost::filesystem::exists(old_db_folder_path)) + { + LOG_PRINT_YELLOW("Removing old DB in " << old_db_folder_path << "...", LOG_LEVEL_0); + boost::filesystem::remove_all(old_db_folder_path); + } + const std::string db_folder_path = m_config_folder + "/" CURRENCY_BLOCKCHAINDATA_FOLDERNAME; LOG_PRINT_L0("Loading blockchain from " << db_folder_path); bool db_opened_okay = false; for(size_t loading_attempt_no = 0; loading_attempt_no < 2; ++loading_attempt_no) { - bool res = m_db.open(db_folder_path); + bool res = m_db.open(db_folder_path, cache_size_l1); if (!res) { // if DB could not be opened -- try to remove the whole folder and re-open DB LOG_PRINT_YELLOW("Failed to initialize database in folder: " << db_folder_path << ", first attempt", LOG_LEVEL_0); boost::filesystem::remove_all(db_folder_path); - res = m_db.open(db_folder_path); + res = m_db.open(db_folder_path, cache_size_l1); CHECK_AND_ASSERT_MES(res, false, "Failed to initialize database in folder: " << db_folder_path << ", second attempt"); } diff --git a/src/currency_core/currency_config.h b/src/currency_core/currency_config.h index cb9b0dcb..11d2ec06 100644 --- a/src/currency_core/currency_config.h +++ b/src/currency_core/currency_config.h @@ -187,8 +187,10 @@ #define CURRENCY_CORE_INSTANCE_LOCK_FILE "lock.lck" -#define CURRENCY_POOLDATA_FOLDERNAME "poolstate" -#define CURRENCY_BLOCKCHAINDATA_FOLDERNAME "blockchain" +#define CURRENCY_POOLDATA_FOLDERNAME_OLD "poolstate" +#define CURRENCY_BLOCKCHAINDATA_FOLDERNAME_OLD "blockchain" +#define CURRENCY_POOLDATA_FOLDERNAME "poolstate_lmdb_v1" +#define CURRENCY_BLOCKCHAINDATA_FOLDERNAME "blockchain_lmdb_v1" #define P2P_NET_DATA_FILENAME "p2pstate.bin" #define MINER_CONFIG_FILENAME "miner_conf.json" #define GUI_SECURE_CONFIG_FILENAME "gui_secure_conf.bin" diff --git a/src/currency_core/tx_pool.cpp b/src/currency_core/tx_pool.cpp index 99838ead..6803ac16 100644 --- a/src/currency_core/tx_pool.cpp +++ b/src/currency_core/tx_pool.cpp @@ -1120,19 +1120,30 @@ namespace currency { m_config_folder = config_folder; - LOG_PRINT_L0("Loading blockchain..."); + uint64_t cache_size_l1 = CACHE_SIZE; + LOG_PRINT_GREEN("Using pool db file cache size(L1): " << cache_size_l1, LOG_LEVEL_0); + + // remove old incompartible DB + const std::string old_db_folder_path = m_config_folder + "/" CURRENCY_POOLDATA_FOLDERNAME_OLD; + if (boost::filesystem::exists(old_db_folder_path)) + { + LOG_PRINT_YELLOW("Removing old DB in " << old_db_folder_path << "...", LOG_LEVEL_0); + boost::filesystem::remove_all(old_db_folder_path); + } + const std::string db_folder_path = m_config_folder + "/" CURRENCY_POOLDATA_FOLDERNAME; - + LOG_PRINT_L0("Loading blockchain from " << db_folder_path << "..."); + bool db_opened_okay = false; for(size_t loading_attempt_no = 0; loading_attempt_no < 2; ++loading_attempt_no) { - bool res = m_db.open(db_folder_path); + bool res = m_db.open(db_folder_path, cache_size_l1); if (!res) { // if DB could not be opened -- try to remove the whole folder and re-open DB LOG_PRINT_YELLOW("Failed to initialize database in folder: " << db_folder_path << ", first attempt", LOG_LEVEL_0); boost::filesystem::remove_all(db_folder_path); - res = m_db.open(db_folder_path); + res = m_db.open(db_folder_path, cache_size_l1); CHECK_AND_ASSERT_MES(res, false, "Failed to initialize database in folder: " << db_folder_path << ", second attempt"); } diff --git a/src/daemon/daemon.cpp b/src/daemon/daemon.cpp index 9851a677..65b58cb1 100644 --- a/src/daemon/daemon.cpp +++ b/src/daemon/daemon.cpp @@ -100,7 +100,7 @@ int main(int argc, char* argv[]) #endif log_space::get_set_log_detalisation_level(true, LOG_LEVEL_2); log_space::log_singletone::add_logger(LOGGER_CONSOLE, NULL, NULL); - log_space::log_singletone::enable_channels("core,currency_protocol,tx_pool,wallet,lmdb"); + log_space::log_singletone::enable_channels("core,currency_protocol,tx_pool,wallet"); LOG_PRINT_L0("Starting..."); tools::signal_handler::install_fatal([](int sig_number, void* address) { diff --git a/tests/unit_tests/db_accessors.cpp b/tests/unit_tests/db_accessors.cpp index 078f1ac5..826eb0a6 100644 --- a/tests/unit_tests/db_accessors.cpp +++ b/tests/unit_tests/db_accessors.cpp @@ -29,8 +29,9 @@ TEST(db_accessor_tests, cached_key_value_accessor_test) tools::db::cached_key_value_accessor m_container(m_db); const std::string folder_name = "./TEST_cached_key_value_accessor_test"; tools::create_directories_if_necessary(folder_name); + uint64_t cache_size = CACHE_SIZE; - ASSERT_TRUE(m_db.open(folder_name)); + ASSERT_TRUE(m_db.open(folder_name, cache_size)); ASSERT_TRUE(m_container.init("container")); ... TODO ... @@ -46,7 +47,8 @@ TEST(db_accessor_tests_2, recoursive_tx_test) const std::string folder_name = "./TEST_db_recursive_tx"; tools::create_directories_if_necessary(folder_name); - ASSERT_TRUE(m_db.open(folder_name)); + uint64_t cache_size = CACHE_SIZE; + ASSERT_TRUE(m_db.open(folder_name, cache_size)); ASSERT_TRUE(m_container.init("zzzz") ); bool tx_result = m_container.begin_transaction(); @@ -317,7 +319,8 @@ TEST(db_accessor_tests, median_db_cache_test) const std::string folder_name = "./TEST_median_db_cache"; const std::string naive_median_serialization_filename = folder_name + "/naive_median"; tools::create_directories_if_necessary(folder_name); - ASSERT_TRUE(m_db.open(folder_name)); + uint64_t cache_size = CACHE_SIZE; + ASSERT_TRUE(m_db.open(folder_name, cache_size)); ASSERT_TRUE(m_tx_fee_median.init("median_fee")); m_db.begin_transaction(); diff --git a/tests/unit_tests/lmdb_tests.cpp b/tests/unit_tests/lmdb_tests.cpp index 7e03cf13..10dc1229 100644 --- a/tests/unit_tests/lmdb_tests.cpp +++ b/tests/unit_tests/lmdb_tests.cpp @@ -30,7 +30,7 @@ namespace lmdb_test // write data // - r = bdba.open(db_file_path); + r = bdba.open(db_file_path, CACHE_SIZE); ASSERT_TRUE(r); db::container_handle h;