diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000000..530192a3b20 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,5 @@ +.git +.vscode +.circleci +tmp_install +compute_build diff --git a/configure b/configure index 57607d79dff..376394da894 100755 --- a/configure +++ b/configure @@ -717,6 +717,7 @@ with_libxml with_uuid with_readline with_systemd +with_libseccomp with_selinux with_ldap with_krb_srvnam @@ -864,6 +865,7 @@ with_bsd_auth with_ldap with_bonjour with_selinux +with_libseccomp with_systemd with_readline with_libedit_preferred @@ -1573,6 +1575,7 @@ Optional Packages: --with-ldap build with LDAP support --with-bonjour build with Bonjour support --with-selinux build with SELinux support + --with-libseccomp build with libseccomp support --with-systemd build with systemd support --without-readline do not use GNU Readline nor BSD Libedit for editing --with-libedit-preferred @@ -8631,6 +8634,39 @@ fi { $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_selinux" >&5 $as_echo "$with_selinux" >&6; } +# +# libseccomp +# +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to build with libseccomp support" >&5 +$as_echo_n "checking whether to build with libseccomp support... " >&6; } + + + +# Check whether --with-libseccomp was given. +if test "${with_libseccomp+set}" = set; then : + withval=$with_libseccomp; + case $withval in + yes) + : + ;; + no) + : + ;; + *) + as_fn_error $? "no argument expected for --with-libseccomp option" "$LINENO" 5 + ;; + esac + +else + with_libseccomp=no + +fi + + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_libseccomp" >&5 +$as_echo "$with_libseccomp" >&6; } + # # Systemd # @@ -14587,6 +14623,56 @@ else fi +fi + +if test "$with_libseccomp" = yes ; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for seccomp_init in -lseccomp" >&5 +$as_echo_n "checking for seccomp_init in -lseccomp... " >&6; } +if ${ac_cv_lib_seccomp_seccomp_init+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-lseccomp $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char seccomp_init (); +int +main () +{ +return seccomp_init (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_seccomp_seccomp_init=yes +else + ac_cv_lib_seccomp_seccomp_init=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_seccomp_seccomp_init" >&5 +$as_echo "$ac_cv_lib_seccomp_seccomp_init" >&6; } +if test "x$ac_cv_lib_seccomp_seccomp_init" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_LIBSECCOMP 1 +_ACEOF + + LIBS="-lseccomp $LIBS" + +else + as_fn_error $? "library 'libseccomp' is required for Seccomp BPF support" "$LINENO" 5 +fi + fi # for contrib/uuid-ossp diff --git a/configure.ac b/configure.ac index c216ac4447a..0a5e5110335 100644 --- a/configure.ac +++ b/configure.ac @@ -927,6 +927,14 @@ PGAC_ARG_BOOL(with, selinux, no, [build with SELinux support]) AC_SUBST(with_selinux) AC_MSG_RESULT([$with_selinux]) +# +# libseccomp +# +AC_MSG_CHECKING([whether to build with libseccomp support]) +PGAC_ARG_BOOL(with, libseccomp, no, [build with libseccomp support]) +AC_SUBST(with_libseccomp) +AC_MSG_RESULT([$with_libseccomp]) + # # Systemd # @@ -1613,6 +1621,11 @@ dnl If you want to use Apple's own Bonjour code on another platform, dnl just add -ldns_sd to LIBS manually. fi +if test "$with_libseccomp" = yes ; then + AC_CHECK_LIB(seccomp, seccomp_init, [], + [AC_MSG_ERROR([library 'libseccomp' is required for Seccomp BPF support])]) +fi + # for contrib/uuid-ossp if test "$with_uuid" = bsd ; then AC_CHECK_HEADERS(uuid.h, diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c index c0c4f5d9ca7..b0f9c2e235d 100644 --- a/contrib/pg_prewarm/autoprewarm.c +++ b/contrib/pg_prewarm/autoprewarm.c @@ -54,6 +54,7 @@ #include "utils/rel.h" #include "utils/relfilenodemap.h" #include "utils/resowner.h" +#include "utils/spccache.h" #define AUTOPREWARM_FILE "autoprewarm.blocks" @@ -449,10 +450,12 @@ void autoprewarm_database_main(Datum main_arg) { int pos; + int io_concurrency; BlockInfoRecord *block_info; Relation rel = NULL; BlockNumber nblocks = 0; BlockInfoRecord *old_blk = NULL; + BlockInfoRecord *prefetch_blk = NULL; dsm_segment *seg; /* Establish signal handlers; once that's done, unblock signals. */ @@ -499,6 +502,7 @@ autoprewarm_database_main(Datum main_arg) { relation_close(rel, AccessShareLock); rel = NULL; + io_concurrency = -1; CommitTransactionCommand(); } @@ -518,6 +522,8 @@ autoprewarm_database_main(Datum main_arg) if (!rel) CommitTransactionCommand(); + else + io_concurrency = get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace); } if (!rel) { @@ -550,6 +556,35 @@ autoprewarm_database_main(Datum main_arg) continue; } + /* if prefetching is enabled for this relation */ + if (io_concurrency > 0) + { + /* make prefetch_blk catch up */ + if (blk > prefetch_blk) + { + prefetch_blk = blk; + } + + /* now, prefetch all following blocks */ + while (prefetch_blk <= &block_info[apw_state->prewarm_stop_idx]) + { + /* unless they're of a different relfilenode */ + if (prefetch_blk->filenode != blk->filenode || + prefetch_blk->forknum != blk->forknum || + prefetch_blk->blocknum >= nblocks) + break; + + /* or unless they are more than io_concurrency blocks ahead */ + if (blk + io_concurrency <= prefetch_blk) + break; + + PrefetchBuffer(rel, prefetch_blk->forknum, prefetch_blk->blocknum); + + /* continue with the next block */ + prefetch_blk++; + } + } + /* Prewarm buffer. */ buf = ReadBufferExtended(rel, blk->forknum, blk->blocknum, RBM_NORMAL, NULL); diff --git a/contrib/pg_prewarm/pg_prewarm.c b/contrib/pg_prewarm/pg_prewarm.c index caff5c4a80f..b68f81d34d3 100644 --- a/contrib/pg_prewarm/pg_prewarm.c +++ b/contrib/pg_prewarm/pg_prewarm.c @@ -18,12 +18,14 @@ #include "access/relation.h" #include "fmgr.h" #include "miscadmin.h" +#include "optimizer/cost.h" #include "storage/bufmgr.h" #include "storage/smgr.h" #include "utils/acl.h" #include "utils/builtins.h" #include "utils/lsyscache.h" #include "utils/rel.h" +#include "utils/spccache.h" PG_MODULE_MAGIC; @@ -183,14 +185,26 @@ pg_prewarm(PG_FUNCTION_ARGS) } else if (ptype == PREWARM_BUFFER) { + BlockNumber prefetch_block = first_block; + Oid nspOid; + int io_concurrency; + + nspOid = rel->rd_rel->reltablespace; + io_concurrency = get_tablespace_maintenance_io_concurrency(nspOid); + /* * In buffer mode, we actually pull the data into shared_buffers. */ for (block = first_block; block <= last_block; ++block) { - Buffer buf; - + Buffer buf; + BlockNumber prefetch_stop = block + Min(last_block - block + 1, + io_concurrency); CHECK_FOR_INTERRUPTS(); + while (prefetch_block < prefetch_stop) + { + PrefetchBuffer(rel, forkNumber, prefetch_block++); + } buf = ReadBufferExtended(rel, forkNumber, block, RBM_NORMAL, NULL); ReleaseBuffer(buf); ++blocks_done; diff --git a/contrib/pg_prewarm/pg_prewarm.control b/contrib/pg_prewarm/pg_prewarm.control index 40e3add4810..d40d1a000b7 100644 --- a/contrib/pg_prewarm/pg_prewarm.control +++ b/contrib/pg_prewarm/pg_prewarm.control @@ -3,3 +3,4 @@ comment = 'prewarm relation data' default_version = '1.2' module_pathname = '$libdir/pg_prewarm' relocatable = true +trusted = true diff --git a/src/Makefile.global.in b/src/Makefile.global.in index 7d5e08c667d..738cb15cfb4 100644 --- a/src/Makefile.global.in +++ b/src/Makefile.global.in @@ -186,6 +186,7 @@ with_tcl = @with_tcl@ with_ssl = @with_ssl@ with_readline = @with_readline@ with_selinux = @with_selinux@ +with_libseccomp = @with_libseccomp@ with_systemd = @with_systemd@ with_gssapi = @with_gssapi@ with_krb_srvnam = @with_krb_srvnam@ diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c index af6949882a1..9d55d783488 100644 --- a/src/backend/access/brin/brin_xlog.c +++ b/src/backend/access/brin/brin_xlog.c @@ -69,7 +69,8 @@ brin_xlog_insert_update(XLogReaderState *record, } /* need this page's blkno to store in revmap */ - regpgno = BufferGetBlockNumber(buffer); + //ZENITH XXX Don't use BufferGetBlockNumber because wal-redo doesn't pin buffer. + XLogRecGetBlockTag(record, 0, NULL, NULL, ®pgno); /* insert the index item into the page */ if (action == BLK_NEEDS_REDO) diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index ea1c4184fbf..f2e6147e307 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -335,6 +335,8 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); + smgr_start_unlogged_build(index->rd_smgr); + initGinState(&buildstate.ginstate, index); buildstate.indtuples = 0; memset(&buildstate.buildStats, 0, sizeof(GinStatsData)); @@ -408,6 +410,8 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) buildstate.buildStats.nTotalPages = RelationGetNumberOfBlocks(index); ginUpdateStats(index, &buildstate.buildStats, true); + smgr_finish_unlogged_build_phase_1(index->rd_smgr); + /* * We didn't write WAL records as we built the index, so if WAL-logging is * required, write all pages to the WAL now. @@ -417,8 +421,12 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) log_newpage_range(index, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index), true); + SetLastWrittenLSNForBlockRange(XactLastRecEnd, index->rd_smgr->smgr_rnode.node, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); + SetLastWrittenLSNForRelation(XactLastRecEnd, index->rd_smgr->smgr_rnode.node, MAIN_FORKNUM); } + smgr_end_unlogged_build(index->rd_smgr); + /* * Return statistics */ diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index 87e8366642f..d240eb829aa 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -407,6 +407,7 @@ ginRedoSplit(XLogReaderState *record) rootbuf; bool isLeaf = (data->flags & GIN_INSERT_ISLEAF) != 0; bool isRoot = (data->flags & GIN_SPLIT_ROOT) != 0; + XLogRedoAction action; /* * First clear incomplete-split flag on child page if this finishes a @@ -415,21 +416,27 @@ ginRedoSplit(XLogReaderState *record) if (!isLeaf) ginRedoClearIncompleteSplit(record, 3); - if (XLogReadBufferForRedo(record, 0, &lbuffer) != BLK_RESTORED) + action = XLogReadBufferForRedo(record, 0, &lbuffer); + if (action != BLK_RESTORED && action != BLK_DONE) elog(ERROR, "GIN split record did not contain a full-page image of left page"); - if (XLogReadBufferForRedo(record, 1, &rbuffer) != BLK_RESTORED) + action = XLogReadBufferForRedo(record, 1, &rbuffer); + if (action != BLK_RESTORED && action != BLK_DONE) elog(ERROR, "GIN split record did not contain a full-page image of right page"); if (isRoot) { - if (XLogReadBufferForRedo(record, 2, &rootbuf) != BLK_RESTORED) + action = XLogReadBufferForRedo(record, 2, &rootbuf); + if (action != BLK_RESTORED && action != BLK_DONE) elog(ERROR, "GIN split record did not contain a full-page image of root page"); - UnlockReleaseBuffer(rootbuf); + if (rootbuf != InvalidBuffer) + UnlockReleaseBuffer(rootbuf); } - UnlockReleaseBuffer(rbuffer); - UnlockReleaseBuffer(lbuffer); + if (rbuffer != InvalidBuffer) + UnlockReleaseBuffer(rbuffer); + if (lbuffer != InvalidBuffer) + UnlockReleaseBuffer(lbuffer); } /* diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index be9b91be859..aaa7ab8acc0 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -40,6 +40,7 @@ #include "access/tableam.h" #include "access/xloginsert.h" #include "catalog/index.h" +#include "catalog/storage.h" #include "miscadmin.h" #include "optimizer/optimizer.h" #include "storage/bufmgr.h" @@ -296,6 +297,8 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) Buffer buffer; Page page; + smgr_start_unlogged_build(index->rd_smgr); + /* initialize the root page */ buffer = gistNewBuffer(index); Assert(BufferGetBlockNumber(buffer) == GIST_ROOT_BLKNO); @@ -328,6 +331,8 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) gistFreeBuildBuffers(buildstate.gfbb); } + smgr_finish_unlogged_build_phase_1(index->rd_smgr); + /* * We didn't write WAL records as we built the index, so if * WAL-logging is required, write all pages to the WAL now. @@ -337,7 +342,12 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) log_newpage_range(index, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index), true); + SetLastWrittenLSNForBlockRange(XactLastRecEnd, + index->rd_smgr->smgr_rnode.node, + MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); + SetLastWrittenLSNForRelation(XactLastRecEnd, index->rd_smgr->smgr_rnode.node, MAIN_FORKNUM); } + smgr_end_unlogged_build(index->rd_smgr); } /* okay, all heap tuples are indexed */ @@ -462,8 +472,15 @@ gist_indexsortbuild(GISTBuildState *state) smgrwrite(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO, levelstate->pages[0], true); if (RelationNeedsWAL(state->indexrel)) - log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO, - levelstate->pages[0], true); + { + XLogRecPtr lsn; + + lsn = log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO, + levelstate->pages[0], true); + SetLastWrittenLSNForBlock(lsn, state->indexrel->rd_smgr->smgr_rnode.node, + MAIN_FORKNUM, GIST_ROOT_BLKNO); + SetLastWrittenLSNForRelation(lsn, state->indexrel->rd_smgr->smgr_rnode.node, MAIN_FORKNUM); + } pfree(levelstate->pages[0]); pfree(levelstate); diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index f190decdff2..3af31e9dcae 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -23,6 +23,7 @@ #include "storage/indexfsm.h" #include "storage/lmgr.h" #include "utils/memutils.h" +#include "utils/spccache.h" /* Working state needed by gistbulkdelete */ typedef struct @@ -130,8 +131,14 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, BlockNumber num_pages; bool needLock; BlockNumber blkno; + BlockNumber prefetch_blkno; + int io_concurrency; MemoryContext oldctx; + io_concurrency = get_tablespace_maintenance_io_concurrency( + rel->rd_rel->reltablespace + ); + /* * Reset fields that track information about the entire index now. This * avoids double-counting in the case where a single VACUUM command @@ -209,6 +216,7 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, needLock = !RELATION_IS_LOCAL(rel); blkno = GIST_ROOT_BLKNO; + prefetch_blkno = blkno; for (;;) { /* Get the current relation length */ @@ -221,9 +229,21 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* Quit if we've scanned the whole relation */ if (blkno >= num_pages) break; + + if (prefetch_blkno < blkno) + prefetch_blkno = blkno; + for (; prefetch_blkno < num_pages && + prefetch_blkno < blkno + io_concurrency; prefetch_blkno++) + PrefetchBuffer(rel, MAIN_FORKNUM, prefetch_blkno); + /* Iterate over pages, then loop back to recheck length */ for (; blkno < num_pages; blkno++) + { + if (io_concurrency > 0 && prefetch_blkno < num_pages) + PrefetchBuffer(rel, MAIN_FORKNUM, prefetch_blkno++); + gistvacuumpage(&vstate, blkno, blkno); + } } /* diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index c361509d68d..07dc943361d 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -32,6 +32,7 @@ #include "utils/builtins.h" #include "utils/index_selfuncs.h" #include "utils/rel.h" +#include "utils/spccache.h" /* Working state for hashbuild and its callback */ typedef struct @@ -466,13 +467,17 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, Bucket orig_maxbucket; Bucket cur_maxbucket; Bucket cur_bucket; + Bucket prf_bucket; Buffer metabuf = InvalidBuffer; HashMetaPage metap; HashMetaPage cachedmetap; + int io_concurrency; tuples_removed = 0; num_index_tuples = 0; + io_concurrency = get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace); + /* * We need a copy of the metapage so that we can use its hashm_spares[] * values to compute bucket page addresses, but a cached copy should be @@ -487,9 +492,14 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* Scan the buckets that we know exist */ cur_bucket = 0; + prf_bucket = cur_bucket; cur_maxbucket = orig_maxbucket; loop_top: + for (; prf_bucket <= cur_maxbucket && + prf_bucket < cur_bucket + io_concurrency; prf_bucket++) + PrefetchBuffer(rel, MAIN_FORKNUM, BUCKET_TO_BLKNO(cachedmetap, prf_bucket)); + while (cur_bucket <= cur_maxbucket) { BlockNumber bucket_blkno; @@ -500,6 +510,12 @@ hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, Page page; bool split_cleanup = false; + if (io_concurrency > 0 && prf_bucket <= cur_maxbucket) + { + PrefetchBuffer(rel, MAIN_FORKNUM, BUCKET_TO_BLKNO(cachedmetap, prf_bucket)); + prf_bucket++; + } + /* Get address of bucket's start page */ bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket); diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index c74fbd01049..b46fc6428a6 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -53,6 +53,7 @@ #include "access/xlogutils.h" #include "catalog/catalog.h" #include "miscadmin.h" +#include "optimizer/cost.h" #include "pgstat.h" #include "port/atomics.h" #include "port/pg_bitutils.h" @@ -316,6 +317,27 @@ initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock) scan->rs_startblock = 0; } + if (enable_seqscan_prefetch) + { + /* + * Do not use tablespace setting for catalog scans, as we might have + * the tablespace settings in the catalogs locked already, which + * might result in a deadlock. + */ + if (IsCatalogRelation(scan->rs_base.rs_rd)) + scan->rs_prefetch_maximum = effective_io_concurrency; + else + scan->rs_prefetch_maximum = + get_tablespace_io_concurrency(scan->rs_base.rs_rd->rd_rel->reltablespace); + + scan->rs_prefetch_target = 1; + } + else + { + scan->rs_prefetch_maximum = -1; + scan->rs_prefetch_target = -1; + } + scan->rs_numblocks = InvalidBlockNumber; scan->rs_inited = false; scan->rs_ctup.t_data = NULL; @@ -398,6 +420,103 @@ heapgetpage(TableScanDesc sscan, BlockNumber page) */ CHECK_FOR_INTERRUPTS(); + /* Prefetch up to io_concurrency blocks ahead */ + if (scan->rs_prefetch_maximum > 0 && scan->rs_nblocks > 1) + { + int64 nblocks; + int64 rel_scan_start; + int64 rel_scan_end; /* blockno of end of scan (mod scan->rs_nblocks) */ + int64 scan_pageoff; /* page, but adjusted for scan position as above */ + + int64 prefetch_start; /* start block of prefetch requests this iteration */ + int64 prefetch_end; /* end block of prefetch requests this iteration, if applicable */ + ParallelBlockTableScanWorker pbscanwork = scan->rs_parallelworkerdata; + ParallelBlockTableScanDesc pbscandesc = (ParallelBlockTableScanDesc) sscan->rs_parallel; + + /* + * Parallel scans look like repeated sequential table scans for + * prefetching; with a scan start at nalloc + ch_remaining - ch_size + */ + if (pbscanwork != NULL) + { + uint64 start_offset, + end_offset; + + Assert(pbscandesc != NULL); + start_offset = pbscanwork->phsw_nallocated + + pbscanwork->phsw_chunk_remaining + 1 + - pbscanwork->phsw_chunk_size; + end_offset = Min(pbscanwork->phsw_nallocated + + pbscanwork->phsw_chunk_remaining + 1, + pbscandesc->phs_nblocks); + + rel_scan_start = (int64) (pbscandesc->phs_startblock) + start_offset; + rel_scan_end = (int64) (pbscandesc->phs_startblock) + end_offset; + nblocks = pbscandesc->phs_nblocks; + } + else + { + rel_scan_start = scan->rs_startblock; + rel_scan_end = scan->rs_startblock + scan->rs_nblocks; + nblocks = scan->rs_nblocks; + } + + prefetch_end = rel_scan_end; + + if ((uint64) page < rel_scan_start) + scan_pageoff = page + nblocks; + else + scan_pageoff = page; + + Assert(rel_scan_start <= scan_pageoff && scan_pageoff <= rel_scan_end); + + /* + * If this is the first page of this seqscan, initiate prefetch of + * pages page..page + n. On each subsequent call, prefetch the next + * page that we haven't prefetched yet, at page + n. + * If this is the last page of the prefetch, + */ + if (rel_scan_start != page) + { + prefetch_start = scan_pageoff + (int64) scan->rs_prefetch_target - 1; + prefetch_end = prefetch_start + 1; + } + else + { + prefetch_start = scan_pageoff; + prefetch_end = rel_scan_end; + } + + /* do not prefetch if the only page we're trying to prefetch is past the end of our scan window */ + if (prefetch_start > rel_scan_end) + prefetch_end = 0; + + if (prefetch_end > prefetch_start + scan->rs_prefetch_target) + prefetch_end = prefetch_start + scan->rs_prefetch_target; + + if (prefetch_end > rel_scan_end) + prefetch_end = rel_scan_end; + + while (prefetch_start < prefetch_end) + { + BlockNumber blckno = (prefetch_start % nblocks); + Assert(blckno < nblocks); + Assert(blckno < INT_MAX); + PrefetchBuffer(scan->rs_base.rs_rd, MAIN_FORKNUM, blckno); + prefetch_start += 1; + } + + /* + * Use exponential growth of readahead up to prefetch_maximum, to + * make sure that a low LIMIT does not result in high IO overhead, + * but operations in general are still very fast. + */ + if (scan->rs_prefetch_target < scan->rs_prefetch_maximum / 2) + scan->rs_prefetch_target *= 2; + else if (scan->rs_prefetch_target < scan->rs_prefetch_maximum) + scan->rs_prefetch_target = scan->rs_prefetch_maximum; + } + /* read page using selected strategy */ scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM, page, RBM_NORMAL, scan->rs_strategy); @@ -2155,6 +2274,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, xlhdr.t_infomask2 = heaptup->t_data->t_infomask2; xlhdr.t_infomask = heaptup->t_data->t_infomask; xlhdr.t_hoff = heaptup->t_data->t_hoff; + xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(heaptup->t_data); /* * note we mark xlhdr as belonging to buffer; if XLogInsert decides to @@ -2178,7 +2298,18 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, END_CRIT_SECTION(); - UnlockReleaseBuffer(buffer); + if (options & HEAP_INSERT_SPECULATIVE) + { + /* + * NEON: speculative token is not stored in WAL, so if the page is evicted + * from the buffer cache, the token will be lost. To prevent that, we keep the + * buffer pinned. It will be unpinned in heapam_tuple_finish/abort_speculative. + */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + } + else + UnlockReleaseBuffer(buffer); + if (vmbuffer != InvalidBuffer) ReleaseBuffer(vmbuffer); @@ -2473,6 +2604,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, tuphdr->t_infomask2 = heaptup->t_data->t_infomask2; tuphdr->t_infomask = heaptup->t_data->t_infomask; + tuphdr->t_cid = HeapTupleHeaderGetRawCommandId(heaptup->t_data); tuphdr->t_hoff = heaptup->t_data->t_hoff; /* write bitmap [+ padding] [+ oid] + data */ @@ -2985,7 +3117,7 @@ heap_delete(Relation relation, ItemPointer tid, tp.t_data->t_infomask2); xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); xlrec.xmax = new_xmax; - + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tp.t_data); if (old_key_tuple != NULL) { if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL) @@ -3006,6 +3138,7 @@ heap_delete(Relation relation, ItemPointer tid, { xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2; xlhdr.t_infomask = old_key_tuple->t_data->t_infomask; + xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(old_key_tuple->t_data); xlhdr.t_hoff = old_key_tuple->t_data->t_hoff; XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader); @@ -3713,6 +3846,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, oldtup.t_data->t_infomask2); xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(oldtup.t_data); XLogRegisterData((char *) &xlrec, SizeOfHeapLock); recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK); PageSetLSN(page, recptr); @@ -4900,6 +5034,7 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, xlrec.infobits_set = compute_infobits(new_infomask, tuple->t_data->t_infomask2); xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0; + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tuple->t_data); XLogRegisterData((char *) &xlrec, SizeOfHeapLock); /* we don't decode row locks atm, so no need to log the origin */ @@ -5820,6 +5955,7 @@ heap_finish_speculative(Relation relation, ItemPointer tid) END_CRIT_SECTION(); + ReleaseBuffer(buffer); /* NEON: release buffer pinned by heap_insert */ UnlockReleaseBuffer(buffer); } @@ -5892,6 +6028,16 @@ heap_abort_speculative(Relation relation, ItemPointer tid) elog(ERROR, "attempted to kill a non-speculative tuple"); Assert(!HeapTupleHeaderIsHeapOnly(tp.t_data)); + /* + * NEON: release buffer pinned by heap_insert + * + * This function is also used on the toast tuples of an aborted speculative + * insertion. For those, there is no token on the tuple, and we didn' t keep + * the pin. + */ + if (HeapTupleHeaderIsSpeculative(tp.t_data)) + ReleaseBuffer(buffer); + /* * No need to check for serializable conflicts here. There is never a * need for a combo CID, either. No need to extract replica identity, or @@ -5949,6 +6095,7 @@ heap_abort_speculative(Relation relation, ItemPointer tid) tp.t_data->t_infomask2); xlrec.offnum = ItemPointerGetOffsetNumber(&tp.t_self); xlrec.xmax = xid; + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(tp.t_data); XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHeapDelete); @@ -8350,7 +8497,7 @@ log_heap_update(Relation reln, Buffer oldbuf, /* Prepare WAL data for the new page */ xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self); xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data); - + xlrec.t_cid = HeapTupleHeaderGetRawCommandId(newtup->t_data); bufflags = REGBUF_STANDARD; if (init) bufflags |= REGBUF_WILL_INIT; @@ -8387,6 +8534,7 @@ log_heap_update(Relation reln, Buffer oldbuf, xlhdr.t_infomask2 = newtup->t_data->t_infomask2; xlhdr.t_infomask = newtup->t_data->t_infomask; xlhdr.t_hoff = newtup->t_data->t_hoff; + xlhdr.t_cid = HeapTupleHeaderGetRawCommandId(newtup->t_data); Assert(SizeofHeapTupleHeader + prefixlen + suffixlen <= newtup->t_len); /* @@ -8428,6 +8576,7 @@ log_heap_update(Relation reln, Buffer oldbuf, xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2; xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask; xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff; + xlhdr_idx.t_cid = HeapTupleHeaderGetRawCommandId(old_key_tuple->t_data); XLogRegisterData((char *) &xlhdr_idx, SizeOfHeapHeader); @@ -8837,8 +8986,16 @@ heap_xlog_visible(XLogReaderState *record) PageSetAllVisible(page); - if (XLogHintBitIsNeeded()) - PageSetLSN(page, lsn); + /* + * NEON: despite to the comment above we need to update page LSN here. + * See discussion at hackers: https://www.postgresql.org/message-id/flat/039076d4f6cdd871691686361f83cb8a6913a86a.camel%40j-davis.com#101ba42b004f9988e3d54fce26fb3462 + * For Neon this assignment is critical because otherwise last written LSN tracked at compute doesn't + * match with page LSN assignee by WAL-redo and as a result, prefetched page is rejected. + * + * It is fixed in upstream in https://github.com/neondatabase/postgres/commit/7bf713dd2d0739fbcd4103971ed69c17ebe677ea + * but until it is merged we still need to carry a patch here. + */ + PageSetLSN(page, lsn); MarkBufferDirty(buffer); } @@ -9060,7 +9217,7 @@ heap_xlog_delete(XLogReaderState *record) HeapTupleHeaderSetXmax(htup, xlrec->xmax); else HeapTupleHeaderSetXmin(htup, InvalidTransactionId); - HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); /* Mark the page as a candidate for pruning */ PageSetPrunable(page, XLogRecGetXid(record)); @@ -9161,7 +9318,7 @@ heap_xlog_insert(XLogReaderState *record) htup->t_infomask = xlhdr.t_infomask; htup->t_hoff = xlhdr.t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, FirstCommandId); + HeapTupleHeaderSetCmin(htup, xlhdr.t_cid); htup->t_ctid = target_tid; if (PageAddItem(page, (Item) htup, newlen, xlrec->offnum, @@ -9304,7 +9461,7 @@ heap_xlog_multi_insert(XLogReaderState *record) htup->t_infomask = xlhdr->t_infomask; htup->t_hoff = xlhdr->t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, FirstCommandId); + HeapTupleHeaderSetCmin(htup, xlhdr->t_cid); ItemPointerSetBlockNumber(&htup->t_ctid, blkno); ItemPointerSetOffsetNumber(&htup->t_ctid, offnum); @@ -9444,7 +9601,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) fix_infomask_from_infobits(xlrec->old_infobits_set, &htup->t_infomask, &htup->t_infomask2); HeapTupleHeaderSetXmax(htup, xlrec->old_xmax); - HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); /* Set forward chain link in t_ctid */ htup->t_ctid = newtid; @@ -9577,7 +9734,7 @@ heap_xlog_update(XLogReaderState *record, bool hot_update) htup->t_hoff = xlhdr.t_hoff; HeapTupleHeaderSetXmin(htup, XLogRecGetXid(record)); - HeapTupleHeaderSetCmin(htup, FirstCommandId); + HeapTupleHeaderSetCmin(htup, xlhdr.t_cid); HeapTupleHeaderSetXmax(htup, xlrec->new_xmax); /* Make sure there is no forward chain link in t_ctid */ htup->t_ctid = newtid; @@ -9718,7 +9875,7 @@ heap_xlog_lock(XLogReaderState *record) offnum); } HeapTupleHeaderSetXmax(htup, xlrec->locking_xid); - HeapTupleHeaderSetCmax(htup, FirstCommandId, false); + HeapTupleHeaderSetCmax(htup, xlrec->t_cid, false); PageSetLSN(page, lsn); MarkBufferDirty(buffer); } diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 21682592478..73aacf4ba9a 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -626,7 +626,7 @@ heapam_relation_copy_data(Relation rel, const RelFileNode *newrnode) { SMgrRelation dstrel; - dstrel = smgropen(*newrnode, rel->rd_backend); + dstrel = smgropen(*newrnode, rel->rd_backend, rel->rd_rel->relpersistence); /* * Since we copy the file directly without looking at the shared buffers, diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index 2a53826736e..5e743f85363 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -117,6 +117,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "replication/logical.h" +#include "replication/message.h" #include "replication/slot.h" #include "storage/bufmgr.h" #include "storage/fd.h" @@ -785,6 +786,36 @@ raw_heap_insert(RewriteState state, HeapTuple tup) * ------------------------------------------------------------------------ */ +/* + * NEON: we need to persist mapping file in WAL + */ +static void +wallog_mapping_file(char const* path, int fd) +{ + char prefix[MAXPGPATH]; + snprintf(prefix, sizeof(prefix), "neon-file:%s", path); + if (fd < 0) + { + elog(DEBUG1, "neon: deleting contents of rewrite file %s", path); + /* unlink file */ + LogLogicalMessage(prefix, NULL, 0, false); + } + else + { + off_t size = lseek(fd, 0, SEEK_END); + char* buf; + elog(DEBUG1, "neon: writing contents of rewrite file %s, size %ld", path, (long)size); + if (size < 0) + elog(ERROR, "Failed to get size of mapping file: %m"); + buf = palloc((size_t)size); + lseek(fd, 0, SEEK_SET); + if (read(fd, buf, (size_t)size) != size) + elog(ERROR, "Failed to read mapping file: %m"); + LogLogicalMessage(prefix, buf, (size_t)size, false); + pfree(buf); + } +} + /* * Do preparations for logging logical mappings during a rewrite if * necessary. If we detect that we don't need to log anything we'll prevent @@ -920,6 +951,7 @@ logical_heap_rewrite_flush_mappings(RewriteState state) errmsg("could not write to file \"%s\", wrote %d of %d: %m", src->path, written, len))); src->off += len; + wallog_mapping_file(src->path, FileGetRawDesc(src->vfd)); XLogBeginInsert(); XLogRegisterData((char *) (&xlrec), sizeof(xlrec)); @@ -1006,7 +1038,7 @@ logical_rewrite_log_mapping(RewriteState state, TransactionId xid, src->off = 0; memcpy(src->path, path, sizeof(path)); src->vfd = PathNameOpenFile(path, - O_CREAT | O_EXCL | O_WRONLY | PG_BINARY); + O_CREAT | O_EXCL | O_RDWR | PG_BINARY); if (src->vfd < 0) ereport(ERROR, (errcode_for_file_access(), @@ -1172,6 +1204,8 @@ heap_xlog_logical_rewrite(XLogReaderState *r) errmsg("could not fsync file \"%s\": %m", path))); pgstat_report_wait_end(); + wallog_mapping_file(path, fd); + if (CloseTransientFile(fd) != 0) ereport(ERROR, (errcode_for_file_access(), @@ -1247,6 +1281,7 @@ CheckPointLogicalRewriteHeap(void) ereport(ERROR, (errcode_for_file_access(), errmsg("could not remove file \"%s\": %m", path))); + wallog_mapping_file(path, -1); } else { @@ -1275,6 +1310,8 @@ CheckPointLogicalRewriteHeap(void) errmsg("could not fsync file \"%s\": %m", path))); pgstat_report_wait_end(); + wallog_mapping_file(path, fd); + if (CloseTransientFile(fd) != 0) ereport(ERROR, (errcode_for_file_access(), diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index b802ed247e7..86a24a630d7 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -52,6 +52,7 @@ #include "commands/vacuum.h" #include "executor/instrument.h" #include "miscadmin.h" +#include "optimizer/cost.h" #include "optimizer/paths.h" #include "pgstat.h" #include "portability/instr_time.h" @@ -64,6 +65,7 @@ #include "utils/memutils.h" #include "utils/pg_rusage.h" #include "utils/timestamp.h" +#include "utils/spccache.h" /* @@ -144,6 +146,9 @@ typedef struct LVRelState Relation *indrels; int nindexes; + /* prefetch */ + int io_concurrency; + /* Aggressive VACUUM? (must set relfrozenxid >= FreezeLimit) */ bool aggressive; /* Use visibility map to skip? (disabled by DISABLE_PAGE_SKIPPING) */ @@ -416,6 +421,8 @@ heap_vacuum_rel(Relation rel, VacuumParams *params, /* Set up high level stuff about rel and its indexes */ vacrel->rel = rel; + vacrel->io_concurrency = + get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace); vac_open_indexes(vacrel->rel, RowExclusiveLock, &vacrel->nindexes, &vacrel->indrels); if (instrument && vacrel->nindexes > 0) @@ -847,6 +854,7 @@ lazy_scan_heap(LVRelState *vacrel) BlockNumber rel_pages = vacrel->rel_pages, blkno, next_unskippable_block, + next_prefetch_block, next_failsafe_block = 0, next_fsm_block_to_vacuum = 0; VacDeadItems *dead_items = vacrel->dead_items; @@ -870,6 +878,7 @@ lazy_scan_heap(LVRelState *vacrel) next_unskippable_block = lazy_scan_skip(vacrel, &vmbuffer, 0, &next_unskippable_allvis, &skipping_current_range); + next_prefetch_block = 0; for (blkno = 0; blkno < rel_pages; blkno++) { Buffer buf; @@ -972,6 +981,33 @@ lazy_scan_heap(LVRelState *vacrel) */ visibilitymap_pin(vacrel->rel, blkno, &vmbuffer); + if (vacrel->io_concurrency > 0) + { + /* + * Prefetch io_concurrency blocks ahead + */ + uint32 prefetch_budget = vacrel->io_concurrency; + + /* never trail behind the current scan */ + if (next_prefetch_block < blkno) + next_prefetch_block = blkno; + + /* but only up to the end of the relation */ + if (prefetch_budget > rel_pages - next_prefetch_block) + prefetch_budget = rel_pages - next_prefetch_block; + + /* And only up to io_concurrency ahead of the current vacuum scan */ + if (next_prefetch_block + prefetch_budget > blkno + vacrel->io_concurrency) + prefetch_budget = blkno + vacrel->io_concurrency - next_prefetch_block; + + /* And only up to the next unskippable block */ + if (next_prefetch_block + prefetch_budget > next_unskippable_block) + prefetch_budget = next_unskippable_block - next_prefetch_block; + + for (; prefetch_budget-- > 0; next_prefetch_block++) + PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, next_prefetch_block); + } + /* Finished preparatory checks. Actually scan the page. */ buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, blkno, RBM_NORMAL, vacrel->bstrategy); @@ -2395,7 +2431,8 @@ lazy_vacuum_all_indexes(LVRelState *vacrel) static void lazy_vacuum_heap_rel(LVRelState *vacrel) { - int index; + int index, + pindex; BlockNumber vacuumed_pages; Buffer vmbuffer = InvalidBuffer; LVSavedErrInfo saved_err_info; @@ -2416,6 +2453,7 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) vacuumed_pages = 0; index = 0; + pindex = 0; while (index < vacrel->dead_items->num_items) { BlockNumber tblk; @@ -2426,6 +2464,48 @@ lazy_vacuum_heap_rel(LVRelState *vacrel) vacuum_delay_point(); tblk = ItemPointerGetBlockNumber(&vacrel->dead_items->items[index]); + + if (vacrel->io_concurrency > 0) + { + /* + * If we're just starting out, prefetch N consecutive blocks. + * If not, only the next 1 block + */ + if (pindex == 0) + { + int prefetch_budget = Min(vacrel->dead_items->num_items, + Min(vacrel->rel_pages, + vacrel->io_concurrency)); + BlockNumber prev_prefetch = ItemPointerGetBlockNumber(&vacrel->dead_items->items[pindex]); + PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, prev_prefetch); + + while (++pindex < vacrel->dead_items->num_items && + prefetch_budget > 0) + { + ItemPointer ptr = &vacrel->dead_items->items[pindex]; + if (ItemPointerGetBlockNumber(ptr) != prev_prefetch) + { + prev_prefetch = ItemPointerGetBlockNumber(ptr); + prefetch_budget -= 1; + PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, prev_prefetch); + } + } + } + else if (pindex < vacrel->dead_items->num_items) + { + BlockNumber previous = ItemPointerGetBlockNumber(&vacrel->dead_items->items[pindex]); + while (++pindex < vacrel->dead_items->num_items) + { + BlockNumber toPrefetch = ItemPointerGetBlockNumber(&vacrel->dead_items->items[pindex]); + if (previous != toPrefetch) + { + PrefetchBuffer(vacrel->rel, MAIN_FORKNUM, toPrefetch); + break; + } + } + } + } + vacrel->blkno = tblk; buf = ReadBufferExtended(vacrel->rel, MAIN_FORKNUM, tblk, RBM_NORMAL, vacrel->bstrategy); diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index e09f25a684c..669a65b04fc 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -290,7 +290,9 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, * If data checksums are enabled (or wal_log_hints=on), we * need to protect the heap page from being torn. */ + /* NEON: we have to update page LSN even if wal_log_hints=off if (XLogHintBitIsNeeded()) + */ { Page heapPage = BufferGetPage(heapBuf); @@ -655,9 +657,18 @@ vm_extend(Relation rel, BlockNumber vm_nblocks) /* Now extend the file */ while (vm_nblocks_now < vm_nblocks) { - PageSetChecksumInplace((Page) pg.data, vm_nblocks_now); - - smgrextend(reln, VISIBILITYMAP_FORKNUM, vm_nblocks_now, pg.data, false); + /* + * ZENITH: Initialize VM pages through buffer cache to prevent loading + * them from pageserver. + */ + Buffer buffer = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, P_NEW, + RBM_ZERO_AND_LOCK, NULL); + Page page = BufferGetPage(buffer); + + PageInit((Page) page, BLCKSZ, 0); + PageSetChecksumInplace(page, vm_nblocks_now); + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); vm_nblocks_now++; } diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 5529afc1fed..5cd7578e27f 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -1081,3 +1081,47 @@ item is irrelevant, and need not be stored at all. This arrangement corresponds to the fact that an L&Y non-leaf page has one more pointer than key. Suffix truncation's negative infinity attributes behave in the same way. + +Notes About Index Scan Prefetch +------------------------------- + +Prefetch can significantly improve the speed of OLAP queries. +To be able to perform prefetch, we need to know which pages will +be accessed during the scan. It is trivial for heap- and bitmap scans, +but requires more effort for index scans: to implement prefetch for +index scans, we need to find out subsequent leaf pages. + +Postgres links all pages at the same level of the B-Tree in a doubly linked list and uses this list for +forward and backward iteration. This list, however, can not trivially be used for prefetching because to locate the next page because we need first to load the current page. To prefetch more than only the next page, we can utilize the parent page's downlinks instead, as it contains references to most of the target page's sibling pages. + +Because Postgres' nbtree pages have no reference to their parent page, we need to remember the parent page when descending the btree and use it to prefetch subsequent pages. We will utilize the parent's linked list to improve the performance of this prefetch system past the key range of the parent page. + +We should prefetch not only leaf pages, but also the next parent page. +The trick is to correctly calculate the moment when it will be needed: +We should not issue the prefetch request when prefetch requests for all children from the current parent page have already been issued, but when there are only effective_io_concurrency line pointers left to prefetch from the page. + +Currently there are two different prefetch implementations for +index-only scan and index scan. Index-only scan doesn't need to access heap tuples so it prefetches +only B-Tree leave pages (and their parents). Prefetch of index-only scan is performed only +if parallel plan is not used. Parallel index scan is using critical section for obtaining next +page by parallel worker. Leaf page is loaded in this critical section. +And if most of time is spent in loading the page, then it actually eliminates any concurrency +and makes prefetch useless. For relatively small tables Postgres will not choose parallel plan in +any case. And for large tables it can be enforced by setting max_parallel_workers_per_gather=0. + +Prefetch for normal (not index-only) index tries to prefetch heap tuples +referenced from leaf page. Average number of items per page +is about 100 which is comparable with default value of effective_io_concurrency. +So there is not so much sense trying to prefetch also next leaf page. + +As far as it is difficult to estimate number of entries traversed by index scan, +we prefer not to prefetch large number of pages from the very beginning. +Such useless prefetch can reduce the performance of point lookups. +Instead of it we start with smallest prefetch distance and increase it +by INCREASE_PREFETCH_DISTANCE_STEP after processing each item +until it reaches effective_io_concurrency. In case of index-only +scan we increase prefetch distance after processing each leaf pages +and for index scan - after processing each tuple. +The only exception is case when no key bounds are specified. +In this case we traverse the whole relation and it makes sense +to start with the largest possible prefetch distance from the very beginning. diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index f6f4af8bfe3..6bb34d2f4f7 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -2157,7 +2157,7 @@ _bt_insert_parent(Relation rel, BlockNumberIsValid(RelationGetTargetBlock(rel)))); /* Find the leftmost page at the next level up */ - pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL); + pbuf = _bt_get_endpoint(rel, opaque->btpo_level + 1, false, NULL, NULL); /* Set up a phony stack entry pointing there */ stack = &fakestack; stack->bts_blkno = BufferGetBlockNumber(pbuf); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 1419476d704..11e346af508 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -37,6 +37,7 @@ #include "utils/builtins.h" #include "utils/index_selfuncs.h" #include "utils/memutils.h" +#include "utils/spccache.h" /* @@ -367,6 +368,7 @@ btbeginscan(Relation rel, int nkeys, int norderbys) so->killedItems = NULL; /* until needed */ so->numKilled = 0; + so->prefetch_maximum = 0; /* disable prefetch */ /* * We don't know yet whether the scan will be index-only, so we do not @@ -908,6 +910,8 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, BTVacState vstate; BlockNumber num_pages; BlockNumber scanblkno; + BlockNumber prefetch_blkno; + int io_concurrency; bool needLock; /* @@ -947,6 +951,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, vstate.maxbufsize = 0; vstate.pendingpages = NULL; vstate.npendingpages = 0; + + io_concurrency = get_tablespace_maintenance_io_concurrency(rel->rd_rel->reltablespace); + /* Consider applying _bt_pendingfsm_finalize optimization */ _bt_pendingfsm_init(rel, &vstate, (callback == NULL)); @@ -975,6 +982,8 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, needLock = !RELATION_IS_LOCAL(rel); scanblkno = BTREE_METAPAGE + 1; + prefetch_blkno = scanblkno; + for (;;) { /* Get the current relation length */ @@ -991,9 +1000,19 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* Quit if we've scanned the whole relation */ if (scanblkno >= num_pages) break; + + if (prefetch_blkno < scanblkno) + prefetch_blkno = scanblkno; + for (; prefetch_blkno < num_pages && + prefetch_blkno < scanblkno + io_concurrency; prefetch_blkno++) + PrefetchBuffer(rel, MAIN_FORKNUM, prefetch_blkno); + /* Iterate over pages, then loop back to recheck length */ for (; scanblkno < num_pages; scanblkno++) { + if (io_concurrency > 0 && prefetch_blkno < num_pages) + PrefetchBuffer(rel, MAIN_FORKNUM, prefetch_blkno++); + btvacuumpage(&vstate, scanblkno); if (info->report_progress) pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index baab42a9da4..a35d68e395a 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -18,12 +18,14 @@ #include "access/nbtree.h" #include "access/relscan.h" #include "access/xact.h" +#include "catalog/catalog.h" #include "miscadmin.h" +#include "optimizer/cost.h" #include "pgstat.h" #include "storage/predicate.h" #include "utils/lsyscache.h" #include "utils/rel.h" - +#include "utils/spccache.h" static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf); @@ -47,6 +49,7 @@ static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir); +#define INCREASE_PREFETCH_DISTANCE_STEP 1 /* * _bt_drop_lock_and_maybe_pin() @@ -837,6 +840,70 @@ _bt_compare(Relation rel, return 0; } + +/* + * _bt_read_parent_for_prefetch - read parent page and extract references to children for prefetch. + * This functions returns offset of first item. + */ +static int +_bt_read_parent_for_prefetch(IndexScanDesc scan, BlockNumber parent, ScanDirection dir) +{ + Relation rel = scan->indexRelation; + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Buffer buf; + Page page; + BTPageOpaque opaque; + OffsetNumber offnum; + OffsetNumber n_child; + int next_parent_prefetch_index; + int i, j; + + buf = _bt_getbuf(rel, parent, BT_READ); + page = BufferGetPage(buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(page); + offnum = P_FIRSTDATAKEY(opaque); + n_child = PageGetMaxOffsetNumber(page) - offnum + 1; + + /* Position where we should insert prefetch of parent page: we intentionally use prefetch_maximum here instead of current_prefetch_distance, + * assuming that it will reach prefetch_maximum before we reach and of the parent page + */ + next_parent_prefetch_index = (n_child > so->prefetch_maximum) + ? n_child - so->prefetch_maximum : 0; + + if (ScanDirectionIsForward(dir)) + { + so->next_parent = opaque->btpo_next; + if (so->next_parent == P_NONE) + next_parent_prefetch_index = -1; + for (i = 0, j = 0; i < n_child; i++) + { + ItemId itemid = PageGetItemId(page, offnum + i); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + if (i == next_parent_prefetch_index) + so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */ + so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup); + } + } + else + { + so->next_parent = opaque->btpo_prev; + if (so->next_parent == P_NONE) + next_parent_prefetch_index = -1; + for (i = 0, j = 0; i < n_child; i++) + { + ItemId itemid = PageGetItemId(page, offnum + n_child - i - 1); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + if (i == next_parent_prefetch_index) + so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */ + so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup); + } + } + so->n_prefetch_blocks = j; + so->last_prefetch_index = 0; + _bt_relbuf(rel, buf); + return offnum; +} + /* * _bt_first() -- Find the first item in a scan. * @@ -1096,6 +1163,37 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) } } + /* Neon: initialize prefetch */ + so->n_prefetch_requests = 0; + so->n_prefetch_blocks = 0; + so->last_prefetch_index = 0; + so->next_parent = P_NONE; + so->prefetch_maximum = IsCatalogRelation(rel) + ? effective_io_concurrency + : get_tablespace_io_concurrency(rel->rd_rel->reltablespace); + + if (scan->xs_want_itup) /* index only scan */ + { + if (enable_indexonlyscan_prefetch) + { + /* We disable prefetch for parallel index-only scan. + * Neon prefetch is efficient only if prefetched blocks are accessed by the same worker + * which issued prefetch request. The logic of splitting pages between parallel workers in + * index scan doesn't allow to satisfy this requirement. + * Also prefetch of leave pages will be useless if expected number of rows fits in one page. + */ + if (scan->parallel_scan) + so->prefetch_maximum = 0; /* disable prefetch */ + } + else + so->prefetch_maximum = 0; /* disable prefetch */ + } + else if (!enable_indexscan_prefetch || !scan->heapRelation) + so->prefetch_maximum = 0; /* disable prefetch */ + + /* If key bounds are not specified, then we will scan the whole relation and it make sense to start with the largest possible prefetch distance */ + so->current_prefetch_distance = (keysCount == 0) ? so->prefetch_maximum : 0; + /* * If we found no usable boundary keys, we have to start from one end of * the tree. Walk down that edge to the first or last key, and scan from @@ -1366,6 +1464,21 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) */ stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot); + /* Start prefetching for index only scan */ + if (so->prefetch_maximum > 0 && stack != NULL && scan->xs_want_itup) /* index only scan */ + { + int first_offset = _bt_read_parent_for_prefetch(scan, stack->bts_blkno, dir); + int skip = ScanDirectionIsForward(dir) + ? stack->bts_offset - first_offset + : first_offset + so->n_prefetch_blocks - 1 - stack->bts_offset; + Assert(so->n_prefetch_blocks >= skip); + so->current_prefetch_distance = INCREASE_PREFETCH_DISTANCE_STEP; + so->n_prefetch_requests = Min(so->current_prefetch_distance, so->n_prefetch_blocks - skip); + so->last_prefetch_index = skip + so->n_prefetch_requests; + for (int i = skip; i < so->last_prefetch_index; i++) + PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[i]); + } + /* don't need to keep the stack around... */ _bt_freestack(stack); @@ -1505,9 +1618,63 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) /* OK, itemIndex says what to return */ currItem = &so->currPos.items[so->currPos.itemIndex]; scan->xs_heaptid = currItem->heapTid; - if (scan->xs_want_itup) + if (scan->xs_want_itup) /* index-only scan */ + { scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + } + else if (so->prefetch_maximum > 0) + { + int prefetchLimit, prefetchDistance; + + /* Neon: prefetch referenced heap pages. + * As far as it is difficult to predict how much items index scan will return + * we do not want to prefetch many heap pages from the very beginning because + * them may not be needed. So we are going to increase prefetch distance by INCREASE_PREFETCH_DISTANCE_STEP + * at each index scan iteration until it reaches prefetch_maximum. + */ + + /* Advance pefetch distance until it reaches prefetch_maximum */ + if (so->current_prefetch_distance + INCREASE_PREFETCH_DISTANCE_STEP <= so->prefetch_maximum) + so->current_prefetch_distance += INCREASE_PREFETCH_DISTANCE_STEP; + else + so->current_prefetch_distance = so->prefetch_maximum; + + /* How much we can prefetch */ + prefetchLimit = Min(so->current_prefetch_distance, so->currPos.lastItem - so->currPos.firstItem + 1); + + /* Active prefeth requests */ + prefetchDistance = so->n_prefetch_requests; + /* + * Consume one prefetch request (if any) + */ + if (prefetchDistance != 0) + prefetchDistance -= 1; + + /* Keep number of active prefetch requests equal to the current prefetch distance. + * When prefetch distance reaches prefetch maximum, this loop performs at most one iteration, + * but at the beginning of index scan it performs up to INCREASE_PREFETCH_DISTANCE_STEP+1 iterations + */ + if (ScanDirectionIsForward(dir)) + { + while (prefetchDistance < prefetchLimit && so->currPos.itemIndex + prefetchDistance <= so->currPos.lastItem) + { + BlockNumber blkno = BlockIdGetBlockNumber(&so->currPos.items[so->currPos.itemIndex + prefetchDistance].heapTid.ip_blkid); + PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, blkno); + prefetchDistance += 1; + } + } + else + { + while (prefetchDistance < prefetchLimit && so->currPos.itemIndex - prefetchDistance >= so->currPos.firstItem) + { + BlockNumber blkno = BlockIdGetBlockNumber(&so->currPos.items[so->currPos.itemIndex - prefetchDistance].heapTid.ip_blkid); + PrefetchBuffer(scan->heapRelation, MAIN_FORKNUM, blkno); + prefetchDistance += 1; + } + } + so->n_prefetch_requests = prefetchDistance; /* update number of active prefetch requests */ + } return true; } @@ -1914,6 +2081,30 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) so->markItemIndex = -1; } + if (scan->xs_want_itup && so->prefetch_maximum > 0) /* Prefetching of leave pages for index-only scan */ + { + /* Advance pefetch distance until it reaches prefetch_maximum */ + if (so->current_prefetch_distance + INCREASE_PREFETCH_DISTANCE_STEP <= so->prefetch_maximum) + so->current_prefetch_distance += INCREASE_PREFETCH_DISTANCE_STEP; + + so->n_prefetch_requests -= 1; /* we load next leaf page, so decrement number of active prefetch requests */ + + /* Check if the are more children to prefetch at current parent page */ + if (so->last_prefetch_index == so->n_prefetch_blocks && so->next_parent != P_NONE) + { + /* we have prefetched all items from current parent page, let's move to the next parent page */ + _bt_read_parent_for_prefetch(scan, so->next_parent, dir); + so->n_prefetch_requests -= 1; /* loading parent page consumes one more prefetch request */ + } + + /* Try to keep number of active prefetch requests equal to current prefetch distance */ + while (so->n_prefetch_requests < so->current_prefetch_distance && so->last_prefetch_index < so->n_prefetch_blocks) + { + so->n_prefetch_requests += 1; + PrefetchBuffer(scan->indexRelation, MAIN_FORKNUM, so->prefetch_blocks[so->last_prefetch_index++]); + } + } + if (ScanDirectionIsForward(dir)) { /* Walk right to the next page with data */ @@ -2318,6 +2509,7 @@ _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot) */ Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, + BlockNumber* parent, Snapshot snapshot) { Buffer buf; @@ -2326,6 +2518,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, OffsetNumber offnum; BlockNumber blkno; IndexTuple itup; + BlockNumber parent_blocknum = P_NONE; /* * If we are looking for a leaf page, okay to descend from fast root; @@ -2343,6 +2536,7 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, page = BufferGetPage(buf); TestForOldSnapshot(snapshot, rel, page); opaque = BTPageGetOpaque(page); + blkno = BufferGetBlockNumber(buf); for (;;) { @@ -2381,12 +2575,15 @@ _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, offnum = P_FIRSTDATAKEY(opaque); itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + parent_blocknum = blkno; blkno = BTreeTupleGetDownLink(itup); buf = _bt_relandgetbuf(rel, buf, blkno, BT_READ); page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); } + if (parent) + *parent = parent_blocknum; return buf; } @@ -2410,13 +2607,13 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) BTPageOpaque opaque; OffsetNumber start; BTScanPosItem *currItem; - + BlockNumber parent; /* * Scan down to the leftmost or rightmost leaf page. This is a simplified * version of _bt_search(). We don't maintain a stack since we know we * won't need it. */ - buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), scan->xs_snapshot); + buf = _bt_get_endpoint(rel, 0, ScanDirectionIsBackward(dir), &parent, scan->xs_snapshot); if (!BufferIsValid(buf)) { @@ -2429,6 +2626,15 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) return false; } + /* Start prefetching for index-only scan */ + if (so->prefetch_maximum > 0 && parent != P_NONE && scan->xs_want_itup) /* index only scan */ + { + _bt_read_parent_for_prefetch(scan, parent, dir); + so->n_prefetch_requests = so->last_prefetch_index = Min(so->prefetch_maximum, so->n_prefetch_blocks); + for (int i = 0; i < so->last_prefetch_index; i++) + PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[i]); + } + PredicateLockPage(rel, BufferGetBlockNumber(buf), scan->xs_snapshot); page = BufferGetPage(buf); opaque = BTPageGetOpaque(page); diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index bfb74049d0c..6628958c522 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -85,6 +85,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); + smgr_start_unlogged_build(index->rd_smgr); + /* * Initialize the meta page and root pages */ @@ -131,6 +133,8 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) SpGistUpdateMetaPage(index); + smgr_finish_unlogged_build_phase_1(index->rd_smgr); + /* * We didn't write WAL records as we built the index, so if WAL-logging is * required, write all pages to the WAL now. @@ -140,8 +144,13 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) log_newpage_range(index, MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index), true); + SetLastWrittenLSNForBlockRange(XactLastRecEnd, index->rd_smgr->smgr_rnode.node, + MAIN_FORKNUM, 0, RelationGetNumberOfBlocks(index)); + SetLastWrittenLSNForRelation(XactLastRecEnd, index->rd_smgr->smgr_rnode.node, MAIN_FORKNUM); } + smgr_end_unlogged_build(index->rd_smgr); + result = (IndexBuildResult *) palloc0(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index 00496305320..74b8f988e63 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -27,6 +27,7 @@ #include "storage/indexfsm.h" #include "storage/lmgr.h" #include "utils/snapmgr.h" +#include "utils/spccache.h" /* Entry in pending-list of TIDs we need to revisit */ @@ -796,7 +797,14 @@ spgvacuumscan(spgBulkDeleteState *bds) Relation index = bds->info->index; bool needLock; BlockNumber num_pages, - blkno; + blkno, + prefetch_blkno; + int io_concurrency; + + /* initiate concurrency */ + io_concurrency = get_tablespace_maintenance_io_concurrency( + index->rd_rel->reltablespace + ); /* Finish setting up spgBulkDeleteState */ initSpGistState(&bds->spgstate, index); @@ -824,6 +832,8 @@ spgvacuumscan(spgBulkDeleteState *bds) * in btvacuumscan(). */ blkno = SPGIST_METAPAGE_BLKNO + 1; + prefetch_blkno = blkno; + for (;;) { /* Get the current relation length */ @@ -836,9 +846,19 @@ spgvacuumscan(spgBulkDeleteState *bds) /* Quit if we've scanned the whole relation */ if (blkno >= num_pages) break; + + if (prefetch_blkno < blkno) + prefetch_blkno = blkno; + for (; prefetch_blkno < num_pages && + prefetch_blkno < blkno + io_concurrency; prefetch_blkno++) + PrefetchBuffer(index, MAIN_FORKNUM, prefetch_blkno); + /* Iterate over pages, then loop back to recheck length */ for (; blkno < num_pages; blkno++) { + if (io_concurrency > 0 && prefetch_blkno < num_pages) + PrefetchBuffer(index, MAIN_FORKNUM, prefetch_blkno++); + spgvacuumpage(bds, blkno); /* empty the pending-list after each page */ if (bds->pendingList != NULL) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 59f94b05d4a..57033aad0ba 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -85,6 +85,7 @@ #include "replication/walreceiver.h" #include "replication/walsender.h" #include "storage/bufmgr.h" +#include "storage/buf_internals.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/large_object.h" @@ -136,6 +137,8 @@ int wal_retrieve_retry_interval = 5000; int max_slot_wal_keep_size_mb = -1; int wal_decode_buffer_size = 512 * 1024; bool track_wal_io_timing = false; +uint64 predefined_sysidentifier; +int lastWrittenLsnCacheSize; #ifdef WAL_DEBUG bool XLOG_DEBUG = false; @@ -198,6 +201,25 @@ const struct config_enum_entry archive_mode_options[] = { {NULL, 0, false} }; +typedef struct LastWrittenLsnCacheEntry +{ + BufferTag key; + XLogRecPtr lsn; + /* double linked list for LRU replacement algorithm */ + dlist_node lru_node; +} LastWrittenLsnCacheEntry; + + +/* + * Cache of last written LSN for each relation page. + * Also to provide request LSN for smgrnblocks, smgrexists there is pseudokey=InvalidBlockId which stores LSN of last + * relation metadata update. + * Size of the cache is limited by GUC variable lastWrittenLsnCacheSize ("lsn_cache_size"), + * pages are replaced using LRU algorithm, based on L2-list. + * Access to this cache is protected by 'LastWrittenLsnLock'. + */ +static HTAB *lastWrittenLsnCache; + /* * Statistics for current checkpoint are collected in this global struct. * Because only the checkpointer or a stand-alone backend can perform @@ -552,6 +574,25 @@ typedef struct XLogCtlData */ XLogRecPtr lastFpwDisableRecPtr; + /* + * Maximal last written LSN for pages not present in lastWrittenLsnCache + */ + XLogRecPtr maxLastWrittenLsn; + + /* + * Double linked list to implement LRU replacement policy for last written LSN cache. + * Access to this list as well as to last written LSN cache is protected by 'LastWrittenLsnLock'. + */ + dlist_head lastWrittenLsnLRU; + + /* neon: copy of startup's RedoStartLSN for walproposer's use */ + XLogRecPtr RedoStartLSN; + + /* + * size of a timeline in zenith pageserver. + * used to enforce timeline size limit. + */ + uint64 zenithCurrentClusterSize; slock_t info_lck; /* locks shared variables shown above */ } XLogCtlData; @@ -634,6 +675,15 @@ static bool holdingAllLocks = false; static MemoryContext walDebugCxt = NULL; #endif + +/* + * Variables read from 'zenith.signal' file. + */ +bool ZenithRecoveryRequested = false; +XLogRecPtr zenithLastRec = InvalidXLogRecPtr; +bool zenithWriteOk = false; + + static void CleanupAfterArchiveRecovery(TimeLineID EndOfLogTLI, XLogRecPtr EndOfLog, TimeLineID newTLI); @@ -644,6 +694,7 @@ static void CreateEndOfRecoveryRecord(void); static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr missingContrecPtr, TimeLineID newTLI); +static void PreCheckPointGuts(int flags); static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags); static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo); static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void); @@ -3295,13 +3346,16 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, XLogFilePath(path, tli, *segno, wal_segment_size); - LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - if (!XLogCtl->InstallXLogFileSegmentActive) + if (XLogCtl) { - LWLockRelease(ControlFileLock); - return false; + /* Neon: in case of sync-safekeepers shared memory is not inialized */ + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + if (!XLogCtl->InstallXLogFileSegmentActive) + { + LWLockRelease(ControlFileLock); + return false; + } } - if (!find_free) { /* Force installation: get rid of any pre-existing segment file */ @@ -3315,7 +3369,8 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, if ((*segno) >= max_segno) { /* Failed to find a free slot within specified range */ - LWLockRelease(ControlFileLock); + if (XLogCtl) + LWLockRelease(ControlFileLock); return false; } (*segno)++; @@ -3329,12 +3384,14 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, */ if (durable_rename_excl(tmppath, path, LOG) != 0) { - LWLockRelease(ControlFileLock); + if (XLogCtl) + LWLockRelease(ControlFileLock); /* durable_rename_excl already emitted log message */ return false; } - LWLockRelease(ControlFileLock); + if (XLogCtl) + LWLockRelease(ControlFileLock); return true; } @@ -4330,11 +4387,8 @@ LocalProcessControlFile(bool reset) ReadControlFile(); } -/* - * Initialization of shared memory for XLOG - */ -Size -XLOGShmemSize(void) +static Size +XLOGCtlShmemSize(void) { Size size; @@ -4383,6 +4437,16 @@ XLOGShmemSize(void) return size; } +/* + * Initialization of shared memory for XLOG + */ +Size +XLOGShmemSize(void) +{ + return XLOGCtlShmemSize() + + hash_estimate_size(lastWrittenLsnCacheSize, sizeof(LastWrittenLsnCacheEntry)); +} + void XLOGShmemInit(void) { @@ -4410,7 +4474,18 @@ XLOGShmemInit(void) XLogCtl = (XLogCtlData *) - ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog); + ShmemInitStruct("XLOG Ctl", XLOGCtlShmemSize(), &foundXLog); + + if (lastWrittenLsnCacheSize > 0) + { + static HASHCTL info; + info.keysize = sizeof(BufferTag); + info.entrysize = sizeof(LastWrittenLsnCacheEntry); + lastWrittenLsnCache = ShmemInitHash("last_written_lsn_cache", + lastWrittenLsnCacheSize, lastWrittenLsnCacheSize, + &info, + HASH_ELEM | HASH_BLOBS); + } localControlFile = ControlFile; ControlFile = (ControlFileData *) @@ -4523,9 +4598,16 @@ BootStrapXLOG(void) * perhaps be useful sometimes. */ gettimeofday(&tv, NULL); - sysidentifier = ((uint64) tv.tv_sec) << 32; - sysidentifier |= ((uint64) tv.tv_usec) << 12; - sysidentifier |= getpid() & 0xFFF; + if (predefined_sysidentifier != 0) + { + sysidentifier = predefined_sysidentifier; + } + else + { + sysidentifier = ((uint64) tv.tv_sec) << 32; + sysidentifier |= ((uint64) tv.tv_usec) << 12; + sysidentifier |= getpid() & 0xFFF; + } /* page buffer must be aligned suitably for O_DIRECT */ buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ); @@ -4879,6 +4961,81 @@ CheckRequiredParameterValues(void) } } +static void +readZenithSignalFile(void) +{ + int fd; + + fd = BasicOpenFile(ZENITH_SIGNAL_FILE, O_RDONLY | PG_BINARY); + if (fd >= 0) + { + struct stat statbuf; + char *content; + char prev_lsn_str[20]; + + /* Slurp the file into a string */ + if (stat(ZENITH_SIGNAL_FILE, &statbuf) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + ZENITH_SIGNAL_FILE))); + content = palloc(statbuf.st_size + 1); + if (read(fd, content, statbuf.st_size) != statbuf.st_size) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + ZENITH_SIGNAL_FILE))); + content[statbuf.st_size] = '\0'; + + /* Parse it */ + if (sscanf(content, "PREV LSN: %19s", prev_lsn_str) != 1) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", ZENITH_SIGNAL_FILE))); + + if (strcmp(prev_lsn_str, "invalid") == 0) + { + /* No prev LSN. Forbid starting up in read-write mode */ + zenithLastRec = InvalidXLogRecPtr; + zenithWriteOk = false; + } + else if (strcmp(prev_lsn_str, "none") == 0) + { + /* + * The page server had no valid prev LSN, but assured that it's ok + * to start without it. This happens when you start the compute + * node for the first time on a new branch. + */ + zenithLastRec = InvalidXLogRecPtr; + zenithWriteOk = true; + } + else + { + uint32 hi, + lo; + + if (sscanf(prev_lsn_str, "%X/%X", &hi, &lo) != 2) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid data in file \"%s\"", ZENITH_SIGNAL_FILE))); + zenithLastRec = ((uint64) hi) << 32 | lo; + + /* If prev LSN is given, it better be valid */ + if (zenithLastRec == InvalidXLogRecPtr) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("invalid prev-LSN in file \"%s\"", ZENITH_SIGNAL_FILE))); + zenithWriteOk = true; + } + ZenithRecoveryRequested = true; + close(fd); + + elog(LOG, + "[ZENITH] found 'zenith.signal' file. setting prev LSN to %X/%X", + LSN_FORMAT_ARGS(zenithLastRec)); + } +} + /* * This must be called ONCE during postmaster or standalone-backend startup */ @@ -4910,10 +5067,15 @@ StartupXLOG(void) CurrentResourceOwner == AuxProcessResourceOwner); CurrentResourceOwner = AuxProcessResourceOwner; + /* + * Read zenith.signal before anything else. + */ + readZenithSignalFile(); + /* * Check that contents look valid. */ - if (!XRecOffIsValid(ControlFile->checkPoint)) + if (!XRecOffIsValid(ControlFile->checkPoint) && !ZenithRecoveryRequested) ereport(FATAL, (errmsg("control file contains invalid checkpoint location"))); @@ -5142,6 +5304,14 @@ StartupXLOG(void) RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo; doPageWrites = lastFullPageWrites; + /* + * Setup last written lsn cache, max written LSN. + * Starting from here, we could be modifying pages through REDO, which requires + * the existance of maxLwLsn + LwLsn LRU. + */ + XLogCtl->maxLastWrittenLsn = RedoRecPtr; + dlist_init(&XLogCtl->lastWrittenLsnLRU); + /* REDO */ if (InRecovery) { @@ -5936,6 +6106,183 @@ GetInsertRecPtr(void) return recptr; } +/* + * GetLastWrittenLSN -- Returns maximal LSN of written page. + * It returns an upper bound for the last written LSN of a given page, + * either from a cached last written LSN or a global maximum last written LSN. + * If rnode is InvalidOid then we calculate maximum among all cached LSN and maxLastWrittenLsn. + * If cache is large enough, iterating through all hash items may be rather expensive. + * But GetLastWrittenLSN(InvalidOid) is used only by zenith_dbsize which is not performance critical. + */ +XLogRecPtr +GetLastWrittenLSN(RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) +{ + XLogRecPtr lsn; + LastWrittenLsnCacheEntry* entry; + + Assert(lastWrittenLsnCacheSize != 0); + + LWLockAcquire(LastWrittenLsnLock, LW_SHARED); + + /* Maximal last written LSN among all non-cached pages */ + lsn = XLogCtl->maxLastWrittenLsn; + + if (rnode.relNode != InvalidOid) + { + BufferTag key; + key.rnode = rnode; + key.forkNum = forknum; + key.blockNum = blkno; + entry = hash_search(lastWrittenLsnCache, &key, HASH_FIND, NULL); + if (entry != NULL) + lsn = entry->lsn; + } + else + { + HASH_SEQ_STATUS seq; + /* Find maximum of all cached LSNs */ + hash_seq_init(&seq, lastWrittenLsnCache); + while ((entry = (LastWrittenLsnCacheEntry *) hash_seq_search(&seq)) != NULL) + { + if (entry->lsn > lsn) + lsn = entry->lsn; + } + } + LWLockRelease(LastWrittenLsnLock); + + return lsn; +} + +/* + * SetLastWrittenLSNForBlockRange -- Set maximal LSN of written page range. + * We maintain cache of last written LSNs with limited size and LRU replacement + * policy. Keeping last written LSN for each page allows to use old LSN when + * requesting pages of unchanged or appended relations. Also it is critical for + * efficient work of prefetch in case massive update operations (like vacuum or remove). + * + * rnode.relNode can be InvalidOid, in this case maxLastWrittenLsn is updated. + * SetLastWrittenLsn with dummy rnode is used by createdb and dbase_redo functions. + */ +void +SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode rnode, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks) +{ + if (lsn == InvalidXLogRecPtr || n_blocks == 0 || lastWrittenLsnCacheSize == 0) + return; + + LWLockAcquire(LastWrittenLsnLock, LW_EXCLUSIVE); + if (rnode.relNode == InvalidOid) + { + if (lsn > XLogCtl->maxLastWrittenLsn) + XLogCtl->maxLastWrittenLsn = lsn; + } + else + { + LastWrittenLsnCacheEntry* entry; + BufferTag key; + bool found; + BlockNumber i; + + key.rnode = rnode; + key.forkNum = forknum; + for (i = 0; i < n_blocks; i++) + { + key.blockNum = from + i; + entry = hash_search(lastWrittenLsnCache, &key, HASH_ENTER, &found); + if (found) + { + if (lsn > entry->lsn) + entry->lsn = lsn; + /* Unlink from LRU list */ + dlist_delete(&entry->lru_node); + } + else + { + entry->lsn = lsn; + if (hash_get_num_entries(lastWrittenLsnCache) > lastWrittenLsnCacheSize) + { + /* Replace least recently used entry */ + LastWrittenLsnCacheEntry* victim = dlist_container(LastWrittenLsnCacheEntry, lru_node, dlist_pop_head_node(&XLogCtl->lastWrittenLsnLRU)); + /* Adjust max LSN for not cached relations/chunks if needed */ + if (victim->lsn > XLogCtl->maxLastWrittenLsn) + XLogCtl->maxLastWrittenLsn = victim->lsn; + + hash_search(lastWrittenLsnCache, victim, HASH_REMOVE, NULL); + } + } + /* Link to the end of LRU list */ + dlist_push_tail(&XLogCtl->lastWrittenLsnLRU, &entry->lru_node); + } + } + LWLockRelease(LastWrittenLsnLock); +} + +/* + * SetLastWrittenLSNForBlock -- Set maximal LSN for block + */ +void +SetLastWrittenLSNForBlock(XLogRecPtr lsn, RelFileNode rnode, ForkNumber forknum, BlockNumber blkno) +{ + SetLastWrittenLSNForBlockRange(lsn, rnode, forknum, blkno, 1); +} + +/* + * SetLastWrittenLSNForRelation -- Set maximal LSN for relation metadata + */ +void +SetLastWrittenLSNForRelation(XLogRecPtr lsn, RelFileNode rnode, ForkNumber forknum) +{ + SetLastWrittenLSNForBlock(lsn, rnode, forknum, REL_METADATA_PSEUDO_BLOCKNO); +} + +/* + * SetLastWrittenLSNForDatabase -- Set maximal LSN for the whole database + */ +void +SetLastWrittenLSNForDatabase(XLogRecPtr lsn) +{ + RelFileNode dummyNode = {InvalidOid, InvalidOid, InvalidOid}; + SetLastWrittenLSNForBlock(lsn, dummyNode, MAIN_FORKNUM, 0); +} + +void +SetRedoStartLsn(XLogRecPtr RedoStartLSN) +{ + XLogCtl->RedoStartLSN = RedoStartLSN; +} + +/* + * RedoStartLsn is set only once by startup process, locking is not required + * after its exit. + */ +XLogRecPtr +GetRedoStartLsn(void) +{ + return XLogCtl->RedoStartLSN; +} + + +uint64 +GetZenithCurrentClusterSize(void) +{ + uint64 size; + SpinLockAcquire(&XLogCtl->info_lck); + size = XLogCtl->zenithCurrentClusterSize; + SpinLockRelease(&XLogCtl->info_lck); + + return size; +} + + +void +SetZenithCurrentClusterSize(uint64 size) +{ + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->zenithCurrentClusterSize = size; + SpinLockRelease(&XLogCtl->info_lck); +} + + + /* * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL * position known to be fsync'd to disk. This should only be used on a @@ -5944,8 +6291,6 @@ GetInsertRecPtr(void) XLogRecPtr GetFlushRecPtr(TimeLineID *insertTLI) { - Assert(XLogCtl->SharedRecoveryState == RECOVERY_STATE_DONE); - SpinLockAcquire(&XLogCtl->info_lck); LogwrtResult = XLogCtl->LogwrtResult; SpinLockRelease(&XLogCtl->info_lck); @@ -6340,6 +6685,11 @@ CreateCheckPoint(int flags) */ SyncPreCheckpoint(); + /* + * NEON: perform checkpiont action requiring write to the WAL before we determine the REDO pointer. + */ + PreCheckPointGuts(flags); + /* * Use a critical section to force system panic if we have trouble. */ @@ -6847,6 +7197,28 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, return recptr; } +static void +CheckPointReplicationState(void) +{ + CheckPointRelationMap(); + CheckPointReplicationSlots(); + CheckPointSnapBuild(); + CheckPointLogicalRewriteHeap(); + CheckPointReplicationOrigin(); +} + +/* + * NEON: we use logical records to persist information of about slots, origins, relation map... + * If it is done inside shutdown checkpoint, then Postgres panics: "concurrent write-ahead log activity while database system is shutting down" + * So it before checkpoint REDO position is determined. + */ +static void +PreCheckPointGuts(int flags) +{ + if (flags & CHECKPOINT_IS_SHUTDOWN) + CheckPointReplicationState(); +} + /* * Flush all data in shared memory to disk, and fsync * @@ -6856,11 +7228,8 @@ CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn, XLogRecPtr pagePtr, static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags) { - CheckPointRelationMap(); - CheckPointReplicationSlots(); - CheckPointSnapBuild(); - CheckPointLogicalRewriteHeap(); - CheckPointReplicationOrigin(); + if (!(flags & CHECKPOINT_IS_SHUTDOWN)) + CheckPointReplicationState(); /* Write out all dirty data in SLRUs and the main buffer pool */ TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags); @@ -7800,6 +8169,8 @@ xlog_redo(XLogReaderState *record) for (uint8 block_id = 0; block_id <= XLogRecMaxBlockId(record); block_id++) { Buffer buffer; + XLogRedoAction result; + if (!XLogRecHasBlockImage(record, block_id)) { @@ -7807,10 +8178,22 @@ xlog_redo(XLogReaderState *record) elog(ERROR, "XLOG_FPI record did not contain a full-page image"); continue; } - - if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED) + result = XLogReadBufferForRedo(record, block_id, &buffer); + if (result == BLK_DONE && (!IsUnderPostmaster || StandbyMode)) + { + /* + * NEON: In the special WAL redo process, blocks that are being + * ignored return BLK_DONE. Accept that. + * Additionally, in standby mode, blocks that are not present + * in shared buffers are ignored during replay, so we also + * ignore those blocks. + */ + } + else if (result != BLK_RESTORED) elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block"); - UnlockReleaseBuffer(buffer); + + if (buffer != InvalidBuffer) + UnlockReleaseBuffer(buffer); } } else if (info == XLOG_BACKUP_END) diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 35cc0559f9d..030e4de7123 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -37,9 +37,11 @@ #include "miscadmin.h" #include "pg_trace.h" #include "replication/origin.h" +#include "replication/walsender.h" #include "storage/bufmgr.h" #include "storage/proc.h" #include "utils/memutils.h" +#include "utils/wait_event.h" /* * Guess the maximum buffer size required to store a compressed version of @@ -87,6 +89,11 @@ typedef struct char compressed_page[COMPRESS_BUFSIZE]; } registered_buffer; +/* GUCs */ +int max_replication_apply_lag; +int max_replication_flush_lag; +int max_replication_write_lag; + static registered_buffer *registered_buffers; static int max_registered_buffers; /* allocated size */ static int max_registered_block_id = 0; /* highest block_id + 1 currently @@ -470,6 +477,11 @@ XLogInsert(RmgrId rmid, uint8 info) return EndPos; } + if (delay_backend_us != NULL && delay_backend_us() > 0) + { + InterruptPending = true; + } + do { XLogRecPtr RedoRecPtr; diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c index b98b3192cf5..88b470d6f23 100644 --- a/src/backend/access/transam/xlogprefetcher.c +++ b/src/backend/access/transam/xlogprefetcher.c @@ -720,8 +720,10 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) * We could try to have a fast path for repeated references to the * same relation (with some scheme to handle invalidations * safely), but for now we'll call smgropen() every time. + * + * Only permanent relations are WAL-logged, so RELPERSISTENCE_PERMANENT. */ - reln = smgropen(block->rnode, InvalidBackendId); + reln = smgropen(block->rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT); /* * If the relation file doesn't exist on disk, for example because diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 03fad82bc1b..f4f4326a5e2 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -44,8 +44,6 @@ static void report_invalid_record(XLogReaderState *state, const char *fmt,...) pg_attribute_printf(2, 3); static bool allocate_recordbuf(XLogReaderState *state, uint32 reclength); -static int ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, - int reqLen); static void XLogReaderInvalReadState(XLogReaderState *state); static XLogPageReadResult XLogDecodeNextRecord(XLogReaderState *state, bool non_blocking); static bool ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, @@ -263,7 +261,7 @@ WALOpenSegmentInit(WALOpenSegment *seg, WALSegmentContext *segcxt, void XLogBeginRead(XLogReaderState *state, XLogRecPtr RecPtr) { - Assert(!XLogRecPtrIsInvalid(RecPtr)); + Assert(!XLogRecPtrIsInvalid(RecPtr) || state->skip_lsn_checks); ResetDecoder(state); @@ -287,6 +285,14 @@ XLogReleasePreviousRecord(XLogReaderState *state) if (!state->record) return InvalidXLogRecPtr; +#define SKIP_INVALID_RECORD(rec_ptr) do { \ + rec_ptr = MAXALIGN(rec_ptr + 1); \ + if (rec_ptr % XLOG_BLCKSZ <= MAXALIGN(1)) \ + goto restart; \ + else \ + goto skip_invalid; \ + } while (0); + /* * Remove it from the decoded record queue. It must be the oldest item * decoded, decode_queue_head. @@ -467,7 +473,7 @@ XLogReadRecord(XLogReaderState *state, char **errormsg) * Return NULL if there is no space in the decode buffer and allow_oversized * is false, or if memory allocation fails for an oversized buffer. */ -static DecodedXLogRecord * +DecodedXLogRecord * XLogReadRecordAlloc(XLogReaderState *state, size_t xl_tot_len, bool allow_oversized) { size_t required_space = DecodeXLogRecordRequiredSpace(xl_tot_len); @@ -583,7 +589,7 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) * In this case, NextRecPtr should already be pointing to a valid * record starting position. */ - Assert(XRecOffIsValid(RecPtr)); + Assert(XRecOffIsValid(RecPtr) || state->skip_lsn_checks); randAccess = true; } @@ -622,17 +628,23 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) } else if (targetRecOff < pageHeaderSize) { - report_invalid_record(state, "invalid record offset at %X/%X", + if(!state->skip_page_validation) + { + report_invalid_record(state, "invalid record offset at %X/%X", LSN_FORMAT_ARGS(RecPtr)); - goto err; + goto err; + } } if ((((XLogPageHeader) state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) && targetRecOff == pageHeaderSize) { - report_invalid_record(state, "contrecord is requested by %X/%X", + if(!state->skip_page_validation) + { + report_invalid_record(state, "contrecord is requested by %X/%X", LSN_FORMAT_ARGS(RecPtr)); - goto err; + goto err; + } } /* ReadPageInternal has verified the page header */ @@ -647,6 +659,7 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) * cannot access any other fields until we've verified that we got the * whole header. */ +skip_invalid: record = (XLogRecord *) (state->readBuf + RecPtr % XLOG_BLCKSZ); total_len = record->xl_tot_len; @@ -662,7 +675,13 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) { if (!ValidXLogRecordHeader(state, RecPtr, state->DecodeRecPtr, record, randAccess)) - goto err; + { + if(!state->skip_invalid_records) + goto err; + + SKIP_INVALID_RECORD(RecPtr); + } + gotheader = true; } else @@ -670,12 +689,19 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) /* XXX: more validation should be done here */ if (total_len < SizeOfXLogRecord) { - report_invalid_record(state, - "invalid record length at %X/%X: wanted %u, got %u", - LSN_FORMAT_ARGS(RecPtr), - (uint32) SizeOfXLogRecord, total_len); - goto err; + if(!state->skip_invalid_records) + { + report_invalid_record(state, + "invalid record length at %X/%X: wanted %u, got %u", + LSN_FORMAT_ARGS(RecPtr), + (uint32) SizeOfXLogRecord, total_len); + + goto err; + } + + SKIP_INVALID_RECORD(RecPtr); } + gotheader = false; } @@ -721,10 +747,16 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) if (total_len > state->readRecordBufSize && !allocate_recordbuf(state, total_len)) { - /* We treat this as a "bogus data" condition */ - report_invalid_record(state, "record length %u at %X/%X too long", - total_len, LSN_FORMAT_ARGS(RecPtr)); - goto err; + + if(!state->skip_invalid_records) + { + /* We treat this as a "bogus data" condition */ + report_invalid_record(state, "record length %u at %X/%X too long", + total_len, LSN_FORMAT_ARGS(RecPtr)); + goto err; + } + + SKIP_INVALID_RECORD(RecPtr); } /* Copy the first fragment of the record from the first page. */ @@ -770,10 +802,15 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) /* Check that the continuation on next page looks valid */ if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD)) { - report_invalid_record(state, + if(!state->skip_invalid_records) + { + report_invalid_record(state, "there is no contrecord flag at %X/%X", LSN_FORMAT_ARGS(RecPtr)); - goto err; + goto err; + } + + SKIP_INVALID_RECORD(RecPtr); } /* @@ -783,12 +820,17 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) if (pageHeader->xlp_rem_len == 0 || total_len != (pageHeader->xlp_rem_len + gotlen)) { - report_invalid_record(state, + if(!state->skip_invalid_records) + { + report_invalid_record(state, "invalid contrecord length %u (expected %lld) at %X/%X", pageHeader->xlp_rem_len, ((long long) total_len) - gotlen, LSN_FORMAT_ARGS(RecPtr)); - goto err; + goto err; + } + + SKIP_INVALID_RECORD(RecPtr); } /* Append the continuation from this page to the buffer */ @@ -819,7 +861,13 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) record = (XLogRecord *) state->readRecordBuf; if (!ValidXLogRecordHeader(state, RecPtr, state->DecodeRecPtr, record, randAccess)) - goto err; + { + if(!state->skip_invalid_records) + goto err; + + SKIP_INVALID_RECORD(RecPtr); + } + gotheader = true; } } while (gotlen < total_len); @@ -828,7 +876,12 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) record = (XLogRecord *) state->readRecordBuf; if (!ValidXLogRecord(state, record, RecPtr)) - goto err; + { + if(!state->skip_invalid_records) + goto err; + + SKIP_INVALID_RECORD(RecPtr); + } pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf); state->DecodeRecPtr = RecPtr; @@ -847,7 +900,12 @@ XLogDecodeNextRecord(XLogReaderState *state, bool nonblocking) /* Record does not cross a page boundary */ if (!ValidXLogRecord(state, record, RecPtr)) - goto err; + { + if(!state->skip_invalid_records) + goto err; + + SKIP_INVALID_RECORD(RecPtr); + } state->NextRecPtr = RecPtr + MAXALIGN(total_len); @@ -979,7 +1037,7 @@ XLogReadAhead(XLogReaderState *state, bool nonblocking) * We fetch the page from a reader-local cache if we know we have the required * data and if there hasn't been any error since caching the data. */ -static int +int ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen) { int readLen; @@ -1031,8 +1089,7 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen) /* we can be sure to have enough WAL available, we scrolled back */ Assert(readLen == XLOG_BLCKSZ); - if (!XLogReaderValidatePageHeader(state, targetSegmentPtr, - state->readBuf)) + if (!XLogReaderValidatePageHeader(state, targetSegmentPtr, state->readBuf) && !state->skip_page_validation) goto err; } @@ -1073,7 +1130,7 @@ ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen) /* * Now that we know we have the full header, validate it. */ - if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr)) + if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr) && !state->skip_page_validation) goto err; /* update read state information */ @@ -1132,7 +1189,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, * We can't exactly verify the prev-link, but surely it should be less * than the record's own address. */ - if (!(record->xl_prev < RecPtr)) + if (!(record->xl_prev < RecPtr) && !state->skip_lsn_checks) { report_invalid_record(state, "record with incorrect prev-link %X/%X at %X/%X", @@ -1148,7 +1205,7 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, * check guards against torn WAL pages where a stale but valid-looking * WAL record starts on a sector boundary. */ - if (record->xl_prev != PrevRecPtr) + if (record->xl_prev != PrevRecPtr && !state->skip_lsn_checks) { report_invalid_record(state, "record with incorrect prev-link %X/%X at %X/%X", @@ -1291,7 +1348,7 @@ XLogReaderValidatePageHeader(XLogReaderState *state, XLogRecPtr recptr, * check typically fails when an old WAL segment is recycled, and hasn't * yet been overwritten with new data yet. */ - if (hdr->xlp_pageaddr != recaddr) + if (hdr->xlp_pageaddr != recaddr && !state->skip_lsn_checks) { char fname[MAXFNAMELEN]; diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 166f7b7b793..713df8b7dde 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -336,6 +336,7 @@ typedef struct XLogRecoveryCtlData XLogRecPtr lastReplayedReadRecPtr; /* start position */ XLogRecPtr lastReplayedEndRecPtr; /* end+1 position */ TimeLineID lastReplayedTLI; /* timeline */ + ConditionVariable replayProgressCV; /* CV for waiters */ /* * When we're currently replaying a record, ie. in a redo function, @@ -465,6 +466,7 @@ XLogRecoveryShmemInit(void) SpinLockInit(&XLogRecoveryCtl->info_lck); InitSharedLatch(&XLogRecoveryCtl->recoveryWakeupLatch); + ConditionVariableInit(&XLogRecoveryCtl->replayProgressCV); ConditionVariableInit(&XLogRecoveryCtl->recoveryNotPausedCV); } @@ -486,6 +488,64 @@ EnableStandbyMode(void) disable_startup_progress_timeout(); } +/* + * Wait for recovery to complete replaying all WAL up to and including + * redoEndRecPtr. + * + * This gets woken up for every WAL record replayed, so make sure you're not + * trying to wait an LSN that is too far in the future. + */ +void +XLogWaitForReplayOf(XLogRecPtr redoEndRecPtr) +{ + static XLogRecPtr replayRecPtr = 0; + + if (!RecoveryInProgress()) + return; + + /* + * Check the backend-local variable first, we may be able to skip accessing + * shared memory (which requires locking) + */ + if (redoEndRecPtr <= replayRecPtr) + return; + + replayRecPtr = GetXLogReplayRecPtr(NULL); + + /* + * Check again if we're going to need to wait, now that we've updated + * the local cached variable. + */ + if (redoEndRecPtr <= replayRecPtr) + return; + + /* + * We need to wait for the variable, so prepare for that. + * + * Note: This wakes up every time a WAL record is replayed, so this can + * be expensive. + */ + ConditionVariablePrepareToSleep(&XLogRecoveryCtl->replayProgressCV); + + while (redoEndRecPtr > replayRecPtr) + { + bool timeout; + timeout = ConditionVariableTimedSleep(&XLogRecoveryCtl->replayProgressCV, + 10000000, /* 10 seconds */ + WAIT_EVENT_RECOVERY_WAL_STREAM); + + replayRecPtr = GetXLogReplayRecPtr(NULL); + + if (timeout) + ereport(LOG, + (errmsg("Waiting for recovery to catch up to %X/%X (currently %X/%X)", + LSN_FORMAT_ARGS(redoEndRecPtr), + LSN_FORMAT_ARGS(replayRecPtr)))); + } + + ConditionVariableCancelSleep(); +} + /* * Prepare the system for WAL recovery, if needed. * @@ -562,6 +622,9 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE) ereport(LOG, (errmsg("starting point-in-time recovery to earliest consistent point"))); + else if (ZenithRecoveryRequested) + ereport(LOG, + (errmsg("starting zenith recovery"))); else ereport(LOG, (errmsg("starting archive recovery"))); @@ -702,6 +765,33 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, /* tell the caller to delete it later */ haveBackupLabel = true; } + else if (ZenithRecoveryRequested) + { + /* + * Zenith hacks to spawn compute node without WAL. Pretend that we + * just finished reading the record that started at 'zenithLastRec' + * and ended at checkpoint.redo + */ + elog(LOG, "starting with zenith basebackup at LSN %X/%X, prev %X/%X", + LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo), + LSN_FORMAT_ARGS(zenithLastRec)); + + CheckPointLoc = zenithLastRec; + CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID; + RedoStartLSN = ControlFile->checkPointCopy.redo; + // FIXME needs review. rebase of ff41b709abea6a9c42100a4fcb0ff434b2c846c9 + // Is it still relevant? + /* make basebackup LSN available for walproposer */ + SetRedoStartLsn(RedoStartLSN); + //EndRecPtr = ControlFile->checkPointCopy.redo; + + memcpy(&checkPoint, &ControlFile->checkPointCopy, sizeof(CheckPoint)); + wasShutdown = true; + + /* Initialize expectedTLEs, like ReadRecord() does */ + expectedTLEs = readTimeLineHistory(checkPoint.ThisTimeLineID); + XLogPrefetcherBeginRead(xlogprefetcher, ControlFile->checkPointCopy.redo); + } else { /* @@ -763,6 +853,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, CheckPointLoc = ControlFile->checkPoint; CheckPointTLI = ControlFile->checkPointCopy.ThisTimeLineID; RedoStartLSN = ControlFile->checkPointCopy.redo; + SetRedoStartLsn(RedoStartLSN); RedoStartTLI = ControlFile->checkPointCopy.ThisTimeLineID; record = ReadCheckpointRecord(xlogprefetcher, CheckPointLoc, 1, true, CheckPointTLI); @@ -852,7 +943,7 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr, (errmsg("invalid next transaction ID"))); /* sanity check */ - if (checkPoint.redo > CheckPointLoc) + if (checkPoint.redo > CheckPointLoc && !ZenithRecoveryRequested) ereport(PANIC, (errmsg("invalid redo in checkpoint record"))); @@ -1450,8 +1541,12 @@ FinishWalRecovery(void) lastRec = XLogRecoveryCtl->lastReplayedReadRecPtr; lastRecTLI = XLogRecoveryCtl->lastReplayedTLI; } - XLogPrefetcherBeginRead(xlogprefetcher, lastRec); - (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI); + + if (!ZenithRecoveryRequested) + { + XLogPrefetcherBeginRead(xlogprefetcher, lastRec); + (void) ReadRecord(xlogprefetcher, PANIC, false, lastRecTLI); + } endOfLog = xlogreader->EndRecPtr; /* @@ -1489,7 +1584,61 @@ FinishWalRecovery(void) * Copy the last partial block to the caller, for initializing the WAL * buffer for appending new WAL. */ - if (endOfLog % XLOG_BLCKSZ != 0) + /* + * When starting from a zenith base backup, we don't have WAL. Initialize + * the WAL page where we will start writing new records from scratch, + * instead. + */ + result->lastPageBeginPtr = endOfLog; + result->lastPage = NULL; + if (ZenithRecoveryRequested) + { + if (!zenithWriteOk) + { + /* + * We cannot start generating new WAL if we don't have a valid prev-LSN + * to use for the first new WAL record. (Shouldn't happen.) + */ + ereport(ERROR, + (errmsg("cannot start in read-write mode from this base backup"))); + } + else + { + int offs = endOfLog % XLOG_BLCKSZ; + char *page = palloc0(offs); + XLogRecPtr pageBeginPtr = endOfLog - offs; + int lastPageSize = ((pageBeginPtr % wal_segment_size) == 0) ? SizeOfXLogLongPHD : SizeOfXLogShortPHD; + + XLogPageHeader xlogPageHdr = (XLogPageHeader) (page); + + if (ReadPageInternal(xlogreader, pageBeginPtr, SizeOfXLogShortPHD) != SizeOfXLogShortPHD) + { + elog(LOG, "Intialize page header %X/%X xlp_rem_len=%d", LSN_FORMAT_ARGS(pageBeginPtr), offs - lastPageSize); + xlogPageHdr->xlp_pageaddr = pageBeginPtr; + xlogPageHdr->xlp_magic = XLOG_PAGE_MAGIC; + xlogPageHdr->xlp_tli = recoveryTargetTLI; + /* + * If we start writing with offset from page beginning, pretend in + * page header there is a record ending where actual data will + * start. + */ + xlogPageHdr->xlp_rem_len = offs - lastPageSize; + xlogPageHdr->xlp_info = (xlogPageHdr->xlp_rem_len > 0) ? XLP_FIRST_IS_CONTRECORD : 0; + } + else + { + memcpy(xlogPageHdr, xlogreader->readBuf, SizeOfXLogShortPHD); + } + readOff = XLogSegmentOffset(pageBeginPtr, wal_segment_size); + + result->lastPageBeginPtr = pageBeginPtr; + result->lastPage = page; + elog(LOG, "Continue writing WAL at %X/%X", LSN_FORMAT_ARGS(xlogreader->EndRecPtr)); + + // FIXME: should we unlink zenith.signal? + } + } + if (result->lastPage == NULL && endOfLog % XLOG_BLCKSZ != 0) { char *page; int len; @@ -1506,12 +1655,6 @@ FinishWalRecovery(void) result->lastPageBeginPtr = pageBeginPtr; result->lastPage = page; } - else - { - /* There is no partial block to copy. */ - result->lastPageBeginPtr = endOfLog; - result->lastPage = NULL; - } /* * Create a comment for the history file to explain why and where timeline @@ -1541,7 +1684,10 @@ ShutdownWalRecovery(void) char recoveryPath[MAXPGPATH]; /* Final update of pg_stat_recovery_prefetch. */ - XLogPrefetcherComputeStats(xlogprefetcher); + if (!ZenithRecoveryRequested) + { + XLogPrefetcherComputeStats(xlogprefetcher); + } /* Shut down xlogreader */ if (readFile >= 0) @@ -1550,7 +1696,11 @@ ShutdownWalRecovery(void) readFile = -1; } XLogReaderFree(xlogreader); - XLogPrefetcherFree(xlogprefetcher); + + if (!ZenithRecoveryRequested) + { + XLogPrefetcherFree(xlogprefetcher); + } if (ArchiveRecoveryRequested) { @@ -1640,7 +1790,10 @@ PerformWalRecovery(void) else { /* just have to read next record after CheckPoint */ - Assert(xlogreader->ReadRecPtr == CheckPointLoc); + if (ZenithRecoveryRequested) + xlogreader->ReadRecPtr = CheckPointLoc; + else + Assert(xlogreader->ReadRecPtr == CheckPointLoc); replayTLI = CheckPointTLI; record = ReadRecord(xlogprefetcher, LOG, false, replayTLI); } @@ -1965,6 +2118,8 @@ ApplyWalRecord(XLogReaderState *xlogreader, XLogRecord *record, TimeLineID *repl /* Reset the prefetcher. */ XLogPrefetchReconfigure(); } + + ConditionVariableBroadcast(&XLogRecoveryCtl->replayProgressCV); } /* diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 702c8c14e12..1d4e9992956 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -33,6 +33,8 @@ #include "utils/rel.h" +bool (*redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id); + /* GUC variable */ bool ignore_invalid_pages = false; @@ -372,6 +374,21 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, block_id); } + if (redo_read_buffer_filter && redo_read_buffer_filter(record, block_id)) + { + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + { + *buf = ReadBufferWithoutRelcache(rnode, forknum, + blkno, mode, NULL, true); + return BLK_DONE; + } + else + { + *buf = InvalidBuffer; + return BLK_DONE; + } + } + /* * Make sure that if the block is marked with WILL_INIT, the caller is * going to initialize it. And vice versa. @@ -490,7 +507,7 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum, } /* Open the relation at smgr level */ - smgr = smgropen(rnode, InvalidBackendId); + smgr = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT); /* * Create the target file if it doesn't already exist. This lets us cope diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c index 48ff9483af2..149b2f408aa 100644 --- a/src/backend/bootstrap/bootstrap.c +++ b/src/backend/bootstrap/bootstrap.c @@ -47,6 +47,7 @@ uint32 bootstrap_data_checksum_version = 0; /* No checksum */ +extern uint64 predefined_sysidentifier; static void CheckerModeMain(void); static void bootstrap_signals(void); @@ -221,7 +222,7 @@ BootstrapModeMain(int argc, char *argv[], bool check_only) argv++; argc--; - while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:X:-:")) != -1) + while ((flag = getopt(argc, argv, "B:c:d:D:Fkr:s:X:-:")) != -1) { switch (flag) { @@ -265,6 +266,16 @@ BootstrapModeMain(int argc, char *argv[], bool check_only) PGC_S_DYNAMIC_DEFAULT); } break; + case 's': + { + char* endptr; +#ifdef HAVE_STRTOULL + predefined_sysidentifier = strtoull(optarg, &endptr, 10); +#else + predefined_sysidentifier = strtoul(optarg, &endptr, 10); +#endif + break; + } case 'c': case '-': { diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index c06e414a38f..530bc3b6e1f 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -145,7 +145,7 @@ RelationCreateStorage(RelFileNode rnode, char relpersistence, return NULL; /* placate compiler */ } - srel = smgropen(rnode, backend); + srel = smgropen(rnode, backend, relpersistence); smgrcreate(srel, MAIN_FORKNUM, false); if (needs_wal) @@ -185,6 +185,7 @@ void log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum) { xl_smgr_create xlrec; + XLogRecPtr lsn; /* * Make an XLOG entry reporting the file creation. @@ -194,7 +195,8 @@ log_smgrcreate(const RelFileNode *rnode, ForkNumber forkNum) XLogBeginInsert(); XLogRegisterData((char *) &xlrec, sizeof(xlrec)); - XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE); + lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE); + SetLastWrittenLSNForRelation(lsn, *rnode, forkNum); } /* @@ -677,7 +679,7 @@ smgrDoPendingDeletes(bool isCommit) { SMgrRelation srel; - srel = smgropen(pending->relnode, pending->backend); + srel = smgropen(pending->relnode, pending->backend, 0); /* allocate the initial array, or extend it, if needed */ if (maxrels == 0) @@ -758,7 +760,7 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker) BlockNumber total_blocks = 0; SMgrRelation srel; - srel = smgropen(pendingsync->rnode, InvalidBackendId); + srel = smgropen(pendingsync->rnode, InvalidBackendId, 0); /* * We emit newpage WAL records for smaller relations. @@ -967,7 +969,7 @@ smgr_redo(XLogReaderState *record) xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record); SMgrRelation reln; - reln = smgropen(xlrec->rnode, InvalidBackendId); + reln = smgropen(xlrec->rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT); smgrcreate(reln, xlrec->forkNum, true); } else if (info == XLOG_SMGR_TRUNCATE) @@ -980,7 +982,7 @@ smgr_redo(XLogReaderState *record) int nforks = 0; bool need_fsm_vacuum = false; - reln = smgropen(xlrec->rnode, InvalidBackendId); + reln = smgropen(xlrec->rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT); /* * Forcibly create relation if it doesn't exist (which suggests that diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index 93f0c739e55..9eb51bbdbd4 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -275,7 +275,7 @@ ScanSourceDatabasePgClass(Oid tbid, Oid dbid, char *srcpath) rnode.dbNode = dbid; rnode.relNode = relfilenode; - smgr = smgropen(rnode, InvalidBackendId); + smgr = smgropen(rnode, InvalidBackendId, RELPERSISTENCE_PERMANENT); nblocks = smgrnblocks(smgr, MAIN_FORKNUM); smgrclose(smgr); @@ -466,8 +466,8 @@ CreateDirAndVersionFile(char *dbpath, Oid dbid, Oid tsid, bool isRedo) * Note that we don't have to copy this from the source database; there's * only one legal value. */ - sprintf(buf, "%s\n", PG_MAJORVERSION); - nbytes = strlen(PG_MAJORVERSION) + 1; + sprintf(buf, "%s", PG_MAJORVERSION); + nbytes = strlen(PG_MAJORVERSION); /* If we are not in WAL replay then write the WAL. */ if (!isRedo) @@ -1398,6 +1398,11 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt) CreateDatabaseUsingFileCopy(src_dboid, dboid, src_deftablespace, dst_deftablespace); + /* + * Update global last written LSN after wal-logging create database command + */ + SetLastWrittenLSNForDatabase(XactLastRecEnd); + /* * Close pg_database, but keep lock till commit. */ @@ -2038,6 +2043,7 @@ movedb(const char *dbname, const char *tblspcname) */ { xl_dbase_create_file_copy_rec xlrec; + XLogRecPtr lsn; xlrec.db_id = db_id; xlrec.tablespace_id = dst_tblspcoid; @@ -2048,8 +2054,10 @@ movedb(const char *dbname, const char *tblspcname) XLogRegisterData((char *) &xlrec, sizeof(xl_dbase_create_file_copy_rec)); - (void) XLogInsert(RM_DBASE_ID, + lsn = XLogInsert(RM_DBASE_ID, XLOG_DBASE_CREATE_FILE_COPY | XLR_SPECIAL_REL_UPDATE); + // TODO: Do we really need to set the LSN here? + SetLastWrittenLSNForDatabase(lsn); } /* @@ -3197,6 +3205,15 @@ dbase_redo(XLogReaderState *record) */ copydir(src_path, dst_path, false); + /* + * Make sure any future requests to the page server see the new + * database. + */ + { + XLogRecPtr lsn = record->EndRecPtr; + SetLastWrittenLSNForDatabase(lsn); + } + pfree(src_path); pfree(dst_path); } @@ -3218,6 +3235,14 @@ dbase_redo(XLogReaderState *record) CreateDirAndVersionFile(dbpath, xlrec->db_id, xlrec->tablespace_id, true); pfree(dbpath); + /* + * Make sure any future requests to the page server see the new + * database. + */ + { + XLogRecPtr lsn = record->EndRecPtr; + SetLastWrittenLSNForDatabase(lsn); + } } else if (info == XLOG_DBASE_DROP) { diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 060c6186ddd..1bd8fe7616a 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -47,7 +47,6 @@ ExplainOneQuery_hook_type ExplainOneQuery_hook = NULL; /* Hook for plugins to get control in explain_get_index_name() */ explain_get_index_name_hook_type explain_get_index_name_hook = NULL; - /* OR-able flags for ExplainXMLTag() */ #define X_OPENING 0 #define X_CLOSING 1 @@ -121,6 +120,7 @@ static void show_eval_params(Bitmapset *bms_params, ExplainState *es); static const char *explain_get_index_name(Oid indexId); static void show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning); +static void show_prefetch_info(ExplainState *es, const PrefetchInfo* prefetch_info); static void show_wal_usage(ExplainState *es, const WalUsage *usage); static void ExplainIndexScanDetails(Oid indexid, ScanDirection indexorderdir, ExplainState *es); @@ -186,6 +186,8 @@ ExplainQuery(ParseState *pstate, ExplainStmt *stmt, es->costs = defGetBoolean(opt); else if (strcmp(opt->defname, "buffers") == 0) es->buffers = defGetBoolean(opt); + else if (strcmp(opt->defname, "prefetch") == 0) + es->prefetch = defGetBoolean(opt); else if (strcmp(opt->defname, "wal") == 0) es->wal = defGetBoolean(opt); else if (strcmp(opt->defname, "settings") == 0) @@ -534,7 +536,7 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es, else if (es->analyze) instrument_option |= INSTRUMENT_ROWS; - if (es->buffers) + if (es->buffers || es->prefetch) instrument_option |= INSTRUMENT_BUFFERS; if (es->wal) instrument_option |= INSTRUMENT_WAL; @@ -2066,6 +2068,10 @@ ExplainNode(PlanState *planstate, List *ancestors, if (es->wal && planstate->instrument) show_wal_usage(es, &planstate->instrument->walusage); + /* Show prefetch usage */ + if (es->prefetch && planstate->instrument) + show_prefetch_info(es, &planstate->instrument->bufusage.prefetch); + /* Prepare per-worker buffer/WAL usage */ if (es->workers_state && (es->buffers || es->wal) && es->verbose) { @@ -3501,6 +3507,34 @@ explain_get_index_name(Oid indexId) return result; } +/* + * Show prefetch statistics + */ +static void +show_prefetch_info(ExplainState *es, const PrefetchInfo* prefetch_info) +{ + if (es->format == EXPLAIN_FORMAT_TEXT) + { + ExplainIndentText(es); + appendStringInfo(es->str, "Prefetch: hits=%lld misses=%lld expired=%lld duplicates=%lld\n", + (long long) prefetch_info->hits, + (long long) prefetch_info->misses, + (long long) prefetch_info->expired, + (long long) prefetch_info->duplicates); + } + else + { + ExplainPropertyInteger("Prefetch Hits", NULL, + prefetch_info->hits, es); + ExplainPropertyInteger("Prefetch Misses", NULL, + prefetch_info->misses, es); + ExplainPropertyInteger("Prefetch Expired Requests", NULL, + prefetch_info->expired, es); + ExplainPropertyInteger("Prefetch Duplicated Requests", NULL, + prefetch_info->duplicates, es); + } +} + /* * Show buffer usage details. */ diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c index df6f021c300..74cc7379e77 100644 --- a/src/backend/commands/extension.c +++ b/src/backend/commands/extension.c @@ -399,6 +399,7 @@ get_extension_script_directory(ExtensionControlFile *control) { char sharepath[MAXPGPATH]; char *result; + struct stat fst; /* * The directory parameter can be omitted, absolute, or relative to the @@ -414,6 +415,16 @@ get_extension_script_directory(ExtensionControlFile *control) result = (char *) palloc(MAXPGPATH); snprintf(result, MAXPGPATH, "%s/%s", sharepath, control->directory); + // If directory does not exist, check remote extension storage + if (stat(result, &fst) < 0) + { + // request download of extension files from for control->directory + if (download_extension_file_hook != NULL) + { + download_extension_file_hook(control->directory, false); + } + } + return result; } @@ -1453,6 +1464,13 @@ CreateExtensionInternal(char *extensionName, * will get us there. */ filename = get_extension_script_filename(pcontrol, NULL, versionName); + + // request download of extension files from compute_ctl + if (download_extension_file_hook != NULL) + { + download_extension_file_hook(extensionName, false); + } + if (stat(filename, &fst) == 0) { /* Easy, no extra scripts */ diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index acaf660c68e..88e0e3a270a 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -54,7 +54,9 @@ * so we pre-log a few fetches in advance. In the event of * crash we can lose (skip over) as many values as we pre-logged. */ -#define SEQ_LOG_VALS 32 +/* Zenith XXX: to ensure sequence order of sequence in Zenith we need to WAL log each sequence update. */ +/* #define SEQ_LOG_VALS 32 */ +#define SEQ_LOG_VALS 0 /* * The "special area" of a sequence's buffer page looks like this. @@ -355,7 +357,7 @@ fill_seq_with_data(Relation rel, HeapTuple tuple) { SMgrRelation srel; - srel = smgropen(rel->rd_node, InvalidBackendId); + srel = smgropen(rel->rd_node, InvalidBackendId, rel->rd_rel->relpersistence); smgrcreate(srel, INIT_FORKNUM, false); log_smgrcreate(&rel->rd_node, INIT_FORKNUM); fill_seq_fork_with_data(rel, tuple, INIT_FORKNUM); diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 88b253dbcd6..6f306a5cdce 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -14760,7 +14760,7 @@ index_copy_data(Relation rel, RelFileNode newrnode) { SMgrRelation dstrel; - dstrel = smgropen(newrnode, rel->rd_backend); + dstrel = smgropen(newrnode, rel->rd_backend, rel->rd_rel->relpersistence); /* * Since we copy the file directly without looking at the shared buffers, diff --git a/src/backend/commands/user.c b/src/backend/commands/user.c index cba8e1979b6..d3ed92b7b75 100644 --- a/src/backend/commands/user.c +++ b/src/backend/commands/user.c @@ -55,7 +55,8 @@ static void AddRoleMems(const char *rolename, Oid roleid, static void DelRoleMems(const char *rolename, Oid roleid, List *memberSpecs, List *memberIds, bool admin_opt); - +static void check_role_membership_authorization(Oid currentUserId, Oid roleid, + bool is_grant); /* Check if current user has createrole privileges */ static bool @@ -459,6 +460,37 @@ CreateRole(ParseState *pstate, CreateRoleStmt *stmt) } } + /* + * If the current user isn't a superuser, make them an admin of the new + * role so that they can administer the new object they just created. + * Superusers will be able to do that anyway. + * + * The grantor of record for this implicit grant is the bootstrap + * superuser, which means that the CREATEROLE user cannot revoke the + * grant. They can however grant the created role back to themselves with + * different options, since they enjoy ADMIN OPTION on it. + */ + if (!superuser()) + { + RoleSpec *current_role = makeNode(RoleSpec); + List *memberSpecs; + List *memberIds = list_make1_oid(GetUserId()); + + current_role->roletype = ROLESPEC_CURRENT_ROLE; + current_role->location = -1; + memberSpecs = list_make1(current_role); + + AddRoleMems(stmt->role, roleid, + memberSpecs, memberIds, + BOOTSTRAP_SUPERUSERID, true); + + /* + * We must make the implicit grant visible to the code below, else the + * additional grants will fail. + */ + CommandCounterIncrement(); + } + /* * Add the specified members to this new role. adminmembers get the admin * option, rolemembers don't. @@ -1388,6 +1420,8 @@ AddRoleMems(const char *rolename, Oid roleid, if (!memberIds) return; + check_role_membership_authorization(grantorId, roleid, true); + /* * Check permissions: must have createrole or admin option on the role to * be changed. To mess with a superuser role, you gotta be superuser. @@ -1426,7 +1460,7 @@ AddRoleMems(const char *rolename, Oid roleid, * present. Nonetheless, inasmuch as users might look to it for a crude * audit trail, let only superusers impute the grant to a third party. */ - if (grantorId != GetUserId() && !superuser()) + if (grantorId != GetUserId() && grantorId != BOOTSTRAP_SUPERUSERID && !superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("must be superuser to set grantor"))); @@ -1561,6 +1595,8 @@ DelRoleMems(const char *rolename, Oid roleid, if (!memberIds) return; + check_role_membership_authorization(GetUserId(), roleid, false); + /* * Check permissions: must have createrole or admin option on the role to * be changed. To mess with a superuser role, you gotta be superuser. @@ -1643,3 +1679,71 @@ DelRoleMems(const char *rolename, Oid roleid, */ table_close(pg_authmem_rel, NoLock); } + +/* + * Check that currentUserId has permission to modify the membership list for + * roleid. Throw an error if not. + */ +static void +check_role_membership_authorization(Oid currentUserId, Oid roleid, + bool is_grant) +{ + /* + * The charter of pg_database_owner is to have exactly one, implicit, + * situation-dependent member. There's no technical need for this + * restriction. (One could lift it and take the further step of making + * object_ownercheck(DatabaseRelationId, ...) equivalent to + * has_privs_of_role(roleid, ROLE_PG_DATABASE_OWNER), in which case + * explicit, situation-independent members could act as the owner of any + * database.) + */ + if (is_grant && roleid == ROLE_PG_DATABASE_OWNER) + ereport(ERROR, + errmsg("role \"%s\" cannot have explicit members", + GetUserNameFromId(roleid, false))); + + /* To mess with a superuser role, you gotta be superuser. */ + if (superuser_arg(roleid)) + { + if (!superuser_arg(currentUserId)) + { + if (is_grant) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to grant role \"%s\"", + GetUserNameFromId(roleid, false)), + errdetail("Only roles with the %s attribute may grant roles with the %s attribute.", + "SUPERUSER", "SUPERUSER"))); + else + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to revoke role \"%s\"", + GetUserNameFromId(roleid, false)), + errdetail("Only roles with the %s attribute may revoke roles with the %s attribute.", + "SUPERUSER", "SUPERUSER"))); + } + } + else + { + /* + * Otherwise, must have admin option on the role to be changed. + */ + if (!is_admin_of_role(currentUserId, roleid)) + { + if (is_grant) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to grant role \"%s\"", + GetUserNameFromId(roleid, false)), + errdetail("Only roles with the %s option on role \"%s\" may grant this role.", + "ADMIN", GetUserNameFromId(roleid, false)))); + else + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to revoke role \"%s\"", + GetUserNameFromId(roleid, false)), + errdetail("Only roles with the %s option on role \"%s\" may revoke this role.", + "ADMIN", GetUserNameFromId(roleid, false)))); + } + } +} diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index ceff4727d4a..e7ddea8861a 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -235,6 +235,10 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add) dst->local_blks_written += add->local_blks_written; dst->temp_blks_read += add->temp_blks_read; dst->temp_blks_written += add->temp_blks_written; + dst->prefetch.hits += add->prefetch.hits; + dst->prefetch.misses += add->prefetch.misses; + dst->prefetch.expired += add->prefetch.expired; + dst->prefetch.duplicates += add->prefetch.duplicates; INSTR_TIME_ADD(dst->blk_read_time, add->blk_read_time); INSTR_TIME_ADD(dst->blk_write_time, add->blk_write_time); INSTR_TIME_ADD(dst->temp_blk_read_time, add->temp_blk_read_time); @@ -257,6 +261,10 @@ BufferUsageAccumDiff(BufferUsage *dst, dst->local_blks_written += add->local_blks_written - sub->local_blks_written; dst->temp_blks_read += add->temp_blks_read - sub->temp_blks_read; dst->temp_blks_written += add->temp_blks_written - sub->temp_blks_written; + dst->prefetch.hits += add->prefetch.hits - sub->prefetch.hits; + dst->prefetch.misses += add->prefetch.misses - sub->prefetch.misses; + dst->prefetch.expired += add->prefetch.expired - sub->prefetch.expired; + dst->prefetch.duplicates += add->prefetch.duplicates - sub->prefetch.duplicates; INSTR_TIME_ACCUM_DIFF(dst->blk_read_time, add->blk_read_time, sub->blk_read_time); INSTR_TIME_ACCUM_DIFF(dst->blk_write_time, diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index f6fe07ad703..95dfef466f6 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -149,37 +149,21 @@ BitmapHeapNext(BitmapHeapScanState *node) * multiple processes to iterate jointly. */ pstate->tbmiterator = tbm_prepare_shared_iterate(tbm); -#ifdef USE_PREFETCH - if (node->prefetch_maximum > 0) - { - pstate->prefetch_iterator = - tbm_prepare_shared_iterate(tbm); - - /* - * We don't need the mutex here as we haven't yet woke up - * others. - */ - pstate->prefetch_pages = 0; - pstate->prefetch_target = -1; - } -#endif /* We have initialized the shared state so wake up others. */ BitmapDoneInitializingSharedState(pstate); } +#ifdef USE_PREFETCH + node->prefetch_head = 0; + node->prefetch_pages = 0; + node->prefetch_target = -1; +#endif /* Allocate a private iterator and attach the shared state to it */ node->shared_tbmiterator = shared_tbmiterator = tbm_attach_shared_iterate(dsa, pstate->tbmiterator); node->tbmres = tbmres = NULL; -#ifdef USE_PREFETCH - if (node->prefetch_maximum > 0) - { - node->shared_prefetch_iterator = - tbm_attach_shared_iterate(dsa, pstate->prefetch_iterator); - } -#endif /* USE_PREFETCH */ } node->initialized = true; } @@ -196,9 +180,25 @@ BitmapHeapNext(BitmapHeapScanState *node) if (tbmres == NULL) { if (!pstate) - node->tbmres = tbmres = tbm_iterate(tbmiterator); + tbmres = tbm_iterate(tbmiterator); else - node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator); + { + if (node->prefetch_pages != 0) + { + tbmres = (TBMIterateResult *)&node->prefetch_requests[node->prefetch_head]; + node->prefetch_pages -= 1; + node->prefetch_head = (node->prefetch_head + 1) % MAX_IO_CONCURRENCY; + } + else + tbmres = tbm_shared_iterate(shared_tbmiterator); + if (tbmres) + { + /* Need to copy result because iterator can be used for prefetch and vocant position in prefetch ring buffer can also be reused */ + memcpy(&node->tbmres_copy, tbmres, offsetof(TBMIterateResult, offsets) + sizeof(OffsetNumber)*Max(tbmres->ntuples, 0)); + tbmres = (TBMIterateResult *)&node->tbmres_copy; + } + } + node->tbmres = tbmres; if (tbmres == NULL) { /* no more entries in the bitmap */ @@ -237,7 +237,6 @@ BitmapHeapNext(BitmapHeapScanState *node) /* AM doesn't think this block is valid, skip */ continue; } - if (tbmres->ntuples >= 0) node->exact_pages++; else @@ -258,19 +257,8 @@ BitmapHeapNext(BitmapHeapScanState *node) * Try to prefetch at least a few pages even before we get to the * second page if we don't stop reading after the first tuple. */ - if (!pstate) - { - if (node->prefetch_target < node->prefetch_maximum) - node->prefetch_target++; - } - else if (pstate->prefetch_target < node->prefetch_maximum) - { - /* take spinlock while updating shared state */ - SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_target < node->prefetch_maximum) - pstate->prefetch_target++; - SpinLockRelease(&pstate->mutex); - } + if (node->prefetch_target < node->prefetch_maximum) + node->prefetch_target++; #endif /* USE_PREFETCH */ } @@ -361,54 +349,24 @@ BitmapAdjustPrefetchIterator(BitmapHeapScanState *node, TBMIterateResult *tbmres) { #ifdef USE_PREFETCH - ParallelBitmapHeapState *pstate = node->pstate; + TBMIterator *prefetch_iterator = node->prefetch_iterator; - if (pstate == NULL) - { - TBMIterator *prefetch_iterator = node->prefetch_iterator; - - if (node->prefetch_pages > 0) - { - /* The main iterator has closed the distance by one page */ - node->prefetch_pages--; - } - else if (prefetch_iterator) - { - /* Do not let the prefetch iterator get behind the main one */ - TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); - - if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno) - elog(ERROR, "prefetch and main iterators are out of sync"); - } + /* NEON: we are not using prefetch iterator for parallel plan so no need to adjust it */ + if (node->pstate != NULL) return; - } - if (node->prefetch_maximum > 0) + if (node->prefetch_pages > 0) { - TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator; - - SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_pages > 0) - { - pstate->prefetch_pages--; - SpinLockRelease(&pstate->mutex); - } - else - { - /* Release the mutex before iterating */ - SpinLockRelease(&pstate->mutex); + /* The main iterator has closed the distance by one page */ + node->prefetch_pages--; + } + else if (prefetch_iterator) + { + /* Do not let the prefetch iterator get behind the main one */ + TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); - /* - * In case of shared mode, we can not ensure that the current - * blockno of the main iterator and that of the prefetch iterator - * are same. It's possible that whatever blockno we are - * prefetching will be processed by another process. Therefore, - * we don't validate the blockno here as we do in non-parallel - * case. - */ - if (prefetch_iterator) - tbm_shared_iterate(prefetch_iterator); - } + if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno) + elog(ERROR, "prefetch and main iterators are out of sync"); } #endif /* USE_PREFETCH */ } @@ -425,35 +383,14 @@ static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node) { #ifdef USE_PREFETCH - ParallelBitmapHeapState *pstate = node->pstate; - - if (pstate == NULL) - { - if (node->prefetch_target >= node->prefetch_maximum) - /* don't increase any further */ ; - else if (node->prefetch_target >= node->prefetch_maximum / 2) - node->prefetch_target = node->prefetch_maximum; - else if (node->prefetch_target > 0) - node->prefetch_target *= 2; - else - node->prefetch_target++; - return; - } - - /* Do an unlocked check first to save spinlock acquisitions. */ - if (pstate->prefetch_target < node->prefetch_maximum) - { - SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_target >= node->prefetch_maximum) - /* don't increase any further */ ; - else if (pstate->prefetch_target >= node->prefetch_maximum / 2) - pstate->prefetch_target = node->prefetch_maximum; - else if (pstate->prefetch_target > 0) - pstate->prefetch_target *= 2; - else - pstate->prefetch_target++; - SpinLockRelease(&pstate->mutex); - } + if (node->prefetch_target >= node->prefetch_maximum) + /* don't increase any further */ ; + else if (node->prefetch_target >= node->prefetch_maximum / 2) + node->prefetch_target = node->prefetch_maximum; + else if (node->prefetch_target > 0) + node->prefetch_target *= 2; + else + node->prefetch_target++; #endif /* USE_PREFETCH */ } @@ -507,56 +444,47 @@ BitmapPrefetch(BitmapHeapScanState *node, TableScanDesc scan) PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); } } - - return; } - - if (pstate->prefetch_pages < pstate->prefetch_target) + else { - TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator; - - if (prefetch_iterator) + while (1) { - while (1) - { - TBMIterateResult *tbmpre; - bool do_prefetch = false; - bool skip_fetch; + TBMIterateResult *tbmpre; + bool do_prefetch = false; + bool skip_fetch; - /* - * Recheck under the mutex. If some other process has already - * done enough prefetching then we need not to do anything. - */ - SpinLockAcquire(&pstate->mutex); - if (pstate->prefetch_pages < pstate->prefetch_target) - { - pstate->prefetch_pages++; - do_prefetch = true; - } - SpinLockRelease(&pstate->mutex); + if (node->prefetch_pages < node->prefetch_target) + { + Assert(node->prefetch_pages < MAX_IO_CONCURRENCY); + do_prefetch = true; + } - if (!do_prefetch) - return; + if (!do_prefetch) + return; - tbmpre = tbm_shared_iterate(prefetch_iterator); - if (tbmpre == NULL) - { - /* No more pages to prefetch */ - tbm_end_shared_iterate(prefetch_iterator); - node->shared_prefetch_iterator = NULL; - break; - } + tbmpre = tbm_shared_iterate(node->shared_tbmiterator); + if (tbmpre != NULL) + { + memcpy(&node->prefetch_requests[(node->prefetch_head + node->prefetch_pages) % MAX_IO_CONCURRENCY], + tbmpre, + offsetof(TBMIterateResult, offsets) + sizeof(OffsetNumber)*Max(tbmpre->ntuples, 0)); + node->prefetch_pages += 1; + } + else + { + /* No more pages to prefetch */ + break; + } - /* As above, skip prefetch if we expect not to need page */ - skip_fetch = (node->can_skip_fetch && - (node->tbmres ? !node->tbmres->recheck : false) && - VM_ALL_VISIBLE(node->ss.ss_currentRelation, - tbmpre->blockno, - &node->pvmbuffer)); + /* As above, skip prefetch if we expect not to need page */ + skip_fetch = (node->can_skip_fetch && + !tbmpre->recheck && + VM_ALL_VISIBLE(node->ss.ss_currentRelation, + tbmpre->blockno, + &node->pvmbuffer)); - if (!skip_fetch) - PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); - } + if (!skip_fetch) + PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); } } #endif /* USE_PREFETCH */ @@ -613,8 +541,6 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node) tbm_end_iterate(node->prefetch_iterator); if (node->shared_tbmiterator) tbm_end_shared_iterate(node->shared_tbmiterator); - if (node->shared_prefetch_iterator) - tbm_end_shared_iterate(node->shared_prefetch_iterator); if (node->tbm) tbm_free(node->tbm); if (node->vmbuffer != InvalidBuffer) @@ -627,7 +553,6 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node) node->prefetch_iterator = NULL; node->initialized = false; node->shared_tbmiterator = NULL; - node->shared_prefetch_iterator = NULL; node->vmbuffer = InvalidBuffer; node->pvmbuffer = InvalidBuffer; @@ -683,8 +608,6 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node) tbm_free(node->tbm); if (node->shared_tbmiterator) tbm_end_shared_iterate(node->shared_tbmiterator); - if (node->shared_prefetch_iterator) - tbm_end_shared_iterate(node->shared_prefetch_iterator); if (node->vmbuffer != InvalidBuffer) ReleaseBuffer(node->vmbuffer); if (node->pvmbuffer != InvalidBuffer) @@ -739,7 +662,6 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) scanstate->pscan_len = 0; scanstate->initialized = false; scanstate->shared_tbmiterator = NULL; - scanstate->shared_prefetch_iterator = NULL; scanstate->pstate = NULL; /* @@ -794,8 +716,7 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) * Maximum number of prefetches for the tablespace if configured, * otherwise the current value of the effective_io_concurrency GUC. */ - scanstate->prefetch_maximum = - get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace); + scanstate->prefetch_maximum = get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace); scanstate->ss.ss_currentRelation = currentRelation; diff --git a/src/backend/main/main.c b/src/backend/main/main.c index f8f7ebbd445..79d79feb0d4 100644 --- a/src/backend/main/main.c +++ b/src/backend/main/main.c @@ -37,6 +37,7 @@ #include "bootstrap/bootstrap.h" #include "common/username.h" +#include "miscadmin.h" #include "port/atomics.h" #include "postmaster/postmaster.h" #include "storage/spin.h" @@ -55,6 +56,41 @@ static void init_locale(const char *categoryname, int category, const char *loca static void help(const char *progname); static void check_root(const char *progname); +typedef int (*MainFunc) (int argc, char *argv[]); + +static int +CallExtMain(char *library_name, char *main_func_name, int argc, char *argv[], bool load_config) +{ + MainFunc main_func; + + /* + * Perform just enough initialization that we can load external libraries + */ + InitStandaloneProcess(argv[0]); + + SetProcessingMode(InitProcessing); + + /* + * Set default values for command-line options. + */ + InitializeGUCOptions(); + + /* Acquire configuration parameters */ + if (load_config && !SelectConfigFiles(NULL, progname)) + exit(1); + + /* + * Imitate we are early in bootstrap loading shared_preload_libraries; + * neon extension sets PGC_POSTMASTER gucs requiring this. + */ + process_shared_preload_libraries_in_progress = true; + + main_func = load_external_function(library_name, main_func_name, true, NULL); + + process_shared_preload_libraries_in_progress = false; + + return main_func(argc, argv); +} /* * Any Postgres server process begins execution here. @@ -198,6 +234,10 @@ main(int argc, char *argv[]) else if (argc > 1 && strcmp(argv[1], "--single") == 0) PostgresSingleUserMain(argc, argv, strdup(get_user_name_or_exit(progname))); + else if (argc > 1 && strcmp(argv[1], "--wal-redo") == 0) + CallExtMain("neon_walredo", "WalRedoMain", argc, argv, false); + else if (argc > 1 && strcmp(argv[1], "--sync-safekeepers") == 0) + CallExtMain("neon", "WalProposerSync", argc, argv, true); else PostmasterMain(argc, argv); /* the functions above should not return */ diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 0ba26b207b0..bf67672a743 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -152,6 +152,9 @@ bool enable_parallel_append = true; bool enable_parallel_hash = true; bool enable_partition_pruning = true; bool enable_async_append = true; +bool enable_seqscan_prefetch = true; +bool enable_indexscan_prefetch = true; +bool enable_indexonlyscan_prefetch = true; typedef struct { diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index 892d42c63ee..5c72554bf70 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -1023,6 +1023,7 @@ PostmasterMain(int argc, char *argv[]) /* * process any libraries that should be preloaded at postmaster start */ + ereport(LOG, (errmsg("postgres processing shared_preload_libraries"))); process_shared_preload_libraries(); /* @@ -1045,7 +1046,9 @@ PostmasterMain(int argc, char *argv[]) /* * Give preloaded libraries a chance to request additional shared memory. */ + ereport(LOG, (errmsg("postgres processing shmem request"))); process_shmem_requests(); + ereport(LOG, (errmsg("postgres done processing shmem request"))); /* * Now that loadable modules have had their chance to request additional @@ -1083,7 +1086,9 @@ PostmasterMain(int argc, char *argv[]) /* * Set up shared memory and semaphores. */ + ereport(LOG, (errmsg("postgres setting up shared memory"))); reset_shared(); + ereport(LOG, (errmsg("postgres done setting up shared memory"))); /* * Estimate number of openable files. This must happen after setting up diff --git a/src/backend/replication/logical/origin.c b/src/backend/replication/logical/origin.c index cd860076016..c9a077ba8be 100644 --- a/src/backend/replication/logical/origin.c +++ b/src/backend/replication/logical/origin.c @@ -82,6 +82,7 @@ #include "nodes/execnodes.h" #include "pgstat.h" #include "replication/logical.h" +#include "replication/message.h" #include "replication/origin.h" #include "storage/condition_variable.h" #include "storage/copydir.h" @@ -562,10 +563,14 @@ CheckPointReplicationOrigin(void) int i; uint32 magic = REPLICATION_STATE_MAGIC; pg_crc32c crc; + char *buf; + size_t chkp_size; if (max_replication_slots == 0) return; + buf = palloc(sizeof(magic) + max_replication_slots*sizeof(ReplicationStateOnDisk) + sizeof(crc)); + INIT_CRC32C(crc); /* make sure no old temp file is remaining */ @@ -599,6 +604,9 @@ CheckPointReplicationOrigin(void) errmsg("could not write to file \"%s\": %m", tmppath))); } + memcpy(buf, &magic, sizeof magic); + chkp_size = sizeof(magic); + COMP_CRC32C(crc, &magic, sizeof(magic)); /* prevent concurrent creations/drops */ @@ -641,6 +649,8 @@ CheckPointReplicationOrigin(void) errmsg("could not write to file \"%s\": %m", tmppath))); } + memcpy(buf + chkp_size, &disk_state, sizeof(disk_state)); + chkp_size += sizeof(disk_state); COMP_CRC32C(crc, &disk_state, sizeof(disk_state)); } @@ -660,6 +670,15 @@ CheckPointReplicationOrigin(void) errmsg("could not write to file \"%s\": %m", tmppath))); } + if (chkp_size != sizeof(magic)) /* has some valid origins */ + { + memcpy(buf + chkp_size, &crc, sizeof crc); + chkp_size += sizeof(crc); + + /* NEON specific: persist snapshot in storage using logical message */ + LogLogicalMessage("neon-file:pg_logical/replorigin_checkpoint", buf, chkp_size, false); + } + pfree(buf); if (CloseTransientFile(tmpfd) != 0) ereport(PANIC, diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index 4c951678c0a..24c2deb9e11 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -126,6 +126,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "replication/logical.h" +#include "replication/message.h" #include "replication/reorderbuffer.h" #include "replication/snapbuild.h" #include "storage/block.h" /* debugging output */ @@ -1599,6 +1600,7 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn) int fd; char tmppath[MAXPGPATH]; char path[MAXPGPATH]; + char prefix[MAXPGPATH]; int ret; struct stat stat_buf; Size sz; @@ -1721,6 +1723,10 @@ SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn) (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", tmppath))); + /* NEON specific: persist snapshot in storage using logical message */ + snprintf(prefix, sizeof(prefix), "neon-file:%s", path); + LogLogicalMessage(prefix, (char*)ondisk, needed_length, false); + errno = 0; pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_WRITE); if ((write(fd, ondisk, needed_length)) != needed_length) @@ -2027,6 +2033,7 @@ CheckPointSnapBuild(void) DIR *snap_dir; struct dirent *snap_de; char path[MAXPGPATH + 21]; + char prefix[MAXPGPATH + 31]; /* * We start off with a minimum of the last redo pointer. No new @@ -2085,6 +2092,10 @@ CheckPointSnapBuild(void) { elog(DEBUG1, "removing snapbuild snapshot %s", path); + /* NEON specific: delete file from storage using logical message */ + snprintf(prefix, sizeof(prefix), "neon-file:%s", path); + LogLogicalMessage(prefix, NULL, 0, false); + /* * It's not particularly harmful, though strange, if we can't * remove the file here. Don't prevent the checkpoint from diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c index 80d96db8eb9..698be8f581a 100644 --- a/src/backend/replication/slot.c +++ b/src/backend/replication/slot.c @@ -45,6 +45,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "replication/slot.h" +#include "replication/message.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/proc.h" @@ -683,6 +684,15 @@ ReplicationSlotDropPtr(ReplicationSlot *slot) sprintf(path, "pg_replslot/%s", NameStr(slot->data.name)); sprintf(tmppath, "pg_replslot/%s.tmp", NameStr(slot->data.name)); + if (SlotIsLogical(slot)) + { + /* NEON specific: delete slot from storage using logical message */ + char prefix[MAXPGPATH]; + snprintf(prefix, sizeof(prefix), "neon-file:%s/state", path); + elog(LOG, "Drop replication slot %s", path); + LogLogicalMessage(prefix, NULL, 0, false); + } + /* * Rename the slot directory on disk, so that we'll no longer recognize * this as a valid slot. Note that if this fails, we've got to mark the @@ -1649,6 +1659,15 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel) ReplicationSlotOnDiskChecksummedSize); FIN_CRC32C(cp.checksum); + if (SlotIsLogical(slot) && cp.slotdata.restart_lsn != InvalidXLogRecPtr) + { + /* NEON specific: persist slot in storage using logical message */ + char prefix[MAXPGPATH]; + snprintf(prefix, sizeof(prefix), "neon-file:%s", path); + elog(LOG, "Save replication slot at %s restart_lsn=%X/%X", path, LSN_FORMAT_ARGS(cp.slotdata.restart_lsn)); + LogLogicalMessage(prefix, (char*)&cp, sizeof cp, false); + } + errno = 0; pgstat_report_wait_start(WAIT_EVENT_REPLICATION_SLOT_WRITE); if ((write(fd, &cp, sizeof(cp))) != sizeof(cp)) diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index 9452932d590..5c53a3c1086 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -497,6 +497,13 @@ WalReceiverMain(void) if (endofwal) break; + /* + * Update WAL statistics, which are produced inside + * issue_xlog_fsync function. This is useful for counting + * WAL flushes, by querying pg_stat_wal. + */ + pgstat_report_wal(true); + /* * Ideally we would reuse a WaitEventSet object repeatedly * here to avoid the overheads of WaitLatchOrSocket on epoll diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 269914bce28..d06582346dd 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -54,6 +54,7 @@ #include "access/transam.h" #include "access/xact.h" #include "access/xlog_internal.h" +#include "access/xloginsert.h" #include "access/xlogreader.h" #include "access/xlogrecovery.h" #include "access/xlogutils.h" @@ -130,6 +131,11 @@ bool log_replication_commands = false; */ bool wake_wal_senders = false; +/* + * Backpressure hook, detecting how much we should delay. + */ +uint64 (*delay_backend_us)(void) = NULL; + /* * xlogreader used for replication. Note that a WAL sender doing physical * replication does not need xlogreader to read WAL, but it needs one to @@ -253,8 +259,6 @@ static void WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, Transac static void WalSndUpdateProgress(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid, bool skipped_xact); static XLogRecPtr WalSndWaitForWal(XLogRecPtr loc); -static void LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time); -static TimeOffset LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now); static bool TransactionIdInRecentPast(TransactionId xid, uint32 epoch); static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo, @@ -2039,7 +2043,7 @@ ProcessStandbyMessage(void) /* * Remember that a walreceiver just confirmed receipt of lsn `lsn`. */ -static void +void PhysicalConfirmReceivedLocation(XLogRecPtr lsn) { bool changed = false; @@ -2078,21 +2082,41 @@ ProcessStandbyReplyMessage(void) flushPtr, applyPtr; bool replyRequested; - TimeOffset writeLag, - flushLag, - applyLag; - bool clearLagTimes; - TimestampTz now; TimestampTz replyTime; - static bool fullyAppliedLastTime = false; - /* the caller already consumed the msgtype byte */ writePtr = pq_getmsgint64(&reply_message); flushPtr = pq_getmsgint64(&reply_message); applyPtr = pq_getmsgint64(&reply_message); replyTime = pq_getmsgint64(&reply_message); replyRequested = pq_getmsgbyte(&reply_message); + ProcessStandbyReply(writePtr, + flushPtr, + applyPtr, + replyTime, + replyRequested); + + elog(LOG, "ProcessStandbyReplyMessage: writelsn %X/%X", + LSN_FORMAT_ARGS(writePtr)); + elog(LOG, "ProcessStandbyReplyMessage: flushlsn %X/%X", + LSN_FORMAT_ARGS(flushPtr)); + elog(LOG, "ProcessStandbyReplyMessage: applylsn %X/%X", + LSN_FORMAT_ARGS(applyPtr)); +} + +void +ProcessStandbyReply(XLogRecPtr writePtr, + XLogRecPtr flushPtr, + XLogRecPtr applyPtr, + TimestampTz replyTime, + bool replyRequested) +{ + TimeOffset writeLag, + flushLag, + applyLag; + bool clearLagTimes; + TimestampTz now; + static bool fullyAppliedLastTime = false; if (message_level_is_interesting(DEBUG2)) { @@ -2275,7 +2299,16 @@ ProcessStandbyHSFeedbackMessage(void) feedbackEpoch = pq_getmsgint(&reply_message, 4); feedbackCatalogXmin = pq_getmsgint(&reply_message, 4); feedbackCatalogEpoch = pq_getmsgint(&reply_message, 4); + ProcessStandbyHSFeedback(replyTime, feedbackXmin, feedbackEpoch, feedbackCatalogXmin, feedbackCatalogEpoch); +} +void +ProcessStandbyHSFeedback(TimestampTz replyTime, + TransactionId feedbackXmin, + uint32 feedbackEpoch, + TransactionId feedbackCatalogXmin, + uint32 feedbackCatalogEpoch) +{ if (message_level_is_interesting(DEBUG2)) { char *replyTimeStr; @@ -2943,9 +2976,12 @@ XLogSendPhysical(void) /* * OK to read and send the slice. */ - resetStringInfo(&output_message); - pq_sendbyte(&output_message, 'w'); + if (output_message.data) + resetStringInfo(&output_message); + else + initStringInfo(&output_message); + pq_sendbyte(&output_message, 'w'); pq_sendint64(&output_message, startptr); /* dataStart */ pq_sendint64(&output_message, SendRqstPtr); /* walEnd */ pq_sendint64(&output_message, 0); /* sendtime, filled in last */ @@ -3127,8 +3163,8 @@ WalSndDone(WalSndSendDataCallback send_data) * flush location if valid, write otherwise. Tools like pg_receivewal will * usually (unless in synchronous mode) return an invalid flush location. */ - replicatedPtr = XLogRecPtrIsInvalid(MyWalSnd->flush) ? - MyWalSnd->write : MyWalSnd->flush; + // XXX Zenith uses flush_lsn to pass extra payload, so use write_lsn here + replicatedPtr = MyWalSnd->write; if (WalSndCaughtUp && sentPtr == replicatedPtr && !pq_is_send_pending()) @@ -3700,7 +3736,7 @@ WalSndKeepaliveIfNecessary(void) * eventually reported to have been written, flushed and applied by the * standby in a reply message. */ -static void +void LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time) { bool buffer_full; @@ -3765,7 +3801,7 @@ LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time) * Return -1 if no new sample data is available, and otherwise the elapsed * time in microseconds. */ -static TimeOffset +TimeOffset LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now) { TimestampTz time = 0; diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 2862e9e412c..bd772e3ff05 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -24,6 +24,14 @@ ConditionVariableMinimallyPadded *BufferIOCVArray; WritebackContext BackendWritebackContext; CkptSortItem *CkptBufferIds; +/* + * Buffer with target WAL redo page. + * We must not evict this page from the buffer pool, but we cannot just keep it pinned because + * some WAL redo functions expect the page to not be pinned. So we have a special check in + * localbuf.c to prevent this buffer from being evicted. + */ +Buffer wal_redo_buffer; +bool am_wal_redo_postgres = false; /* * Data Structures: diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 9fcb3d6e194..3b83b109f15 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -56,6 +56,7 @@ #include "utils/rel.h" #include "utils/resowner_private.h" #include "utils/timestamp.h" +#include "replication/walsender.h" /* Note: these two macros only work on shared buffers, not local ones! */ @@ -159,6 +160,9 @@ int checkpoint_flush_after = 0; int bgwriter_flush_after = 0; int backend_flush_after = 0; +/* Evict unpinned pages (for better test coverage) */ +bool zenith_test_evict = false; + /* local state for StartBufferIO and related functions */ static BufferDesc *InProgressBuf = NULL; static bool IsForInput; @@ -802,7 +806,8 @@ ReadBufferWithoutRelcache(RelFileNode rnode, ForkNumber forkNum, { bool hit; - SMgrRelation smgr = smgropen(rnode, InvalidBackendId); + SMgrRelation smgr = smgropen(rnode, InvalidBackendId, + RELPERSISTENCE_PERMANENT); return ReadBuffer_common(smgr, permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED, forkNum, blockNum, @@ -824,7 +829,11 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, Block bufBlock; bool found; bool isExtend; - bool isLocalBuf = SmgrIsTemp(smgr); + /* + * wal_redo postgres is working in single user mode, we do not need to synchronize access to shared buffer, + * so let's use local buffers instead + */ + bool isLocalBuf = SmgrIsTemp(smgr) || am_wal_redo_postgres; *hit = false; @@ -934,11 +943,14 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, */ bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr); if (!PageIsNew((Page) bufBlock)) - ereport(ERROR, + { + // XXX-ZENITH + MemSet((char *) bufBlock, 0, BLCKSZ); + ereport(DEBUG1, (errmsg("unexpected data beyond EOF in block %u of relation %s", blockNum, relpath(smgr->smgr_rnode, forkNum)), errhint("This has been seen to occur with buggy kernels; consider updating your system."))); - + } /* * We *must* do smgrextend before succeeding, else the page will not * be reserved by the kernel, and the next P_NEW call will decide to @@ -1927,6 +1939,32 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner) UnlockBufHdr(buf, buf_state); } ForgetPrivateRefCountEntry(ref); + + if (zenith_test_evict && !InRecovery) + { + buf_state = LockBufHdr(buf); + if (BUF_STATE_GET_REFCOUNT(buf_state) == 0) + { + if (buf_state & BM_DIRTY) + { + ReservePrivateRefCountEntry(); + PinBuffer_Locked(buf); + if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf), + LW_SHARED)) + { + FlushOneBuffer(b); + LWLockRelease(BufferDescriptorGetContentLock(buf)); + } + UnpinBuffer(buf, true); + } + else + { + InvalidateBuffer(buf); + } + } + else + UnlockBufHdr(buf, buf_state); + } } } @@ -2848,7 +2886,7 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) /* Find smgr relation for buffer */ if (reln == NULL) - reln = smgropen(buf->tag.rnode, InvalidBackendId); + reln = smgropen(buf->tag.rnode, InvalidBackendId, 0); TRACE_POSTGRESQL_BUFFER_FLUSH_START(buf->tag.forkNum, buf->tag.blockNum, @@ -3725,7 +3763,9 @@ RelationCopyStorageUsingBuffer(RelFileNode srcnode, use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM); /* Get number of blocks in the source relation. */ - nblocks = smgrnblocks(smgropen(srcnode, InvalidBackendId), + nblocks = smgrnblocks(smgropen(srcnode, InvalidBackendId, + permanent ? RELPERSISTENCE_PERMANENT + :RELPERSISTENCE_UNLOGGED), forkNum); /* Nothing to copy; just return. */ @@ -3737,7 +3777,8 @@ RelationCopyStorageUsingBuffer(RelFileNode srcnode, * relation before starting to copy block by block. */ memset(buf.data, 0, BLCKSZ); - smgrextend(smgropen(dstnode, InvalidBackendId), forkNum, nblocks - 1, + smgrextend(smgropen(dstnode, InvalidBackendId, permanent ? RELPERSISTENCE_PERMANENT + :RELPERSISTENCE_UNLOGGED), forkNum, nblocks - 1, buf.data, true); /* This is a bulk operation, so use buffer access strategies. */ @@ -3819,9 +3860,9 @@ CreateAndCopyRelationData(RelFileNode src_rnode, RelFileNode dst_rnode, for (ForkNumber forkNum = MAIN_FORKNUM + 1; forkNum <= MAX_FORKNUM; forkNum++) { - if (smgrexists(smgropen(src_rnode, InvalidBackendId), forkNum)) + if (smgrexists(smgropen(src_rnode, InvalidBackendId, relpersistence), forkNum)) { - smgrcreate(smgropen(dst_rnode, InvalidBackendId), forkNum, false); + smgrcreate(smgropen(dst_rnode, InvalidBackendId, relpersistence), forkNum, false); /* * WAL log creation if the relation is persistent, or this is the @@ -4992,7 +5033,7 @@ IssuePendingWritebacks(WritebackContext *context) i += ahead; /* and finally tell the kernel to write the data to storage */ - reln = smgropen(tag.rnode, InvalidBackendId); + reln = smgropen(tag.rnode, InvalidBackendId, 0); smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks); } diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index e71f95ac1ff..b99ed777b68 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -18,12 +18,15 @@ #include "access/parallel.h" #include "catalog/catalog.h" #include "executor/instrument.h" +#include "miscadmin.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" #include "utils/guc.h" #include "utils/memutils.h" #include "utils/resowner_private.h" +/* ZENITH: prevent eviction of the buffer of target page */ +extern Buffer wal_redo_buffer; /*#define LBDEBUG*/ @@ -182,6 +185,12 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, if (LocalRefCount[b] == 0) { + if (-b - 1 == wal_redo_buffer) + { + /* ZENITH: Prevent eviction of the buffer with target wal redo page */ + continue; + } + buf_state = pg_atomic_read_u32(&bufHdr->state); if (BUF_STATE_GET_USAGECOUNT(buf_state) > 0) @@ -215,7 +224,10 @@ LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, Page localpage = (char *) LocalBufHdrGetBlock(bufHdr); /* Find smgr relation for buffer */ - oreln = smgropen(bufHdr->tag.rnode, MyBackendId); + if (am_wal_redo_postgres && MyBackendId == InvalidBackendId) + oreln = smgropen(bufHdr->tag.rnode, MyBackendId, RELPERSISTENCE_PERMANENT); + else + oreln = smgropen(bufHdr->tag.rnode, MyBackendId, RELPERSISTENCE_TEMP); PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index d41ae37090a..a3c6a55f5d7 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -649,10 +649,18 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks) /* Extend as needed. */ while (fsm_nblocks_now < fsm_nblocks) { - PageSetChecksumInplace((Page) pg.data, fsm_nblocks_now); + /* + * ZENITH: Initialize FSM pages through buffer cache to prevent loading + * them from pageserver. + */ + Buffer buffer = ReadBufferExtended(rel, FSM_FORKNUM, P_NEW, RBM_ZERO_AND_LOCK, NULL); + Page page = BufferGetPage(buffer); + + PageInit((Page) page, BLCKSZ, 0); + PageSetChecksumInplace(page, fsm_nblocks_now); + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); - smgrextend(reln, FSM_FORKNUM, fsm_nblocks_now, - pg.data, false); fsm_nblocks_now++; } diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index 6c7cf6c2956..b4652c33ff6 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -53,3 +53,4 @@ XactTruncationLock 44 # 45 was XactTruncationLock until removal of BackendRandomLock WrapLimitsVacuumLock 46 NotifyQueueTailLock 47 +LastWrittenLsnLock 48 diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index f14c48da6cf..34dfe7ff97e 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -612,6 +612,14 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) return true; } +/* + * mdprefetch() -- Cancel all previous prefetch requests + */ +void +md_reset_prefetch(SMgrRelation reln) +{ +} + /* * mdwriteback() -- Tell the kernel to write pages back to storage. * @@ -1096,7 +1104,7 @@ DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo) srels = palloc(sizeof(SMgrRelation) * ndelrels); for (i = 0; i < ndelrels; i++) { - SMgrRelation srel = smgropen(delrels[i], InvalidBackendId); + SMgrRelation srel = smgropen(delrels[i], InvalidBackendId, 0); if (isRedo) { @@ -1379,7 +1387,7 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) int mdsyncfiletag(const FileTag *ftag, char *path) { - SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId); + SMgrRelation reln = smgropen(ftag->rnode, InvalidBackendId, 0); File file; bool need_to_close; int result, diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index a477f70f0e3..87260673bc8 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -18,6 +18,7 @@ #include "postgres.h" #include "access/xlogutils.h" +#include "catalog/pg_tablespace.h" #include "lib/ilist.h" #include "storage/bufmgr.h" #include "storage/ipc.h" @@ -26,47 +27,8 @@ #include "utils/hsearch.h" #include "utils/inval.h" - -/* - * This struct of function pointers defines the API between smgr.c and - * any individual storage manager module. Note that smgr subfunctions are - * generally expected to report problems via elog(ERROR). An exception is - * that smgr_unlink should use elog(WARNING), rather than erroring out, - * because we normally unlink relations during post-commit/abort cleanup, - * and so it's too late to raise an error. Also, various conditions that - * would normally be errors should be allowed during bootstrap and/or WAL - * recovery --- see comments in md.c for details. - */ -typedef struct f_smgr -{ - void (*smgr_init) (void); /* may be NULL */ - void (*smgr_shutdown) (void); /* may be NULL */ - void (*smgr_open) (SMgrRelation reln); - void (*smgr_close) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_create) (SMgrRelation reln, ForkNumber forknum, - bool isRedo); - bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_unlink) (RelFileNodeBackend rnode, ForkNumber forknum, - bool isRedo); - void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); - bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum); - void (*smgr_read) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer); - void (*smgr_write) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, char *buffer, bool skipFsync); - void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks); - BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); - void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, - BlockNumber nblocks); - void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); -} f_smgr; - -static const f_smgr smgrsw[] = { +static const f_smgr smgr_md = { /* magnetic disk */ - { .smgr_init = mdinit, .smgr_shutdown = NULL, .smgr_open = mdopen, @@ -82,11 +44,8 @@ static const f_smgr smgrsw[] = { .smgr_nblocks = mdnblocks, .smgr_truncate = mdtruncate, .smgr_immedsync = mdimmedsync, - } }; -static const int NSmgr = lengthof(smgrsw); - /* * Each backend has a hashtable that stores all extant SMgrRelation objects. * In addition, "unowned" SMgrRelation objects are chained together in a list. @@ -96,7 +55,7 @@ static HTAB *SMgrRelationHash = NULL; static dlist_head unowned_relns; /* local function prototypes */ -static void smgrshutdown(int code, Datum arg); +//static void smgrshutdown(int code, Datum arg); /* @@ -110,40 +69,80 @@ static void smgrshutdown(int code, Datum arg); void smgrinit(void) { - int i; + (*smgr_init_hook)(); - for (i = 0; i < NSmgr; i++) - { - if (smgrsw[i].smgr_init) - smgrsw[i].smgr_init(); - } + /* + * ZENITH XXX + * This doesn't work with inmem_smgr, so temporarily disable. + * Anyway, we don't have any real smgrshutdown function. + */ + // /* register the shutdown proc */ + // on_proc_exit(smgrshutdown, 0); +} - /* register the shutdown proc */ - on_proc_exit(smgrshutdown, 0); +//ZENITH XXX See comment above. Silence compiler warning. +// /* +// * on_proc_exit hook for smgr cleanup during backend shutdown +// */ +// static void +// smgrshutdown(int code, Datum arg) +// { +// if (smgr_shutdown_hook) +// (*smgr_shutdown_hook)(); + +// smgr_shutdown_standard(); +// } + +/* Hook for plugins to get control in smgr */ +smgr_hook_type smgr_hook = NULL; +smgr_init_hook_type smgr_init_hook = smgr_init_standard; +smgr_shutdown_hook_type smgr_shutdown_hook = NULL; + +const f_smgr * +smgr_standard(BackendId backend, RelFileNode rnode) +{ + return &smgr_md; } -/* - * on_proc_exit hook for smgr cleanup during backend shutdown - */ -static void -smgrshutdown(int code, Datum arg) +void +smgr_init_standard(void) { - int i; + mdinit(); +} - for (i = 0; i < NSmgr; i++) +void +smgr_shutdown_standard(void) +{ +} + +const f_smgr * +smgr(BackendId backend, RelFileNode rnode) +{ + const f_smgr *result; + + if (smgr_hook) { - if (smgrsw[i].smgr_shutdown) - smgrsw[i].smgr_shutdown(); + result = (*smgr_hook)(backend, rnode); } + else + result = smgr_standard(backend, rnode); + + return result; } + /* * smgropen() -- Return an SMgrRelation object, creating it if need be. * * This does not attempt to actually open the underlying file. + * + * The caller should pass the value of pg_class.relpersistence, if they know + * it, or 0 if unknown. Some operations, like smgrwrite() and smgrunlink() + * are allowed when relpersistence is not known, but others like smgrread() + * require it. */ SMgrRelation -smgropen(RelFileNode rnode, BackendId backend) +smgropen(RelFileNode rnode, BackendId backend, char relpersistence) { RelFileNodeBackend brnode; SMgrRelation reln; @@ -174,16 +173,33 @@ smgropen(RelFileNode rnode, BackendId backend) /* hash_search already filled in the lookup key */ reln->smgr_owner = NULL; reln->smgr_targblock = InvalidBlockNumber; + reln->smgr_relpersistence = relpersistence; for (int i = 0; i <= MAX_FORKNUM; ++i) reln->smgr_cached_nblocks[i] = InvalidBlockNumber; - reln->smgr_which = 0; /* we only have md.c at present */ + + reln->smgr = smgr(backend, rnode); /* implementation-specific initialization */ - smgrsw[reln->smgr_which].smgr_open(reln); + (*reln->smgr).smgr_open(reln); /* it has no owner yet */ dlist_push_tail(&unowned_relns, &reln->node); } + else + { + /* + * If the caller passed a valid 'relpersistence', and it was unknown + * before, update it. + */ + if (reln->smgr_relpersistence == 0) + reln->smgr_relpersistence = relpersistence; + else + { + if (!(relpersistence == 0 || reln->smgr_relpersistence == relpersistence)) + elog(ERROR, "relpersistence mismatch: smgropen %c vs SmgrRelation %c", + relpersistence, reln->smgr_relpersistence); + } + } return reln; } @@ -246,7 +262,7 @@ smgrclearowner(SMgrRelation *owner, SMgrRelation reln) bool smgrexists(SMgrRelation reln, ForkNumber forknum) { - return smgrsw[reln->smgr_which].smgr_exists(reln, forknum); + return (*reln->smgr).smgr_exists(reln, forknum); } /* @@ -259,7 +275,7 @@ smgrclose(SMgrRelation reln) ForkNumber forknum; for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - smgrsw[reln->smgr_which].smgr_close(reln, forknum); + (*reln->smgr).smgr_close(reln, forknum); owner = reln->smgr_owner; @@ -289,7 +305,7 @@ smgrrelease(SMgrRelation reln) { for (ForkNumber forknum = 0; forknum <= MAX_FORKNUM; forknum++) { - smgrsw[reln->smgr_which].smgr_close(reln, forknum); + (*reln->smgr).smgr_close(reln, forknum); reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber; } } @@ -368,7 +384,7 @@ smgrclosenode(RelFileNodeBackend rnode) void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) { - smgrsw[reln->smgr_which].smgr_create(reln, forknum, isRedo); + (*reln->smgr).smgr_create(reln, forknum, isRedo); } /* @@ -396,12 +412,10 @@ smgrdosyncall(SMgrRelation *rels, int nrels) */ for (i = 0; i < nrels; i++) { - int which = rels[i]->smgr_which; - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) { - if (smgrsw[which].smgr_exists(rels[i], forknum)) - smgrsw[which].smgr_immedsync(rels[i], forknum); + if ((*rels[i]->smgr).smgr_exists(rels[i], forknum)) + (*rels[i]->smgr).smgr_immedsync(rels[i], forknum); } } } @@ -440,13 +454,12 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo) for (i = 0; i < nrels; i++) { RelFileNodeBackend rnode = rels[i]->smgr_rnode; - int which = rels[i]->smgr_which; rnodes[i] = rnode; /* Close the forks at smgr level */ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - smgrsw[which].smgr_close(rels[i], forknum); + (*rels[i]->smgr).smgr_close(rels[i], forknum); } /* @@ -470,10 +483,8 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo) for (i = 0; i < nrels; i++) { - int which = rels[i]->smgr_which; - for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - smgrsw[which].smgr_unlink(rnodes[i], forknum, isRedo); + (*rels[i]->smgr).smgr_unlink(rnodes[i], forknum, isRedo); } pfree(rnodes); @@ -493,7 +504,7 @@ void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync) { - smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum, + (*reln->smgr).smgr_extend(reln, forknum, blocknum, buffer, skipFsync); /* @@ -517,7 +528,7 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) { - return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum); + return (*reln->smgr).smgr_prefetch(reln, forknum, blocknum); } /* @@ -532,7 +543,7 @@ void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) { - smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer); + (*reln->smgr).smgr_read(reln, forknum, blocknum, buffer); } /* @@ -554,7 +565,7 @@ void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync) { - smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum, + (*reln->smgr).smgr_write(reln, forknum, blocknum, buffer, skipFsync); } @@ -567,7 +578,7 @@ void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks) { - smgrsw[reln->smgr_which].smgr_writeback(reln, forknum, blocknum, + (*reln->smgr).smgr_writeback(reln, forknum, blocknum, nblocks); } @@ -585,7 +596,7 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum) if (result != InvalidBlockNumber) return result; - result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum); + result = (*reln->smgr).smgr_nblocks(reln, forknum); reln->smgr_cached_nblocks[forknum] = result; @@ -651,7 +662,7 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb /* Make the cached size is invalid if we encounter an error. */ reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber; - smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]); + (*reln->smgr).smgr_truncate(reln, forknum[i], nblocks[i]); /* * We might as well update the local smgr_cached_nblocks values. The @@ -690,7 +701,31 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb void smgrimmedsync(SMgrRelation reln, ForkNumber forknum) { - smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum); + (*reln->smgr).smgr_immedsync(reln, forknum); +} + +/* + * Zenith-added functions to mark the phases of an unlogged index build. + */ +void +smgr_start_unlogged_build(SMgrRelation reln) +{ + if ((*reln->smgr).smgr_start_unlogged_build) + (*reln->smgr).smgr_start_unlogged_build(reln); +} + +void +smgr_finish_unlogged_build_phase_1(SMgrRelation reln) +{ + if ((*reln->smgr).smgr_finish_unlogged_build_phase_1) + (*reln->smgr).smgr_finish_unlogged_build_phase_1(reln); +} + +void +smgr_end_unlogged_build(SMgrRelation reln) +{ + if ((*reln->smgr).smgr_end_unlogged_build) + (*reln->smgr).smgr_end_unlogged_build(reln); } /* diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 66294ab4c8b..65bb69ca3e5 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -180,6 +180,8 @@ static ProcSignalReason RecoveryConflictReason; static MemoryContext row_description_context = NULL; static StringInfoData row_description_buf; +process_interrupts_callback_t ProcessInterruptsCallback; + /* ---------------------------------------------------------------- * decls for routines only used in this file * ---------------------------------------------------------------- @@ -3160,6 +3162,7 @@ ProcessInterrupts(void) return; InterruptPending = false; + Retry: if (ProcDiePending) { ProcDiePending = false; @@ -3404,6 +3407,13 @@ ProcessInterrupts(void) if (LogMemoryContextPending) ProcessLogMemoryContextInterrupt(); + + /* Call registered callback if any */ + if (ProcessInterruptsCallback) + { + if (ProcessInterruptsCallback()) + goto Retry; + } } diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c index 87c15b9c6f3..5c11993a6d5 100644 --- a/src/backend/utils/activity/wait_event.c +++ b/src/backend/utils/activity/wait_event.c @@ -503,6 +503,9 @@ pgstat_get_wait_timeout(WaitEventTimeout w) case WAIT_EVENT_VACUUM_TRUNCATE: event_name = "VacuumTruncate"; break; + case WAIT_EVENT_BACK_PRESSURE: + event_name = "BackPressure"; + break; /* no default case, so that compiler will warn */ } diff --git a/src/backend/utils/adt/dbsize.c b/src/backend/utils/adt/dbsize.c index b4a2c8d2197..7e5b9da4032 100644 --- a/src/backend/utils/adt/dbsize.c +++ b/src/backend/utils/adt/dbsize.c @@ -23,6 +23,7 @@ #include "commands/tablespace.h" #include "miscadmin.h" #include "storage/fd.h" +#include "storage/smgr.h" #include "utils/acl.h" #include "utils/builtins.h" #include "utils/numeric.h" @@ -98,6 +99,8 @@ db_dir_size(const char *path) return dirsize; } +dbsize_hook_type dbsize_hook = NULL; + /* * calculate size of database in all tablespaces */ @@ -127,6 +130,13 @@ calculate_database_size(Oid dbOid) /* Include pg_default storage */ snprintf(pathname, sizeof(pathname), "base/%u", dbOid); + + if (dbsize_hook) + { + totalsize = (*dbsize_hook)(dbOid); + return totalsize; + } + totalsize = db_dir_size(pathname); /* Scan the non-default tablespaces */ @@ -292,41 +302,17 @@ pg_tablespace_size_name(PG_FUNCTION_ARGS) * is no check here or at the call sites for that. */ static int64 -calculate_relation_size(RelFileNode *rfn, BackendId backend, ForkNumber forknum) +calculate_relation_size(RelFileNode *rfn, BackendId backend, ForkNumber forknum, char relpersistence) { - int64 totalsize = 0; - char *relationpath; - char pathname[MAXPGPATH]; - unsigned int segcount = 0; + SMgrRelation srel = smgropen(*rfn, backend, relpersistence); - relationpath = relpathbackend(*rfn, backend, forknum); - - for (segcount = 0;; segcount++) + if (smgrexists(srel, forknum)) { - struct stat fst; - - CHECK_FOR_INTERRUPTS(); - - if (segcount == 0) - snprintf(pathname, MAXPGPATH, "%s", - relationpath); - else - snprintf(pathname, MAXPGPATH, "%s.%u", - relationpath, segcount); + BlockNumber n = smgrnblocks(srel, forknum); - if (stat(pathname, &fst) < 0) - { - if (errno == ENOENT) - break; - else - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not stat file \"%s\": %m", pathname))); - } - totalsize += fst.st_size; + return (int64) n * BLCKSZ; } - - return totalsize; + return 0; } Datum @@ -350,7 +336,8 @@ pg_relation_size(PG_FUNCTION_ARGS) PG_RETURN_NULL(); size = calculate_relation_size(&(rel->rd_node), rel->rd_backend, - forkname_to_number(text_to_cstring(forkName))); + forkname_to_number(text_to_cstring(forkName)), + rel->rd_rel->relpersistence); relation_close(rel, AccessShareLock); @@ -375,7 +362,8 @@ calculate_toast_table_size(Oid toastrelid) /* toast heap size, including FSM and VM size */ for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) size += calculate_relation_size(&(toastRel->rd_node), - toastRel->rd_backend, forkNum); + toastRel->rd_backend, forkNum, + toastRel->rd_rel->relpersistence); /* toast index size, including FSM and VM size */ indexlist = RelationGetIndexList(toastRel); @@ -389,7 +377,8 @@ calculate_toast_table_size(Oid toastrelid) AccessShareLock); for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) size += calculate_relation_size(&(toastIdxRel->rd_node), - toastIdxRel->rd_backend, forkNum); + toastIdxRel->rd_backend, forkNum, + toastIdxRel->rd_rel->relpersistence); relation_close(toastIdxRel, AccessShareLock); } @@ -418,7 +407,8 @@ calculate_table_size(Relation rel) */ for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) size += calculate_relation_size(&(rel->rd_node), rel->rd_backend, - forkNum); + forkNum, + rel->rd_rel->relpersistence); /* * Size of toast relation @@ -458,7 +448,8 @@ calculate_indexes_size(Relation rel) for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++) size += calculate_relation_size(&(idxRel->rd_node), idxRel->rd_backend, - forkNum); + forkNum, + idxRel->rd_rel->relpersistence); relation_close(idxRel, AccessShareLock); } diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 0ce4400b321..2416116e97c 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -3796,7 +3796,7 @@ RelationSetNewRelfilenode(Relation relation, char persistence) * fails at this stage, the new cluster will need to be recreated * anyway. */ - srel = smgropen(relation->rd_node, relation->rd_backend); + srel = smgropen(relation->rd_node, relation->rd_backend, persistence); smgrdounlinkall(&srel, 1, false); smgrclose(srel); } diff --git a/src/backend/utils/fmgr/dfmgr.c b/src/backend/utils/fmgr/dfmgr.c index 7f9ea972804..b289644a852 100644 --- a/src/backend/utils/fmgr/dfmgr.c +++ b/src/backend/utils/fmgr/dfmgr.c @@ -36,6 +36,7 @@ #include "storage/shmem.h" #include "utils/hsearch.h" +download_extension_file_hook_type download_extension_file_hook = NULL; /* signature for PostgreSQL-specific library init function */ typedef void (*PG_init_t) (void); @@ -79,11 +80,13 @@ static void *internal_load_library(const char *libname); static void incompatible_module_error(const char *libname, const Pg_magic_struct *module_magic_data) pg_attribute_noreturn(); static bool file_exists(const char *name); -static char *expand_dynamic_library_name(const char *name); +static char *expand_dynamic_library_name(const char *name, bool *is_found); static void check_restricted_library_name(const char *name); static char *substitute_libpath_macro(const char *name); static char *find_in_dynamic_libpath(const char *basename); +static void neon_try_load(const char *name); + /* Magic structure that module needs to match to be accepted */ static const Pg_magic_struct magic_data = PG_MODULE_MAGIC_DATA; @@ -108,9 +111,20 @@ load_external_function(const char *filename, const char *funcname, char *fullname; void *lib_handle; void *retval; + bool is_found = true; /* Expand the possibly-abbreviated filename to an exact path name */ - fullname = expand_dynamic_library_name(filename); + fullname = expand_dynamic_library_name(filename, &is_found); + + // if file is not found, try to download it from compute_ctl + if (!is_found && download_extension_file_hook != NULL) + { + // try to download the file + elog(DEBUG3, "load_external_function: try to download file: %s", fullname); + neon_try_load(fullname); + // try to find file locally once again + fullname = expand_dynamic_library_name(filename, &is_found); + } /* Load the shared library, unless we already did */ lib_handle = internal_load_library(fullname); @@ -132,6 +146,47 @@ load_external_function(const char *filename, const char *funcname, return retval; } +void +neon_try_load(const char *name) +{ + bool have_slash; + char *request_name; + + // add .so suffix if it is not present + if (strstr(name, DLSUFFIX) == NULL) + { + request_name = psprintf("%s%s", name, DLSUFFIX); + elog(DEBUG3, "neon_try_load: add DLSUFFIX: %s", request_name); + } + else + { + request_name = pstrdup(name); + elog(DEBUG3, "neon_try_load: DLSUFFIX already present: %s", request_name); + } + + have_slash = (first_dir_separator(request_name) != NULL); + + if (strncmp(request_name, "$libdir/", strlen("$libdir/")) == 0) + { + char *new_request_name = psprintf("%s", request_name + strlen("$libdir/")); + pfree(request_name); + request_name = new_request_name; + + elog(DEBUG3, "neon_try_load: omit $libdir/: %s", request_name); + } + else if (have_slash) + { + ereport(ERROR, + (errcode(ERRCODE_INVALID_NAME), + errmsg("unexpected path in dynamic library name: %s", + name))); + } + + elog(DEBUG3, "neon_try_load: final request_name: %s", request_name); + + download_extension_file_hook(request_name, true); +} + /* * This function loads a shlib file without looking up any particular * function in it. If the same shlib has previously been loaded, @@ -144,13 +199,24 @@ void load_file(const char *filename, bool restricted) { char *fullname; + bool is_found = true; /* Apply security restriction if requested */ if (restricted) check_restricted_library_name(filename); /* Expand the possibly-abbreviated filename to an exact path name */ - fullname = expand_dynamic_library_name(filename); + fullname = expand_dynamic_library_name(filename, &is_found); + + // if file is not found, try to download it from compute_ctl + if (!is_found && download_extension_file_hook != NULL) + { + // try to download the file + elog(DEBUG3, "load_file: try to download file: %s", fullname); + neon_try_load(fullname); + // try to find file locally once again + fullname = expand_dynamic_library_name(filename, &is_found); + } /* Load the shared library */ (void) internal_load_library(fullname); @@ -168,7 +234,6 @@ lookup_external_function(void *filehandle, const char *funcname) return dlsym(filehandle, funcname); } - /* * Load the specified dynamic-link library file, unless it already is * loaded. Return the pg_dl* handle for the file. @@ -209,6 +274,7 @@ internal_load_library(const char *libname) errmsg("could not access file \"%s\": %m", libname))); + for (file_scanner = file_list; file_scanner != NULL && !SAME_INODE(stat_buf, *file_scanner); @@ -428,7 +494,7 @@ file_exists(const char *name) * The result will always be freshly palloc'd. */ static char * -expand_dynamic_library_name(const char *name) +expand_dynamic_library_name(const char *name, bool *is_found) { bool have_slash; char *new; @@ -474,9 +540,11 @@ expand_dynamic_library_name(const char *name) * If we can't find the file, just return the string as-is. The ensuing * load attempt will fail and report a suitable message. */ + *is_found = false; return pstrdup(name); } + /* * Check a restricted library name. It must begin with "$libdir/plugins/" * and there must not be any directory separators after that (this is diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index b25bd0e5838..602153b26c4 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -1666,7 +1666,7 @@ load_libraries(const char *libraries, const char *gucname, bool restricted) filename = expanded; } load_file(filename, restricted); - ereport(DEBUG1, + ereport(LOG, (errmsg_internal("loaded library \"%s\"", filename))); if (expanded) pfree(expanded); diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index c410ba532d2..8d05ffa1705 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -40,6 +40,7 @@ #include "access/transam.h" #include "access/twophase.h" #include "access/xact.h" +#include "access/xloginsert.h" #include "access/xlog_internal.h" #include "access/xlogprefetcher.h" #include "access/xlogrecovery.h" @@ -92,6 +93,7 @@ #include "storage/pg_shmem.h" #include "storage/predicate.h" #include "storage/proc.h" +#include "storage/smgr.h" #include "storage/standby.h" #include "tcop/tcopprot.h" #include "tsearch/ts_cache.h" @@ -1012,6 +1014,36 @@ static const unit_conversion time_unit_conversion_table[] = static struct config_bool ConfigureNamesBool[] = { + { + {"enable_seqscan_prefetch", PGC_USERSET, RESOURCES_ASYNCHRONOUS, + gettext_noop("Enables prefetching of next pages in sequential scans."), + NULL, + GUC_EXPLAIN + }, + &enable_seqscan_prefetch, + true, + NULL, NULL, NULL + }, + { + {"enable_indexscan_prefetch", PGC_USERSET, RESOURCES_ASYNCHRONOUS, + gettext_noop("Enables prefetching of heap pages in index scans."), + NULL, + GUC_EXPLAIN + }, + &enable_indexscan_prefetch, + true, + NULL, NULL, NULL + }, + { + {"enable_indexonlyscan_prefetch", PGC_USERSET, RESOURCES_ASYNCHRONOUS, + gettext_noop("Enables prefetching of leave pages in index-only scans."), + NULL, + GUC_EXPLAIN + }, + &enable_indexonlyscan_prefetch, + true, + NULL, NULL, NULL + }, { {"enable_seqscan", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enables the planner's use of sequential-scan plans."), @@ -2172,6 +2204,16 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + {"neon_test_evict", PGC_POSTMASTER, UNGROUPED, + gettext_noop("Evict unpinned pages (for better test coverage)"), + }, + &zenith_test_evict, + false, + NULL, NULL, NULL + }, + + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL @@ -2416,6 +2458,16 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"lsn_cache_size", PGC_POSTMASTER, UNGROUPED, + gettext_noop("Size of last written LSN cache used by Neon."), + NULL + }, + &lastWrittenLsnCacheSize, + 128*1024, 1024, INT_MAX, + NULL, NULL, NULL + }, + { {"temp_buffers", PGC_USERSET, RESOURCES_MEM, gettext_noop("Sets the maximum number of temporary buffers used by each session."), @@ -2973,6 +3025,42 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"max_replication_apply_lag", PGC_POSTMASTER, REPLICATION_SENDING, + gettext_noop("Maximal write lag between master and replicas."), + gettext_noop("When lag between minimal apply position of replica and current LSN exceeds this value," + "backends are blocked."), + GUC_UNIT_MB, + }, + &max_replication_apply_lag, + -1, -1, INT_MAX, /* it should not be smaller than maximal size of WAL record */ + NULL, NULL, NULL + }, + + { + {"max_replication_flush_lag", PGC_POSTMASTER, REPLICATION_SENDING, + gettext_noop("Maximal flush lag between master and replicas."), + gettext_noop("When lag between minimal flush position of replica and current LSN exceeds this value," + "backends are blocked"), + GUC_UNIT_MB, + }, + &max_replication_flush_lag, + -1, -1, INT_MAX, /* it should not be smaller than maximal size of WAL record */ + NULL, NULL, NULL + }, + + { + {"max_replication_write_lag", PGC_POSTMASTER, REPLICATION_SENDING, + gettext_noop("Maximal write lag between master and replicas."), + gettext_noop("When lag between minimal write position of replica and current LSN exceeds this value," + "backends are blocked"), + GUC_UNIT_MB, + }, + &max_replication_write_lag, + -1, -1, INT_MAX, /* it should not be smaller than maximal size of WAL record */ + NULL, NULL, NULL + }, + { {"max_slot_wal_keep_size", PGC_SIGHUP, REPLICATION_SENDING, gettext_noop("Sets the maximum WAL size that can be reserved by replication slots."), diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 429844fdd35..a20326694f4 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -2859,6 +2859,7 @@ main(int argc, char *argv[]) {"discard-caches", no_argument, NULL, 14}, {"locale-provider", required_argument, NULL, 15}, {"icu-locale", required_argument, NULL, 16}, + {"sysid", required_argument, NULL, 17}, {NULL, 0, NULL, 0} }; @@ -3016,6 +3017,9 @@ main(int argc, char *argv[]) case 16: icu_locale = pg_strdup(optarg); break; + case 17: + boot_options = psprintf("%s -s %s", boot_options, optarg); + break; default: /* getopt_long already emitted a complaint */ pg_log_error_hint("Try \"%s --help\" for more information.", progname); diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index 5dc60109b12..d408315ec25 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -13,6 +13,7 @@ #include "postgres.h" #include +#include #include #include #include @@ -25,8 +26,11 @@ #include "common/fe_memutils.h" #include "common/logging.h" #include "getopt_long.h" +#include "port/pg_bitutils.h" #include "rmgrdesc.h" +#define OFFSET_INVALID ((size_t)-1) + /* * NOTE: For any code change or issue fix here, it is highly recommended to * give a thought about doing the same in pg_walinspect contrib module as well. @@ -45,8 +49,10 @@ typedef struct XLogDumpPrivate XLogRecPtr startptr; XLogRecPtr endptr; bool endptr_reached; + char* input_filename; } XLogDumpPrivate; + typedef struct XLogDumpConfig { /* display options */ @@ -58,6 +64,8 @@ typedef struct XLogDumpConfig bool stats; bool stats_per_record; + bool ignore_format_errors; + /* filter options */ bool filter_by_rmgr[RM_MAX_ID + 1]; bool filter_by_rmgr_enabled; @@ -86,6 +94,34 @@ sigint_handler(int signum) } #endif +/* calculate ceil(log base 2) of num */ +static int +my_log2(long num) +{ + /* + * guard against too-large input, which would be invalid for + * pg_ceil_log2_*() + */ + if (num > LONG_MAX / 2) + num = LONG_MAX / 2; + +#if SIZEOF_LONG < 8 + return pg_ceil_log2_32(num); +#else + return pg_ceil_log2_64(num); +#endif +} + +/* calculate first power of 2 >= num, bounded to what will fit in an int */ +static int +next_pow2_int(long num) +{ + if (num > INT_MAX / 2) + num = INT_MAX / 2; + return 1 << my_log2(num); +} + + static void print_rmgr_list(void) { @@ -298,6 +334,18 @@ WALDumpOpenSegment(XLogReaderState *state, XLogSegNo nextSegNo, TimeLineID tli = *tli_p; char fname[MAXPGPATH]; int tries; + XLogDumpPrivate *private = state->private_data; + + if(private->input_filename) + { + Assert(nextSegNo == 0); + + state->seg.ws_file = open_file_in_directory(state->segcxt.ws_dir, private->input_filename); + if (state->seg.ws_file >= 0) + return; + + pg_fatal("could not open file \"%s\": %m", private->input_filename); + } XLogFileName(fname, tli, nextSegNo, state->segcxt.ws_segsize); @@ -368,6 +416,7 @@ WALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, { WALOpenSegment *seg = &errinfo.wre_seg; char fname[MAXPGPATH]; + char *actual_fname = private->input_filename ? private->input_filename : fname; XLogFileName(fname, seg->ws_tli, seg->ws_segno, state->segcxt.ws_segsize); @@ -376,11 +425,11 @@ WALDumpReadPage(XLogReaderState *state, XLogRecPtr targetPagePtr, int reqLen, { errno = errinfo.wre_errno; pg_fatal("could not read from file %s, offset %d: %m", - fname, errinfo.wre_off); + actual_fname, errinfo.wre_off); } else pg_fatal("could not read from file %s, offset %d: read %d of %d", - fname, errinfo.wre_off, errinfo.wre_read, + actual_fname, errinfo.wre_off, errinfo.wre_read, errinfo.wre_req); } @@ -451,16 +500,26 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record) uint32 fpi_len; uint8 info = XLogRecGetInfo(record); XLogRecPtr xl_prev = XLogRecGetPrev(record); + XLogDumpPrivate *private = record->private_data; StringInfoData s; XLogRecGetLen(record, &rec_len, &fpi_len); - printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, lsn: %X/%08X, prev %X/%08X, ", - desc->rm_name, - rec_len, XLogRecGetTotalLen(record), - XLogRecGetXid(record), - LSN_FORMAT_ARGS(record->ReadRecPtr), - LSN_FORMAT_ARGS(xl_prev)); + if(private->input_filename) + printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, offset: 0x%lX, prev %X/%08X, ", + desc->rm_name, + rec_len, XLogRecGetTotalLen(record), + XLogRecGetXid(record), + record->ReadRecPtr, + LSN_FORMAT_ARGS(xl_prev)); + else + printf("rmgr: %-11s len (rec/tot): %6u/%6u, tx: %10u, lsn: %X/%08X, prev %X/%08X, ", + desc->rm_name, + rec_len, XLogRecGetTotalLen(record), + XLogRecGetXid(record), + LSN_FORMAT_ARGS(record->ReadRecPtr), + LSN_FORMAT_ARGS(xl_prev)); + id = desc->rm_identify(info); if (id == NULL) @@ -666,7 +725,10 @@ usage(void) printf(_(" -f, --follow keep retrying after reaching end of WAL\n")); printf(_(" -F, --fork=FORK only show records that modify blocks in fork FORK;\n" " valid names are main, fsm, vm, init\n")); + printf(_(" -i, --ignore ignore format errors, skip invalid structures\n")); + printf(_(" -N, --file=FNAME dump log records from a single file\n")); printf(_(" -n, --limit=N number of records to display\n")); + printf(_(" -o, --offset=OFFSET offset of the first record to in a file to dump\n")); printf(_(" -p, --path=PATH directory in which to find log segment files or a\n" " directory with a ./pg_wal that contains such files\n" " (default: current directory, ./pg_wal, $PGDATA/pg_wal)\n")); @@ -700,6 +762,9 @@ main(int argc, char **argv) XLogRecPtr first_record; char *waldir = NULL; char *errormsg; + char *fname = NULL; + bool single_file = false; + size_t start_offset = OFFSET_INVALID; static struct option long_options[] = { {"bkp-details", no_argument, NULL, 'b'}, @@ -707,6 +772,9 @@ main(int argc, char **argv) {"end", required_argument, NULL, 'e'}, {"follow", no_argument, NULL, 'f'}, {"fork", required_argument, NULL, 'F'}, + {"file", required_argument, NULL, 'N'}, + {"ignore", no_argument, NULL, 'i'}, + {"offset", required_argument, NULL, 'o'}, {"fullpage", no_argument, NULL, 'w'}, {"help", no_argument, NULL, '?'}, {"limit", required_argument, NULL, 'n'}, @@ -755,6 +823,7 @@ main(int argc, char **argv) private.startptr = InvalidXLogRecPtr; private.endptr = InvalidXLogRecPtr; private.endptr_reached = false; + private.input_filename = NULL; config.quiet = false; config.bkp_details = false; @@ -772,6 +841,7 @@ main(int argc, char **argv) config.filter_by_fpw = false; config.stats = false; config.stats_per_record = false; + config.ignore_format_errors = false; stats.startptr = InvalidXLogRecPtr; stats.endptr = InvalidXLogRecPtr; @@ -782,7 +852,7 @@ main(int argc, char **argv) goto bad_argument; } - while ((option = getopt_long(argc, argv, "bB:e:fF:n:p:qr:R:s:t:wx:z", + while ((option = getopt_long(argc, argv, "bB:e:fF:in:N:o:p:qr:R:s:t:wx:z", long_options, &optindex)) != -1) { switch (option) @@ -821,6 +891,13 @@ main(int argc, char **argv) } config.filter_by_extended = true; break; + case 'N': + fname = pg_strdup(optarg); + single_file = true; + break; + case 'i': + config.ignore_format_errors = true; + break; case 'n': if (sscanf(optarg, "%d", &config.stop_after_records) != 1) { @@ -828,6 +905,13 @@ main(int argc, char **argv) goto bad_argument; } break; + case 'o': + if (sscanf(optarg, "%zu", &start_offset) != 1) + { + pg_log_error("could not parse offset \"%s\"", optarg); + goto bad_argument; + } + break; case 'p': waldir = pg_strdup(optarg); break; @@ -962,6 +1046,73 @@ main(int argc, char **argv) goto bad_argument; } + if (start_offset != OFFSET_INVALID) + { + if(!XLogRecPtrIsInvalid(private.startptr) || !XLogRecPtrIsInvalid(private.endptr)) + { + pg_log_error("either file offset or start/end pointers should be specified"); + goto bad_argument; + } + + if(!single_file) + { + pg_log_error("offset option could only be used with filename option"); + goto bad_argument; + } + + /* Log records are maxaligned, start at the closest next position */ + private.startptr = MAXALIGN(start_offset); + } + + if(single_file) + { + char *directory = NULL; + int fd; + struct stat stat; + + if(config.follow) + { + pg_log_error("Follow could not be used in file dump mode"); + goto bad_argument; + } + + if (waldir != NULL) + { + pg_log_error("either single file or wal directory should be specified"); + goto bad_argument; + } + + split_path(fname, &directory, &private.input_filename); + waldir = directory; + + if(waldir == NULL) + { + char *cwd = malloc(MAXPGPATH); + + if (!getcwd(cwd, MAXPGPATH)) + pg_fatal("could identify current directory: %m"); + + waldir = cwd; + } + + if (!verify_directory(waldir)) + pg_fatal("could not open directory \"%s\": %m", waldir); + + fd = open_file_in_directory(waldir, private.input_filename); + if (fd < 0) + pg_fatal("could not open file \"%s\"", private.input_filename); + + if(fstat(fd, &stat) != 0) + pg_fatal("could not stat file \"%s\"", private.input_filename); + + private.endptr = stat.st_size; + + /* Round up segment size to next power of 2 or 1MB */ + WalSegSz = Max(next_pow2_int(private.endptr), 1024 * 1024); + + close(fd); + } + if (waldir != NULL) { /* validate path points to directory */ @@ -980,6 +1131,12 @@ main(int argc, char **argv) int fd; XLogSegNo segno; + if(single_file) + { + pg_log_error("either single file or start/end boundaries should be specified"); + goto bad_argument; + } + split_path(argv[optind], &directory, &fname); if (waldir == NULL && directory != NULL) @@ -1052,10 +1209,11 @@ main(int argc, char **argv) } } else - waldir = identify_target_directory(waldir, NULL); + if (!single_file) + waldir = identify_target_directory(waldir, NULL); /* we don't know what to print */ - if (XLogRecPtrIsInvalid(private.startptr)) + if (XLogRecPtrIsInvalid(private.startptr) && !single_file) { pg_log_error("no start WAL location given"); goto bad_argument; @@ -1073,13 +1231,27 @@ main(int argc, char **argv) if (!xlogreader_state) pg_fatal("out of memory while allocating a WAL reading processor"); - /* first find a valid recptr to start from */ - first_record = XLogFindNextRecord(xlogreader_state, private.startptr); + if(single_file) + { + if(config.ignore_format_errors) + { + xlogreader_state->skip_page_validation = true; + xlogreader_state->skip_invalid_records = true; + } - if (first_record == InvalidXLogRecPtr) - pg_fatal("could not find a valid record after %X/%X", - LSN_FORMAT_ARGS(private.startptr)); + xlogreader_state->skip_lsn_checks = true; + first_record = private.startptr; + XLogBeginRead(xlogreader_state, first_record); + } + else + { + /* first find a valid recptr to start from */ + first_record = XLogFindNextRecord(xlogreader_state, private.startptr); + if (first_record == InvalidXLogRecPtr) + pg_fatal("could not find a valid record after %X/%X", + LSN_FORMAT_ARGS(private.startptr)); + } /* * Display a message that we're skipping data if `from` wasn't a pointer * to the start of a record and also wasn't a pointer to the beginning of diff --git a/src/include/Makefile b/src/include/Makefile index 5f257a958c8..ac9ccf71d19 100644 --- a/src/include/Makefile +++ b/src/include/Makefile @@ -47,22 +47,15 @@ install: all installdirs $(INSTALL_DATA) utils/errcodes.h '$(DESTDIR)$(includedir_server)/utils' $(INSTALL_DATA) utils/fmgroids.h '$(DESTDIR)$(includedir_server)/utils' $(INSTALL_DATA) utils/fmgrprotos.h '$(DESTDIR)$(includedir_server)/utils' -# We don't use INSTALL_DATA for performance reasons --- there are a lot of files -# (in fact, we have to take some pains to avoid overlength shell commands here) - cp $(srcdir)/*.h '$(DESTDIR)$(includedir_server)'/ + $(INSTALL_DATA) $(srcdir)/*.h '$(DESTDIR)$(includedir_server)' for dir in $(SUBDIRS); do \ - cp $(srcdir)/$$dir/*.h '$(DESTDIR)$(includedir_server)'/$$dir/ || exit; \ + $(INSTALL_DATA) $(srcdir)/$$dir/*.h '$(DESTDIR)$(includedir_server)'/$$dir || exit; \ done ifeq ($(vpath_build),yes) for file in catalog/schemapg.h catalog/system_fk_info.h catalog/pg_*_d.h parser/gram.h storage/lwlocknames.h utils/probes.h; do \ - cp $$file '$(DESTDIR)$(includedir_server)'/$$file || exit; \ + $(INSTALL_DATA) $$file '$(DESTDIR)$(includedir_server)'/$$file || exit; \ done endif - cd '$(DESTDIR)$(includedir_server)' && chmod $(INSTALL_DATA_MODE) *.h - for dir in $(SUBDIRS); do \ - cd '$(DESTDIR)$(includedir_server)'/$$dir || exit; \ - chmod $(INSTALL_DATA_MODE) *.h || exit; \ - done installdirs: $(MKDIR_P) '$(DESTDIR)$(includedir)/libpq' '$(DESTDIR)$(includedir_internal)/libpq' diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index abf62d9df79..32d2340b795 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -71,6 +71,10 @@ typedef struct HeapScanDescData */ ParallelBlockTableScanWorkerData *rs_parallelworkerdata; + /* prefetch info */ + int rs_prefetch_maximum; /* io_concurrency of tablespace */ + int rs_prefetch_target; /* current readahead target */ + /* these fields only used in page-at-a-time mode and for bitmap scans */ int rs_cindex; /* current tuple's index in vistuples */ int rs_ntuples; /* number of visible tuples on page */ diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 2d8a7f62706..3cdd4c8a75c 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -108,6 +108,7 @@ typedef struct xl_heap_delete { TransactionId xmax; /* xmax of the deleted tuple */ OffsetNumber offnum; /* deleted tuple's offset */ + uint32 t_cid; uint8 infobits_set; /* infomask bits */ uint8 flags; } xl_heap_delete; @@ -145,6 +146,7 @@ typedef struct xl_heap_header { uint16 t_infomask2; uint16 t_infomask; + uint32 t_cid; uint8 t_hoff; } xl_heap_header; @@ -186,6 +188,7 @@ typedef struct xl_multi_insert_tuple uint16 datalen; /* size of tuple data that follows */ uint16 t_infomask2; uint16 t_infomask; + uint32 t_cid; uint8 t_hoff; /* TUPLE DATA FOLLOWS AT END OF STRUCT */ } xl_multi_insert_tuple; @@ -215,9 +218,9 @@ typedef struct xl_heap_update OffsetNumber old_offnum; /* old tuple's offset */ uint8 old_infobits_set; /* infomask bits to set on old tuple */ uint8 flags; + uint32 t_cid; TransactionId new_xmax; /* xmax of the new tuple */ OffsetNumber new_offnum; /* new tuple's offset */ - /* * If XLH_UPDATE_CONTAINS_OLD_TUPLE or XLH_UPDATE_CONTAINS_OLD_KEY flags * are set, xl_heap_header and tuple data for the old tuple follow. @@ -279,6 +282,7 @@ typedef struct xl_heap_lock { TransactionId locking_xid; /* might be a MultiXactId not xid */ OffsetNumber offnum; /* locked tuple's offset on page */ + uint32 t_cid; int8 infobits_set; /* infomask and infomask2 bits to set */ uint8 flags; /* XLH_LOCK_* flag bits */ } xl_heap_lock; diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 93f8267b483..f50c43cfadd 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1069,6 +1069,22 @@ typedef struct BTScanOpaqueData /* keep these last in struct for efficiency */ BTScanPosData currPos; /* current position data */ BTScanPosData markPos; /* marked position, if any */ + + /* Neon: prefetch state */ + int prefetch_maximum; /* maximal number of prefetch requests */ + + /* Prefech of referenced heap pages for index scan */ + /* To minimize waste prefetch requests we start with prefetch distance 0 + * and increase it until it reaches prefetch_maximum + */ + int current_prefetch_distance; + + /* Prefetch of leave pages of B-Tree for index-only scan */ + int n_prefetch_requests; /* number of active prefetch requests */ + int n_prefetch_blocks; /* number of elements in prefetch_blocks */ + int last_prefetch_index; /* current position in prefetch_blocks (prefetch_blocks[0..last_prefetch_index] are already requested */ + BlockNumber next_parent; /* pointer to next parent page */ + BlockNumber prefetch_blocks[MaxTIDsPerBTreePage + 1]; /* leaves + parent page */ } BTScanOpaqueData; typedef BTScanOpaqueData *BTScanOpaque; @@ -1232,6 +1248,7 @@ extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, + BlockNumber* parent, Snapshot snapshot); /* diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index cd674c3c23f..727f674b8bf 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -30,6 +30,15 @@ extern PGDLLIMPORT XLogRecPtr ProcLastRecPtr; extern PGDLLIMPORT XLogRecPtr XactLastRecEnd; extern PGDLLIMPORT XLogRecPtr XactLastCommitEnd; +/* + * Pseudo block number used to associate LSN with relation metadata (relation size) + */ +#define REL_METADATA_PSEUDO_BLOCKNO InvalidBlockNumber + +extern bool ZenithRecoveryRequested; +extern XLogRecPtr zenithLastRec; +extern bool zenithWriteOk; + /* these variables are GUC parameters related to XLOG */ extern PGDLLIMPORT int wal_segment_size; extern PGDLLIMPORT int min_wal_size_mb; @@ -53,6 +62,8 @@ extern PGDLLIMPORT bool track_wal_io_timing; extern PGDLLIMPORT int wal_decode_buffer_size; extern PGDLLIMPORT int CheckPointSegments; +extern int lastWrittenLsnCacheSize; + /* Archive modes */ typedef enum ArchiveMode @@ -243,6 +254,21 @@ extern XLogRecPtr GetFlushRecPtr(TimeLineID *insertTLI); extern TimeLineID GetWALInsertionTimeLine(void); extern XLogRecPtr GetLastImportantRecPtr(void); +/* neon specifics */ + +extern void SetLastWrittenLSNForBlock(XLogRecPtr lsn, RelFileNode relfilenode, ForkNumber forknum, BlockNumber blkno); +extern void SetLastWrittenLSNForBlockRange(XLogRecPtr lsn, RelFileNode relfilenode, ForkNumber forknum, BlockNumber from, BlockNumber n_blocks); +extern void SetLastWrittenLSNForDatabase(XLogRecPtr lsn); +extern void SetLastWrittenLSNForRelation(XLogRecPtr lsn, RelFileNode relfilenode, ForkNumber forknum); +extern XLogRecPtr GetLastWrittenLSN(RelFileNode relfilenode, ForkNumber forknum, BlockNumber blkno); + +extern void SetRedoStartLsn(XLogRecPtr RedoStartLSN); +extern XLogRecPtr GetRedoStartLsn(void); + +extern void SetZenithCurrentClusterSize(uint64 size); +extern uint64 GetZenithCurrentClusterSize(void); + + extern void SetWalWriterSleeping(bool sleeping); extern void assign_max_wal_size(int newval, void *extra); @@ -297,6 +323,8 @@ extern SessionBackupState get_backup_status(void); #define TABLESPACE_MAP "tablespace_map" #define TABLESPACE_MAP_OLD "tablespace_map.old" +#define ZENITH_SIGNAL_FILE "zenith.signal" + /* files to signal promotion to primary */ #define PROMOTE_SIGNAL_FILE "promote" diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index fae0bef8f5d..1abfbfe95a3 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -354,6 +354,9 @@ extern void XLogRecGetBlockRefInfo(XLogReaderState *record, bool pretty, bool detailed_format, StringInfo buf, uint32 *fpi_len); +extern int ReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, + int reqLen); + /* * Exported for the functions in timeline.c and xlogarchive.c. Only valid * in the startup process. diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h index 5fc340c434b..65c6247bd0f 100644 --- a/src/include/access/xloginsert.h +++ b/src/include/access/xloginsert.h @@ -38,6 +38,10 @@ #define REGBUF_KEEP_DATA 0x10 /* include data even if a full-page image * is taken */ +extern int max_replication_apply_lag; +extern int max_replication_flush_lag; +extern int max_replication_write_lag; + /* prototypes for public functions in xloginsert.c: */ extern void XLogBeginInsert(void); extern void XLogSetRecordFlags(uint8 flags); diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index 9e63162e429..835428fafd9 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -216,6 +216,10 @@ struct XLogReaderState /* Set when XLP_FIRST_IS_OVERWRITE_CONTRECORD is found */ XLogRecPtr overwrittenRecPtr; + /* Disable validation to allow dumping corrupt WAL */ + bool skip_page_validation; + bool skip_invalid_records; + bool skip_lsn_checks; /* ---------------------------------------- * Decoded representation of current record @@ -439,5 +443,7 @@ extern bool XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum, Buffer *prefetch_buffer); +extern DecodedXLogRecord * +XLogReadRecordAlloc(XLogReaderState *state, size_t xl_tot_len, bool allow_oversized); #endif /* XLOGREADER_H */ diff --git a/src/include/access/xlogrecovery.h b/src/include/access/xlogrecovery.h index 0aa85d90e89..48eaa8bcbf1 100644 --- a/src/include/access/xlogrecovery.h +++ b/src/include/access/xlogrecovery.h @@ -135,6 +135,7 @@ extern void ShutdownWalRecovery(void); extern void RemovePromoteSignalFiles(void); extern bool HotStandbyActive(void); +extern void XLogWaitForReplayOf(XLogRecPtr redoEndRecPtr); extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *replayTLI); extern RecoveryPauseState GetRecoveryPauseState(void); extern void SetRecoveryPause(bool recoveryPause); diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h index c9d0b75a01b..15f155238a0 100644 --- a/src/include/access/xlogutils.h +++ b/src/include/access/xlogutils.h @@ -81,6 +81,12 @@ typedef struct ReadLocalXLogPageNoWaitPrivate bool end_of_wal; /* true, when end of WAL is reached */ } ReadLocalXLogPageNoWaitPrivate; +/* + * Returns true if we shouldn't do REDO on that block in record indicated by + * block_id; false otherwise. + */ +extern bool (*redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id); + extern XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record, uint8 buffer_id, Buffer *buf); extern Buffer XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id); diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h index 666977fb1f8..642e267f500 100644 --- a/src/include/commands/explain.h +++ b/src/include/commands/explain.h @@ -46,6 +46,7 @@ typedef struct ExplainState bool timing; /* print detailed node timing */ bool summary; /* print total planning and execution timing */ bool settings; /* print modified settings */ + bool prefetch; /* print prefetch statistic */ ExplainFormat format; /* output format */ /* state for output formatting --- not reset for each new plan tree */ int indent; /* current indentation level */ diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index 2945cce3a97..ba396caf18b 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -15,6 +15,14 @@ #include "portability/instr_time.h" +/* Prefeth statistics */ +typedef struct +{ + int64 hits; + int64 misses; + int64 expired; + int64 duplicates; +} PrefetchInfo; /* * BufferUsage and WalUsage counters keep being incremented infinitely, @@ -37,6 +45,7 @@ typedef struct BufferUsage instr_time blk_write_time; /* time spent writing blocks */ instr_time temp_blk_read_time; /* time spent reading temp blocks */ instr_time temp_blk_write_time; /* time spent writing temp blocks */ + PrefetchInfo prefetch; /* prefetch statistics */ } BufferUsage; /* diff --git a/src/include/fmgr.h b/src/include/fmgr.h index 5314b737052..d654047a279 100644 --- a/src/include/fmgr.h +++ b/src/include/fmgr.h @@ -778,4 +778,10 @@ extern PGDLLIMPORT fmgr_hook_type fmgr_hook; #define FmgrHookIsNeeded(fn_oid) \ (!needs_fmgr_hook ? false : (*needs_fmgr_hook)(fn_oid)) + + +// download_extension_file_hook (filename, is_library) +typedef bool (*download_extension_file_hook_type) (const char *, bool); +extern PGDLLIMPORT download_extension_file_hook_type download_extension_file_hook; + #endif /* FMGR_H */ diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 3233278b340..026d7e64786 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -107,6 +107,10 @@ extern PGDLLIMPORT volatile uint32 CritSectionCount; /* in tcop/postgres.c */ extern void ProcessInterrupts(void); +/* Callback called by ProcessInterrupts in the loop while it is returning true. */ +typedef bool (*process_interrupts_callback_t)(void); +extern process_interrupts_callback_t ProcessInterruptsCallback; + /* Test whether an interrupt is pending */ #ifndef WIN32 #define INTERRUPTS_PENDING_CONDITION() \ @@ -492,4 +496,7 @@ extern PGDLLIMPORT shmem_request_hook_type shmem_request_hook; /* in executor/nodeHash.c */ extern size_t get_hash_memory_limit(void); +/* in storage/buffer/buf_init.c */ +extern bool am_wal_redo_postgres; + #endif /* MISCADMIN_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 9fa23e2bb66..c373c7d2663 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -23,6 +23,7 @@ #include "nodes/plannodes.h" #include "nodes/tidbitmap.h" #include "partitioning/partdefs.h" +#include "storage/bufmgr.h" #include "storage/condition_variable.h" #include "utils/hsearch.h" #include "utils/queryenvironment.h" @@ -1690,6 +1691,15 @@ typedef struct ParallelBitmapHeapState char phs_snapshot_data[FLEXIBLE_ARRAY_MEMBER]; } ParallelBitmapHeapState; +typedef struct TBMIteratePrefetchResult +{ + BlockNumber blockno; /* page number containing tuples */ + int ntuples; /* -1 indicates lossy result */ + bool recheck; /* should the tuples be rechecked? */ + /* Note: recheck is always true if ntuples < 0 */ + OffsetNumber offsets[MaxHeapTuplesPerPage]; +} TBMIteratePrefetchResult; + /* ---------------- * BitmapHeapScanState information * @@ -1710,7 +1720,6 @@ typedef struct ParallelBitmapHeapState * pscan_len size of the shared memory for parallel bitmap * initialized is node is ready to iterate * shared_tbmiterator shared iterator - * shared_prefetch_iterator shared iterator for prefetching * pstate shared state for parallel bitmap scan * ---------------- */ @@ -1734,7 +1743,10 @@ typedef struct BitmapHeapScanState Size pscan_len; bool initialized; TBMSharedIterator *shared_tbmiterator; - TBMSharedIterator *shared_prefetch_iterator; + /* parallel worker private ring buffer with prefetch requests: it allows to access prefetch result from the same worker */ + TBMIteratePrefetchResult prefetch_requests[MAX_IO_CONCURRENCY]; + TBMIteratePrefetchResult tbmres_copy; /* copy of current iterator result */ + int prefetch_head; /* head position in ring buffer */ ParallelBitmapHeapState *pstate; } BitmapHeapScanState; diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index bc12071af6e..f7c33b7e658 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -69,6 +69,10 @@ extern PGDLLIMPORT bool enable_parallel_append; extern PGDLLIMPORT bool enable_parallel_hash; extern PGDLLIMPORT bool enable_partition_pruning; extern PGDLLIMPORT bool enable_async_append; +extern PGDLLIMPORT bool enable_seqscan_prefetch; +extern PGDLLIMPORT bool enable_indexscan_prefetch; +extern PGDLLIMPORT bool enable_indexonlyscan_prefetch; + extern PGDLLIMPORT int constraint_exclusion; extern double index_pages_fetched(double tuples_fetched, BlockNumber pages, diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index d09e9f9a1c3..407280ccaf8 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -340,6 +340,9 @@ /* Define if you have a function readline library */ #undef HAVE_LIBREADLINE +/* Define to 1 if you have the `seccomp' library (-lseccomp). */ +#undef HAVE_LIBSECCOMP + /* Define to 1 if you have the `selinux' library (-lselinux). */ #undef HAVE_LIBSELINUX diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index 8d2e3e3a57d..b0e5b6be896 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -57,7 +57,7 @@ * version. Example: "ACME Postgres/1.2". Note that the string will appear * in a user-facing error message if an ABI mismatch is detected. */ -#define FMGR_ABI_EXTRA "PostgreSQL" +#define FMGR_ABI_EXTRA "Neon Postgres" /* * Maximum number of columns in an index. There is little point in making diff --git a/src/include/replication/walsender.h b/src/include/replication/walsender.h index d99a21b0771..96ed5ede4b5 100644 --- a/src/include/replication/walsender.h +++ b/src/include/replication/walsender.h @@ -12,6 +12,7 @@ #ifndef _WALSENDER_H #define _WALSENDER_H +#include "access/xlog.h" #include /* @@ -48,6 +49,25 @@ extern void WalSndWaitStopping(void); extern void HandleWalSndInitStopping(void); extern void WalSndRqstFileReload(void); +/* + * Hook to check for WAL receiving backpressure. + * Return value in microseconds + */ +extern uint64 (*delay_backend_us)(void); + +/* expose these so that they can be reused by the neon walproposer extension */ +extern void LagTrackerWrite(XLogRecPtr lsn, TimestampTz local_flush_time); +extern TimeOffset LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now); +extern void ProcessStandbyReply(XLogRecPtr writePtr, XLogRecPtr flushPtr, + XLogRecPtr applyPtr, TimestampTz replyTime, + bool replyRequested); +extern void PhysicalConfirmReceivedLocation(XLogRecPtr lsn); +extern void ProcessStandbyHSFeedback(TimestampTz replyTime, + TransactionId feedbackXmin, + uint32 feedbackEpoch, + TransactionId feedbackCatalogXmin, + uint32 feedbackCatalogEpoch); + /* * Remember that we want to wakeup walsenders later * diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index a17e7b28a53..14bf8ca3df4 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -278,6 +278,8 @@ typedef struct WritebackContext extern PGDLLIMPORT BufferDescPadded *BufferDescriptors; extern PGDLLIMPORT WritebackContext BackendWritebackContext; +extern Buffer wal_redo_buffer; + /* in localbuf.c */ extern PGDLLIMPORT BufferDesc *LocalBufferDescriptors; diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 58391406f65..ccda751e9e0 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -76,6 +76,8 @@ extern PGDLLIMPORT int checkpoint_flush_after; extern PGDLLIMPORT int backend_flush_after; extern PGDLLIMPORT int bgwriter_flush_after; +extern bool zenith_test_evict; + /* in buf_init.c */ extern PGDLLIMPORT char *BufferBlocks; @@ -84,8 +86,8 @@ extern PGDLLIMPORT int NLocBuffer; extern PGDLLIMPORT Block *LocalBufferBlockPointers; extern PGDLLIMPORT int32 *LocalRefCount; -/* upper limit for effective_io_concurrency */ -#define MAX_IO_CONCURRENCY 1000 +/* upper limit for effective_io_concurrency (better to he power of 2) */ +#define MAX_IO_CONCURRENCY 1024 /* special block number for ReadBuffer() */ #define P_NEW InvalidBlockNumber /* grow the file to get a new page */ diff --git a/src/include/storage/md.h b/src/include/storage/md.h index ffffa40db71..34fc0f2c0ff 100644 --- a/src/include/storage/md.h +++ b/src/include/storage/md.h @@ -30,6 +30,7 @@ extern void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); +extern void md_reset_prefetch(SMgrRelation reln); extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); extern void mdwrite(SMgrRelation reln, ForkNumber forknum, diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 6b63c60fbd9..2a29dcd194b 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -18,6 +18,14 @@ #include "storage/block.h" #include "storage/relfilenode.h" +struct f_smgr; + +/* + * Neon: extended SMGR API. + * This define can be used by extensions to determine that them are built for Neon. + */ +#define NEON_SMGR 1 + /* * smgr.c maintains a table of SMgrRelation objects, which are essentially * cached file handles. An SMgrRelation is created (if not already present) @@ -41,6 +49,9 @@ typedef struct SMgrRelationData /* rnode is the hashtable lookup key, so it must be first! */ RelFileNodeBackend smgr_rnode; /* relation physical identifier */ + /* copy of pg_class.relpersistence, or 0 if not known */ + char smgr_relpersistence; + /* pointer to owning pointer, or NULL if none */ struct SMgrRelationData **smgr_owner; @@ -59,7 +70,7 @@ typedef struct SMgrRelationData * Fields below here are intended to be private to smgr.c and its * submodules. Do not touch them from elsewhere. */ - int smgr_which; /* storage manager selector */ + const struct f_smgr *smgr; /* * for md.c; per-fork arrays of the number of open segments @@ -77,8 +88,68 @@ typedef SMgrRelationData *SMgrRelation; #define SmgrIsTemp(smgr) \ RelFileNodeBackendIsTemp((smgr)->smgr_rnode) + +/* + * This struct of function pointers defines the API between smgr.c and + * any individual storage manager module. Note that smgr subfunctions are + * generally expected to report problems via elog(ERROR). An exception is + * that smgr_unlink should use elog(WARNING), rather than erroring out, + * because we normally unlink relations during post-commit/abort cleanup, + * and so it's too late to raise an error. Also, various conditions that + * would normally be errors should be allowed during bootstrap and/or WAL + * recovery --- see comments in md.c for details. + */ +typedef struct f_smgr +{ + void (*smgr_init) (void); /* may be NULL */ + void (*smgr_shutdown) (void); /* may be NULL */ + void (*smgr_open) (SMgrRelation reln); + void (*smgr_close) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_create) (SMgrRelation reln, ForkNumber forknum, + bool isRedo); + bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_unlink) (RelFileNodeBackend rnode, ForkNumber forknum, + bool isRedo); + void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); + bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); + void (*smgr_read) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer); + void (*smgr_write) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, char *buffer, bool skipFsync); + void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); + BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); + void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); + + void (*smgr_reset_prefetch) (SMgrRelation reln); + void (*smgr_start_unlogged_build) (SMgrRelation reln); + void (*smgr_finish_unlogged_build_phase_1) (SMgrRelation reln); + void (*smgr_end_unlogged_build) (SMgrRelation reln); +} f_smgr; + +typedef void (*smgr_init_hook_type) (void); +typedef void (*smgr_shutdown_hook_type) (void); +extern PGDLLIMPORT smgr_init_hook_type smgr_init_hook; +extern PGDLLIMPORT smgr_shutdown_hook_type smgr_shutdown_hook; +extern void smgr_init_standard(void); +extern void smgr_shutdown_standard(void); + +// Alternative implementation of calculate_database_size() +typedef int64 (*dbsize_hook_type) (Oid dbOid); +extern PGDLLIMPORT dbsize_hook_type dbsize_hook; + +typedef const f_smgr *(*smgr_hook_type) (BackendId backend, RelFileNode rnode); +extern PGDLLIMPORT smgr_hook_type smgr_hook; +extern const f_smgr *smgr_standard(BackendId backend, RelFileNode rnode); + +extern const f_smgr *smgr(BackendId backend, RelFileNode rnode); + extern void smgrinit(void); -extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend); +extern SMgrRelation smgropen(RelFileNode rnode, BackendId backend, char relpersistence); extern bool smgrexists(SMgrRelation reln, ForkNumber forknum); extern void smgrsetowner(SMgrRelation *owner, SMgrRelation reln); extern void smgrclearowner(SMgrRelation *owner, SMgrRelation reln); @@ -108,4 +179,8 @@ extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum); extern void AtEOXact_SMgr(void); extern bool ProcessBarrierSmgrRelease(void); +extern void smgr_start_unlogged_build(SMgrRelation reln); +extern void smgr_finish_unlogged_build_phase_1(SMgrRelation reln); +extern void smgr_end_unlogged_build(SMgrRelation reln); + #endif /* SMGR_H */ diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index a1bc0717567..4f333a21fa1 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -556,7 +556,7 @@ static inline SMgrRelation RelationGetSmgr(Relation rel) { if (unlikely(rel->rd_smgr == NULL)) - smgrsetowner(&(rel->rd_smgr), smgropen(rel->rd_node, rel->rd_backend)); + smgrsetowner(&(rel->rd_smgr), smgropen(rel->rd_node, rel->rd_backend, rel->rd_rel->relpersistence)); return rel->rd_smgr; } #endif /* !FRONTEND */ diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h index b578e2ec757..5569dee5345 100644 --- a/src/include/utils/wait_event.h +++ b/src/include/utils/wait_event.h @@ -146,7 +146,8 @@ typedef enum WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL, WAIT_EVENT_REGISTER_SYNC_REQUEST, WAIT_EVENT_VACUUM_DELAY, - WAIT_EVENT_VACUUM_TRUNCATE + WAIT_EVENT_VACUUM_TRUNCATE, + WAIT_EVENT_BACK_PRESSURE } WaitEventTimeout; /* ---------- diff --git a/src/test/regress/expected/alter_table_1.out b/src/test/regress/expected/alter_table_1.out new file mode 100644 index 00000000000..0f116d8750d --- /dev/null +++ b/src/test/regress/expected/alter_table_1.out @@ -0,0 +1,4487 @@ +-- +-- ALTER_TABLE +-- +-- Clean up in case a prior regression run failed +SET client_min_messages TO 'warning'; +DROP ROLE IF EXISTS regress_alter_table_user1; +RESET client_min_messages; +CREATE USER regress_alter_table_user1; +-- +-- add attribute +-- +CREATE TABLE attmp (initial int4); +COMMENT ON TABLE attmp_wrong IS 'table comment'; +ERROR: relation "attmp_wrong" does not exist +COMMENT ON TABLE attmp IS 'table comment'; +COMMENT ON TABLE attmp IS NULL; +ALTER TABLE attmp ADD COLUMN xmin integer; -- fails +ERROR: column name "xmin" conflicts with a system column name +ALTER TABLE attmp ADD COLUMN a int4 default 3; +ALTER TABLE attmp ADD COLUMN b name; +ALTER TABLE attmp ADD COLUMN c text; +ALTER TABLE attmp ADD COLUMN d float8; +ALTER TABLE attmp ADD COLUMN e float4; +ALTER TABLE attmp ADD COLUMN f int2; +ALTER TABLE attmp ADD COLUMN g polygon; +ALTER TABLE attmp ADD COLUMN i char; +ALTER TABLE attmp ADD COLUMN k int4; +ALTER TABLE attmp ADD COLUMN l tid; +ALTER TABLE attmp ADD COLUMN m xid; +ALTER TABLE attmp ADD COLUMN n oidvector; +--ALTER TABLE attmp ADD COLUMN o lock; +ALTER TABLE attmp ADD COLUMN p boolean; +ALTER TABLE attmp ADD COLUMN q point; +ALTER TABLE attmp ADD COLUMN r lseg; +ALTER TABLE attmp ADD COLUMN s path; +ALTER TABLE attmp ADD COLUMN t box; +ALTER TABLE attmp ADD COLUMN v timestamp; +ALTER TABLE attmp ADD COLUMN w interval; +ALTER TABLE attmp ADD COLUMN x float8[]; +ALTER TABLE attmp ADD COLUMN y float4[]; +ALTER TABLE attmp ADD COLUMN z int2[]; +INSERT INTO attmp (a, b, c, d, e, f, g, i, k, l, m, n, p, q, r, s, t, + v, w, x, y, z) + VALUES (4, 'name', 'text', 4.1, 4.1, 2, '(4.1,4.1,3.1,3.1)', + 'c', + 314159, '(1,1)', '512', + '1 2 3 4 5 6 7 8', true, '(1.1,1.1)', '(4.1,4.1,3.1,3.1)', + '(0,2,4.1,4.1,3.1,3.1)', '(4.1,4.1,3.1,3.1)', + 'epoch', '01:00:10', '{1.0,2.0,3.0,4.0}', '{1.0,2.0,3.0,4.0}', '{1,2,3,4}'); +SELECT * FROM attmp; + initial | a | b | c | d | e | f | g | i | k | l | m | n | p | q | r | s | t | v | w | x | y | z +---------+---+------+------+-----+-----+---+-----------------------+---+--------+-------+-----+-----------------+---+-----------+-----------------------+-----------------------------+---------------------+--------------------------+------------------+-----------+-----------+----------- + | 4 | name | text | 4.1 | 4.1 | 2 | ((4.1,4.1),(3.1,3.1)) | c | 314159 | (1,1) | 512 | 1 2 3 4 5 6 7 8 | t | (1.1,1.1) | [(4.1,4.1),(3.1,3.1)] | ((0,2),(4.1,4.1),(3.1,3.1)) | (4.1,4.1),(3.1,3.1) | Thu Jan 01 00:00:00 1970 | @ 1 hour 10 secs | {1,2,3,4} | {1,2,3,4} | {1,2,3,4} +(1 row) + +DROP TABLE attmp; +-- the wolf bug - schema mods caused inconsistent row descriptors +CREATE TABLE attmp ( + initial int4 +); +ALTER TABLE attmp ADD COLUMN a int4; +ALTER TABLE attmp ADD COLUMN b name; +ALTER TABLE attmp ADD COLUMN c text; +ALTER TABLE attmp ADD COLUMN d float8; +ALTER TABLE attmp ADD COLUMN e float4; +ALTER TABLE attmp ADD COLUMN f int2; +ALTER TABLE attmp ADD COLUMN g polygon; +ALTER TABLE attmp ADD COLUMN i char; +ALTER TABLE attmp ADD COLUMN k int4; +ALTER TABLE attmp ADD COLUMN l tid; +ALTER TABLE attmp ADD COLUMN m xid; +ALTER TABLE attmp ADD COLUMN n oidvector; +--ALTER TABLE attmp ADD COLUMN o lock; +ALTER TABLE attmp ADD COLUMN p boolean; +ALTER TABLE attmp ADD COLUMN q point; +ALTER TABLE attmp ADD COLUMN r lseg; +ALTER TABLE attmp ADD COLUMN s path; +ALTER TABLE attmp ADD COLUMN t box; +ALTER TABLE attmp ADD COLUMN v timestamp; +ALTER TABLE attmp ADD COLUMN w interval; +ALTER TABLE attmp ADD COLUMN x float8[]; +ALTER TABLE attmp ADD COLUMN y float4[]; +ALTER TABLE attmp ADD COLUMN z int2[]; +INSERT INTO attmp (a, b, c, d, e, f, g, i, k, l, m, n, p, q, r, s, t, + v, w, x, y, z) + VALUES (4, 'name', 'text', 4.1, 4.1, 2, '(4.1,4.1,3.1,3.1)', + 'c', + 314159, '(1,1)', '512', + '1 2 3 4 5 6 7 8', true, '(1.1,1.1)', '(4.1,4.1,3.1,3.1)', + '(0,2,4.1,4.1,3.1,3.1)', '(4.1,4.1,3.1,3.1)', + 'epoch', '01:00:10', '{1.0,2.0,3.0,4.0}', '{1.0,2.0,3.0,4.0}', '{1,2,3,4}'); +SELECT * FROM attmp; + initial | a | b | c | d | e | f | g | i | k | l | m | n | p | q | r | s | t | v | w | x | y | z +---------+---+------+------+-----+-----+---+-----------------------+---+--------+-------+-----+-----------------+---+-----------+-----------------------+-----------------------------+---------------------+--------------------------+------------------+-----------+-----------+----------- + | 4 | name | text | 4.1 | 4.1 | 2 | ((4.1,4.1),(3.1,3.1)) | c | 314159 | (1,1) | 512 | 1 2 3 4 5 6 7 8 | t | (1.1,1.1) | [(4.1,4.1),(3.1,3.1)] | ((0,2),(4.1,4.1),(3.1,3.1)) | (4.1,4.1),(3.1,3.1) | Thu Jan 01 00:00:00 1970 | @ 1 hour 10 secs | {1,2,3,4} | {1,2,3,4} | {1,2,3,4} +(1 row) + +CREATE INDEX attmp_idx ON attmp (a, (d + e), b); +ALTER INDEX attmp_idx ALTER COLUMN 0 SET STATISTICS 1000; +ERROR: column number must be in range from 1 to 32767 +LINE 1: ALTER INDEX attmp_idx ALTER COLUMN 0 SET STATISTICS 1000; + ^ +ALTER INDEX attmp_idx ALTER COLUMN 1 SET STATISTICS 1000; +ERROR: cannot alter statistics on non-expression column "a" of index "attmp_idx" +HINT: Alter statistics on table column instead. +ALTER INDEX attmp_idx ALTER COLUMN 2 SET STATISTICS 1000; +\d+ attmp_idx + Index "public.attmp_idx" + Column | Type | Key? | Definition | Storage | Stats target +--------+------------------+------+------------+---------+-------------- + a | integer | yes | a | plain | + expr | double precision | yes | (d + e) | plain | 1000 + b | cstring | yes | b | plain | +btree, for table "public.attmp" + +ALTER INDEX attmp_idx ALTER COLUMN 3 SET STATISTICS 1000; +ERROR: cannot alter statistics on non-expression column "b" of index "attmp_idx" +HINT: Alter statistics on table column instead. +ALTER INDEX attmp_idx ALTER COLUMN 4 SET STATISTICS 1000; +ERROR: column number 4 of relation "attmp_idx" does not exist +ALTER INDEX attmp_idx ALTER COLUMN 2 SET STATISTICS -1; +DROP TABLE attmp; +-- +-- rename - check on both non-temp and temp tables +-- +CREATE TABLE attmp (regtable int); +CREATE TEMP TABLE attmp (attmptable int); +ALTER TABLE attmp RENAME TO attmp_new; +SELECT * FROM attmp; + regtable +---------- +(0 rows) + +SELECT * FROM attmp_new; + attmptable +------------ +(0 rows) + +ALTER TABLE attmp RENAME TO attmp_new2; +SELECT * FROM attmp; -- should fail +ERROR: relation "attmp" does not exist +LINE 1: SELECT * FROM attmp; + ^ +SELECT * FROM attmp_new; + attmptable +------------ +(0 rows) + +SELECT * FROM attmp_new2; + regtable +---------- +(0 rows) + +DROP TABLE attmp_new; +DROP TABLE attmp_new2; +-- check rename of partitioned tables and indexes also +CREATE TABLE part_attmp (a int primary key) partition by range (a); +CREATE TABLE part_attmp1 PARTITION OF part_attmp FOR VALUES FROM (0) TO (100); +ALTER INDEX part_attmp_pkey RENAME TO part_attmp_index; +ALTER INDEX part_attmp1_pkey RENAME TO part_attmp1_index; +ALTER TABLE part_attmp RENAME TO part_at2tmp; +ALTER TABLE part_attmp1 RENAME TO part_at2tmp1; +SET ROLE regress_alter_table_user1; +ALTER INDEX part_attmp_index RENAME TO fail; +ERROR: must be owner of index part_attmp_index +ALTER INDEX part_attmp1_index RENAME TO fail; +ERROR: must be owner of index part_attmp1_index +ALTER TABLE part_at2tmp RENAME TO fail; +ERROR: must be owner of table part_at2tmp +ALTER TABLE part_at2tmp1 RENAME TO fail; +ERROR: must be owner of table part_at2tmp1 +RESET ROLE; +DROP TABLE part_at2tmp; +-- +-- check renaming to a table's array type's autogenerated name +-- (the array type's name should get out of the way) +-- +CREATE TABLE attmp_array (id int); +CREATE TABLE attmp_array2 (id int); +SELECT typname FROM pg_type WHERE oid = 'attmp_array[]'::regtype; + typname +-------------- + _attmp_array +(1 row) + +SELECT typname FROM pg_type WHERE oid = 'attmp_array2[]'::regtype; + typname +--------------- + _attmp_array2 +(1 row) + +ALTER TABLE attmp_array2 RENAME TO _attmp_array; +SELECT typname FROM pg_type WHERE oid = 'attmp_array[]'::regtype; + typname +--------------- + __attmp_array +(1 row) + +SELECT typname FROM pg_type WHERE oid = '_attmp_array[]'::regtype; + typname +---------------- + ___attmp_array +(1 row) + +DROP TABLE _attmp_array; +DROP TABLE attmp_array; +-- renaming to table's own array type's name is an interesting corner case +CREATE TABLE attmp_array (id int); +SELECT typname FROM pg_type WHERE oid = 'attmp_array[]'::regtype; + typname +-------------- + _attmp_array +(1 row) + +ALTER TABLE attmp_array RENAME TO _attmp_array; +SELECT typname FROM pg_type WHERE oid = '_attmp_array[]'::regtype; + typname +--------------- + __attmp_array +(1 row) + +DROP TABLE _attmp_array; +-- ALTER TABLE ... RENAME on non-table relations +-- renaming indexes (FIXME: this should probably test the index's functionality) +ALTER INDEX IF EXISTS __onek_unique1 RENAME TO attmp_onek_unique1; +NOTICE: relation "__onek_unique1" does not exist, skipping +ALTER INDEX IF EXISTS __attmp_onek_unique1 RENAME TO onek_unique1; +NOTICE: relation "__attmp_onek_unique1" does not exist, skipping +ALTER INDEX onek_unique1 RENAME TO attmp_onek_unique1; +ALTER INDEX attmp_onek_unique1 RENAME TO onek_unique1; +SET ROLE regress_alter_table_user1; +ALTER INDEX onek_unique1 RENAME TO fail; -- permission denied +ERROR: must be owner of index onek_unique1 +RESET ROLE; +-- renaming views +CREATE VIEW attmp_view (unique1) AS SELECT unique1 FROM tenk1; +ALTER TABLE attmp_view RENAME TO attmp_view_new; +SET ROLE regress_alter_table_user1; +ALTER VIEW attmp_view_new RENAME TO fail; -- permission denied +ERROR: must be owner of view attmp_view_new +RESET ROLE; +-- hack to ensure we get an indexscan here +set enable_seqscan to off; +set enable_bitmapscan to off; +-- 5 values, sorted +SELECT unique1 FROM tenk1 WHERE unique1 < 5; + unique1 +--------- + 0 + 1 + 2 + 3 + 4 +(5 rows) + +reset enable_seqscan; +reset enable_bitmapscan; +DROP VIEW attmp_view_new; +-- toast-like relation name +alter table stud_emp rename to pg_toast_stud_emp; +alter table pg_toast_stud_emp rename to stud_emp; +-- renaming index should rename constraint as well +ALTER TABLE onek ADD CONSTRAINT onek_unique1_constraint UNIQUE (unique1); +ALTER INDEX onek_unique1_constraint RENAME TO onek_unique1_constraint_foo; +ALTER TABLE onek DROP CONSTRAINT onek_unique1_constraint_foo; +-- renaming constraint +ALTER TABLE onek ADD CONSTRAINT onek_check_constraint CHECK (unique1 >= 0); +ALTER TABLE onek RENAME CONSTRAINT onek_check_constraint TO onek_check_constraint_foo; +ALTER TABLE onek DROP CONSTRAINT onek_check_constraint_foo; +-- renaming constraint should rename index as well +ALTER TABLE onek ADD CONSTRAINT onek_unique1_constraint UNIQUE (unique1); +DROP INDEX onek_unique1_constraint; -- to see whether it's there +ERROR: cannot drop index onek_unique1_constraint because constraint onek_unique1_constraint on table onek requires it +HINT: You can drop constraint onek_unique1_constraint on table onek instead. +ALTER TABLE onek RENAME CONSTRAINT onek_unique1_constraint TO onek_unique1_constraint_foo; +DROP INDEX onek_unique1_constraint_foo; -- to see whether it's there +ERROR: cannot drop index onek_unique1_constraint_foo because constraint onek_unique1_constraint_foo on table onek requires it +HINT: You can drop constraint onek_unique1_constraint_foo on table onek instead. +ALTER TABLE onek DROP CONSTRAINT onek_unique1_constraint_foo; +-- renaming constraints vs. inheritance +CREATE TABLE constraint_rename_test (a int CONSTRAINT con1 CHECK (a > 0), b int, c int); +\d constraint_rename_test + Table "public.constraint_rename_test" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | integer | | | +Check constraints: + "con1" CHECK (a > 0) + +CREATE TABLE constraint_rename_test2 (a int CONSTRAINT con1 CHECK (a > 0), d int) INHERITS (constraint_rename_test); +NOTICE: merging column "a" with inherited definition +NOTICE: merging constraint "con1" with inherited definition +\d constraint_rename_test2 + Table "public.constraint_rename_test2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | integer | | | + d | integer | | | +Check constraints: + "con1" CHECK (a > 0) +Inherits: constraint_rename_test + +ALTER TABLE constraint_rename_test2 RENAME CONSTRAINT con1 TO con1foo; -- fail +ERROR: cannot rename inherited constraint "con1" +ALTER TABLE ONLY constraint_rename_test RENAME CONSTRAINT con1 TO con1foo; -- fail +ERROR: inherited constraint "con1" must be renamed in child tables too +ALTER TABLE constraint_rename_test RENAME CONSTRAINT con1 TO con1foo; -- ok +\d constraint_rename_test + Table "public.constraint_rename_test" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | integer | | | +Check constraints: + "con1foo" CHECK (a > 0) +Number of child tables: 1 (Use \d+ to list them.) + +\d constraint_rename_test2 + Table "public.constraint_rename_test2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | integer | | | + d | integer | | | +Check constraints: + "con1foo" CHECK (a > 0) +Inherits: constraint_rename_test + +ALTER TABLE constraint_rename_test ADD CONSTRAINT con2 CHECK (b > 0) NO INHERIT; +ALTER TABLE ONLY constraint_rename_test RENAME CONSTRAINT con2 TO con2foo; -- ok +ALTER TABLE constraint_rename_test RENAME CONSTRAINT con2foo TO con2bar; -- ok +\d constraint_rename_test + Table "public.constraint_rename_test" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | integer | | | +Check constraints: + "con1foo" CHECK (a > 0) + "con2bar" CHECK (b > 0) NO INHERIT +Number of child tables: 1 (Use \d+ to list them.) + +\d constraint_rename_test2 + Table "public.constraint_rename_test2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | integer | | | + d | integer | | | +Check constraints: + "con1foo" CHECK (a > 0) +Inherits: constraint_rename_test + +ALTER TABLE constraint_rename_test ADD CONSTRAINT con3 PRIMARY KEY (a); +ALTER TABLE constraint_rename_test RENAME CONSTRAINT con3 TO con3foo; -- ok +\d constraint_rename_test + Table "public.constraint_rename_test" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | not null | + b | integer | | | + c | integer | | | +Indexes: + "con3foo" PRIMARY KEY, btree (a) +Check constraints: + "con1foo" CHECK (a > 0) + "con2bar" CHECK (b > 0) NO INHERIT +Number of child tables: 1 (Use \d+ to list them.) + +\d constraint_rename_test2 + Table "public.constraint_rename_test2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | not null | + b | integer | | | + c | integer | | | + d | integer | | | +Check constraints: + "con1foo" CHECK (a > 0) +Inherits: constraint_rename_test + +DROP TABLE constraint_rename_test2; +DROP TABLE constraint_rename_test; +ALTER TABLE IF EXISTS constraint_not_exist RENAME CONSTRAINT con3 TO con3foo; -- ok +NOTICE: relation "constraint_not_exist" does not exist, skipping +ALTER TABLE IF EXISTS constraint_rename_test ADD CONSTRAINT con4 UNIQUE (a); +NOTICE: relation "constraint_rename_test" does not exist, skipping +-- renaming constraints with cache reset of target relation +CREATE TABLE constraint_rename_cache (a int, + CONSTRAINT chk_a CHECK (a > 0), + PRIMARY KEY (a)); +ALTER TABLE constraint_rename_cache + RENAME CONSTRAINT chk_a TO chk_a_new; +ALTER TABLE constraint_rename_cache + RENAME CONSTRAINT constraint_rename_cache_pkey TO constraint_rename_pkey_new; +CREATE TABLE like_constraint_rename_cache + (LIKE constraint_rename_cache INCLUDING ALL); +\d like_constraint_rename_cache + Table "public.like_constraint_rename_cache" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | not null | +Indexes: + "like_constraint_rename_cache_pkey" PRIMARY KEY, btree (a) +Check constraints: + "chk_a_new" CHECK (a > 0) + +DROP TABLE constraint_rename_cache; +DROP TABLE like_constraint_rename_cache; +-- FOREIGN KEY CONSTRAINT adding TEST +CREATE TABLE attmp2 (a int primary key); +CREATE TABLE attmp3 (a int, b int); +CREATE TABLE attmp4 (a int, b int, unique(a,b)); +CREATE TABLE attmp5 (a int, b int); +-- Insert rows into attmp2 (pktable) +INSERT INTO attmp2 values (1); +INSERT INTO attmp2 values (2); +INSERT INTO attmp2 values (3); +INSERT INTO attmp2 values (4); +-- Insert rows into attmp3 +INSERT INTO attmp3 values (1,10); +INSERT INTO attmp3 values (1,20); +INSERT INTO attmp3 values (5,50); +-- Try (and fail) to add constraint due to invalid source columns +ALTER TABLE attmp3 add constraint attmpconstr foreign key(c) references attmp2 match full; +ERROR: column "c" referenced in foreign key constraint does not exist +-- Try (and fail) to add constraint due to invalid destination columns explicitly given +ALTER TABLE attmp3 add constraint attmpconstr foreign key(a) references attmp2(b) match full; +ERROR: column "b" referenced in foreign key constraint does not exist +-- Try (and fail) to add constraint due to invalid data +ALTER TABLE attmp3 add constraint attmpconstr foreign key (a) references attmp2 match full; +ERROR: insert or update on table "attmp3" violates foreign key constraint "attmpconstr" +DETAIL: Key (a)=(5) is not present in table "attmp2". +-- Delete failing row +DELETE FROM attmp3 where a=5; +-- Try (and succeed) +ALTER TABLE attmp3 add constraint attmpconstr foreign key (a) references attmp2 match full; +ALTER TABLE attmp3 drop constraint attmpconstr; +INSERT INTO attmp3 values (5,50); +-- Try NOT VALID and then VALIDATE CONSTRAINT, but fails. Delete failure then re-validate +ALTER TABLE attmp3 add constraint attmpconstr foreign key (a) references attmp2 match full NOT VALID; +ALTER TABLE attmp3 validate constraint attmpconstr; +ERROR: insert or update on table "attmp3" violates foreign key constraint "attmpconstr" +DETAIL: Key (a)=(5) is not present in table "attmp2". +-- Delete failing row +DELETE FROM attmp3 where a=5; +-- Try (and succeed) and repeat to show it works on already valid constraint +ALTER TABLE attmp3 validate constraint attmpconstr; +ALTER TABLE attmp3 validate constraint attmpconstr; +-- Try a non-verified CHECK constraint +ALTER TABLE attmp3 ADD CONSTRAINT b_greater_than_ten CHECK (b > 10); -- fail +ERROR: check constraint "b_greater_than_ten" of relation "attmp3" is violated by some row +ALTER TABLE attmp3 ADD CONSTRAINT b_greater_than_ten CHECK (b > 10) NOT VALID; -- succeeds +ALTER TABLE attmp3 VALIDATE CONSTRAINT b_greater_than_ten; -- fails +ERROR: check constraint "b_greater_than_ten" of relation "attmp3" is violated by some row +DELETE FROM attmp3 WHERE NOT b > 10; +ALTER TABLE attmp3 VALIDATE CONSTRAINT b_greater_than_ten; -- succeeds +ALTER TABLE attmp3 VALIDATE CONSTRAINT b_greater_than_ten; -- succeeds +-- Test inherited NOT VALID CHECK constraints +select * from attmp3; + a | b +---+---- + 1 | 20 +(1 row) + +CREATE TABLE attmp6 () INHERITS (attmp3); +CREATE TABLE attmp7 () INHERITS (attmp3); +INSERT INTO attmp6 VALUES (6, 30), (7, 16); +ALTER TABLE attmp3 ADD CONSTRAINT b_le_20 CHECK (b <= 20) NOT VALID; +ALTER TABLE attmp3 VALIDATE CONSTRAINT b_le_20; -- fails +ERROR: check constraint "b_le_20" of relation "attmp6" is violated by some row +DELETE FROM attmp6 WHERE b > 20; +ALTER TABLE attmp3 VALIDATE CONSTRAINT b_le_20; -- succeeds +-- An already validated constraint must not be revalidated +CREATE FUNCTION boo(int) RETURNS int IMMUTABLE STRICT LANGUAGE plpgsql AS $$ BEGIN RAISE NOTICE 'boo: %', $1; RETURN $1; END; $$; +INSERT INTO attmp7 VALUES (8, 18); +ALTER TABLE attmp7 ADD CONSTRAINT identity CHECK (b = boo(b)); +NOTICE: boo: 18 +ALTER TABLE attmp3 ADD CONSTRAINT IDENTITY check (b = boo(b)) NOT VALID; +NOTICE: merging constraint "identity" with inherited definition +ALTER TABLE attmp3 VALIDATE CONSTRAINT identity; +NOTICE: boo: 20 +NOTICE: boo: 16 +-- A NO INHERIT constraint should not be looked for in children during VALIDATE CONSTRAINT +create table parent_noinh_convalid (a int); +create table child_noinh_convalid () inherits (parent_noinh_convalid); +insert into parent_noinh_convalid values (1); +insert into child_noinh_convalid values (1); +alter table parent_noinh_convalid add constraint check_a_is_2 check (a = 2) no inherit not valid; +-- fail, because of the row in parent +alter table parent_noinh_convalid validate constraint check_a_is_2; +ERROR: check constraint "check_a_is_2" of relation "parent_noinh_convalid" is violated by some row +delete from only parent_noinh_convalid; +-- ok (parent itself contains no violating rows) +alter table parent_noinh_convalid validate constraint check_a_is_2; +select convalidated from pg_constraint where conrelid = 'parent_noinh_convalid'::regclass and conname = 'check_a_is_2'; + convalidated +-------------- + t +(1 row) + +-- cleanup +drop table parent_noinh_convalid, child_noinh_convalid; +-- Try (and fail) to create constraint from attmp5(a) to attmp4(a) - unique constraint on +-- attmp4 is a,b +ALTER TABLE attmp5 add constraint attmpconstr foreign key(a) references attmp4(a) match full; +ERROR: there is no unique constraint matching given keys for referenced table "attmp4" +DROP TABLE attmp7; +DROP TABLE attmp6; +DROP TABLE attmp5; +DROP TABLE attmp4; +DROP TABLE attmp3; +DROP TABLE attmp2; +-- NOT VALID with plan invalidation -- ensure we don't use a constraint for +-- exclusion until validated +set constraint_exclusion TO 'partition'; +create table nv_parent (d date, check (false) no inherit not valid); +-- not valid constraint added at creation time should automatically become valid +\d nv_parent + Table "public.nv_parent" + Column | Type | Collation | Nullable | Default +--------+------+-----------+----------+--------- + d | date | | | +Check constraints: + "nv_parent_check" CHECK (false) NO INHERIT + +create table nv_child_2010 () inherits (nv_parent); +create table nv_child_2011 () inherits (nv_parent); +alter table nv_child_2010 add check (d between '2010-01-01'::date and '2010-12-31'::date) not valid; +alter table nv_child_2011 add check (d between '2011-01-01'::date and '2011-12-31'::date) not valid; +explain (costs off) select * from nv_parent where d between '2011-08-01' and '2011-08-31'; + QUERY PLAN +--------------------------------------------------------------------------- + Append + -> Seq Scan on nv_parent nv_parent_1 + Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date)) + -> Seq Scan on nv_child_2010 nv_parent_2 + Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date)) + -> Seq Scan on nv_child_2011 nv_parent_3 + Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date)) +(7 rows) + +create table nv_child_2009 (check (d between '2009-01-01'::date and '2009-12-31'::date)) inherits (nv_parent); +explain (costs off) select * from nv_parent where d between '2011-08-01'::date and '2011-08-31'::date; + QUERY PLAN +--------------------------------------------------------------------------- + Append + -> Seq Scan on nv_parent nv_parent_1 + Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date)) + -> Seq Scan on nv_child_2010 nv_parent_2 + Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date)) + -> Seq Scan on nv_child_2011 nv_parent_3 + Filter: ((d >= '08-01-2011'::date) AND (d <= '08-31-2011'::date)) +(7 rows) + +explain (costs off) select * from nv_parent where d between '2009-08-01'::date and '2009-08-31'::date; + QUERY PLAN +--------------------------------------------------------------------------- + Append + -> Seq Scan on nv_parent nv_parent_1 + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) + -> Seq Scan on nv_child_2010 nv_parent_2 + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) + -> Seq Scan on nv_child_2011 nv_parent_3 + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) + -> Seq Scan on nv_child_2009 nv_parent_4 + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) +(9 rows) + +-- after validation, the constraint should be used +alter table nv_child_2011 VALIDATE CONSTRAINT nv_child_2011_d_check; +explain (costs off) select * from nv_parent where d between '2009-08-01'::date and '2009-08-31'::date; + QUERY PLAN +--------------------------------------------------------------------------- + Append + -> Seq Scan on nv_parent nv_parent_1 + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) + -> Seq Scan on nv_child_2010 nv_parent_2 + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) + -> Seq Scan on nv_child_2009 nv_parent_3 + Filter: ((d >= '08-01-2009'::date) AND (d <= '08-31-2009'::date)) +(7 rows) + +-- add an inherited NOT VALID constraint +alter table nv_parent add check (d between '2001-01-01'::date and '2099-12-31'::date) not valid; +\d nv_child_2009 + Table "public.nv_child_2009" + Column | Type | Collation | Nullable | Default +--------+------+-----------+----------+--------- + d | date | | | +Check constraints: + "nv_child_2009_d_check" CHECK (d >= '01-01-2009'::date AND d <= '12-31-2009'::date) + "nv_parent_d_check" CHECK (d >= '01-01-2001'::date AND d <= '12-31-2099'::date) NOT VALID +Inherits: nv_parent + +-- we leave nv_parent and children around to help test pg_dump logic +-- Foreign key adding test with mixed types +-- Note: these tables are TEMP to avoid name conflicts when this test +-- is run in parallel with foreign_key.sql. +CREATE TEMP TABLE PKTABLE (ptest1 int PRIMARY KEY); +INSERT INTO PKTABLE VALUES(42); +CREATE TEMP TABLE FKTABLE (ftest1 inet); +-- This next should fail, because int=inet does not exist +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable; +ERROR: foreign key constraint "fktable_ftest1_fkey" cannot be implemented +DETAIL: Key columns "ftest1" and "ptest1" are of incompatible types: inet and integer. +-- This should also fail for the same reason, but here we +-- give the column name +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable(ptest1); +ERROR: foreign key constraint "fktable_ftest1_fkey" cannot be implemented +DETAIL: Key columns "ftest1" and "ptest1" are of incompatible types: inet and integer. +DROP TABLE FKTABLE; +-- This should succeed, even though they are different types, +-- because int=int8 exists and is a member of the integer opfamily +CREATE TEMP TABLE FKTABLE (ftest1 int8); +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable; +-- Check it actually works +INSERT INTO FKTABLE VALUES(42); -- should succeed +INSERT INTO FKTABLE VALUES(43); -- should fail +ERROR: insert or update on table "fktable" violates foreign key constraint "fktable_ftest1_fkey" +DETAIL: Key (ftest1)=(43) is not present in table "pktable". +DROP TABLE FKTABLE; +-- This should fail, because we'd have to cast numeric to int which is +-- not an implicit coercion (or use numeric=numeric, but that's not part +-- of the integer opfamily) +CREATE TEMP TABLE FKTABLE (ftest1 numeric); +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable; +ERROR: foreign key constraint "fktable_ftest1_fkey" cannot be implemented +DETAIL: Key columns "ftest1" and "ptest1" are of incompatible types: numeric and integer. +DROP TABLE FKTABLE; +DROP TABLE PKTABLE; +-- On the other hand, this should work because int implicitly promotes to +-- numeric, and we allow promotion on the FK side +CREATE TEMP TABLE PKTABLE (ptest1 numeric PRIMARY KEY); +INSERT INTO PKTABLE VALUES(42); +CREATE TEMP TABLE FKTABLE (ftest1 int); +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1) references pktable; +-- Check it actually works +INSERT INTO FKTABLE VALUES(42); -- should succeed +INSERT INTO FKTABLE VALUES(43); -- should fail +ERROR: insert or update on table "fktable" violates foreign key constraint "fktable_ftest1_fkey" +DETAIL: Key (ftest1)=(43) is not present in table "pktable". +DROP TABLE FKTABLE; +DROP TABLE PKTABLE; +CREATE TEMP TABLE PKTABLE (ptest1 int, ptest2 inet, + PRIMARY KEY(ptest1, ptest2)); +-- This should fail, because we just chose really odd types +CREATE TEMP TABLE FKTABLE (ftest1 cidr, ftest2 timestamp); +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1, ftest2) references pktable; +ERROR: foreign key constraint "fktable_ftest1_ftest2_fkey" cannot be implemented +DETAIL: Key columns "ftest1" and "ptest1" are of incompatible types: cidr and integer. +DROP TABLE FKTABLE; +-- Again, so should this... +CREATE TEMP TABLE FKTABLE (ftest1 cidr, ftest2 timestamp); +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1, ftest2) + references pktable(ptest1, ptest2); +ERROR: foreign key constraint "fktable_ftest1_ftest2_fkey" cannot be implemented +DETAIL: Key columns "ftest1" and "ptest1" are of incompatible types: cidr and integer. +DROP TABLE FKTABLE; +-- This fails because we mixed up the column ordering +CREATE TEMP TABLE FKTABLE (ftest1 int, ftest2 inet); +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest1, ftest2) + references pktable(ptest2, ptest1); +ERROR: foreign key constraint "fktable_ftest1_ftest2_fkey" cannot be implemented +DETAIL: Key columns "ftest1" and "ptest2" are of incompatible types: integer and inet. +-- As does this... +ALTER TABLE FKTABLE ADD FOREIGN KEY(ftest2, ftest1) + references pktable(ptest1, ptest2); +ERROR: foreign key constraint "fktable_ftest2_ftest1_fkey" cannot be implemented +DETAIL: Key columns "ftest2" and "ptest1" are of incompatible types: inet and integer. +DROP TABLE FKTABLE; +DROP TABLE PKTABLE; +-- Test that ALTER CONSTRAINT updates trigger deferrability properly +CREATE TEMP TABLE PKTABLE (ptest1 int primary key); +CREATE TEMP TABLE FKTABLE (ftest1 int); +ALTER TABLE FKTABLE ADD CONSTRAINT fknd FOREIGN KEY(ftest1) REFERENCES pktable + ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE; +ALTER TABLE FKTABLE ADD CONSTRAINT fkdd FOREIGN KEY(ftest1) REFERENCES pktable + ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY DEFERRED; +ALTER TABLE FKTABLE ADD CONSTRAINT fkdi FOREIGN KEY(ftest1) REFERENCES pktable + ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY IMMEDIATE; +ALTER TABLE FKTABLE ADD CONSTRAINT fknd2 FOREIGN KEY(ftest1) REFERENCES pktable + ON DELETE CASCADE ON UPDATE NO ACTION DEFERRABLE INITIALLY DEFERRED; +ALTER TABLE FKTABLE ALTER CONSTRAINT fknd2 NOT DEFERRABLE; +ALTER TABLE FKTABLE ADD CONSTRAINT fkdd2 FOREIGN KEY(ftest1) REFERENCES pktable + ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE; +ALTER TABLE FKTABLE ALTER CONSTRAINT fkdd2 DEFERRABLE INITIALLY DEFERRED; +ALTER TABLE FKTABLE ADD CONSTRAINT fkdi2 FOREIGN KEY(ftest1) REFERENCES pktable + ON DELETE CASCADE ON UPDATE NO ACTION NOT DEFERRABLE; +ALTER TABLE FKTABLE ALTER CONSTRAINT fkdi2 DEFERRABLE INITIALLY IMMEDIATE; +SELECT conname, tgfoid::regproc, tgtype, tgdeferrable, tginitdeferred +FROM pg_trigger JOIN pg_constraint con ON con.oid = tgconstraint +WHERE tgrelid = 'pktable'::regclass +ORDER BY 1,2,3; + conname | tgfoid | tgtype | tgdeferrable | tginitdeferred +---------+------------------------+--------+--------------+---------------- + fkdd | "RI_FKey_cascade_del" | 9 | f | f + fkdd | "RI_FKey_noaction_upd" | 17 | t | t + fkdd2 | "RI_FKey_cascade_del" | 9 | f | f + fkdd2 | "RI_FKey_noaction_upd" | 17 | t | t + fkdi | "RI_FKey_cascade_del" | 9 | f | f + fkdi | "RI_FKey_noaction_upd" | 17 | t | f + fkdi2 | "RI_FKey_cascade_del" | 9 | f | f + fkdi2 | "RI_FKey_noaction_upd" | 17 | t | f + fknd | "RI_FKey_cascade_del" | 9 | f | f + fknd | "RI_FKey_noaction_upd" | 17 | f | f + fknd2 | "RI_FKey_cascade_del" | 9 | f | f + fknd2 | "RI_FKey_noaction_upd" | 17 | f | f +(12 rows) + +SELECT conname, tgfoid::regproc, tgtype, tgdeferrable, tginitdeferred +FROM pg_trigger JOIN pg_constraint con ON con.oid = tgconstraint +WHERE tgrelid = 'fktable'::regclass +ORDER BY 1,2,3; + conname | tgfoid | tgtype | tgdeferrable | tginitdeferred +---------+---------------------+--------+--------------+---------------- + fkdd | "RI_FKey_check_ins" | 5 | t | t + fkdd | "RI_FKey_check_upd" | 17 | t | t + fkdd2 | "RI_FKey_check_ins" | 5 | t | t + fkdd2 | "RI_FKey_check_upd" | 17 | t | t + fkdi | "RI_FKey_check_ins" | 5 | t | f + fkdi | "RI_FKey_check_upd" | 17 | t | f + fkdi2 | "RI_FKey_check_ins" | 5 | t | f + fkdi2 | "RI_FKey_check_upd" | 17 | t | f + fknd | "RI_FKey_check_ins" | 5 | f | f + fknd | "RI_FKey_check_upd" | 17 | f | f + fknd2 | "RI_FKey_check_ins" | 5 | f | f + fknd2 | "RI_FKey_check_upd" | 17 | f | f +(12 rows) + +-- temp tables should go away by themselves, need not drop them. +-- test check constraint adding +create table atacc1 ( test int ); +-- add a check constraint +alter table atacc1 add constraint atacc_test1 check (test>3); +-- should fail +insert into atacc1 (test) values (2); +ERROR: new row for relation "atacc1" violates check constraint "atacc_test1" +DETAIL: Failing row contains (2). +-- should succeed +insert into atacc1 (test) values (4); +drop table atacc1; +-- let's do one where the check fails when added +create table atacc1 ( test int ); +-- insert a soon to be failing row +insert into atacc1 (test) values (2); +-- add a check constraint (fails) +alter table atacc1 add constraint atacc_test1 check (test>3); +ERROR: check constraint "atacc_test1" of relation "atacc1" is violated by some row +insert into atacc1 (test) values (4); +drop table atacc1; +-- let's do one where the check fails because the column doesn't exist +create table atacc1 ( test int ); +-- add a check constraint (fails) +alter table atacc1 add constraint atacc_test1 check (test1>3); +ERROR: column "test1" does not exist +HINT: Perhaps you meant to reference the column "atacc1.test". +drop table atacc1; +-- something a little more complicated +create table atacc1 ( test int, test2 int, test3 int); +-- add a check constraint (fails) +alter table atacc1 add constraint atacc_test1 check (test+test23), test2 int); +alter table atacc1 add check (test2>test); +-- should fail for $2 +insert into atacc1 (test2, test) values (3, 4); +ERROR: new row for relation "atacc1" violates check constraint "atacc1_check" +DETAIL: Failing row contains (4, 3). +drop table atacc1; +-- inheritance related tests +create table atacc1 (test int); +create table atacc2 (test2 int); +create table atacc3 (test3 int) inherits (atacc1, atacc2); +alter table atacc2 add constraint foo check (test2>0); +-- fail and then succeed on atacc2 +insert into atacc2 (test2) values (-3); +ERROR: new row for relation "atacc2" violates check constraint "foo" +DETAIL: Failing row contains (-3). +insert into atacc2 (test2) values (3); +-- fail and then succeed on atacc3 +insert into atacc3 (test2) values (-3); +ERROR: new row for relation "atacc3" violates check constraint "foo" +DETAIL: Failing row contains (null, -3, null). +insert into atacc3 (test2) values (3); +drop table atacc3; +drop table atacc2; +drop table atacc1; +-- same things with one created with INHERIT +create table atacc1 (test int); +create table atacc2 (test2 int); +create table atacc3 (test3 int) inherits (atacc1, atacc2); +alter table atacc3 no inherit atacc2; +-- fail +alter table atacc3 no inherit atacc2; +ERROR: relation "atacc2" is not a parent of relation "atacc3" +-- make sure it really isn't a child +insert into atacc3 (test2) values (3); +select test2 from atacc2; + test2 +------- +(0 rows) + +-- fail due to missing constraint +alter table atacc2 add constraint foo check (test2>0); +alter table atacc3 inherit atacc2; +ERROR: child table is missing constraint "foo" +-- fail due to missing column +alter table atacc3 rename test2 to testx; +alter table atacc3 inherit atacc2; +ERROR: child table is missing column "test2" +-- fail due to mismatched data type +alter table atacc3 add test2 bool; +alter table atacc3 inherit atacc2; +ERROR: child table "atacc3" has different type for column "test2" +alter table atacc3 drop test2; +-- succeed +alter table atacc3 add test2 int; +update atacc3 set test2 = 4 where test2 is null; +alter table atacc3 add constraint foo check (test2>0); +alter table atacc3 inherit atacc2; +-- fail due to duplicates and circular inheritance +alter table atacc3 inherit atacc2; +ERROR: relation "atacc2" would be inherited from more than once +alter table atacc2 inherit atacc3; +ERROR: circular inheritance not allowed +DETAIL: "atacc3" is already a child of "atacc2". +alter table atacc2 inherit atacc2; +ERROR: circular inheritance not allowed +DETAIL: "atacc2" is already a child of "atacc2". +-- test that we really are a child now (should see 4 not 3 and cascade should go through) +select test2 from atacc2; + test2 +------- + 4 +(1 row) + +drop table atacc2 cascade; +NOTICE: drop cascades to table atacc3 +drop table atacc1; +-- adding only to a parent is allowed as of 9.2 +create table atacc1 (test int); +create table atacc2 (test2 int) inherits (atacc1); +-- ok: +alter table atacc1 add constraint foo check (test>0) no inherit; +-- check constraint is not there on child +insert into atacc2 (test) values (-3); +-- check constraint is there on parent +insert into atacc1 (test) values (-3); +ERROR: new row for relation "atacc1" violates check constraint "foo" +DETAIL: Failing row contains (-3). +insert into atacc1 (test) values (3); +-- fail, violating row: +alter table atacc2 add constraint foo check (test>0) no inherit; +ERROR: check constraint "foo" of relation "atacc2" is violated by some row +drop table atacc2; +drop table atacc1; +-- test unique constraint adding +create table atacc1 ( test int ) ; +-- add a unique constraint +alter table atacc1 add constraint atacc_test1 unique (test); +-- insert first value +insert into atacc1 (test) values (2); +-- should fail +insert into atacc1 (test) values (2); +ERROR: duplicate key value violates unique constraint "atacc_test1" +DETAIL: Key (test)=(2) already exists. +-- should succeed +insert into atacc1 (test) values (4); +-- try to create duplicates via alter table using - should fail +alter table atacc1 alter column test type integer using 0; +ERROR: could not create unique index "atacc_test1" +DETAIL: Key (test)=(0) is duplicated. +drop table atacc1; +-- let's do one where the unique constraint fails when added +create table atacc1 ( test int ); +-- insert soon to be failing rows +insert into atacc1 (test) values (2); +insert into atacc1 (test) values (2); +-- add a unique constraint (fails) +alter table atacc1 add constraint atacc_test1 unique (test); +ERROR: could not create unique index "atacc_test1" +DETAIL: Key (test)=(2) is duplicated. +insert into atacc1 (test) values (3); +drop table atacc1; +-- let's do one where the unique constraint fails +-- because the column doesn't exist +create table atacc1 ( test int ); +-- add a unique constraint (fails) +alter table atacc1 add constraint atacc_test1 unique (test1); +ERROR: column "test1" named in key does not exist +drop table atacc1; +-- something a little more complicated +create table atacc1 ( test int, test2 int); +-- add a unique constraint +alter table atacc1 add constraint atacc_test1 unique (test, test2); +-- insert initial value +insert into atacc1 (test,test2) values (4,4); +-- should fail +insert into atacc1 (test,test2) values (4,4); +ERROR: duplicate key value violates unique constraint "atacc_test1" +DETAIL: Key (test, test2)=(4, 4) already exists. +-- should all succeed +insert into atacc1 (test,test2) values (4,5); +insert into atacc1 (test,test2) values (5,4); +insert into atacc1 (test,test2) values (5,5); +drop table atacc1; +-- lets do some naming tests +create table atacc1 (test int, test2 int, unique(test)); +alter table atacc1 add unique (test2); +-- should fail for @@ second one @@ +insert into atacc1 (test2, test) values (3, 3); +insert into atacc1 (test2, test) values (2, 3); +ERROR: duplicate key value violates unique constraint "atacc1_test_key" +DETAIL: Key (test)=(3) already exists. +drop table atacc1; +-- test primary key constraint adding +create table atacc1 ( id serial, test int) ; +-- add a primary key constraint +alter table atacc1 add constraint atacc_test1 primary key (test); +-- insert first value +insert into atacc1 (test) values (2); +-- should fail +insert into atacc1 (test) values (2); +ERROR: duplicate key value violates unique constraint "atacc_test1" +DETAIL: Key (test)=(2) already exists. +-- should succeed +insert into atacc1 (test) values (4); +-- inserting NULL should fail +insert into atacc1 (test) values(NULL); +ERROR: null value in column "test" of relation "atacc1" violates not-null constraint +DETAIL: Failing row contains (4, null). +-- try adding a second primary key (should fail) +alter table atacc1 add constraint atacc_oid1 primary key(id); +ERROR: multiple primary keys for table "atacc1" are not allowed +-- drop first primary key constraint +alter table atacc1 drop constraint atacc_test1 restrict; +-- try adding a primary key on oid (should succeed) +alter table atacc1 add constraint atacc_oid1 primary key(id); +drop table atacc1; +-- let's do one where the primary key constraint fails when added +create table atacc1 ( test int ); +-- insert soon to be failing rows +insert into atacc1 (test) values (2); +insert into atacc1 (test) values (2); +-- add a primary key (fails) +alter table atacc1 add constraint atacc_test1 primary key (test); +ERROR: could not create unique index "atacc_test1" +DETAIL: Key (test)=(2) is duplicated. +insert into atacc1 (test) values (3); +drop table atacc1; +-- let's do another one where the primary key constraint fails when added +create table atacc1 ( test int ); +-- insert soon to be failing row +insert into atacc1 (test) values (NULL); +-- add a primary key (fails) +alter table atacc1 add constraint atacc_test1 primary key (test); +ERROR: column "test" of relation "atacc1" contains null values +insert into atacc1 (test) values (3); +drop table atacc1; +-- let's do one where the primary key constraint fails +-- because the column doesn't exist +create table atacc1 ( test int ); +-- add a primary key constraint (fails) +alter table atacc1 add constraint atacc_test1 primary key (test1); +ERROR: column "test1" of relation "atacc1" does not exist +drop table atacc1; +-- adding a new column as primary key to a non-empty table. +-- should fail unless the column has a non-null default value. +create table atacc1 ( test int ); +insert into atacc1 (test) values (0); +-- add a primary key column without a default (fails). +alter table atacc1 add column test2 int primary key; +ERROR: column "test2" of relation "atacc1" contains null values +-- now add a primary key column with a default (succeeds). +alter table atacc1 add column test2 int default 0 primary key; +drop table atacc1; +-- this combination used to have order-of-execution problems (bug #15580) +create table atacc1 (a int); +insert into atacc1 values(1); +alter table atacc1 + add column b float8 not null default random(), + add primary key(a); +drop table atacc1; +-- additionally, we've seen issues with foreign key validation not being +-- properly delayed until after a table rewrite. Check that works ok. +create table atacc1 (a int primary key); +alter table atacc1 add constraint atacc1_fkey foreign key (a) references atacc1 (a) not valid; +alter table atacc1 validate constraint atacc1_fkey, alter a type bigint; +drop table atacc1; +-- we've also seen issues with check constraints being validated at the wrong +-- time when there's a pending table rewrite. +create table atacc1 (a bigint, b int); +insert into atacc1 values(1,1); +alter table atacc1 add constraint atacc1_chk check(b = 1) not valid; +alter table atacc1 validate constraint atacc1_chk, alter a type int; +drop table atacc1; +-- same as above, but ensure the constraint violation is detected +create table atacc1 (a bigint, b int); +insert into atacc1 values(1,2); +alter table atacc1 add constraint atacc1_chk check(b = 1) not valid; +alter table atacc1 validate constraint atacc1_chk, alter a type int; +ERROR: check constraint "atacc1_chk" of relation "atacc1" is violated by some row +drop table atacc1; +-- something a little more complicated +create table atacc1 ( test int, test2 int); +-- add a primary key constraint +alter table atacc1 add constraint atacc_test1 primary key (test, test2); +-- try adding a second primary key - should fail +alter table atacc1 add constraint atacc_test2 primary key (test); +ERROR: multiple primary keys for table "atacc1" are not allowed +-- insert initial value +insert into atacc1 (test,test2) values (4,4); +-- should fail +insert into atacc1 (test,test2) values (4,4); +ERROR: duplicate key value violates unique constraint "atacc_test1" +DETAIL: Key (test, test2)=(4, 4) already exists. +insert into atacc1 (test,test2) values (NULL,3); +ERROR: null value in column "test" of relation "atacc1" violates not-null constraint +DETAIL: Failing row contains (null, 3). +insert into atacc1 (test,test2) values (3, NULL); +ERROR: null value in column "test2" of relation "atacc1" violates not-null constraint +DETAIL: Failing row contains (3, null). +insert into atacc1 (test,test2) values (NULL,NULL); +ERROR: null value in column "test" of relation "atacc1" violates not-null constraint +DETAIL: Failing row contains (null, null). +-- should all succeed +insert into atacc1 (test,test2) values (4,5); +insert into atacc1 (test,test2) values (5,4); +insert into atacc1 (test,test2) values (5,5); +drop table atacc1; +-- lets do some naming tests +create table atacc1 (test int, test2 int, primary key(test)); +-- only first should succeed +insert into atacc1 (test2, test) values (3, 3); +insert into atacc1 (test2, test) values (2, 3); +ERROR: duplicate key value violates unique constraint "atacc1_pkey" +DETAIL: Key (test)=(3) already exists. +insert into atacc1 (test2, test) values (1, NULL); +ERROR: null value in column "test" of relation "atacc1" violates not-null constraint +DETAIL: Failing row contains (null, 1). +drop table atacc1; +-- alter table / alter column [set/drop] not null tests +-- try altering system catalogs, should fail +alter table pg_class alter column relname drop not null; +ERROR: permission denied: "pg_class" is a system catalog +alter table pg_class alter relname set not null; +ERROR: permission denied: "pg_class" is a system catalog +-- try altering non-existent table, should fail +alter table non_existent alter column bar set not null; +ERROR: relation "non_existent" does not exist +alter table non_existent alter column bar drop not null; +ERROR: relation "non_existent" does not exist +-- test setting columns to null and not null and vice versa +-- test checking for null values and primary key +create table atacc1 (test int not null); +alter table atacc1 add constraint "atacc1_pkey" primary key (test); +alter table atacc1 alter column test drop not null; +ERROR: column "test" is in a primary key +alter table atacc1 drop constraint "atacc1_pkey"; +alter table atacc1 alter column test drop not null; +insert into atacc1 values (null); +alter table atacc1 alter test set not null; +ERROR: column "test" of relation "atacc1" contains null values +delete from atacc1; +alter table atacc1 alter test set not null; +-- try altering a non-existent column, should fail +alter table atacc1 alter bar set not null; +ERROR: column "bar" of relation "atacc1" does not exist +alter table atacc1 alter bar drop not null; +ERROR: column "bar" of relation "atacc1" does not exist +-- try creating a view and altering that, should fail +create view myview as select * from atacc1; +alter table myview alter column test drop not null; +ERROR: "myview" is not a table or foreign table +alter table myview alter column test set not null; +ERROR: "myview" is not a table or foreign table +drop view myview; +drop table atacc1; +-- set not null verified by constraints +create table atacc1 (test_a int, test_b int); +insert into atacc1 values (null, 1); +-- constraint not cover all values, should fail +alter table atacc1 add constraint atacc1_constr_or check(test_a is not null or test_b < 10); +alter table atacc1 alter test_a set not null; +ERROR: column "test_a" of relation "atacc1" contains null values +alter table atacc1 drop constraint atacc1_constr_or; +-- not valid constraint, should fail +alter table atacc1 add constraint atacc1_constr_invalid check(test_a is not null) not valid; +alter table atacc1 alter test_a set not null; +ERROR: column "test_a" of relation "atacc1" contains null values +alter table atacc1 drop constraint atacc1_constr_invalid; +-- with valid constraint +update atacc1 set test_a = 1; +alter table atacc1 add constraint atacc1_constr_a_valid check(test_a is not null); +alter table atacc1 alter test_a set not null; +delete from atacc1; +insert into atacc1 values (2, null); +alter table atacc1 alter test_a drop not null; +-- test multiple set not null at same time +-- test_a checked by atacc1_constr_a_valid, test_b should fail by table scan +alter table atacc1 alter test_a set not null, alter test_b set not null; +ERROR: column "test_b" of relation "atacc1" contains null values +-- commands order has no importance +alter table atacc1 alter test_b set not null, alter test_a set not null; +ERROR: column "test_b" of relation "atacc1" contains null values +-- valid one by table scan, one by check constraints +update atacc1 set test_b = 1; +alter table atacc1 alter test_b set not null, alter test_a set not null; +alter table atacc1 alter test_a drop not null, alter test_b drop not null; +-- both column has check constraints +alter table atacc1 add constraint atacc1_constr_b_valid check(test_b is not null); +alter table atacc1 alter test_b set not null, alter test_a set not null; +drop table atacc1; +-- test inheritance +create table parent (a int); +create table child (b varchar(255)) inherits (parent); +alter table parent alter a set not null; +insert into parent values (NULL); +ERROR: null value in column "a" of relation "parent" violates not-null constraint +DETAIL: Failing row contains (null). +insert into child (a, b) values (NULL, 'foo'); +ERROR: null value in column "a" of relation "child" violates not-null constraint +DETAIL: Failing row contains (null, foo). +alter table parent alter a drop not null; +insert into parent values (NULL); +insert into child (a, b) values (NULL, 'foo'); +alter table only parent alter a set not null; +ERROR: column "a" of relation "parent" contains null values +alter table child alter a set not null; +ERROR: column "a" of relation "child" contains null values +delete from parent; +alter table only parent alter a set not null; +insert into parent values (NULL); +ERROR: null value in column "a" of relation "parent" violates not-null constraint +DETAIL: Failing row contains (null). +alter table child alter a set not null; +insert into child (a, b) values (NULL, 'foo'); +ERROR: null value in column "a" of relation "child" violates not-null constraint +DETAIL: Failing row contains (null, foo). +delete from child; +alter table child alter a set not null; +insert into child (a, b) values (NULL, 'foo'); +ERROR: null value in column "a" of relation "child" violates not-null constraint +DETAIL: Failing row contains (null, foo). +drop table child; +drop table parent; +-- test setting and removing default values +create table def_test ( + c1 int4 default 5, + c2 text default 'initial_default' +); +insert into def_test default values; +alter table def_test alter column c1 drop default; +insert into def_test default values; +alter table def_test alter column c2 drop default; +insert into def_test default values; +alter table def_test alter column c1 set default 10; +alter table def_test alter column c2 set default 'new_default'; +insert into def_test default values; +select * from def_test; + c1 | c2 +----+----------------- + 5 | initial_default + | initial_default + | + 10 | new_default +(4 rows) + +-- set defaults to an incorrect type: this should fail +alter table def_test alter column c1 set default 'wrong_datatype'; +ERROR: invalid input syntax for type integer: "wrong_datatype" +alter table def_test alter column c2 set default 20; +-- set defaults on a non-existent column: this should fail +alter table def_test alter column c3 set default 30; +ERROR: column "c3" of relation "def_test" does not exist +-- set defaults on views: we need to create a view, add a rule +-- to allow insertions into it, and then alter the view to add +-- a default +create view def_view_test as select * from def_test; +create rule def_view_test_ins as + on insert to def_view_test + do instead insert into def_test select new.*; +insert into def_view_test default values; +alter table def_view_test alter column c1 set default 45; +insert into def_view_test default values; +alter table def_view_test alter column c2 set default 'view_default'; +insert into def_view_test default values; +select * from def_view_test; + c1 | c2 +----+----------------- + 5 | initial_default + | initial_default + | + 10 | new_default + | + 45 | + 45 | view_default +(7 rows) + +drop rule def_view_test_ins on def_view_test; +drop view def_view_test; +drop table def_test; +-- alter table / drop column tests +-- try altering system catalogs, should fail +alter table pg_class drop column relname; +ERROR: permission denied: "pg_class" is a system catalog +-- try altering non-existent table, should fail +alter table nosuchtable drop column bar; +ERROR: relation "nosuchtable" does not exist +-- test dropping columns +create table atacc1 (a int4 not null, b int4, c int4 not null, d int4); +insert into atacc1 values (1, 2, 3, 4); +alter table atacc1 drop a; +alter table atacc1 drop a; +ERROR: column "a" of relation "atacc1" does not exist +-- SELECTs +select * from atacc1; + b | c | d +---+---+--- + 2 | 3 | 4 +(1 row) + +select * from atacc1 order by a; +ERROR: column "a" does not exist +LINE 1: select * from atacc1 order by a; + ^ +select * from atacc1 order by "........pg.dropped.1........"; +ERROR: column "........pg.dropped.1........" does not exist +LINE 1: select * from atacc1 order by "........pg.dropped.1........"... + ^ +select * from atacc1 group by a; +ERROR: column "a" does not exist +LINE 1: select * from atacc1 group by a; + ^ +select * from atacc1 group by "........pg.dropped.1........"; +ERROR: column "........pg.dropped.1........" does not exist +LINE 1: select * from atacc1 group by "........pg.dropped.1........"... + ^ +select atacc1.* from atacc1; + b | c | d +---+---+--- + 2 | 3 | 4 +(1 row) + +select a from atacc1; +ERROR: column "a" does not exist +LINE 1: select a from atacc1; + ^ +select atacc1.a from atacc1; +ERROR: column atacc1.a does not exist +LINE 1: select atacc1.a from atacc1; + ^ +select b,c,d from atacc1; + b | c | d +---+---+--- + 2 | 3 | 4 +(1 row) + +select a,b,c,d from atacc1; +ERROR: column "a" does not exist +LINE 1: select a,b,c,d from atacc1; + ^ +select * from atacc1 where a = 1; +ERROR: column "a" does not exist +LINE 1: select * from atacc1 where a = 1; + ^ +select "........pg.dropped.1........" from atacc1; +ERROR: column "........pg.dropped.1........" does not exist +LINE 1: select "........pg.dropped.1........" from atacc1; + ^ +select atacc1."........pg.dropped.1........" from atacc1; +ERROR: column atacc1.........pg.dropped.1........ does not exist +LINE 1: select atacc1."........pg.dropped.1........" from atacc1; + ^ +select "........pg.dropped.1........",b,c,d from atacc1; +ERROR: column "........pg.dropped.1........" does not exist +LINE 1: select "........pg.dropped.1........",b,c,d from atacc1; + ^ +select * from atacc1 where "........pg.dropped.1........" = 1; +ERROR: column "........pg.dropped.1........" does not exist +LINE 1: select * from atacc1 where "........pg.dropped.1........" = ... + ^ +-- UPDATEs +update atacc1 set a = 3; +ERROR: column "a" of relation "atacc1" does not exist +LINE 1: update atacc1 set a = 3; + ^ +update atacc1 set b = 2 where a = 3; +ERROR: column "a" does not exist +LINE 1: update atacc1 set b = 2 where a = 3; + ^ +update atacc1 set "........pg.dropped.1........" = 3; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +LINE 1: update atacc1 set "........pg.dropped.1........" = 3; + ^ +update atacc1 set b = 2 where "........pg.dropped.1........" = 3; +ERROR: column "........pg.dropped.1........" does not exist +LINE 1: update atacc1 set b = 2 where "........pg.dropped.1........"... + ^ +-- INSERTs +insert into atacc1 values (10, 11, 12, 13); +ERROR: INSERT has more expressions than target columns +LINE 1: insert into atacc1 values (10, 11, 12, 13); + ^ +insert into atacc1 values (default, 11, 12, 13); +ERROR: INSERT has more expressions than target columns +LINE 1: insert into atacc1 values (default, 11, 12, 13); + ^ +insert into atacc1 values (11, 12, 13); +insert into atacc1 (a) values (10); +ERROR: column "a" of relation "atacc1" does not exist +LINE 1: insert into atacc1 (a) values (10); + ^ +insert into atacc1 (a) values (default); +ERROR: column "a" of relation "atacc1" does not exist +LINE 1: insert into atacc1 (a) values (default); + ^ +insert into atacc1 (a,b,c,d) values (10,11,12,13); +ERROR: column "a" of relation "atacc1" does not exist +LINE 1: insert into atacc1 (a,b,c,d) values (10,11,12,13); + ^ +insert into atacc1 (a,b,c,d) values (default,11,12,13); +ERROR: column "a" of relation "atacc1" does not exist +LINE 1: insert into atacc1 (a,b,c,d) values (default,11,12,13); + ^ +insert into atacc1 (b,c,d) values (11,12,13); +insert into atacc1 ("........pg.dropped.1........") values (10); +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +LINE 1: insert into atacc1 ("........pg.dropped.1........") values (... + ^ +insert into atacc1 ("........pg.dropped.1........") values (default); +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +LINE 1: insert into atacc1 ("........pg.dropped.1........") values (... + ^ +insert into atacc1 ("........pg.dropped.1........",b,c,d) values (10,11,12,13); +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +LINE 1: insert into atacc1 ("........pg.dropped.1........",b,c,d) va... + ^ +insert into atacc1 ("........pg.dropped.1........",b,c,d) values (default,11,12,13); +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +LINE 1: insert into atacc1 ("........pg.dropped.1........",b,c,d) va... + ^ +-- DELETEs +delete from atacc1 where a = 3; +ERROR: column "a" does not exist +LINE 1: delete from atacc1 where a = 3; + ^ +delete from atacc1 where "........pg.dropped.1........" = 3; +ERROR: column "........pg.dropped.1........" does not exist +LINE 1: delete from atacc1 where "........pg.dropped.1........" = 3; + ^ +delete from atacc1; +-- try dropping a non-existent column, should fail +alter table atacc1 drop bar; +ERROR: column "bar" of relation "atacc1" does not exist +-- try removing an oid column, should succeed (as it's nonexistent) +alter table atacc1 SET WITHOUT OIDS; +-- try adding an oid column, should fail (not supported) +alter table atacc1 SET WITH OIDS; +ERROR: syntax error at or near "WITH" +LINE 1: alter table atacc1 SET WITH OIDS; + ^ +-- try dropping the xmin column, should fail +alter table atacc1 drop xmin; +ERROR: cannot drop system column "xmin" +-- try creating a view and altering that, should fail +create view myview as select * from atacc1; +select * from myview; + b | c | d +---+---+--- +(0 rows) + +alter table myview drop d; +ERROR: "myview" is not a table, composite type, or foreign table +drop view myview; +-- test some commands to make sure they fail on the dropped column +analyze atacc1(a); +ERROR: column "a" of relation "atacc1" does not exist +analyze atacc1("........pg.dropped.1........"); +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +vacuum analyze atacc1(a); +ERROR: column "a" of relation "atacc1" does not exist +vacuum analyze atacc1("........pg.dropped.1........"); +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +comment on column atacc1.a is 'testing'; +ERROR: column "a" of relation "atacc1" does not exist +comment on column atacc1."........pg.dropped.1........" is 'testing'; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 alter a set storage plain; +ERROR: column "a" of relation "atacc1" does not exist +alter table atacc1 alter "........pg.dropped.1........" set storage plain; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 alter a set statistics 0; +ERROR: column "a" of relation "atacc1" does not exist +alter table atacc1 alter "........pg.dropped.1........" set statistics 0; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 alter a set default 3; +ERROR: column "a" of relation "atacc1" does not exist +alter table atacc1 alter "........pg.dropped.1........" set default 3; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 alter a drop default; +ERROR: column "a" of relation "atacc1" does not exist +alter table atacc1 alter "........pg.dropped.1........" drop default; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 alter a set not null; +ERROR: column "a" of relation "atacc1" does not exist +alter table atacc1 alter "........pg.dropped.1........" set not null; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 alter a drop not null; +ERROR: column "a" of relation "atacc1" does not exist +alter table atacc1 alter "........pg.dropped.1........" drop not null; +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 rename a to x; +ERROR: column "a" does not exist +alter table atacc1 rename "........pg.dropped.1........" to x; +ERROR: column "........pg.dropped.1........" does not exist +alter table atacc1 add primary key(a); +ERROR: column "a" of relation "atacc1" does not exist +alter table atacc1 add primary key("........pg.dropped.1........"); +ERROR: column "........pg.dropped.1........" of relation "atacc1" does not exist +alter table atacc1 add unique(a); +ERROR: column "a" named in key does not exist +alter table atacc1 add unique("........pg.dropped.1........"); +ERROR: column "........pg.dropped.1........" named in key does not exist +alter table atacc1 add check (a > 3); +ERROR: column "a" does not exist +alter table atacc1 add check ("........pg.dropped.1........" > 3); +ERROR: column "........pg.dropped.1........" does not exist +create table atacc2 (id int4 unique); +alter table atacc1 add foreign key (a) references atacc2(id); +ERROR: column "a" referenced in foreign key constraint does not exist +alter table atacc1 add foreign key ("........pg.dropped.1........") references atacc2(id); +ERROR: column "........pg.dropped.1........" referenced in foreign key constraint does not exist +alter table atacc2 add foreign key (id) references atacc1(a); +ERROR: column "a" referenced in foreign key constraint does not exist +alter table atacc2 add foreign key (id) references atacc1("........pg.dropped.1........"); +ERROR: column "........pg.dropped.1........" referenced in foreign key constraint does not exist +drop table atacc2; +create index "testing_idx" on atacc1(a); +ERROR: column "a" does not exist +create index "testing_idx" on atacc1("........pg.dropped.1........"); +ERROR: column "........pg.dropped.1........" does not exist +-- test create as and select into +insert into atacc1 values (21, 22, 23); +create table attest1 as select * from atacc1; +select * from attest1; + b | c | d +----+----+---- + 21 | 22 | 23 +(1 row) + +drop table attest1; +select * into attest2 from atacc1; +select * from attest2; + b | c | d +----+----+---- + 21 | 22 | 23 +(1 row) + +drop table attest2; +-- try dropping all columns +alter table atacc1 drop c; +alter table atacc1 drop d; +alter table atacc1 drop b; +select * from atacc1; +-- +(1 row) + +drop table atacc1; +-- test constraint error reporting in presence of dropped columns +create table atacc1 (id serial primary key, value int check (value < 10)); +insert into atacc1(value) values (100); +ERROR: new row for relation "atacc1" violates check constraint "atacc1_value_check" +DETAIL: Failing row contains (1, 100). +alter table atacc1 drop column value; +alter table atacc1 add column value int check (value < 10); +insert into atacc1(value) values (100); +ERROR: new row for relation "atacc1" violates check constraint "atacc1_value_check" +DETAIL: Failing row contains (2, 100). +insert into atacc1(id, value) values (null, 0); +ERROR: null value in column "id" of relation "atacc1" violates not-null constraint +DETAIL: Failing row contains (null, 0). +drop table atacc1; +-- test inheritance +create table parent (a int, b int, c int); +insert into parent values (1, 2, 3); +alter table parent drop a; +create table child (d varchar(255)) inherits (parent); +insert into child values (12, 13, 'testing'); +select * from parent; + b | c +----+---- + 2 | 3 + 12 | 13 +(2 rows) + +select * from child; + b | c | d +----+----+--------- + 12 | 13 | testing +(1 row) + +alter table parent drop c; +select * from parent; + b +---- + 2 + 12 +(2 rows) + +select * from child; + b | d +----+--------- + 12 | testing +(1 row) + +drop table child; +drop table parent; +-- check error cases for inheritance column merging +create table parent (a float8, b numeric(10,4), c text collate "C"); +create table child (a float4) inherits (parent); -- fail +NOTICE: merging column "a" with inherited definition +ERROR: column "a" has a type conflict +DETAIL: double precision versus real +create table child (b decimal(10,7)) inherits (parent); -- fail +NOTICE: moving and merging column "b" with inherited definition +DETAIL: User-specified column moved to the position of the inherited column. +ERROR: column "b" has a type conflict +DETAIL: numeric(10,4) versus numeric(10,7) +create table child (c text collate "POSIX") inherits (parent); -- fail +NOTICE: moving and merging column "c" with inherited definition +DETAIL: User-specified column moved to the position of the inherited column. +ERROR: column "c" has a collation conflict +DETAIL: "C" versus "POSIX" +create table child (a double precision, b decimal(10,4)) inherits (parent); +NOTICE: merging column "a" with inherited definition +NOTICE: merging column "b" with inherited definition +drop table child; +drop table parent; +-- test copy in/out +create table attest (a int4, b int4, c int4); +insert into attest values (1,2,3); +alter table attest drop a; +copy attest to stdout; +2 3 +copy attest(a) to stdout; +ERROR: column "a" of relation "attest" does not exist +copy attest("........pg.dropped.1........") to stdout; +ERROR: column "........pg.dropped.1........" of relation "attest" does not exist +copy attest from stdin; +ERROR: extra data after last expected column +CONTEXT: COPY attest, line 1: "10 11 12" +select * from attest; + b | c +---+--- + 2 | 3 +(1 row) + +copy attest from stdin; +select * from attest; + b | c +----+---- + 2 | 3 + 21 | 22 +(2 rows) + +copy attest(a) from stdin; +ERROR: column "a" of relation "attest" does not exist +copy attest("........pg.dropped.1........") from stdin; +ERROR: column "........pg.dropped.1........" of relation "attest" does not exist +copy attest(b,c) from stdin; +select * from attest; + b | c +----+---- + 2 | 3 + 21 | 22 + 31 | 32 +(3 rows) + +drop table attest; +-- test inheritance +create table dropColumn (a int, b int, e int); +create table dropColumnChild (c int) inherits (dropColumn); +create table dropColumnAnother (d int) inherits (dropColumnChild); +-- these two should fail +alter table dropColumnchild drop column a; +ERROR: cannot drop inherited column "a" +alter table only dropColumnChild drop column b; +ERROR: cannot drop inherited column "b" +-- these three should work +alter table only dropColumn drop column e; +alter table dropColumnChild drop column c; +alter table dropColumn drop column a; +create table renameColumn (a int); +create table renameColumnChild (b int) inherits (renameColumn); +create table renameColumnAnother (c int) inherits (renameColumnChild); +-- these three should fail +alter table renameColumnChild rename column a to d; +ERROR: cannot rename inherited column "a" +alter table only renameColumnChild rename column a to d; +ERROR: inherited column "a" must be renamed in child tables too +alter table only renameColumn rename column a to d; +ERROR: inherited column "a" must be renamed in child tables too +-- these should work +alter table renameColumn rename column a to d; +alter table renameColumnChild rename column b to a; +-- these should work +alter table if exists doesnt_exist_tab rename column a to d; +NOTICE: relation "doesnt_exist_tab" does not exist, skipping +alter table if exists doesnt_exist_tab rename column b to a; +NOTICE: relation "doesnt_exist_tab" does not exist, skipping +-- this should work +alter table renameColumn add column w int; +-- this should fail +alter table only renameColumn add column x int; +ERROR: column must be added to child tables too +-- Test corner cases in dropping of inherited columns +create table p1 (f1 int, f2 int); +create table c1 (f1 int not null) inherits(p1); +NOTICE: merging column "f1" with inherited definition +-- should be rejected since c1.f1 is inherited +alter table c1 drop column f1; +ERROR: cannot drop inherited column "f1" +-- should work +alter table p1 drop column f1; +-- c1.f1 is still there, but no longer inherited +select f1 from c1; + f1 +---- +(0 rows) + +alter table c1 drop column f1; +select f1 from c1; +ERROR: column "f1" does not exist +LINE 1: select f1 from c1; + ^ +HINT: Perhaps you meant to reference the column "c1.f2". +drop table p1 cascade; +NOTICE: drop cascades to table c1 +create table p1 (f1 int, f2 int); +create table c1 () inherits(p1); +-- should be rejected since c1.f1 is inherited +alter table c1 drop column f1; +ERROR: cannot drop inherited column "f1" +alter table p1 drop column f1; +-- c1.f1 is dropped now, since there is no local definition for it +select f1 from c1; +ERROR: column "f1" does not exist +LINE 1: select f1 from c1; + ^ +HINT: Perhaps you meant to reference the column "c1.f2". +drop table p1 cascade; +NOTICE: drop cascades to table c1 +create table p1 (f1 int, f2 int); +create table c1 () inherits(p1); +-- should be rejected since c1.f1 is inherited +alter table c1 drop column f1; +ERROR: cannot drop inherited column "f1" +alter table only p1 drop column f1; +-- c1.f1 is NOT dropped, but must now be considered non-inherited +alter table c1 drop column f1; +drop table p1 cascade; +NOTICE: drop cascades to table c1 +create table p1 (f1 int, f2 int); +create table c1 (f1 int not null) inherits(p1); +NOTICE: merging column "f1" with inherited definition +-- should be rejected since c1.f1 is inherited +alter table c1 drop column f1; +ERROR: cannot drop inherited column "f1" +alter table only p1 drop column f1; +-- c1.f1 is still there, but no longer inherited +alter table c1 drop column f1; +drop table p1 cascade; +NOTICE: drop cascades to table c1 +create table p1(id int, name text); +create table p2(id2 int, name text, height int); +create table c1(age int) inherits(p1,p2); +NOTICE: merging multiple inherited definitions of column "name" +create table gc1() inherits (c1); +select relname, attname, attinhcount, attislocal +from pg_class join pg_attribute on (pg_class.oid = pg_attribute.attrelid) +where relname in ('p1','p2','c1','gc1') and attnum > 0 and not attisdropped +order by relname, attnum; + relname | attname | attinhcount | attislocal +---------+---------+-------------+------------ + c1 | id | 1 | f + c1 | name | 2 | f + c1 | id2 | 1 | f + c1 | height | 1 | f + c1 | age | 0 | t + gc1 | id | 1 | f + gc1 | name | 1 | f + gc1 | id2 | 1 | f + gc1 | height | 1 | f + gc1 | age | 1 | f + p1 | id | 0 | t + p1 | name | 0 | t + p2 | id2 | 0 | t + p2 | name | 0 | t + p2 | height | 0 | t +(15 rows) + +-- should work +alter table only p1 drop column name; +-- should work. Now c1.name is local and inhcount is 0. +alter table p2 drop column name; +-- should be rejected since its inherited +alter table gc1 drop column name; +ERROR: cannot drop inherited column "name" +-- should work, and drop gc1.name along +alter table c1 drop column name; +-- should fail: column does not exist +alter table gc1 drop column name; +ERROR: column "name" of relation "gc1" does not exist +-- should work and drop the attribute in all tables +alter table p2 drop column height; +-- IF EXISTS test +create table dropColumnExists (); +alter table dropColumnExists drop column non_existing; --fail +ERROR: column "non_existing" of relation "dropcolumnexists" does not exist +alter table dropColumnExists drop column if exists non_existing; --succeed +NOTICE: column "non_existing" of relation "dropcolumnexists" does not exist, skipping +select relname, attname, attinhcount, attislocal +from pg_class join pg_attribute on (pg_class.oid = pg_attribute.attrelid) +where relname in ('p1','p2','c1','gc1') and attnum > 0 and not attisdropped +order by relname, attnum; + relname | attname | attinhcount | attislocal +---------+---------+-------------+------------ + c1 | id | 1 | f + c1 | id2 | 1 | f + c1 | age | 0 | t + gc1 | id | 1 | f + gc1 | id2 | 1 | f + gc1 | age | 1 | f + p1 | id | 0 | t + p2 | id2 | 0 | t +(8 rows) + +drop table p1, p2 cascade; +NOTICE: drop cascades to 2 other objects +DETAIL: drop cascades to table c1 +drop cascades to table gc1 +-- test attinhcount tracking with merged columns +create table depth0(); +create table depth1(c text) inherits (depth0); +create table depth2() inherits (depth1); +alter table depth0 add c text; +NOTICE: merging definition of column "c" for child "depth1" +select attrelid::regclass, attname, attinhcount, attislocal +from pg_attribute +where attnum > 0 and attrelid::regclass in ('depth0', 'depth1', 'depth2') +order by attrelid::regclass::text, attnum; + attrelid | attname | attinhcount | attislocal +----------+---------+-------------+------------ + depth0 | c | 0 | t + depth1 | c | 1 | t + depth2 | c | 1 | f +(3 rows) + +-- test renumbering of child-table columns in inherited operations +create table p1 (f1 int); +create table c1 (f2 text, f3 int) inherits (p1); +alter table p1 add column a1 int check (a1 > 0); +alter table p1 add column f2 text; +NOTICE: merging definition of column "f2" for child "c1" +insert into p1 values (1,2,'abc'); +insert into c1 values(11,'xyz',33,0); -- should fail +ERROR: new row for relation "c1" violates check constraint "p1_a1_check" +DETAIL: Failing row contains (11, xyz, 33, 0). +insert into c1 values(11,'xyz',33,22); +select * from p1; + f1 | a1 | f2 +----+----+----- + 1 | 2 | abc + 11 | 22 | xyz +(2 rows) + +update p1 set a1 = a1 + 1, f2 = upper(f2); +select * from p1; + f1 | a1 | f2 +----+----+----- + 1 | 3 | ABC + 11 | 23 | XYZ +(2 rows) + +drop table p1 cascade; +NOTICE: drop cascades to table c1 +-- test that operations with a dropped column do not try to reference +-- its datatype +create domain mytype as text; +create temp table foo (f1 text, f2 mytype, f3 text); +insert into foo values('bb','cc','dd'); +select * from foo; + f1 | f2 | f3 +----+----+---- + bb | cc | dd +(1 row) + +drop domain mytype cascade; +NOTICE: drop cascades to column f2 of table foo +select * from foo; + f1 | f3 +----+---- + bb | dd +(1 row) + +insert into foo values('qq','rr'); +select * from foo; + f1 | f3 +----+---- + bb | dd + qq | rr +(2 rows) + +update foo set f3 = 'zz'; +select * from foo; + f1 | f3 +----+---- + bb | zz + qq | zz +(2 rows) + +select f3,max(f1) from foo group by f3; + f3 | max +----+----- + zz | qq +(1 row) + +-- Simple tests for alter table column type +alter table foo alter f1 TYPE integer; -- fails +ERROR: column "f1" cannot be cast automatically to type integer +HINT: You might need to specify "USING f1::integer". +alter table foo alter f1 TYPE varchar(10); +create table anothertab (atcol1 serial8, atcol2 boolean, + constraint anothertab_chk check (atcol1 <= 3)); +insert into anothertab (atcol1, atcol2) values (default, true); +insert into anothertab (atcol1, atcol2) values (default, false); +select * from anothertab; + atcol1 | atcol2 +--------+-------- + 1 | t + 2 | f +(2 rows) + +alter table anothertab alter column atcol1 type boolean; -- fails +ERROR: column "atcol1" cannot be cast automatically to type boolean +HINT: You might need to specify "USING atcol1::boolean". +alter table anothertab alter column atcol1 type boolean using atcol1::int; -- fails +ERROR: result of USING clause for column "atcol1" cannot be cast automatically to type boolean +HINT: You might need to add an explicit cast. +alter table anothertab alter column atcol1 type integer; +select * from anothertab; + atcol1 | atcol2 +--------+-------- + 1 | t + 2 | f +(2 rows) + +insert into anothertab (atcol1, atcol2) values (45, null); -- fails +ERROR: new row for relation "anothertab" violates check constraint "anothertab_chk" +DETAIL: Failing row contains (45, null). +insert into anothertab (atcol1, atcol2) values (default, null); +select * from anothertab; + atcol1 | atcol2 +--------+-------- + 1 | t + 2 | f + 3 | +(3 rows) + +alter table anothertab alter column atcol2 type text + using case when atcol2 is true then 'IT WAS TRUE' + when atcol2 is false then 'IT WAS FALSE' + else 'IT WAS NULL!' end; +select * from anothertab; + atcol1 | atcol2 +--------+-------------- + 1 | IT WAS TRUE + 2 | IT WAS FALSE + 3 | IT WAS NULL! +(3 rows) + +alter table anothertab alter column atcol1 type boolean + using case when atcol1 % 2 = 0 then true else false end; -- fails +ERROR: default for column "atcol1" cannot be cast automatically to type boolean +alter table anothertab alter column atcol1 drop default; +alter table anothertab alter column atcol1 type boolean + using case when atcol1 % 2 = 0 then true else false end; -- fails +ERROR: operator does not exist: boolean <= integer +HINT: No operator matches the given name and argument types. You might need to add explicit type casts. +alter table anothertab drop constraint anothertab_chk; +alter table anothertab drop constraint anothertab_chk; -- fails +ERROR: constraint "anothertab_chk" of relation "anothertab" does not exist +alter table anothertab drop constraint IF EXISTS anothertab_chk; -- succeeds +NOTICE: constraint "anothertab_chk" of relation "anothertab" does not exist, skipping +alter table anothertab alter column atcol1 type boolean + using case when atcol1 % 2 = 0 then true else false end; +select * from anothertab; + atcol1 | atcol2 +--------+-------------- + f | IT WAS TRUE + t | IT WAS FALSE + f | IT WAS NULL! +(3 rows) + +drop table anothertab; +-- Test index handling in alter table column type (cf. bugs #15835, #15865) +create table anothertab(f1 int primary key, f2 int unique, + f3 int, f4 int, f5 int); +alter table anothertab + add exclude using btree (f3 with =); +alter table anothertab + add exclude using btree (f4 with =) where (f4 is not null); +alter table anothertab + add exclude using btree (f4 with =) where (f5 > 0); +alter table anothertab + add unique(f1,f4); +create index on anothertab(f2,f3); +create unique index on anothertab(f4); +\d anothertab + Table "public.anothertab" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + f1 | integer | | not null | + f2 | integer | | | + f3 | integer | | | + f4 | integer | | | + f5 | integer | | | +Indexes: + "anothertab_pkey" PRIMARY KEY, btree (f1) + "anothertab_f1_f4_key" UNIQUE CONSTRAINT, btree (f1, f4) + "anothertab_f2_f3_idx" btree (f2, f3) + "anothertab_f2_key" UNIQUE CONSTRAINT, btree (f2) + "anothertab_f3_excl" EXCLUDE USING btree (f3 WITH =) + "anothertab_f4_excl" EXCLUDE USING btree (f4 WITH =) WHERE (f4 IS NOT NULL) + "anothertab_f4_excl1" EXCLUDE USING btree (f4 WITH =) WHERE (f5 > 0) + "anothertab_f4_idx" UNIQUE, btree (f4) + +alter table anothertab alter column f1 type bigint; +alter table anothertab + alter column f2 type bigint, + alter column f3 type bigint, + alter column f4 type bigint; +alter table anothertab alter column f5 type bigint; +\d anothertab + Table "public.anothertab" + Column | Type | Collation | Nullable | Default +--------+--------+-----------+----------+--------- + f1 | bigint | | not null | + f2 | bigint | | | + f3 | bigint | | | + f4 | bigint | | | + f5 | bigint | | | +Indexes: + "anothertab_pkey" PRIMARY KEY, btree (f1) + "anothertab_f1_f4_key" UNIQUE CONSTRAINT, btree (f1, f4) + "anothertab_f2_f3_idx" btree (f2, f3) + "anothertab_f2_key" UNIQUE CONSTRAINT, btree (f2) + "anothertab_f3_excl" EXCLUDE USING btree (f3 WITH =) + "anothertab_f4_excl" EXCLUDE USING btree (f4 WITH =) WHERE (f4 IS NOT NULL) + "anothertab_f4_excl1" EXCLUDE USING btree (f4 WITH =) WHERE (f5 > 0) + "anothertab_f4_idx" UNIQUE, btree (f4) + +drop table anothertab; +-- test that USING expressions are parsed before column alter type / drop steps +create table another (f1 int, f2 text, f3 text); +insert into another values(1, 'one', 'uno'); +insert into another values(2, 'two', 'due'); +insert into another values(3, 'three', 'tre'); +select * from another; + f1 | f2 | f3 +----+-------+----- + 1 | one | uno + 2 | two | due + 3 | three | tre +(3 rows) + +alter table another + alter f1 type text using f2 || ' and ' || f3 || ' more', + alter f2 type bigint using f1 * 10, + drop column f3; +select * from another; + f1 | f2 +--------------------+---- + one and uno more | 10 + two and due more | 20 + three and tre more | 30 +(3 rows) + +drop table another; +-- Create an index that skips WAL, then perform a SET DATA TYPE that skips +-- rewriting the index. +begin; +create table skip_wal_skip_rewrite_index (c varchar(10) primary key); +alter table skip_wal_skip_rewrite_index alter c type varchar(20); +commit; +-- table's row type +create table tab1 (a int, b text); +create table tab2 (x int, y tab1); +alter table tab1 alter column b type varchar; -- fails +ERROR: cannot alter table "tab1" because column "tab2.y" uses its row type +-- Alter column type that's part of a partitioned index +create table at_partitioned (a int, b text) partition by range (a); +create table at_part_1 partition of at_partitioned for values from (0) to (1000); +insert into at_partitioned values (512, '0.123'); +create table at_part_2 (b text, a int); +insert into at_part_2 values ('1.234', 1024); +create index on at_partitioned (b); +create index on at_partitioned (a); +\d at_part_1 + Table "public.at_part_1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | text | | | +Partition of: at_partitioned FOR VALUES FROM (0) TO (1000) +Indexes: + "at_part_1_a_idx" btree (a) + "at_part_1_b_idx" btree (b) + +\d at_part_2 + Table "public.at_part_2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + b | text | | | + a | integer | | | + +alter table at_partitioned attach partition at_part_2 for values from (1000) to (2000); +\d at_part_2 + Table "public.at_part_2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + b | text | | | + a | integer | | | +Partition of: at_partitioned FOR VALUES FROM (1000) TO (2000) +Indexes: + "at_part_2_a_idx" btree (a) + "at_part_2_b_idx" btree (b) + +alter table at_partitioned alter column b type numeric using b::numeric; +\d at_part_1 + Table "public.at_part_1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | numeric | | | +Partition of: at_partitioned FOR VALUES FROM (0) TO (1000) +Indexes: + "at_part_1_a_idx" btree (a) + "at_part_1_b_idx" btree (b) + +\d at_part_2 + Table "public.at_part_2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + b | numeric | | | + a | integer | | | +Partition of: at_partitioned FOR VALUES FROM (1000) TO (2000) +Indexes: + "at_part_2_a_idx" btree (a) + "at_part_2_b_idx" btree (b) + +drop table at_partitioned; +-- Alter column type when no table rewrite is required +-- Also check that comments are preserved +create table at_partitioned(id int, name varchar(64), unique (id, name)) + partition by hash(id); +comment on constraint at_partitioned_id_name_key on at_partitioned is 'parent constraint'; +comment on index at_partitioned_id_name_key is 'parent index'; +create table at_partitioned_0 partition of at_partitioned + for values with (modulus 2, remainder 0); +comment on constraint at_partitioned_0_id_name_key on at_partitioned_0 is 'child 0 constraint'; +comment on index at_partitioned_0_id_name_key is 'child 0 index'; +create table at_partitioned_1 partition of at_partitioned + for values with (modulus 2, remainder 1); +comment on constraint at_partitioned_1_id_name_key on at_partitioned_1 is 'child 1 constraint'; +comment on index at_partitioned_1_id_name_key is 'child 1 index'; +insert into at_partitioned values(1, 'foo'); +insert into at_partitioned values(3, 'bar'); +create temp table old_oids as + select relname, oid as oldoid, relfilenode as oldfilenode + from pg_class where relname like 'at_partitioned%'; +select relname, + c.oid = oldoid as orig_oid, + case relfilenode + when 0 then 'none' + when c.oid then 'own' + when oldfilenode then 'orig' + else 'OTHER' + end as storage, + obj_description(c.oid, 'pg_class') as desc + from pg_class c left join old_oids using (relname) + where relname like 'at_partitioned%' + order by relname; + relname | orig_oid | storage | desc +------------------------------+----------+---------+--------------- + at_partitioned | t | none | + at_partitioned_0 | t | own | + at_partitioned_0_id_name_key | t | own | child 0 index + at_partitioned_1 | t | own | + at_partitioned_1_id_name_key | t | own | child 1 index + at_partitioned_id_name_key | t | none | parent index +(6 rows) + +select conname, obj_description(oid, 'pg_constraint') as desc + from pg_constraint where conname like 'at_partitioned%' + order by conname; + conname | desc +------------------------------+-------------------- + at_partitioned_0_id_name_key | child 0 constraint + at_partitioned_1_id_name_key | child 1 constraint + at_partitioned_id_name_key | parent constraint +(3 rows) + +alter table at_partitioned alter column name type varchar(127); +-- Note: these tests currently show the wrong behavior for comments :-( +select relname, + c.oid = oldoid as orig_oid, + case relfilenode + when 0 then 'none' + when c.oid then 'own' + when oldfilenode then 'orig' + else 'OTHER' + end as storage, + obj_description(c.oid, 'pg_class') as desc + from pg_class c left join old_oids using (relname) + where relname like 'at_partitioned%' + order by relname; + relname | orig_oid | storage | desc +------------------------------+----------+---------+-------------- + at_partitioned | t | none | + at_partitioned_0 | t | own | + at_partitioned_0_id_name_key | f | own | parent index + at_partitioned_1 | t | own | + at_partitioned_1_id_name_key | f | own | parent index + at_partitioned_id_name_key | f | none | parent index +(6 rows) + +select conname, obj_description(oid, 'pg_constraint') as desc + from pg_constraint where conname like 'at_partitioned%' + order by conname; + conname | desc +------------------------------+------------------- + at_partitioned_0_id_name_key | + at_partitioned_1_id_name_key | + at_partitioned_id_name_key | parent constraint +(3 rows) + +-- Don't remove this DROP, it exposes bug #15672 +drop table at_partitioned; +-- disallow recursive containment of row types +create temp table recur1 (f1 int); +alter table recur1 add column f2 recur1; -- fails +ERROR: composite type recur1 cannot be made a member of itself +alter table recur1 add column f2 recur1[]; -- fails +ERROR: composite type recur1 cannot be made a member of itself +create domain array_of_recur1 as recur1[]; +alter table recur1 add column f2 array_of_recur1; -- fails +ERROR: composite type recur1 cannot be made a member of itself +create temp table recur2 (f1 int, f2 recur1); +alter table recur1 add column f2 recur2; -- fails +ERROR: composite type recur1 cannot be made a member of itself +alter table recur1 add column f2 int; +alter table recur1 alter column f2 type recur2; -- fails +ERROR: composite type recur1 cannot be made a member of itself +-- SET STORAGE may need to add a TOAST table +create table test_storage (a text); +alter table test_storage alter a set storage plain; +alter table test_storage add b int default 0; -- rewrite table to remove its TOAST table +alter table test_storage alter a set storage extended; -- re-add TOAST table +select reltoastrelid <> 0 as has_toast_table +from pg_class +where oid = 'test_storage'::regclass; + has_toast_table +----------------- + t +(1 row) + +-- test that SET STORAGE propagates to index correctly +create index test_storage_idx on test_storage (b, a); +alter table test_storage alter column a set storage external; +\d+ test_storage + Table "public.test_storage" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+----------+--------------+------------- + a | text | | | | external | | + b | integer | | | 0 | plain | | +Indexes: + "test_storage_idx" btree (b, a) + +\d+ test_storage_idx + Index "public.test_storage_idx" + Column | Type | Key? | Definition | Storage | Stats target +--------+---------+------+------------+----------+-------------- + b | integer | yes | b | plain | + a | text | yes | a | external | +btree, for table "public.test_storage" + +-- ALTER COLUMN TYPE with a check constraint and a child table (bug #13779) +CREATE TABLE test_inh_check (a float check (a > 10.2), b float); +CREATE TABLE test_inh_check_child() INHERITS(test_inh_check); +\d test_inh_check + Table "public.test_inh_check" + Column | Type | Collation | Nullable | Default +--------+------------------+-----------+----------+--------- + a | double precision | | | + b | double precision | | | +Check constraints: + "test_inh_check_a_check" CHECK (a > 10.2::double precision) +Number of child tables: 1 (Use \d+ to list them.) + +\d test_inh_check_child + Table "public.test_inh_check_child" + Column | Type | Collation | Nullable | Default +--------+------------------+-----------+----------+--------- + a | double precision | | | + b | double precision | | | +Check constraints: + "test_inh_check_a_check" CHECK (a > 10.2::double precision) +Inherits: test_inh_check + +select relname, conname, coninhcount, conislocal, connoinherit + from pg_constraint c, pg_class r + where relname like 'test_inh_check%' and c.conrelid = r.oid + order by 1, 2; + relname | conname | coninhcount | conislocal | connoinherit +----------------------+------------------------+-------------+------------+-------------- + test_inh_check | test_inh_check_a_check | 0 | t | f + test_inh_check_child | test_inh_check_a_check | 1 | f | f +(2 rows) + +ALTER TABLE test_inh_check ALTER COLUMN a TYPE numeric; +\d test_inh_check + Table "public.test_inh_check" + Column | Type | Collation | Nullable | Default +--------+------------------+-----------+----------+--------- + a | numeric | | | + b | double precision | | | +Check constraints: + "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision) +Number of child tables: 1 (Use \d+ to list them.) + +\d test_inh_check_child + Table "public.test_inh_check_child" + Column | Type | Collation | Nullable | Default +--------+------------------+-----------+----------+--------- + a | numeric | | | + b | double precision | | | +Check constraints: + "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision) +Inherits: test_inh_check + +select relname, conname, coninhcount, conislocal, connoinherit + from pg_constraint c, pg_class r + where relname like 'test_inh_check%' and c.conrelid = r.oid + order by 1, 2; + relname | conname | coninhcount | conislocal | connoinherit +----------------------+------------------------+-------------+------------+-------------- + test_inh_check | test_inh_check_a_check | 0 | t | f + test_inh_check_child | test_inh_check_a_check | 1 | f | f +(2 rows) + +-- also try noinherit, local, and local+inherited cases +ALTER TABLE test_inh_check ADD CONSTRAINT bnoinherit CHECK (b > 100) NO INHERIT; +ALTER TABLE test_inh_check_child ADD CONSTRAINT blocal CHECK (b < 1000); +ALTER TABLE test_inh_check_child ADD CONSTRAINT bmerged CHECK (b > 1); +ALTER TABLE test_inh_check ADD CONSTRAINT bmerged CHECK (b > 1); +NOTICE: merging constraint "bmerged" with inherited definition +\d test_inh_check + Table "public.test_inh_check" + Column | Type | Collation | Nullable | Default +--------+------------------+-----------+----------+--------- + a | numeric | | | + b | double precision | | | +Check constraints: + "bmerged" CHECK (b > 1::double precision) + "bnoinherit" CHECK (b > 100::double precision) NO INHERIT + "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision) +Number of child tables: 1 (Use \d+ to list them.) + +\d test_inh_check_child + Table "public.test_inh_check_child" + Column | Type | Collation | Nullable | Default +--------+------------------+-----------+----------+--------- + a | numeric | | | + b | double precision | | | +Check constraints: + "blocal" CHECK (b < 1000::double precision) + "bmerged" CHECK (b > 1::double precision) + "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision) +Inherits: test_inh_check + +select relname, conname, coninhcount, conislocal, connoinherit + from pg_constraint c, pg_class r + where relname like 'test_inh_check%' and c.conrelid = r.oid + order by 1, 2; + relname | conname | coninhcount | conislocal | connoinherit +----------------------+------------------------+-------------+------------+-------------- + test_inh_check | bmerged | 0 | t | f + test_inh_check | bnoinherit | 0 | t | t + test_inh_check | test_inh_check_a_check | 0 | t | f + test_inh_check_child | blocal | 0 | t | f + test_inh_check_child | bmerged | 1 | t | f + test_inh_check_child | test_inh_check_a_check | 1 | f | f +(6 rows) + +ALTER TABLE test_inh_check ALTER COLUMN b TYPE numeric; +NOTICE: merging constraint "bmerged" with inherited definition +\d test_inh_check + Table "public.test_inh_check" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | numeric | | | + b | numeric | | | +Check constraints: + "bmerged" CHECK (b::double precision > 1::double precision) + "bnoinherit" CHECK (b::double precision > 100::double precision) NO INHERIT + "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision) +Number of child tables: 1 (Use \d+ to list them.) + +\d test_inh_check_child + Table "public.test_inh_check_child" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | numeric | | | + b | numeric | | | +Check constraints: + "blocal" CHECK (b::double precision < 1000::double precision) + "bmerged" CHECK (b::double precision > 1::double precision) + "test_inh_check_a_check" CHECK (a::double precision > 10.2::double precision) +Inherits: test_inh_check + +select relname, conname, coninhcount, conislocal, connoinherit + from pg_constraint c, pg_class r + where relname like 'test_inh_check%' and c.conrelid = r.oid + order by 1, 2; + relname | conname | coninhcount | conislocal | connoinherit +----------------------+------------------------+-------------+------------+-------------- + test_inh_check | bmerged | 0 | t | f + test_inh_check | bnoinherit | 0 | t | t + test_inh_check | test_inh_check_a_check | 0 | t | f + test_inh_check_child | blocal | 0 | t | f + test_inh_check_child | bmerged | 1 | t | f + test_inh_check_child | test_inh_check_a_check | 1 | f | f +(6 rows) + +-- ALTER COLUMN TYPE with different schema in children +-- Bug at https://postgr.es/m/20170102225618.GA10071@telsasoft.com +CREATE TABLE test_type_diff (f1 int); +CREATE TABLE test_type_diff_c (extra smallint) INHERITS (test_type_diff); +ALTER TABLE test_type_diff ADD COLUMN f2 int; +INSERT INTO test_type_diff_c VALUES (1, 2, 3); +ALTER TABLE test_type_diff ALTER COLUMN f2 TYPE bigint USING f2::bigint; +CREATE TABLE test_type_diff2 (int_two int2, int_four int4, int_eight int8); +CREATE TABLE test_type_diff2_c1 (int_four int4, int_eight int8, int_two int2); +CREATE TABLE test_type_diff2_c2 (int_eight int8, int_two int2, int_four int4); +CREATE TABLE test_type_diff2_c3 (int_two int2, int_four int4, int_eight int8); +ALTER TABLE test_type_diff2_c1 INHERIT test_type_diff2; +ALTER TABLE test_type_diff2_c2 INHERIT test_type_diff2; +ALTER TABLE test_type_diff2_c3 INHERIT test_type_diff2; +INSERT INTO test_type_diff2_c1 VALUES (1, 2, 3); +INSERT INTO test_type_diff2_c2 VALUES (4, 5, 6); +INSERT INTO test_type_diff2_c3 VALUES (7, 8, 9); +ALTER TABLE test_type_diff2 ALTER COLUMN int_four TYPE int8 USING int_four::int8; +-- whole-row references are disallowed +ALTER TABLE test_type_diff2 ALTER COLUMN int_four TYPE int4 USING (pg_column_size(test_type_diff2)); +ERROR: cannot convert whole-row table reference +DETAIL: USING expression contains a whole-row table reference. +-- check for rollback of ANALYZE corrupting table property flags (bug #11638) +CREATE TABLE check_fk_presence_1 (id int PRIMARY KEY, t text); +CREATE TABLE check_fk_presence_2 (id int REFERENCES check_fk_presence_1, t text); +BEGIN; +ALTER TABLE check_fk_presence_2 DROP CONSTRAINT check_fk_presence_2_id_fkey; +ANALYZE check_fk_presence_2; +ROLLBACK; +\d check_fk_presence_2 + Table "public.check_fk_presence_2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + id | integer | | | + t | text | | | +Foreign-key constraints: + "check_fk_presence_2_id_fkey" FOREIGN KEY (id) REFERENCES check_fk_presence_1(id) + +DROP TABLE check_fk_presence_1, check_fk_presence_2; +-- check column addition within a view (bug #14876) +create table at_base_table(id int, stuff text); +insert into at_base_table values (23, 'skidoo'); +create view at_view_1 as select * from at_base_table bt; +create view at_view_2 as select *, to_json(v1) as j from at_view_1 v1; +\d+ at_view_1 + View "public.at_view_1" + Column | Type | Collation | Nullable | Default | Storage | Description +--------+---------+-----------+----------+---------+----------+------------- + id | integer | | | | plain | + stuff | text | | | | extended | +View definition: + SELECT bt.id, + bt.stuff + FROM at_base_table bt; + +\d+ at_view_2 + View "public.at_view_2" + Column | Type | Collation | Nullable | Default | Storage | Description +--------+---------+-----------+----------+---------+----------+------------- + id | integer | | | | plain | + stuff | text | | | | extended | + j | json | | | | extended | +View definition: + SELECT v1.id, + v1.stuff, + to_json(v1.*) AS j + FROM at_view_1 v1; + +explain (verbose, costs off) select * from at_view_2; + QUERY PLAN +---------------------------------------------------------- + Seq Scan on public.at_base_table bt + Output: bt.id, bt.stuff, to_json(ROW(bt.id, bt.stuff)) +(2 rows) + +select * from at_view_2; + id | stuff | j +----+--------+---------------------------- + 23 | skidoo | {"id":23,"stuff":"skidoo"} +(1 row) + +create or replace view at_view_1 as select *, 2+2 as more from at_base_table bt; +\d+ at_view_1 + View "public.at_view_1" + Column | Type | Collation | Nullable | Default | Storage | Description +--------+---------+-----------+----------+---------+----------+------------- + id | integer | | | | plain | + stuff | text | | | | extended | + more | integer | | | | plain | +View definition: + SELECT bt.id, + bt.stuff, + 2 + 2 AS more + FROM at_base_table bt; + +\d+ at_view_2 + View "public.at_view_2" + Column | Type | Collation | Nullable | Default | Storage | Description +--------+---------+-----------+----------+---------+----------+------------- + id | integer | | | | plain | + stuff | text | | | | extended | + j | json | | | | extended | +View definition: + SELECT v1.id, + v1.stuff, + to_json(v1.*) AS j + FROM at_view_1 v1; + +explain (verbose, costs off) select * from at_view_2; + QUERY PLAN +---------------------------------------------------------------- + Seq Scan on public.at_base_table bt + Output: bt.id, bt.stuff, to_json(ROW(bt.id, bt.stuff, NULL)) +(2 rows) + +select * from at_view_2; + id | stuff | j +----+--------+---------------------------------------- + 23 | skidoo | {"id":23,"stuff":"skidoo","more":null} +(1 row) + +drop view at_view_2; +drop view at_view_1; +drop table at_base_table; +-- check adding a column not iself requiring a rewrite, together with +-- a column requiring a default (bug #16038) +-- ensure that rewrites aren't silently optimized away, removing the +-- value of the test +CREATE FUNCTION check_ddl_rewrite(p_tablename regclass, p_ddl text) +RETURNS boolean +LANGUAGE plpgsql AS $$ +DECLARE + v_relfilenode oid; +BEGIN + v_relfilenode := relfilenode FROM pg_class WHERE oid = p_tablename; + + EXECUTE p_ddl; + + RETURN v_relfilenode <> (SELECT relfilenode FROM pg_class WHERE oid = p_tablename); +END; +$$; +CREATE TABLE rewrite_test(col text); +INSERT INTO rewrite_test VALUES ('something'); +INSERT INTO rewrite_test VALUES (NULL); +-- empty[12] don't need rewrite, but notempty[12]_rewrite will force one +SELECT check_ddl_rewrite('rewrite_test', $$ + ALTER TABLE rewrite_test + ADD COLUMN empty1 text, + ADD COLUMN notempty1_rewrite serial; +$$); + check_ddl_rewrite +------------------- + t +(1 row) + +SELECT check_ddl_rewrite('rewrite_test', $$ + ALTER TABLE rewrite_test + ADD COLUMN notempty2_rewrite serial, + ADD COLUMN empty2 text; +$$); + check_ddl_rewrite +------------------- + t +(1 row) + +-- also check that fast defaults cause no problem, first without rewrite +SELECT check_ddl_rewrite('rewrite_test', $$ + ALTER TABLE rewrite_test + ADD COLUMN empty3 text, + ADD COLUMN notempty3_norewrite int default 42; +$$); + check_ddl_rewrite +------------------- + f +(1 row) + +SELECT check_ddl_rewrite('rewrite_test', $$ + ALTER TABLE rewrite_test + ADD COLUMN notempty4_norewrite int default 42, + ADD COLUMN empty4 text; +$$); + check_ddl_rewrite +------------------- + f +(1 row) + +-- then with rewrite +SELECT check_ddl_rewrite('rewrite_test', $$ + ALTER TABLE rewrite_test + ADD COLUMN empty5 text, + ADD COLUMN notempty5_norewrite int default 42, + ADD COLUMN notempty5_rewrite serial; +$$); + check_ddl_rewrite +------------------- + t +(1 row) + +SELECT check_ddl_rewrite('rewrite_test', $$ + ALTER TABLE rewrite_test + ADD COLUMN notempty6_rewrite serial, + ADD COLUMN empty6 text, + ADD COLUMN notempty6_norewrite int default 42; +$$); + check_ddl_rewrite +------------------- + t +(1 row) + +-- cleanup +DROP FUNCTION check_ddl_rewrite(regclass, text); +DROP TABLE rewrite_test; +-- +-- lock levels +-- +drop type lockmodes; +ERROR: type "lockmodes" does not exist +create type lockmodes as enum ( + 'SIReadLock' +,'AccessShareLock' +,'RowShareLock' +,'RowExclusiveLock' +,'ShareUpdateExclusiveLock' +,'ShareLock' +,'ShareRowExclusiveLock' +,'ExclusiveLock' +,'AccessExclusiveLock' +); +drop view my_locks; +ERROR: view "my_locks" does not exist +create or replace view my_locks as +select case when c.relname like 'pg_toast%' then 'pg_toast' else c.relname end, max(mode::lockmodes) as max_lockmode +from pg_locks l join pg_class c on l.relation = c.oid +where virtualtransaction = ( + select virtualtransaction + from pg_locks + where transactionid = pg_current_xact_id()::xid) +and locktype = 'relation' +and relnamespace != (select oid from pg_namespace where nspname = 'pg_catalog') +and c.relname != 'my_locks' +group by c.relname; +create table alterlock (f1 int primary key, f2 text); +insert into alterlock values (1, 'foo'); +create table alterlock2 (f3 int primary key, f1 int); +insert into alterlock2 values (1, 1); +begin; alter table alterlock alter column f2 set statistics 150; +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock +(1 row) + +rollback; +begin; alter table alterlock cluster on alterlock_pkey; +select * from my_locks order by 1; + relname | max_lockmode +----------------+-------------------------- + alterlock | ShareUpdateExclusiveLock + alterlock_pkey | ShareUpdateExclusiveLock +(2 rows) + +commit; +begin; alter table alterlock set without cluster; +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock +(1 row) + +commit; +begin; alter table alterlock set (fillfactor = 100); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock + pg_toast | ShareUpdateExclusiveLock +(2 rows) + +commit; +begin; alter table alterlock reset (fillfactor); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock + pg_toast | ShareUpdateExclusiveLock +(2 rows) + +commit; +begin; alter table alterlock set (toast.autovacuum_enabled = off); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock + pg_toast | ShareUpdateExclusiveLock +(2 rows) + +commit; +begin; alter table alterlock set (autovacuum_enabled = off); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock + pg_toast | ShareUpdateExclusiveLock +(2 rows) + +commit; +begin; alter table alterlock alter column f2 set (n_distinct = 1); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock +(1 row) + +rollback; +-- test that mixing options with different lock levels works as expected +begin; alter table alterlock set (autovacuum_enabled = off, fillfactor = 80); +select * from my_locks order by 1; + relname | max_lockmode +-----------+-------------------------- + alterlock | ShareUpdateExclusiveLock + pg_toast | ShareUpdateExclusiveLock +(2 rows) + +commit; +begin; alter table alterlock alter column f2 set storage extended; +select * from my_locks order by 1; + relname | max_lockmode +-----------+--------------------- + alterlock | AccessExclusiveLock +(1 row) + +rollback; +begin; alter table alterlock alter column f2 set default 'x'; +select * from my_locks order by 1; + relname | max_lockmode +-----------+--------------------- + alterlock | AccessExclusiveLock +(1 row) + +rollback; +begin; +create trigger ttdummy + before delete or update on alterlock + for each row + execute procedure + ttdummy (1, 1); +select * from my_locks order by 1; + relname | max_lockmode +-----------+----------------------- + alterlock | ShareRowExclusiveLock +(1 row) + +rollback; +begin; +select * from my_locks order by 1; + relname | max_lockmode +---------+-------------- +(0 rows) + +alter table alterlock2 add foreign key (f1) references alterlock (f1); +select * from my_locks order by 1; + relname | max_lockmode +-----------------+----------------------- + alterlock | ShareRowExclusiveLock + alterlock2 | ShareRowExclusiveLock + alterlock2_pkey | AccessShareLock + alterlock_pkey | AccessShareLock +(4 rows) + +rollback; +begin; +alter table alterlock2 +add constraint alterlock2nv foreign key (f1) references alterlock (f1) NOT VALID; +select * from my_locks order by 1; + relname | max_lockmode +------------+----------------------- + alterlock | ShareRowExclusiveLock + alterlock2 | ShareRowExclusiveLock +(2 rows) + +commit; +begin; +alter table alterlock2 validate constraint alterlock2nv; +select * from my_locks order by 1; + relname | max_lockmode +-----------------+-------------------------- + alterlock | RowShareLock + alterlock2 | ShareUpdateExclusiveLock + alterlock2_pkey | AccessShareLock + alterlock_pkey | AccessShareLock +(4 rows) + +rollback; +create or replace view my_locks as +select case when c.relname like 'pg_toast%' then 'pg_toast' else c.relname end, max(mode::lockmodes) as max_lockmode +from pg_locks l join pg_class c on l.relation = c.oid +where virtualtransaction = ( + select virtualtransaction + from pg_locks + where transactionid = pg_current_xact_id()::xid) +and locktype = 'relation' +and relnamespace != (select oid from pg_namespace where nspname = 'pg_catalog') +and c.relname = 'my_locks' +group by c.relname; +-- raise exception +alter table my_locks set (autovacuum_enabled = false); +ERROR: unrecognized parameter "autovacuum_enabled" +alter view my_locks set (autovacuum_enabled = false); +ERROR: unrecognized parameter "autovacuum_enabled" +alter table my_locks reset (autovacuum_enabled); +alter view my_locks reset (autovacuum_enabled); +begin; +alter view my_locks set (security_barrier=off); +select * from my_locks order by 1; + relname | max_lockmode +----------+--------------------- + my_locks | AccessExclusiveLock +(1 row) + +alter view my_locks reset (security_barrier); +rollback; +-- this test intentionally applies the ALTER TABLE command against a view, but +-- uses a view option so we expect this to succeed. This form of SQL is +-- accepted for historical reasons, as shown in the docs for ALTER VIEW +begin; +alter table my_locks set (security_barrier=off); +select * from my_locks order by 1; + relname | max_lockmode +----------+--------------------- + my_locks | AccessExclusiveLock +(1 row) + +alter table my_locks reset (security_barrier); +rollback; +-- cleanup +drop table alterlock2; +drop table alterlock; +drop view my_locks; +drop type lockmodes; +-- +-- alter function +-- +create function test_strict(text) returns text as + 'select coalesce($1, ''got passed a null'');' + language sql returns null on null input; +select test_strict(NULL); + test_strict +------------- + +(1 row) + +alter function test_strict(text) called on null input; +select test_strict(NULL); + test_strict +------------------- + got passed a null +(1 row) + +create function non_strict(text) returns text as + 'select coalesce($1, ''got passed a null'');' + language sql called on null input; +select non_strict(NULL); + non_strict +------------------- + got passed a null +(1 row) + +alter function non_strict(text) returns null on null input; +select non_strict(NULL); + non_strict +------------ + +(1 row) + +-- +-- alter object set schema +-- +create schema alter1; +create schema alter2; +create table alter1.t1(f1 serial primary key, f2 int check (f2 > 0)); +create view alter1.v1 as select * from alter1.t1; +create function alter1.plus1(int) returns int as 'select $1+1' language sql; +create domain alter1.posint integer check (value > 0); +create type alter1.ctype as (f1 int, f2 text); +create function alter1.same(alter1.ctype, alter1.ctype) returns boolean language sql +as 'select $1.f1 is not distinct from $2.f1 and $1.f2 is not distinct from $2.f2'; +create operator alter1.=(procedure = alter1.same, leftarg = alter1.ctype, rightarg = alter1.ctype); +create operator class alter1.ctype_hash_ops default for type alter1.ctype using hash as + operator 1 alter1.=(alter1.ctype, alter1.ctype); +create conversion alter1.latin1_to_utf8 for 'latin1' to 'utf8' from iso8859_1_to_utf8; +create text search parser alter1.prs(start = prsd_start, gettoken = prsd_nexttoken, end = prsd_end, lextypes = prsd_lextype); +create text search configuration alter1.cfg(parser = alter1.prs); +create text search template alter1.tmpl(init = dsimple_init, lexize = dsimple_lexize); +create text search dictionary alter1.dict(template = alter1.tmpl); +insert into alter1.t1(f2) values(11); +insert into alter1.t1(f2) values(12); +alter table alter1.t1 set schema alter1; -- no-op, same schema +alter table alter1.t1 set schema alter2; +alter table alter1.v1 set schema alter2; +alter function alter1.plus1(int) set schema alter2; +alter domain alter1.posint set schema alter2; +alter operator class alter1.ctype_hash_ops using hash set schema alter2; +alter operator family alter1.ctype_hash_ops using hash set schema alter2; +alter operator alter1.=(alter1.ctype, alter1.ctype) set schema alter2; +alter function alter1.same(alter1.ctype, alter1.ctype) set schema alter2; +alter type alter1.ctype set schema alter1; -- no-op, same schema +alter type alter1.ctype set schema alter2; +alter conversion alter1.latin1_to_utf8 set schema alter2; +alter text search parser alter1.prs set schema alter2; +alter text search configuration alter1.cfg set schema alter2; +alter text search template alter1.tmpl set schema alter2; +alter text search dictionary alter1.dict set schema alter2; +-- this should succeed because nothing is left in alter1 +drop schema alter1; +insert into alter2.t1(f2) values(13); +insert into alter2.t1(f2) values(14); +select * from alter2.t1; + f1 | f2 +----+---- + 1 | 11 + 2 | 12 + 3 | 13 + 4 | 14 +(4 rows) + +select * from alter2.v1; + f1 | f2 +----+---- + 1 | 11 + 2 | 12 + 3 | 13 + 4 | 14 +(4 rows) + +select alter2.plus1(41); + plus1 +------- + 42 +(1 row) + +-- clean up +drop schema alter2 cascade; +NOTICE: drop cascades to 13 other objects +DETAIL: drop cascades to table alter2.t1 +drop cascades to view alter2.v1 +drop cascades to function alter2.plus1(integer) +drop cascades to type alter2.posint +drop cascades to type alter2.ctype +drop cascades to function alter2.same(alter2.ctype,alter2.ctype) +drop cascades to operator alter2.=(alter2.ctype,alter2.ctype) +drop cascades to operator family alter2.ctype_hash_ops for access method hash +drop cascades to conversion alter2.latin1_to_utf8 +drop cascades to text search parser alter2.prs +drop cascades to text search configuration alter2.cfg +drop cascades to text search template alter2.tmpl +drop cascades to text search dictionary alter2.dict +-- +-- composite types +-- +CREATE TYPE test_type AS (a int); +\d test_type + Composite type "public.test_type" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + +ALTER TYPE nosuchtype ADD ATTRIBUTE b text; -- fails +ERROR: relation "nosuchtype" does not exist +ALTER TYPE test_type ADD ATTRIBUTE b text; +\d test_type + Composite type "public.test_type" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | text | | | + +ALTER TYPE test_type ADD ATTRIBUTE b text; -- fails +ERROR: column "b" of relation "test_type" already exists +ALTER TYPE test_type ALTER ATTRIBUTE b SET DATA TYPE varchar; +\d test_type + Composite type "public.test_type" + Column | Type | Collation | Nullable | Default +--------+-------------------+-----------+----------+--------- + a | integer | | | + b | character varying | | | + +ALTER TYPE test_type ALTER ATTRIBUTE b SET DATA TYPE integer; +\d test_type + Composite type "public.test_type" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + +ALTER TYPE test_type DROP ATTRIBUTE b; +\d test_type + Composite type "public.test_type" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + +ALTER TYPE test_type DROP ATTRIBUTE c; -- fails +ERROR: column "c" of relation "test_type" does not exist +ALTER TYPE test_type DROP ATTRIBUTE IF EXISTS c; +NOTICE: column "c" of relation "test_type" does not exist, skipping +ALTER TYPE test_type DROP ATTRIBUTE a, ADD ATTRIBUTE d boolean; +\d test_type + Composite type "public.test_type" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + d | boolean | | | + +ALTER TYPE test_type RENAME ATTRIBUTE a TO aa; +ERROR: column "a" does not exist +ALTER TYPE test_type RENAME ATTRIBUTE d TO dd; +\d test_type + Composite type "public.test_type" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + dd | boolean | | | + +DROP TYPE test_type; +CREATE TYPE test_type1 AS (a int, b text); +CREATE TABLE test_tbl1 (x int, y test_type1); +ALTER TYPE test_type1 ALTER ATTRIBUTE b TYPE varchar; -- fails +ERROR: cannot alter type "test_type1" because column "test_tbl1.y" uses it +CREATE TYPE test_type2 AS (a int, b text); +CREATE TABLE test_tbl2 OF test_type2; +CREATE TABLE test_tbl2_subclass () INHERITS (test_tbl2); +\d test_type2 + Composite type "public.test_type2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | text | | | + +\d test_tbl2 + Table "public.test_tbl2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | text | | | +Number of child tables: 1 (Use \d+ to list them.) +Typed table of type: test_type2 + +ALTER TYPE test_type2 ADD ATTRIBUTE c text; -- fails +ERROR: cannot alter type "test_type2" because it is the type of a typed table +HINT: Use ALTER ... CASCADE to alter the typed tables too. +ALTER TYPE test_type2 ADD ATTRIBUTE c text CASCADE; +\d test_type2 + Composite type "public.test_type2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | text | | | + c | text | | | + +\d test_tbl2 + Table "public.test_tbl2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | text | | | + c | text | | | +Number of child tables: 1 (Use \d+ to list them.) +Typed table of type: test_type2 + +ALTER TYPE test_type2 ALTER ATTRIBUTE b TYPE varchar; -- fails +ERROR: cannot alter type "test_type2" because it is the type of a typed table +HINT: Use ALTER ... CASCADE to alter the typed tables too. +ALTER TYPE test_type2 ALTER ATTRIBUTE b TYPE varchar CASCADE; +\d test_type2 + Composite type "public.test_type2" + Column | Type | Collation | Nullable | Default +--------+-------------------+-----------+----------+--------- + a | integer | | | + b | character varying | | | + c | text | | | + +\d test_tbl2 + Table "public.test_tbl2" + Column | Type | Collation | Nullable | Default +--------+-------------------+-----------+----------+--------- + a | integer | | | + b | character varying | | | + c | text | | | +Number of child tables: 1 (Use \d+ to list them.) +Typed table of type: test_type2 + +ALTER TYPE test_type2 DROP ATTRIBUTE b; -- fails +ERROR: cannot alter type "test_type2" because it is the type of a typed table +HINT: Use ALTER ... CASCADE to alter the typed tables too. +ALTER TYPE test_type2 DROP ATTRIBUTE b CASCADE; +\d test_type2 + Composite type "public.test_type2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + c | text | | | + +\d test_tbl2 + Table "public.test_tbl2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + c | text | | | +Number of child tables: 1 (Use \d+ to list them.) +Typed table of type: test_type2 + +ALTER TYPE test_type2 RENAME ATTRIBUTE a TO aa; -- fails +ERROR: cannot alter type "test_type2" because it is the type of a typed table +HINT: Use ALTER ... CASCADE to alter the typed tables too. +ALTER TYPE test_type2 RENAME ATTRIBUTE a TO aa CASCADE; +\d test_type2 + Composite type "public.test_type2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + aa | integer | | | + c | text | | | + +\d test_tbl2 + Table "public.test_tbl2" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + aa | integer | | | + c | text | | | +Number of child tables: 1 (Use \d+ to list them.) +Typed table of type: test_type2 + +\d test_tbl2_subclass + Table "public.test_tbl2_subclass" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + aa | integer | | | + c | text | | | +Inherits: test_tbl2 + +DROP TABLE test_tbl2_subclass; +CREATE TYPE test_typex AS (a int, b text); +CREATE TABLE test_tblx (x int, y test_typex check ((y).a > 0)); +ALTER TYPE test_typex DROP ATTRIBUTE a; -- fails +ERROR: cannot drop column a of composite type test_typex because other objects depend on it +DETAIL: constraint test_tblx_y_check on table test_tblx depends on column a of composite type test_typex +HINT: Use DROP ... CASCADE to drop the dependent objects too. +ALTER TYPE test_typex DROP ATTRIBUTE a CASCADE; +NOTICE: drop cascades to constraint test_tblx_y_check on table test_tblx +\d test_tblx + Table "public.test_tblx" + Column | Type | Collation | Nullable | Default +--------+------------+-----------+----------+--------- + x | integer | | | + y | test_typex | | | + +DROP TABLE test_tblx; +DROP TYPE test_typex; +-- This test isn't that interesting on its own, but the purpose is to leave +-- behind a table to test pg_upgrade with. The table has a composite type +-- column in it, and the composite type has a dropped attribute. +CREATE TYPE test_type3 AS (a int); +CREATE TABLE test_tbl3 (c) AS SELECT '(1)'::test_type3; +ALTER TYPE test_type3 DROP ATTRIBUTE a, ADD ATTRIBUTE b int; +CREATE TYPE test_type_empty AS (); +DROP TYPE test_type_empty; +-- +-- typed tables: OF / NOT OF +-- +CREATE TYPE tt_t0 AS (z inet, x int, y numeric(8,2)); +ALTER TYPE tt_t0 DROP ATTRIBUTE z; +CREATE TABLE tt0 (x int NOT NULL, y numeric(8,2)); -- OK +CREATE TABLE tt1 (x int, y bigint); -- wrong base type +CREATE TABLE tt2 (x int, y numeric(9,2)); -- wrong typmod +CREATE TABLE tt3 (y numeric(8,2), x int); -- wrong column order +CREATE TABLE tt4 (x int); -- too few columns +CREATE TABLE tt5 (x int, y numeric(8,2), z int); -- too few columns +CREATE TABLE tt6 () INHERITS (tt0); -- can't have a parent +CREATE TABLE tt7 (x int, q text, y numeric(8,2)); +ALTER TABLE tt7 DROP q; -- OK +ALTER TABLE tt0 OF tt_t0; +ALTER TABLE tt1 OF tt_t0; +ERROR: table "tt1" has different type for column "y" +ALTER TABLE tt2 OF tt_t0; +ERROR: table "tt2" has different type for column "y" +ALTER TABLE tt3 OF tt_t0; +ERROR: table has column "y" where type requires "x" +ALTER TABLE tt4 OF tt_t0; +ERROR: table is missing column "y" +ALTER TABLE tt5 OF tt_t0; +ERROR: table has extra column "z" +ALTER TABLE tt6 OF tt_t0; +ERROR: typed tables cannot inherit +ALTER TABLE tt7 OF tt_t0; +CREATE TYPE tt_t1 AS (x int, y numeric(8,2)); +ALTER TABLE tt7 OF tt_t1; -- reassign an already-typed table +ALTER TABLE tt7 NOT OF; +\d tt7 + Table "public.tt7" + Column | Type | Collation | Nullable | Default +--------+--------------+-----------+----------+--------- + x | integer | | | + y | numeric(8,2) | | | + +-- make sure we can drop a constraint on the parent but it remains on the child +CREATE TABLE test_drop_constr_parent (c text CHECK (c IS NOT NULL)); +CREATE TABLE test_drop_constr_child () INHERITS (test_drop_constr_parent); +ALTER TABLE ONLY test_drop_constr_parent DROP CONSTRAINT "test_drop_constr_parent_c_check"; +-- should fail +INSERT INTO test_drop_constr_child (c) VALUES (NULL); +ERROR: new row for relation "test_drop_constr_child" violates check constraint "test_drop_constr_parent_c_check" +DETAIL: Failing row contains (null). +DROP TABLE test_drop_constr_parent CASCADE; +NOTICE: drop cascades to table test_drop_constr_child +-- +-- IF EXISTS test +-- +ALTER TABLE IF EXISTS tt8 ADD COLUMN f int; +NOTICE: relation "tt8" does not exist, skipping +ALTER TABLE IF EXISTS tt8 ADD CONSTRAINT xxx PRIMARY KEY(f); +NOTICE: relation "tt8" does not exist, skipping +ALTER TABLE IF EXISTS tt8 ADD CHECK (f BETWEEN 0 AND 10); +NOTICE: relation "tt8" does not exist, skipping +ALTER TABLE IF EXISTS tt8 ALTER COLUMN f SET DEFAULT 0; +NOTICE: relation "tt8" does not exist, skipping +ALTER TABLE IF EXISTS tt8 RENAME COLUMN f TO f1; +NOTICE: relation "tt8" does not exist, skipping +ALTER TABLE IF EXISTS tt8 SET SCHEMA alter2; +NOTICE: relation "tt8" does not exist, skipping +CREATE TABLE tt8(a int); +CREATE SCHEMA alter2; +ALTER TABLE IF EXISTS tt8 ADD COLUMN f int; +ALTER TABLE IF EXISTS tt8 ADD CONSTRAINT xxx PRIMARY KEY(f); +ALTER TABLE IF EXISTS tt8 ADD CHECK (f BETWEEN 0 AND 10); +ALTER TABLE IF EXISTS tt8 ALTER COLUMN f SET DEFAULT 0; +ALTER TABLE IF EXISTS tt8 RENAME COLUMN f TO f1; +ALTER TABLE IF EXISTS tt8 SET SCHEMA alter2; +\d alter2.tt8 + Table "alter2.tt8" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + f1 | integer | | not null | 0 +Indexes: + "xxx" PRIMARY KEY, btree (f1) +Check constraints: + "tt8_f_check" CHECK (f1 >= 0 AND f1 <= 10) + +DROP TABLE alter2.tt8; +DROP SCHEMA alter2; +-- +-- Check conflicts between index and CHECK constraint names +-- +CREATE TABLE tt9(c integer); +ALTER TABLE tt9 ADD CHECK(c > 1); +ALTER TABLE tt9 ADD CHECK(c > 2); -- picks nonconflicting name +ALTER TABLE tt9 ADD CONSTRAINT foo CHECK(c > 3); +ALTER TABLE tt9 ADD CONSTRAINT foo CHECK(c > 4); -- fail, dup name +ERROR: constraint "foo" for relation "tt9" already exists +ALTER TABLE tt9 ADD UNIQUE(c); +ALTER TABLE tt9 ADD UNIQUE(c); -- picks nonconflicting name +ALTER TABLE tt9 ADD CONSTRAINT tt9_c_key UNIQUE(c); -- fail, dup name +ERROR: relation "tt9_c_key" already exists +ALTER TABLE tt9 ADD CONSTRAINT foo UNIQUE(c); -- fail, dup name +ERROR: constraint "foo" for relation "tt9" already exists +ALTER TABLE tt9 ADD CONSTRAINT tt9_c_key CHECK(c > 5); -- fail, dup name +ERROR: constraint "tt9_c_key" for relation "tt9" already exists +ALTER TABLE tt9 ADD CONSTRAINT tt9_c_key2 CHECK(c > 6); +ALTER TABLE tt9 ADD UNIQUE(c); -- picks nonconflicting name +\d tt9 + Table "public.tt9" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c | integer | | | +Indexes: + "tt9_c_key" UNIQUE CONSTRAINT, btree (c) + "tt9_c_key1" UNIQUE CONSTRAINT, btree (c) + "tt9_c_key3" UNIQUE CONSTRAINT, btree (c) +Check constraints: + "foo" CHECK (c > 3) + "tt9_c_check" CHECK (c > 1) + "tt9_c_check1" CHECK (c > 2) + "tt9_c_key2" CHECK (c > 6) + +DROP TABLE tt9; +-- Check that comments on constraints and indexes are not lost at ALTER TABLE. +CREATE TABLE comment_test ( + id int, + positive_col int CHECK (positive_col > 0), + indexed_col int, + CONSTRAINT comment_test_pk PRIMARY KEY (id)); +CREATE INDEX comment_test_index ON comment_test(indexed_col); +COMMENT ON COLUMN comment_test.id IS 'Column ''id'' on comment_test'; +COMMENT ON INDEX comment_test_index IS 'Simple index on comment_test'; +COMMENT ON CONSTRAINT comment_test_positive_col_check ON comment_test IS 'CHECK constraint on comment_test.positive_col'; +COMMENT ON CONSTRAINT comment_test_pk ON comment_test IS 'PRIMARY KEY constraint of comment_test'; +COMMENT ON INDEX comment_test_pk IS 'Index backing the PRIMARY KEY of comment_test'; +SELECT col_description('comment_test'::regclass, 1) as comment; + comment +----------------------------- + Column 'id' on comment_test +(1 row) + +SELECT indexrelid::regclass::text as index, obj_description(indexrelid, 'pg_class') as comment FROM pg_index where indrelid = 'comment_test'::regclass ORDER BY 1, 2; + index | comment +--------------------+----------------------------------------------- + comment_test_index | Simple index on comment_test + comment_test_pk | Index backing the PRIMARY KEY of comment_test +(2 rows) + +SELECT conname as constraint, obj_description(oid, 'pg_constraint') as comment FROM pg_constraint where conrelid = 'comment_test'::regclass ORDER BY 1, 2; + constraint | comment +---------------------------------+----------------------------------------------- + comment_test_pk | PRIMARY KEY constraint of comment_test + comment_test_positive_col_check | CHECK constraint on comment_test.positive_col +(2 rows) + +-- Change the datatype of all the columns. ALTER TABLE is optimized to not +-- rebuild an index if the new data type is binary compatible with the old +-- one. Check do a dummy ALTER TABLE that doesn't change the datatype +-- first, to test that no-op codepath, and another one that does. +ALTER TABLE comment_test ALTER COLUMN indexed_col SET DATA TYPE int; +ALTER TABLE comment_test ALTER COLUMN indexed_col SET DATA TYPE text; +ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE int; +ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE text; +ALTER TABLE comment_test ALTER COLUMN positive_col SET DATA TYPE int; +ALTER TABLE comment_test ALTER COLUMN positive_col SET DATA TYPE bigint; +-- Check that the comments are intact. +SELECT col_description('comment_test'::regclass, 1) as comment; + comment +----------------------------- + Column 'id' on comment_test +(1 row) + +SELECT indexrelid::regclass::text as index, obj_description(indexrelid, 'pg_class') as comment FROM pg_index where indrelid = 'comment_test'::regclass ORDER BY 1, 2; + index | comment +--------------------+----------------------------------------------- + comment_test_index | Simple index on comment_test + comment_test_pk | Index backing the PRIMARY KEY of comment_test +(2 rows) + +SELECT conname as constraint, obj_description(oid, 'pg_constraint') as comment FROM pg_constraint where conrelid = 'comment_test'::regclass ORDER BY 1, 2; + constraint | comment +---------------------------------+----------------------------------------------- + comment_test_pk | PRIMARY KEY constraint of comment_test + comment_test_positive_col_check | CHECK constraint on comment_test.positive_col +(2 rows) + +-- Check compatibility for foreign keys and comments. This is done +-- separately as rebuilding the column type of the parent leads +-- to an error and would reduce the test scope. +CREATE TABLE comment_test_child ( + id text CONSTRAINT comment_test_child_fk REFERENCES comment_test); +CREATE INDEX comment_test_child_fk ON comment_test_child(id); +COMMENT ON COLUMN comment_test_child.id IS 'Column ''id'' on comment_test_child'; +COMMENT ON INDEX comment_test_child_fk IS 'Index backing the FOREIGN KEY of comment_test_child'; +COMMENT ON CONSTRAINT comment_test_child_fk ON comment_test_child IS 'FOREIGN KEY constraint of comment_test_child'; +-- Change column type of parent +ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE text; +ALTER TABLE comment_test ALTER COLUMN id SET DATA TYPE int USING id::integer; +ERROR: foreign key constraint "comment_test_child_fk" cannot be implemented +DETAIL: Key columns "id" and "id" are of incompatible types: text and integer. +-- Comments should be intact +SELECT col_description('comment_test_child'::regclass, 1) as comment; + comment +----------------------------------- + Column 'id' on comment_test_child +(1 row) + +SELECT indexrelid::regclass::text as index, obj_description(indexrelid, 'pg_class') as comment FROM pg_index where indrelid = 'comment_test_child'::regclass ORDER BY 1, 2; + index | comment +-----------------------+----------------------------------------------------- + comment_test_child_fk | Index backing the FOREIGN KEY of comment_test_child +(1 row) + +SELECT conname as constraint, obj_description(oid, 'pg_constraint') as comment FROM pg_constraint where conrelid = 'comment_test_child'::regclass ORDER BY 1, 2; + constraint | comment +-----------------------+---------------------------------------------- + comment_test_child_fk | FOREIGN KEY constraint of comment_test_child +(1 row) + +-- Check that we map relation oids to filenodes and back correctly. Only +-- display bad mappings so the test output doesn't change all the time. A +-- filenode function call can return NULL for a relation dropped concurrently +-- with the call's surrounding query, so ignore a NULL mapped_oid for +-- relations that no longer exist after all calls finish. +CREATE TEMP TABLE filenode_mapping AS +SELECT + oid, mapped_oid, reltablespace, relfilenode, relname +FROM pg_class, + pg_filenode_relation(reltablespace, pg_relation_filenode(oid)) AS mapped_oid +WHERE relkind IN ('r', 'i', 'S', 't', 'm') AND mapped_oid IS DISTINCT FROM oid; +SELECT m.* FROM filenode_mapping m LEFT JOIN pg_class c ON c.oid = m.oid +WHERE c.oid IS NOT NULL OR m.mapped_oid IS NOT NULL; + oid | mapped_oid | reltablespace | relfilenode | relname +-----+------------+---------------+-------------+--------- +(0 rows) + +-- Checks on creating and manipulation of user defined relations in +-- pg_catalog. +SHOW allow_system_table_mods; + allow_system_table_mods +------------------------- + off +(1 row) + +-- disallowed because of search_path issues with pg_dump +CREATE TABLE pg_catalog.new_system_table(); +ERROR: permission denied to create "pg_catalog.new_system_table" +DETAIL: System catalog modifications are currently disallowed. +-- instead create in public first, move to catalog +CREATE TABLE new_system_table(id serial primary key, othercol text); +ALTER TABLE new_system_table SET SCHEMA pg_catalog; +ALTER TABLE new_system_table SET SCHEMA public; +ALTER TABLE new_system_table SET SCHEMA pg_catalog; +-- will be ignored -- already there: +ALTER TABLE new_system_table SET SCHEMA pg_catalog; +ALTER TABLE new_system_table RENAME TO old_system_table; +CREATE INDEX old_system_table__othercol ON old_system_table (othercol); +INSERT INTO old_system_table(othercol) VALUES ('somedata'), ('otherdata'); +UPDATE old_system_table SET id = -id; +DELETE FROM old_system_table WHERE othercol = 'somedata'; +TRUNCATE old_system_table; +ALTER TABLE old_system_table DROP CONSTRAINT new_system_table_pkey; +ALTER TABLE old_system_table DROP COLUMN othercol; +DROP TABLE old_system_table; +-- set logged +CREATE UNLOGGED TABLE unlogged1(f1 SERIAL PRIMARY KEY, f2 TEXT); +-- check relpersistence of an unlogged table +SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^unlogged1' +UNION ALL +SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^unlogged1' +UNION ALL +SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^unlogged1' +ORDER BY relname; + relname | relkind | relpersistence +------------------+---------+---------------- + toast index | i | p + toast table | t | p + unlogged1 | r | p + unlogged1_f1_seq | S | p + unlogged1_pkey | i | p +(5 rows) + +CREATE UNLOGGED TABLE unlogged2(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES unlogged1); -- foreign key +CREATE UNLOGGED TABLE unlogged3(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES unlogged3); -- self-referencing foreign key +ALTER TABLE unlogged3 SET LOGGED; -- skip self-referencing foreign key +ALTER TABLE unlogged2 SET LOGGED; -- fails because a foreign key to an unlogged table exists +ALTER TABLE unlogged1 SET LOGGED; +-- check relpersistence of an unlogged table after changing to permanent +SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^unlogged1' +UNION ALL +SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^unlogged1' +UNION ALL +SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^unlogged1' +ORDER BY relname; + relname | relkind | relpersistence +------------------+---------+---------------- + toast index | i | p + toast table | t | p + unlogged1 | r | p + unlogged1_f1_seq | S | p + unlogged1_pkey | i | p +(5 rows) + +ALTER TABLE unlogged1 SET LOGGED; -- silently do nothing +DROP TABLE unlogged3; +DROP TABLE unlogged2; +DROP TABLE unlogged1; +-- set unlogged +CREATE TABLE logged1(f1 SERIAL PRIMARY KEY, f2 TEXT); +-- check relpersistence of a permanent table +SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^logged1' +UNION ALL +SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^logged1' +UNION ALL +SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^logged1' +ORDER BY relname; + relname | relkind | relpersistence +----------------+---------+---------------- + logged1 | r | p + logged1_f1_seq | S | p + logged1_pkey | i | p + toast index | i | p + toast table | t | p +(5 rows) + +CREATE TABLE logged2(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES logged1); -- foreign key +CREATE TABLE logged3(f1 SERIAL PRIMARY KEY, f2 INTEGER REFERENCES logged3); -- self-referencing foreign key +ALTER TABLE logged1 SET UNLOGGED; -- fails because a foreign key from a permanent table exists +ERROR: could not change table "logged1" to unlogged because it references logged table "logged2" +ALTER TABLE logged3 SET UNLOGGED; -- skip self-referencing foreign key +ALTER TABLE logged2 SET UNLOGGED; +ALTER TABLE logged1 SET UNLOGGED; +-- check relpersistence of a permanent table after changing to unlogged +SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^logged1' +UNION ALL +SELECT 'toast table', t.relkind, t.relpersistence FROM pg_class r JOIN pg_class t ON t.oid = r.reltoastrelid WHERE r.relname ~ '^logged1' +UNION ALL +SELECT 'toast index', ri.relkind, ri.relpersistence FROM pg_class r join pg_class t ON t.oid = r.reltoastrelid JOIN pg_index i ON i.indrelid = t.oid JOIN pg_class ri ON ri.oid = i.indexrelid WHERE r.relname ~ '^logged1' +ORDER BY relname; + relname | relkind | relpersistence +----------------+---------+---------------- + logged1 | r | u + logged1_f1_seq | S | p + logged1_pkey | i | u + toast index | i | u + toast table | t | u +(5 rows) + +ALTER TABLE logged1 SET UNLOGGED; -- silently do nothing +DROP TABLE logged3; +DROP TABLE logged2; +DROP TABLE logged1; +-- test ADD COLUMN IF NOT EXISTS +CREATE TABLE test_add_column(c1 integer); +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + +ALTER TABLE test_add_column + ADD COLUMN c2 integer; +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + +ALTER TABLE test_add_column + ADD COLUMN c2 integer; -- fail because c2 already exists +ERROR: column "c2" of relation "test_add_column" already exists +ALTER TABLE ONLY test_add_column + ADD COLUMN c2 integer; -- fail because c2 already exists +ERROR: column "c2" of relation "test_add_column" already exists +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + +ALTER TABLE test_add_column + ADD COLUMN IF NOT EXISTS c2 integer; -- skipping because c2 already exists +NOTICE: column "c2" of relation "test_add_column" already exists, skipping +ALTER TABLE ONLY test_add_column + ADD COLUMN IF NOT EXISTS c2 integer; -- skipping because c2 already exists +NOTICE: column "c2" of relation "test_add_column" already exists, skipping +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + +ALTER TABLE test_add_column + ADD COLUMN c2 integer, -- fail because c2 already exists + ADD COLUMN c3 integer primary key; +ERROR: column "c2" of relation "test_add_column" already exists +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + +ALTER TABLE test_add_column + ADD COLUMN IF NOT EXISTS c2 integer, -- skipping because c2 already exists + ADD COLUMN c3 integer primary key; +NOTICE: column "c2" of relation "test_add_column" already exists, skipping +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + c3 | integer | | not null | +Indexes: + "test_add_column_pkey" PRIMARY KEY, btree (c3) + +ALTER TABLE test_add_column + ADD COLUMN IF NOT EXISTS c2 integer, -- skipping because c2 already exists + ADD COLUMN IF NOT EXISTS c3 integer primary key; -- skipping because c3 already exists +NOTICE: column "c2" of relation "test_add_column" already exists, skipping +NOTICE: column "c3" of relation "test_add_column" already exists, skipping +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + c3 | integer | | not null | +Indexes: + "test_add_column_pkey" PRIMARY KEY, btree (c3) + +ALTER TABLE test_add_column + ADD COLUMN IF NOT EXISTS c2 integer, -- skipping because c2 already exists + ADD COLUMN IF NOT EXISTS c3 integer, -- skipping because c3 already exists + ADD COLUMN c4 integer REFERENCES test_add_column; +NOTICE: column "c2" of relation "test_add_column" already exists, skipping +NOTICE: column "c3" of relation "test_add_column" already exists, skipping +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + c3 | integer | | not null | + c4 | integer | | | +Indexes: + "test_add_column_pkey" PRIMARY KEY, btree (c3) +Foreign-key constraints: + "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3) +Referenced by: + TABLE "test_add_column" CONSTRAINT "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3) + +ALTER TABLE test_add_column + ADD COLUMN IF NOT EXISTS c4 integer REFERENCES test_add_column; +NOTICE: column "c4" of relation "test_add_column" already exists, skipping +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + c2 | integer | | | + c3 | integer | | not null | + c4 | integer | | | +Indexes: + "test_add_column_pkey" PRIMARY KEY, btree (c3) +Foreign-key constraints: + "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3) +Referenced by: + TABLE "test_add_column" CONSTRAINT "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3) + +ALTER TABLE test_add_column + ADD COLUMN IF NOT EXISTS c5 SERIAL CHECK (c5 > 8); +\d test_add_column + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------------------------------------------- + c1 | integer | | | + c2 | integer | | | + c3 | integer | | not null | + c4 | integer | | | + c5 | integer | | not null | nextval('test_add_column_c5_seq'::regclass) +Indexes: + "test_add_column_pkey" PRIMARY KEY, btree (c3) +Check constraints: + "test_add_column_c5_check" CHECK (c5 > 8) +Foreign-key constraints: + "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3) +Referenced by: + TABLE "test_add_column" CONSTRAINT "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3) + +ALTER TABLE test_add_column + ADD COLUMN IF NOT EXISTS c5 SERIAL CHECK (c5 > 10); +NOTICE: column "c5" of relation "test_add_column" already exists, skipping +\d test_add_column* + Table "public.test_add_column" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------------------------------------------- + c1 | integer | | | + c2 | integer | | | + c3 | integer | | not null | + c4 | integer | | | + c5 | integer | | not null | nextval('test_add_column_c5_seq'::regclass) +Indexes: + "test_add_column_pkey" PRIMARY KEY, btree (c3) +Check constraints: + "test_add_column_c5_check" CHECK (c5 > 8) +Foreign-key constraints: + "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3) +Referenced by: + TABLE "test_add_column" CONSTRAINT "test_add_column_c4_fkey" FOREIGN KEY (c4) REFERENCES test_add_column(c3) + + Sequence "public.test_add_column_c5_seq" + Type | Start | Minimum | Maximum | Increment | Cycles? | Cache +---------+-------+---------+------------+-----------+---------+------- + integer | 1 | 1 | 2147483647 | 1 | no | 1 +Owned by: public.test_add_column.c5 + + Index "public.test_add_column_pkey" + Column | Type | Key? | Definition +--------+---------+------+------------ + c3 | integer | yes | c3 +primary key, btree, for table "public.test_add_column" + +DROP TABLE test_add_column; +\d test_add_column* +-- assorted cases with multiple ALTER TABLE steps +CREATE TABLE ataddindex(f1 INT); +INSERT INTO ataddindex VALUES (42), (43); +CREATE UNIQUE INDEX ataddindexi0 ON ataddindex(f1); +ALTER TABLE ataddindex + ADD PRIMARY KEY USING INDEX ataddindexi0, + ALTER f1 TYPE BIGINT; +\d ataddindex + Table "public.ataddindex" + Column | Type | Collation | Nullable | Default +--------+--------+-----------+----------+--------- + f1 | bigint | | not null | +Indexes: + "ataddindexi0" PRIMARY KEY, btree (f1) + +DROP TABLE ataddindex; +CREATE TABLE ataddindex(f1 VARCHAR(10)); +INSERT INTO ataddindex(f1) VALUES ('foo'), ('a'); +ALTER TABLE ataddindex + ALTER f1 SET DATA TYPE TEXT, + ADD EXCLUDE ((f1 LIKE 'a') WITH =); +\d ataddindex + Table "public.ataddindex" + Column | Type | Collation | Nullable | Default +--------+------+-----------+----------+--------- + f1 | text | | | +Indexes: + "ataddindex_expr_excl" EXCLUDE USING btree ((f1 ~~ 'a'::text) WITH =) + +DROP TABLE ataddindex; +CREATE TABLE ataddindex(id int, ref_id int); +ALTER TABLE ataddindex + ADD PRIMARY KEY (id), + ADD FOREIGN KEY (ref_id) REFERENCES ataddindex; +\d ataddindex + Table "public.ataddindex" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + id | integer | | not null | + ref_id | integer | | | +Indexes: + "ataddindex_pkey" PRIMARY KEY, btree (id) +Foreign-key constraints: + "ataddindex_ref_id_fkey" FOREIGN KEY (ref_id) REFERENCES ataddindex(id) +Referenced by: + TABLE "ataddindex" CONSTRAINT "ataddindex_ref_id_fkey" FOREIGN KEY (ref_id) REFERENCES ataddindex(id) + +DROP TABLE ataddindex; +CREATE TABLE ataddindex(id int, ref_id int); +ALTER TABLE ataddindex + ADD UNIQUE (id), + ADD FOREIGN KEY (ref_id) REFERENCES ataddindex (id); +\d ataddindex + Table "public.ataddindex" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + id | integer | | | + ref_id | integer | | | +Indexes: + "ataddindex_id_key" UNIQUE CONSTRAINT, btree (id) +Foreign-key constraints: + "ataddindex_ref_id_fkey" FOREIGN KEY (ref_id) REFERENCES ataddindex(id) +Referenced by: + TABLE "ataddindex" CONSTRAINT "ataddindex_ref_id_fkey" FOREIGN KEY (ref_id) REFERENCES ataddindex(id) + +DROP TABLE ataddindex; +-- unsupported constraint types for partitioned tables +CREATE TABLE partitioned ( + a int, + b int +) PARTITION BY RANGE (a, (a+b+1)); +ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&); +ERROR: exclusion constraints are not supported on partitioned tables +LINE 1: ALTER TABLE partitioned ADD EXCLUDE USING gist (a WITH &&); + ^ +-- cannot drop column that is part of the partition key +ALTER TABLE partitioned DROP COLUMN a; +ERROR: cannot drop column "a" because it is part of the partition key of relation "partitioned" +ALTER TABLE partitioned ALTER COLUMN a TYPE char(5); +ERROR: cannot alter column "a" because it is part of the partition key of relation "partitioned" +ALTER TABLE partitioned DROP COLUMN b; +ERROR: cannot drop column "b" because it is part of the partition key of relation "partitioned" +ALTER TABLE partitioned ALTER COLUMN b TYPE char(5); +ERROR: cannot alter column "b" because it is part of the partition key of relation "partitioned" +-- partitioned table cannot participate in regular inheritance +CREATE TABLE nonpartitioned ( + a int, + b int +); +ALTER TABLE partitioned INHERIT nonpartitioned; +ERROR: cannot change inheritance of partitioned table +ALTER TABLE nonpartitioned INHERIT partitioned; +ERROR: cannot inherit from partitioned table "partitioned" +-- cannot add NO INHERIT constraint to partitioned tables +ALTER TABLE partitioned ADD CONSTRAINT chk_a CHECK (a > 0) NO INHERIT; +ERROR: cannot add NO INHERIT constraint to partitioned table "partitioned" +DROP TABLE partitioned, nonpartitioned; +-- +-- ATTACH PARTITION +-- +-- check that target table is partitioned +CREATE TABLE unparted ( + a int +); +CREATE TABLE fail_part (like unparted); +ALTER TABLE unparted ATTACH PARTITION fail_part FOR VALUES IN ('a'); +ERROR: table "unparted" is not partitioned +DROP TABLE unparted, fail_part; +-- check that partition bound is compatible +CREATE TABLE list_parted ( + a int NOT NULL, + b char(2) COLLATE "C", + CONSTRAINT check_a CHECK (a > 0) +) PARTITION BY LIST (a); +CREATE TABLE fail_part (LIKE list_parted); +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES FROM (1) TO (10); +ERROR: invalid bound specification for a list partition +LINE 1: ...list_parted ATTACH PARTITION fail_part FOR VALUES FROM (1) T... + ^ +DROP TABLE fail_part; +-- check that the table being attached exists +ALTER TABLE list_parted ATTACH PARTITION nonexistent FOR VALUES IN (1); +ERROR: relation "nonexistent" does not exist +-- check ownership of the source table +CREATE ROLE regress_test_me; +CREATE ROLE regress_test_not_me; +CREATE TABLE not_owned_by_me (LIKE list_parted); +ALTER TABLE not_owned_by_me OWNER TO regress_test_not_me; +SET SESSION AUTHORIZATION regress_test_me; +CREATE TABLE owned_by_me ( + a int +) PARTITION BY LIST (a); +ALTER TABLE owned_by_me ATTACH PARTITION not_owned_by_me FOR VALUES IN (1); +ERROR: must be owner of table not_owned_by_me +RESET SESSION AUTHORIZATION; +DROP TABLE owned_by_me, not_owned_by_me; +DROP ROLE regress_test_not_me; +DROP ROLE regress_test_me; +-- check that the table being attached is not part of regular inheritance +CREATE TABLE parent (LIKE list_parted); +CREATE TABLE child () INHERITS (parent); +ALTER TABLE list_parted ATTACH PARTITION child FOR VALUES IN (1); +ERROR: cannot attach inheritance child as partition +ALTER TABLE list_parted ATTACH PARTITION parent FOR VALUES IN (1); +ERROR: cannot attach inheritance parent as partition +DROP TABLE parent CASCADE; +NOTICE: drop cascades to table child +-- check any TEMP-ness +CREATE TEMP TABLE temp_parted (a int) PARTITION BY LIST (a); +CREATE TABLE perm_part (a int); +ALTER TABLE temp_parted ATTACH PARTITION perm_part FOR VALUES IN (1); +ERROR: cannot attach a permanent relation as partition of temporary relation "temp_parted" +DROP TABLE temp_parted, perm_part; +-- check that the table being attached is not a typed table +CREATE TYPE mytype AS (a int); +CREATE TABLE fail_part OF mytype; +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: cannot attach a typed table as partition +DROP TYPE mytype CASCADE; +NOTICE: drop cascades to table fail_part +-- check that the table being attached has only columns present in the parent +CREATE TABLE fail_part (like list_parted, c int); +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: table "fail_part" contains column "c" not found in parent "list_parted" +DETAIL: The new partition may contain only the columns present in parent. +DROP TABLE fail_part; +-- check that the table being attached has every column of the parent +CREATE TABLE fail_part (a int NOT NULL); +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: child table is missing column "b" +DROP TABLE fail_part; +-- check that columns match in type, collation and NOT NULL status +CREATE TABLE fail_part ( + b char(3), + a int NOT NULL +); +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: child table "fail_part" has different type for column "b" +ALTER TABLE fail_part ALTER b TYPE char (2) COLLATE "POSIX"; +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: child table "fail_part" has different collation for column "b" +DROP TABLE fail_part; +-- check that the table being attached has all constraints of the parent +CREATE TABLE fail_part ( + b char(2) COLLATE "C", + a int NOT NULL +); +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: child table is missing constraint "check_a" +-- check that the constraint matches in definition with parent's constraint +ALTER TABLE fail_part ADD CONSTRAINT check_a CHECK (a >= 0); +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: child table "fail_part" has different definition for check constraint "check_a" +DROP TABLE fail_part; +-- check the attributes and constraints after partition is attached +CREATE TABLE part_1 ( + a int NOT NULL, + b char(2) COLLATE "C", + CONSTRAINT check_a CHECK (a > 0) +); +ALTER TABLE list_parted ATTACH PARTITION part_1 FOR VALUES IN (1); +-- attislocal and conislocal are always false for merged attributes and constraints respectively. +SELECT attislocal, attinhcount FROM pg_attribute WHERE attrelid = 'part_1'::regclass AND attnum > 0; + attislocal | attinhcount +------------+------------- + f | 1 + f | 1 +(2 rows) + +SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_1'::regclass AND conname = 'check_a'; + conislocal | coninhcount +------------+------------- + f | 1 +(1 row) + +-- check that the new partition won't overlap with an existing partition +CREATE TABLE fail_part (LIKE part_1 INCLUDING CONSTRAINTS); +ALTER TABLE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); +ERROR: partition "fail_part" would overlap partition "part_1" +LINE 1: ...LE list_parted ATTACH PARTITION fail_part FOR VALUES IN (1); + ^ +DROP TABLE fail_part; +-- check that an existing table can be attached as a default partition +CREATE TABLE def_part (LIKE list_parted INCLUDING CONSTRAINTS); +ALTER TABLE list_parted ATTACH PARTITION def_part DEFAULT; +-- check attaching default partition fails if a default partition already +-- exists +CREATE TABLE fail_def_part (LIKE part_1 INCLUDING CONSTRAINTS); +ALTER TABLE list_parted ATTACH PARTITION fail_def_part DEFAULT; +ERROR: partition "fail_def_part" conflicts with existing default partition "def_part" +LINE 1: ...ER TABLE list_parted ATTACH PARTITION fail_def_part DEFAULT; + ^ +-- check validation when attaching list partitions +CREATE TABLE list_parted2 ( + a int, + b char +) PARTITION BY LIST (a); +-- check that violating rows are correctly reported +CREATE TABLE part_2 (LIKE list_parted2); +INSERT INTO part_2 VALUES (3, 'a'); +ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2); +ERROR: partition constraint of relation "part_2" is violated by some row +-- should be ok after deleting the bad row +DELETE FROM part_2; +ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2); +-- check partition cannot be attached if default has some row for its values +CREATE TABLE list_parted2_def PARTITION OF list_parted2 DEFAULT; +INSERT INTO list_parted2_def VALUES (11, 'z'); +CREATE TABLE part_3 (LIKE list_parted2); +ALTER TABLE list_parted2 ATTACH PARTITION part_3 FOR VALUES IN (11); +ERROR: updated partition constraint for default partition "list_parted2_def" would be violated by some row +-- should be ok after deleting the bad row +DELETE FROM list_parted2_def WHERE a = 11; +ALTER TABLE list_parted2 ATTACH PARTITION part_3 FOR VALUES IN (11); +-- adding constraints that describe the desired partition constraint +-- (or more restrictive) will help skip the validation scan +CREATE TABLE part_3_4 ( + LIKE list_parted2, + CONSTRAINT check_a CHECK (a IN (3)) +); +-- however, if a list partition does not accept nulls, there should be +-- an explicit NOT NULL constraint on the partition key column for the +-- validation scan to be skipped; +ALTER TABLE list_parted2 ATTACH PARTITION part_3_4 FOR VALUES IN (3, 4); +-- adding a NOT NULL constraint will cause the scan to be skipped +ALTER TABLE list_parted2 DETACH PARTITION part_3_4; +ALTER TABLE part_3_4 ALTER a SET NOT NULL; +ALTER TABLE list_parted2 ATTACH PARTITION part_3_4 FOR VALUES IN (3, 4); +-- check if default partition scan skipped +ALTER TABLE list_parted2_def ADD CONSTRAINT check_a CHECK (a IN (5, 6)); +CREATE TABLE part_55_66 PARTITION OF list_parted2 FOR VALUES IN (55, 66); +-- check validation when attaching range partitions +CREATE TABLE range_parted ( + a int, + b int +) PARTITION BY RANGE (a, b); +-- check that violating rows are correctly reported +CREATE TABLE part1 ( + a int NOT NULL CHECK (a = 1), + b int NOT NULL CHECK (b >= 1 AND b <= 10) +); +INSERT INTO part1 VALUES (1, 10); +-- Remember the TO bound is exclusive +ALTER TABLE range_parted ATTACH PARTITION part1 FOR VALUES FROM (1, 1) TO (1, 10); +ERROR: partition constraint of relation "part1" is violated by some row +-- should be ok after deleting the bad row +DELETE FROM part1; +ALTER TABLE range_parted ATTACH PARTITION part1 FOR VALUES FROM (1, 1) TO (1, 10); +-- adding constraints that describe the desired partition constraint +-- (or more restrictive) will help skip the validation scan +CREATE TABLE part2 ( + a int NOT NULL CHECK (a = 1), + b int NOT NULL CHECK (b >= 10 AND b < 18) +); +ALTER TABLE range_parted ATTACH PARTITION part2 FOR VALUES FROM (1, 10) TO (1, 20); +-- Create default partition +CREATE TABLE partr_def1 PARTITION OF range_parted DEFAULT; +-- Only one default partition is allowed, hence, following should give error +CREATE TABLE partr_def2 (LIKE part1 INCLUDING CONSTRAINTS); +ALTER TABLE range_parted ATTACH PARTITION partr_def2 DEFAULT; +ERROR: partition "partr_def2" conflicts with existing default partition "partr_def1" +LINE 1: ...LTER TABLE range_parted ATTACH PARTITION partr_def2 DEFAULT; + ^ +-- Overlapping partitions cannot be attached, hence, following should give error +INSERT INTO partr_def1 VALUES (2, 10); +CREATE TABLE part3 (LIKE range_parted); +ALTER TABLE range_parted ATTACH partition part3 FOR VALUES FROM (2, 10) TO (2, 20); +ERROR: updated partition constraint for default partition "partr_def1" would be violated by some row +-- Attaching partitions should be successful when there are no overlapping rows +ALTER TABLE range_parted ATTACH partition part3 FOR VALUES FROM (3, 10) TO (3, 20); +-- check that leaf partitions are scanned when attaching a partitioned +-- table +CREATE TABLE part_5 ( + LIKE list_parted2 +) PARTITION BY LIST (b); +-- check that violating rows are correctly reported +CREATE TABLE part_5_a PARTITION OF part_5 FOR VALUES IN ('a'); +INSERT INTO part_5_a (a, b) VALUES (6, 'a'); +ALTER TABLE list_parted2 ATTACH PARTITION part_5 FOR VALUES IN (5); +ERROR: partition constraint of relation "part_5_a" is violated by some row +-- delete the faulting row and also add a constraint to skip the scan +DELETE FROM part_5_a WHERE a NOT IN (3); +ALTER TABLE part_5 ADD CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 5); +ALTER TABLE list_parted2 ATTACH PARTITION part_5 FOR VALUES IN (5); +ALTER TABLE list_parted2 DETACH PARTITION part_5; +ALTER TABLE part_5 DROP CONSTRAINT check_a; +-- scan should again be skipped, even though NOT NULL is now a column property +ALTER TABLE part_5 ADD CONSTRAINT check_a CHECK (a IN (5)), ALTER a SET NOT NULL; +ALTER TABLE list_parted2 ATTACH PARTITION part_5 FOR VALUES IN (5); +-- Check the case where attnos of the partitioning columns in the table being +-- attached differs from the parent. It should not affect the constraint- +-- checking logic that allows to skip the scan. +CREATE TABLE part_6 ( + c int, + LIKE list_parted2, + CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 6) +); +ALTER TABLE part_6 DROP c; +ALTER TABLE list_parted2 ATTACH PARTITION part_6 FOR VALUES IN (6); +-- Similar to above, but the table being attached is a partitioned table +-- whose partition has still different attnos for the root partitioning +-- columns. +CREATE TABLE part_7 ( + LIKE list_parted2, + CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 7) +) PARTITION BY LIST (b); +CREATE TABLE part_7_a_null ( + c int, + d int, + e int, + LIKE list_parted2, -- 'a' will have attnum = 4 + CONSTRAINT check_b CHECK (b IS NULL OR b = 'a'), + CONSTRAINT check_a CHECK (a IS NOT NULL AND a = 7) +); +ALTER TABLE part_7_a_null DROP c, DROP d, DROP e; +ALTER TABLE part_7 ATTACH PARTITION part_7_a_null FOR VALUES IN ('a', null); +ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7); +-- Same example, but check this time that the constraint correctly detects +-- violating rows +ALTER TABLE list_parted2 DETACH PARTITION part_7; +ALTER TABLE part_7 DROP CONSTRAINT check_a; -- thusly, scan won't be skipped +INSERT INTO part_7 (a, b) VALUES (8, null), (9, 'a'); +SELECT tableoid::regclass, a, b FROM part_7 order by a; + tableoid | a | b +---------------+---+--- + part_7_a_null | 8 | + part_7_a_null | 9 | a +(2 rows) + +ALTER TABLE list_parted2 ATTACH PARTITION part_7 FOR VALUES IN (7); +ERROR: partition constraint of relation "part_7_a_null" is violated by some row +-- check that leaf partitions of default partition are scanned when +-- attaching a partitioned table. +ALTER TABLE part_5 DROP CONSTRAINT check_a; +CREATE TABLE part5_def PARTITION OF part_5 DEFAULT PARTITION BY LIST(a); +CREATE TABLE part5_def_p1 PARTITION OF part5_def FOR VALUES IN (5); +INSERT INTO part5_def_p1 VALUES (5, 'y'); +CREATE TABLE part5_p1 (LIKE part_5); +ALTER TABLE part_5 ATTACH PARTITION part5_p1 FOR VALUES IN ('y'); +ERROR: updated partition constraint for default partition "part5_def_p1" would be violated by some row +-- should be ok after deleting the bad row +DELETE FROM part5_def_p1 WHERE b = 'y'; +ALTER TABLE part_5 ATTACH PARTITION part5_p1 FOR VALUES IN ('y'); +-- check that the table being attached is not already a partition +ALTER TABLE list_parted2 ATTACH PARTITION part_2 FOR VALUES IN (2); +ERROR: "part_2" is already a partition +-- check that circular inheritance is not allowed +ALTER TABLE part_5 ATTACH PARTITION list_parted2 FOR VALUES IN ('b'); +ERROR: circular inheritance not allowed +DETAIL: "part_5" is already a child of "list_parted2". +ALTER TABLE list_parted2 ATTACH PARTITION list_parted2 FOR VALUES IN (0); +ERROR: circular inheritance not allowed +DETAIL: "list_parted2" is already a child of "list_parted2". +-- If a partitioned table being created or an existing table being attached +-- as a partition does not have a constraint that would allow validation scan +-- to be skipped, but an individual partition does, then the partition's +-- validation scan is skipped. +CREATE TABLE quuux (a int, b text) PARTITION BY LIST (a); +CREATE TABLE quuux_default PARTITION OF quuux DEFAULT PARTITION BY LIST (b); +CREATE TABLE quuux_default1 PARTITION OF quuux_default ( + CONSTRAINT check_1 CHECK (a IS NOT NULL AND a = 1) +) FOR VALUES IN ('b'); +CREATE TABLE quuux1 (a int, b text); +ALTER TABLE quuux ATTACH PARTITION quuux1 FOR VALUES IN (1); -- validate! +CREATE TABLE quuux2 (a int, b text); +ALTER TABLE quuux ATTACH PARTITION quuux2 FOR VALUES IN (2); -- skip validation +DROP TABLE quuux1, quuux2; +-- should validate for quuux1, but not for quuux2 +CREATE TABLE quuux1 PARTITION OF quuux FOR VALUES IN (1); +CREATE TABLE quuux2 PARTITION OF quuux FOR VALUES IN (2); +DROP TABLE quuux; +-- check validation when attaching hash partitions +-- Use hand-rolled hash functions and operator class to get predictable result +-- on different machines. part_test_int4_ops is defined in insert.sql. +-- check that the new partition won't overlap with an existing partition +CREATE TABLE hash_parted ( + a int, + b int +) PARTITION BY HASH (a part_test_int4_ops); +CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 4, REMAINDER 0); +CREATE TABLE fail_part (LIKE hpart_1); +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 4); +ERROR: partition "fail_part" would overlap partition "hpart_1" +LINE 1: ...hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODU... + ^ +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 0); +ERROR: partition "fail_part" would overlap partition "hpart_1" +LINE 1: ...hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODU... + ^ +DROP TABLE fail_part; +-- check validation when attaching hash partitions +-- check that violating rows are correctly reported +CREATE TABLE hpart_2 (LIKE hash_parted); +INSERT INTO hpart_2 VALUES (3, 0); +ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1); +ERROR: partition constraint of relation "hpart_2" is violated by some row +-- should be ok after deleting the bad row +DELETE FROM hpart_2; +ALTER TABLE hash_parted ATTACH PARTITION hpart_2 FOR VALUES WITH (MODULUS 4, REMAINDER 1); +-- check that leaf partitions are scanned when attaching a partitioned +-- table +CREATE TABLE hpart_5 ( + LIKE hash_parted +) PARTITION BY LIST (b); +-- check that violating rows are correctly reported +CREATE TABLE hpart_5_a PARTITION OF hpart_5 FOR VALUES IN ('1', '2', '3'); +INSERT INTO hpart_5_a (a, b) VALUES (7, 1); +ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2); +ERROR: partition constraint of relation "hpart_5_a" is violated by some row +-- should be ok after deleting the bad row +DELETE FROM hpart_5_a; +ALTER TABLE hash_parted ATTACH PARTITION hpart_5 FOR VALUES WITH (MODULUS 4, REMAINDER 2); +-- check that the table being attach is with valid modulus and remainder value +CREATE TABLE fail_part(LIKE hash_parted); +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 0, REMAINDER 1); +ERROR: modulus for hash partition must be a positive integer +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 8, REMAINDER 8); +ERROR: remainder for hash partition must be less than modulus +ALTER TABLE hash_parted ATTACH PARTITION fail_part FOR VALUES WITH (MODULUS 3, REMAINDER 2); +ERROR: every hash partition modulus must be a factor of the next larger modulus +DETAIL: The new modulus 3 is not a factor of 4, the modulus of existing partition "hpart_1". +DROP TABLE fail_part; +-- +-- DETACH PARTITION +-- +-- check that the table is partitioned at all +CREATE TABLE regular_table (a int); +ALTER TABLE regular_table DETACH PARTITION any_name; +ERROR: table "regular_table" is not partitioned +DROP TABLE regular_table; +-- check that the partition being detached exists at all +ALTER TABLE list_parted2 DETACH PARTITION part_4; +ERROR: relation "part_4" does not exist +ALTER TABLE hash_parted DETACH PARTITION hpart_4; +ERROR: relation "hpart_4" does not exist +-- check that the partition being detached is actually a partition of the parent +CREATE TABLE not_a_part (a int); +ALTER TABLE list_parted2 DETACH PARTITION not_a_part; +ERROR: relation "not_a_part" is not a partition of relation "list_parted2" +ALTER TABLE list_parted2 DETACH PARTITION part_1; +ERROR: relation "part_1" is not a partition of relation "list_parted2" +ALTER TABLE hash_parted DETACH PARTITION not_a_part; +ERROR: relation "not_a_part" is not a partition of relation "hash_parted" +DROP TABLE not_a_part; +-- check that, after being detached, attinhcount/coninhcount is dropped to 0 and +-- attislocal/conislocal is set to true +ALTER TABLE list_parted2 DETACH PARTITION part_3_4; +SELECT attinhcount, attislocal FROM pg_attribute WHERE attrelid = 'part_3_4'::regclass AND attnum > 0; + attinhcount | attislocal +-------------+------------ + 0 | t + 0 | t +(2 rows) + +SELECT coninhcount, conislocal FROM pg_constraint WHERE conrelid = 'part_3_4'::regclass AND conname = 'check_a'; + coninhcount | conislocal +-------------+------------ + 0 | t +(1 row) + +DROP TABLE part_3_4; +-- check that a detached partition is not dropped on dropping a partitioned table +CREATE TABLE range_parted2 ( + a int +) PARTITION BY RANGE(a); +CREATE TABLE part_rp PARTITION OF range_parted2 FOR VALUES FROM (0) to (100); +ALTER TABLE range_parted2 DETACH PARTITION part_rp; +DROP TABLE range_parted2; +SELECT * from part_rp; + a +--- +(0 rows) + +DROP TABLE part_rp; +-- concurrent detach +CREATE TABLE range_parted2 ( + a int +) PARTITION BY RANGE(a); +CREATE TABLE part_rp PARTITION OF range_parted2 FOR VALUES FROM (0) to (100); +BEGIN; +-- doesn't work in a partition block +ALTER TABLE range_parted2 DETACH PARTITION part_rp CONCURRENTLY; +ERROR: ALTER TABLE ... DETACH CONCURRENTLY cannot run inside a transaction block +COMMIT; +CREATE TABLE part_rpd PARTITION OF range_parted2 DEFAULT; +-- doesn't work if there's a default partition +ALTER TABLE range_parted2 DETACH PARTITION part_rp CONCURRENTLY; +ERROR: cannot detach partitions concurrently when a default partition exists +-- doesn't work for the default partition +ALTER TABLE range_parted2 DETACH PARTITION part_rpd CONCURRENTLY; +ERROR: cannot detach partitions concurrently when a default partition exists +DROP TABLE part_rpd; +-- works fine +ALTER TABLE range_parted2 DETACH PARTITION part_rp CONCURRENTLY; +\d+ range_parted2 + Partitioned table "public.range_parted2" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | +Partition key: RANGE (a) +Number of partitions: 0 + +-- constraint should be created +\d part_rp + Table "public.part_rp" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | +Check constraints: + "part_rp_a_check" CHECK (a IS NOT NULL AND a >= 0 AND a < 100) + +CREATE TABLE part_rp100 PARTITION OF range_parted2 (CHECK (a>=123 AND a<133 AND a IS NOT NULL)) FOR VALUES FROM (100) to (200); +ALTER TABLE range_parted2 DETACH PARTITION part_rp100 CONCURRENTLY; +-- redundant constraint should not be created +\d part_rp100 + Table "public.part_rp100" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | +Check constraints: + "part_rp100_a_check" CHECK (a >= 123 AND a < 133 AND a IS NOT NULL) + +DROP TABLE range_parted2; +-- Check ALTER TABLE commands for partitioned tables and partitions +-- cannot add/drop column to/from *only* the parent +ALTER TABLE ONLY list_parted2 ADD COLUMN c int; +ERROR: column must be added to child tables too +ALTER TABLE ONLY list_parted2 DROP COLUMN b; +ERROR: cannot drop column from only the partitioned table when partitions exist +HINT: Do not specify the ONLY keyword. +-- cannot add a column to partition or drop an inherited one +ALTER TABLE part_2 ADD COLUMN c text; +ERROR: cannot add column to a partition +ALTER TABLE part_2 DROP COLUMN b; +ERROR: cannot drop inherited column "b" +-- Nor rename, alter type +ALTER TABLE part_2 RENAME COLUMN b to c; +ERROR: cannot rename inherited column "b" +ALTER TABLE part_2 ALTER COLUMN b TYPE text; +ERROR: cannot alter inherited column "b" +-- cannot add/drop NOT NULL or check constraints to *only* the parent, when +-- partitions exist +ALTER TABLE ONLY list_parted2 ALTER b SET NOT NULL; +ERROR: constraint must be added to child tables too +DETAIL: Column "b" of relation "part_2" is not already NOT NULL. +HINT: Do not specify the ONLY keyword. +ALTER TABLE ONLY list_parted2 ADD CONSTRAINT check_b CHECK (b <> 'zz'); +ERROR: constraint must be added to child tables too +ALTER TABLE list_parted2 ALTER b SET NOT NULL; +ALTER TABLE ONLY list_parted2 ALTER b DROP NOT NULL; +ERROR: cannot remove constraint from only the partitioned table when partitions exist +HINT: Do not specify the ONLY keyword. +ALTER TABLE list_parted2 ADD CONSTRAINT check_b CHECK (b <> 'zz'); +ALTER TABLE ONLY list_parted2 DROP CONSTRAINT check_b; +ERROR: cannot remove constraint from only the partitioned table when partitions exist +HINT: Do not specify the ONLY keyword. +-- It's alright though, if no partitions are yet created +CREATE TABLE parted_no_parts (a int) PARTITION BY LIST (a); +ALTER TABLE ONLY parted_no_parts ALTER a SET NOT NULL; +ALTER TABLE ONLY parted_no_parts ADD CONSTRAINT check_a CHECK (a > 0); +ALTER TABLE ONLY parted_no_parts ALTER a DROP NOT NULL; +ALTER TABLE ONLY parted_no_parts DROP CONSTRAINT check_a; +DROP TABLE parted_no_parts; +-- cannot drop inherited NOT NULL or check constraints from partition +ALTER TABLE list_parted2 ALTER b SET NOT NULL, ADD CONSTRAINT check_a2 CHECK (a > 0); +ALTER TABLE part_2 ALTER b DROP NOT NULL; +ERROR: column "b" is marked NOT NULL in parent table +ALTER TABLE part_2 DROP CONSTRAINT check_a2; +ERROR: cannot drop inherited constraint "check_a2" of relation "part_2" +-- Doesn't make sense to add NO INHERIT constraints on partitioned tables +ALTER TABLE list_parted2 add constraint check_b2 check (b <> 'zz') NO INHERIT; +ERROR: cannot add NO INHERIT constraint to partitioned table "list_parted2" +-- check that a partition cannot participate in regular inheritance +CREATE TABLE inh_test () INHERITS (part_2); +ERROR: cannot inherit from partition "part_2" +CREATE TABLE inh_test (LIKE part_2); +ALTER TABLE inh_test INHERIT part_2; +ERROR: cannot inherit from a partition +ALTER TABLE part_2 INHERIT inh_test; +ERROR: cannot change inheritance of a partition +-- cannot drop or alter type of partition key columns of lower level +-- partitioned tables; for example, part_5, which is list_parted2's +-- partition, is partitioned on b; +ALTER TABLE list_parted2 DROP COLUMN b; +ERROR: cannot drop column "b" because it is part of the partition key of relation "part_5" +ALTER TABLE list_parted2 ALTER COLUMN b TYPE text; +ERROR: cannot alter column "b" because it is part of the partition key of relation "part_5" +-- dropping non-partition key columns should be allowed on the parent table. +ALTER TABLE list_parted DROP COLUMN b; +SELECT * FROM list_parted; + a +--- +(0 rows) + +-- cleanup +DROP TABLE list_parted, list_parted2, range_parted; +DROP TABLE fail_def_part; +DROP TABLE hash_parted; +-- more tests for certain multi-level partitioning scenarios +create table p (a int, b int) partition by range (a, b); +create table p1 (b int, a int not null) partition by range (b); +create table p11 (like p1); +alter table p11 drop a; +alter table p11 add a int; +alter table p11 drop a; +alter table p11 add a int not null; +-- attnum for key attribute 'a' is different in p, p1, and p11 +select attrelid::regclass, attname, attnum +from pg_attribute +where attname = 'a' + and (attrelid = 'p'::regclass + or attrelid = 'p1'::regclass + or attrelid = 'p11'::regclass) +order by attrelid::regclass::text; + attrelid | attname | attnum +----------+---------+-------- + p | a | 1 + p1 | a | 2 + p11 | a | 4 +(3 rows) + +alter table p1 attach partition p11 for values from (2) to (5); +insert into p1 (a, b) values (2, 3); +-- check that partition validation scan correctly detects violating rows +alter table p attach partition p1 for values from (1, 2) to (1, 10); +ERROR: partition constraint of relation "p11" is violated by some row +-- cleanup +drop table p; +drop table p1; +-- validate constraint on partitioned tables should only scan leaf partitions +create table parted_validate_test (a int) partition by list (a); +create table parted_validate_test_1 partition of parted_validate_test for values in (0, 1); +alter table parted_validate_test add constraint parted_validate_test_chka check (a > 0) not valid; +alter table parted_validate_test validate constraint parted_validate_test_chka; +drop table parted_validate_test; +-- test alter column options +CREATE TABLE attmp(i integer); +INSERT INTO attmp VALUES (1); +ALTER TABLE attmp ALTER COLUMN i SET (n_distinct = 1, n_distinct_inherited = 2); +ALTER TABLE attmp ALTER COLUMN i RESET (n_distinct_inherited); +ANALYZE attmp; +DROP TABLE attmp; +DROP USER regress_alter_table_user1; +-- check that violating rows are correctly reported when attaching as the +-- default partition +create table defpart_attach_test (a int) partition by list (a); +create table defpart_attach_test1 partition of defpart_attach_test for values in (1); +create table defpart_attach_test_d (b int, a int); +alter table defpart_attach_test_d drop b; +insert into defpart_attach_test_d values (1), (2); +-- error because its constraint as the default partition would be violated +-- by the row containing 1 +alter table defpart_attach_test attach partition defpart_attach_test_d default; +ERROR: partition constraint of relation "defpart_attach_test_d" is violated by some row +delete from defpart_attach_test_d where a = 1; +alter table defpart_attach_test_d add check (a > 1); +-- should be attached successfully and without needing to be scanned +alter table defpart_attach_test attach partition defpart_attach_test_d default; +-- check that attaching a partition correctly reports any rows in the default +-- partition that should not be there for the new partition to be attached +-- successfully +create table defpart_attach_test_2 (like defpart_attach_test_d); +alter table defpart_attach_test attach partition defpart_attach_test_2 for values in (2); +ERROR: updated partition constraint for default partition "defpart_attach_test_d" would be violated by some row +drop table defpart_attach_test; +-- check combinations of temporary and permanent relations when attaching +-- partitions. +create table perm_part_parent (a int) partition by list (a); +create temp table temp_part_parent (a int) partition by list (a); +create table perm_part_child (a int); +create temp table temp_part_child (a int); +alter table temp_part_parent attach partition perm_part_child default; -- error +ERROR: cannot attach a permanent relation as partition of temporary relation "temp_part_parent" +alter table perm_part_parent attach partition temp_part_child default; -- error +ERROR: cannot attach a temporary relation as partition of permanent relation "perm_part_parent" +alter table temp_part_parent attach partition temp_part_child default; -- ok +drop table perm_part_parent cascade; +drop table temp_part_parent cascade; +-- check that attaching partitions to a table while it is being used is +-- prevented +create table tab_part_attach (a int) partition by list (a); +create or replace function func_part_attach() returns trigger + language plpgsql as $$ + begin + execute 'create table tab_part_attach_1 (a int)'; + execute 'alter table tab_part_attach attach partition tab_part_attach_1 for values in (1)'; + return null; + end $$; +create trigger trig_part_attach before insert on tab_part_attach + for each statement execute procedure func_part_attach(); +insert into tab_part_attach values (1); +ERROR: cannot ALTER TABLE "tab_part_attach" because it is being used by active queries in this session +CONTEXT: SQL statement "alter table tab_part_attach attach partition tab_part_attach_1 for values in (1)" +PL/pgSQL function func_part_attach() line 4 at EXECUTE +drop table tab_part_attach; +drop function func_part_attach(); +-- test case where the partitioning operator is a SQL function whose +-- evaluation results in the table's relcache being rebuilt partway through +-- the execution of an ATTACH PARTITION command +create function at_test_sql_partop (int4, int4) returns int language sql +as $$ select case when $1 = $2 then 0 when $1 > $2 then 1 else -1 end; $$; +create operator class at_test_sql_partop for type int4 using btree as + operator 1 < (int4, int4), operator 2 <= (int4, int4), + operator 3 = (int4, int4), operator 4 >= (int4, int4), + operator 5 > (int4, int4), function 1 at_test_sql_partop(int4, int4); +create table at_test_sql_partop (a int) partition by range (a at_test_sql_partop); +create table at_test_sql_partop_1 (a int); +alter table at_test_sql_partop attach partition at_test_sql_partop_1 for values from (0) to (10); +drop table at_test_sql_partop; +drop operator class at_test_sql_partop using btree; +drop function at_test_sql_partop; +/* Test case for bug #16242 */ +-- We create a parent and child where the child has missing +-- non-null attribute values, and arrange to pass them through +-- tuple conversion from the child to the parent tupdesc +create table bar1 (a integer, b integer not null default 1) + partition by range (a); +create table bar2 (a integer); +insert into bar2 values (1); +alter table bar2 add column b integer not null default 1; +-- (at this point bar2 contains tuple with natts=1) +alter table bar1 attach partition bar2 default; +-- this works: +select * from bar1; + a | b +---+--- + 1 | 1 +(1 row) + +-- this exercises tuple conversion: +create function xtrig() + returns trigger language plpgsql +as $$ + declare + r record; + begin + for r in select * from old loop + raise info 'a=%, b=%', r.a, r.b; + end loop; + return NULL; + end; +$$; +create trigger xtrig + after update on bar1 + referencing old table as old + for each statement execute procedure xtrig(); +update bar1 set a = a + 1; +INFO: a=1, b=1 +/* End test case for bug #16242 */ +-- Test that ALTER TABLE rewrite preserves a clustered index +-- for normal indexes and indexes on constraints. +create table alttype_cluster (a int); +alter table alttype_cluster add primary key (a); +create index alttype_cluster_ind on alttype_cluster (a); +alter table alttype_cluster cluster on alttype_cluster_ind; +-- Normal index remains clustered. +select indexrelid::regclass, indisclustered from pg_index + where indrelid = 'alttype_cluster'::regclass + order by indexrelid::regclass::text; + indexrelid | indisclustered +----------------------+---------------- + alttype_cluster_ind | t + alttype_cluster_pkey | f +(2 rows) + +alter table alttype_cluster alter a type bigint; +select indexrelid::regclass, indisclustered from pg_index + where indrelid = 'alttype_cluster'::regclass + order by indexrelid::regclass::text; + indexrelid | indisclustered +----------------------+---------------- + alttype_cluster_ind | t + alttype_cluster_pkey | f +(2 rows) + +-- Constraint index remains clustered. +alter table alttype_cluster cluster on alttype_cluster_pkey; +select indexrelid::regclass, indisclustered from pg_index + where indrelid = 'alttype_cluster'::regclass + order by indexrelid::regclass::text; + indexrelid | indisclustered +----------------------+---------------- + alttype_cluster_ind | f + alttype_cluster_pkey | t +(2 rows) + +alter table alttype_cluster alter a type int; +select indexrelid::regclass, indisclustered from pg_index + where indrelid = 'alttype_cluster'::regclass + order by indexrelid::regclass::text; + indexrelid | indisclustered +----------------------+---------------- + alttype_cluster_ind | f + alttype_cluster_pkey | t +(2 rows) + +drop table alttype_cluster; diff --git a/src/test/regress/expected/create_role.out b/src/test/regress/expected/create_role.out index 4e67d727603..5c42c333dcc 100644 --- a/src/test/regress/expected/create_role.out +++ b/src/test/regress/expected/create_role.out @@ -24,7 +24,8 @@ CREATE ROLE regress_noiseword SYSID 12345; NOTICE: SYSID can no longer be specified -- fail, cannot grant membership in superuser role CREATE ROLE regress_nosuch_super IN ROLE regress_role_super; -ERROR: must be superuser to alter superusers +ERROR: permission denied to grant role "regress_role_super" +DETAIL: Only roles with the SUPERUSER attribute may grant roles with the SUPERUSER attribute. -- fail, database owner cannot have members CREATE ROLE regress_nosuch_dbowner IN ROLE pg_database_owner; ERROR: role "pg_database_owner" cannot have explicit members @@ -61,29 +62,44 @@ REVOKE ALL PRIVILEGES ON tenant_table FROM PUBLIC; -- fail, these objects belonging to regress_tenant SET SESSION AUTHORIZATION regress_createrole; DROP INDEX tenant_idx; -ERROR: must be owner of index tenant_idx ALTER TABLE tenant_table ADD COLUMN t text; -ERROR: must be owner of table tenant_table DROP TABLE tenant_table; -ERROR: must be owner of table tenant_table ALTER VIEW tenant_view OWNER TO regress_role_admin; -ERROR: must be owner of view tenant_view +ERROR: must be member of role "regress_role_admin" DROP VIEW tenant_view; -ERROR: must be owner of view tenant_view -- fail, cannot take ownership of these objects from regress_tenant REASSIGN OWNED BY regress_tenant TO regress_createrole; -ERROR: permission denied to reassign objects -- ok, having CREATEROLE is enough to create roles in privileged roles CREATE ROLE regress_read_all_data IN ROLE pg_read_all_data; +ERROR: permission denied to grant role "pg_read_all_data" +DETAIL: Only roles with the ADMIN option on role "pg_read_all_data" may grant this role. CREATE ROLE regress_write_all_data IN ROLE pg_write_all_data; +ERROR: permission denied to grant role "pg_write_all_data" +DETAIL: Only roles with the ADMIN option on role "pg_write_all_data" may grant this role. CREATE ROLE regress_monitor IN ROLE pg_monitor; +ERROR: permission denied to grant role "pg_monitor" +DETAIL: Only roles with the ADMIN option on role "pg_monitor" may grant this role. CREATE ROLE regress_read_all_settings IN ROLE pg_read_all_settings; +ERROR: permission denied to grant role "pg_read_all_settings" +DETAIL: Only roles with the ADMIN option on role "pg_read_all_settings" may grant this role. CREATE ROLE regress_read_all_stats IN ROLE pg_read_all_stats; +ERROR: permission denied to grant role "pg_read_all_stats" +DETAIL: Only roles with the ADMIN option on role "pg_read_all_stats" may grant this role. CREATE ROLE regress_stat_scan_tables IN ROLE pg_stat_scan_tables; +ERROR: permission denied to grant role "pg_stat_scan_tables" +DETAIL: Only roles with the ADMIN option on role "pg_stat_scan_tables" may grant this role. CREATE ROLE regress_read_server_files IN ROLE pg_read_server_files; +ERROR: permission denied to grant role "pg_read_server_files" +DETAIL: Only roles with the ADMIN option on role "pg_read_server_files" may grant this role. CREATE ROLE regress_write_server_files IN ROLE pg_write_server_files; +ERROR: permission denied to grant role "pg_write_server_files" +DETAIL: Only roles with the ADMIN option on role "pg_write_server_files" may grant this role. CREATE ROLE regress_execute_server_program IN ROLE pg_execute_server_program; +ERROR: permission denied to grant role "pg_execute_server_program" +DETAIL: Only roles with the ADMIN option on role "pg_execute_server_program" may grant this role. CREATE ROLE regress_signal_backend IN ROLE pg_signal_backend; +ERROR: permission denied to grant role "pg_signal_backend" +DETAIL: Only roles with the ADMIN option on role "pg_signal_backend" may grant this role. -- fail, creation of these roles failed above so they do not now exist SET SESSION AUTHORIZATION regress_role_admin; DROP ROLE regress_nosuch_superuser; @@ -116,20 +132,27 @@ DROP ROLE regress_inroles; DROP ROLE regress_adminroles; DROP ROLE regress_rolecreator; DROP ROLE regress_read_all_data; +ERROR: role "regress_read_all_data" does not exist DROP ROLE regress_write_all_data; +ERROR: role "regress_write_all_data" does not exist DROP ROLE regress_monitor; +ERROR: role "regress_monitor" does not exist DROP ROLE regress_read_all_settings; +ERROR: role "regress_read_all_settings" does not exist DROP ROLE regress_read_all_stats; +ERROR: role "regress_read_all_stats" does not exist DROP ROLE regress_stat_scan_tables; +ERROR: role "regress_stat_scan_tables" does not exist DROP ROLE regress_read_server_files; +ERROR: role "regress_read_server_files" does not exist DROP ROLE regress_write_server_files; +ERROR: role "regress_write_server_files" does not exist DROP ROLE regress_execute_server_program; +ERROR: role "regress_execute_server_program" does not exist DROP ROLE regress_signal_backend; +ERROR: role "regress_signal_backend" does not exist -- fail, role still owns database objects DROP ROLE regress_tenant; -ERROR: role "regress_tenant" cannot be dropped because some objects depend on it -DETAIL: owner of table tenant_table -owner of view tenant_view -- fail, cannot drop ourself nor superusers DROP ROLE regress_role_super; ERROR: must be superuser to drop superusers @@ -138,8 +161,12 @@ ERROR: current user cannot be dropped -- ok RESET SESSION AUTHORIZATION; DROP INDEX tenant_idx; +ERROR: index "tenant_idx" does not exist DROP TABLE tenant_table; +ERROR: table "tenant_table" does not exist DROP VIEW tenant_view; +ERROR: view "tenant_view" does not exist DROP ROLE regress_tenant; +ERROR: role "regress_tenant" does not exist DROP ROLE regress_role_admin; DROP ROLE regress_role_super; diff --git a/src/test/regress/expected/create_table_1.out b/src/test/regress/expected/create_table_1.out new file mode 100644 index 00000000000..4ec5f297a34 --- /dev/null +++ b/src/test/regress/expected/create_table_1.out @@ -0,0 +1,1315 @@ +-- +-- CREATE_TABLE +-- +-- +-- CLASS DEFINITIONS +-- +CREATE TABLE hobbies_r ( + name text, + person text +); +CREATE TABLE equipment_r ( + name text, + hobby text +); +CREATE TABLE onek ( + unique1 int4, + unique2 int4, + two int4, + four int4, + ten int4, + twenty int4, + hundred int4, + thousand int4, + twothousand int4, + fivethous int4, + tenthous int4, + odd int4, + even int4, + stringu1 name, + stringu2 name, + string4 name +); +CREATE TABLE tenk1 ( + unique1 int4, + unique2 int4, + two int4, + four int4, + ten int4, + twenty int4, + hundred int4, + thousand int4, + twothousand int4, + fivethous int4, + tenthous int4, + odd int4, + even int4, + stringu1 name, + stringu2 name, + string4 name +); +CREATE TABLE tenk2 ( + unique1 int4, + unique2 int4, + two int4, + four int4, + ten int4, + twenty int4, + hundred int4, + thousand int4, + twothousand int4, + fivethous int4, + tenthous int4, + odd int4, + even int4, + stringu1 name, + stringu2 name, + string4 name +); +CREATE TABLE person ( + name text, + age int4, + location point +); +CREATE TABLE emp ( + salary int4, + manager name +) INHERITS (person); +CREATE TABLE student ( + gpa float8 +) INHERITS (person); +CREATE TABLE stud_emp ( + percent int4 +) INHERITS (emp, student); +NOTICE: merging multiple inherited definitions of column "name" +NOTICE: merging multiple inherited definitions of column "age" +NOTICE: merging multiple inherited definitions of column "location" +CREATE TABLE city ( + name name, + location box, + budget city_budget +); +CREATE TABLE dept ( + dname name, + mgrname text +); +CREATE TABLE slow_emp4000 ( + home_base box +); +CREATE TABLE fast_emp4000 ( + home_base box +); +CREATE TABLE road ( + name text, + thepath path +); +CREATE TABLE ihighway () INHERITS (road); +CREATE TABLE shighway ( + surface text +) INHERITS (road); +CREATE TABLE real_city ( + pop int4, + cname text, + outline path +); +-- +-- test the "star" operators a bit more thoroughly -- this time, +-- throw in lots of NULL fields... +-- +-- a is the type root +-- b and c inherit from a (one-level single inheritance) +-- d inherits from b and c (two-level multiple inheritance) +-- e inherits from c (two-level single inheritance) +-- f inherits from e (three-level single inheritance) +-- +CREATE TABLE a_star ( + class char, + a int4 +); +CREATE TABLE b_star ( + b text +) INHERITS (a_star); +CREATE TABLE c_star ( + c name +) INHERITS (a_star); +CREATE TABLE d_star ( + d float8 +) INHERITS (b_star, c_star); +NOTICE: merging multiple inherited definitions of column "class" +NOTICE: merging multiple inherited definitions of column "a" +CREATE TABLE e_star ( + e int2 +) INHERITS (c_star); +CREATE TABLE f_star ( + f polygon +) INHERITS (e_star); +CREATE TABLE aggtest ( + a int2, + b float4 +); +CREATE TABLE hash_i4_heap ( + seqno int4, + random int4 +); +CREATE TABLE hash_name_heap ( + seqno int4, + random name +); +CREATE TABLE hash_txt_heap ( + seqno int4, + random text +); +CREATE TABLE hash_f8_heap ( + seqno int4, + random float8 +); +-- don't include the hash_ovfl_heap stuff in the distribution +-- the data set is too large for what it's worth +-- +-- CREATE TABLE hash_ovfl_heap ( +-- x int4, +-- y int4 +-- ); +CREATE TABLE bt_i4_heap ( + seqno int4, + random int4 +); +CREATE TABLE bt_name_heap ( + seqno name, + random int4 +); +CREATE TABLE bt_txt_heap ( + seqno text, + random int4 +); +CREATE TABLE bt_f8_heap ( + seqno float8, + random int4 +); +CREATE TABLE array_op_test ( + seqno int4, + i int4[], + t text[] +); +CREATE TABLE array_index_op_test ( + seqno int4, + i int4[], + t text[] +); +CREATE TABLE testjsonb ( + j jsonb +); +CREATE TABLE unknowntab ( + u unknown -- fail +); +ERROR: column "u" has pseudo-type unknown +CREATE TYPE unknown_comptype AS ( + u unknown -- fail +); +ERROR: column "u" has pseudo-type unknown +CREATE TABLE IF NOT EXISTS test_tsvector( + t text, + a tsvector +); +CREATE TABLE IF NOT EXISTS test_tsvector( + t text +); +NOTICE: relation "test_tsvector" already exists, skipping +-- invalid: non-lowercase quoted reloptions identifiers +CREATE TABLE tas_case WITH ("Fillfactor" = 10) AS SELECT 1 a; +ERROR: unrecognized parameter "Fillfactor" +CREATE UNLOGGED TABLE unlogged1 (a int primary key); -- OK +CREATE TEMPORARY TABLE unlogged2 (a int primary key); -- OK +SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^unlogged\d' ORDER BY relname; + relname | relkind | relpersistence +----------------+---------+---------------- + unlogged1 | r | p + unlogged1_pkey | i | p + unlogged2 | r | t + unlogged2_pkey | i | t +(4 rows) + +REINDEX INDEX unlogged1_pkey; +REINDEX INDEX unlogged2_pkey; +SELECT relname, relkind, relpersistence FROM pg_class WHERE relname ~ '^unlogged\d' ORDER BY relname; + relname | relkind | relpersistence +----------------+---------+---------------- + unlogged1 | r | p + unlogged1_pkey | i | p + unlogged2 | r | t + unlogged2_pkey | i | t +(4 rows) + +DROP TABLE unlogged2; +INSERT INTO unlogged1 VALUES (42); +CREATE UNLOGGED TABLE public.unlogged2 (a int primary key); -- also OK +CREATE UNLOGGED TABLE pg_temp.unlogged3 (a int primary key); -- not OK +ERROR: only temporary relations may be created in temporary schemas +LINE 1: CREATE UNLOGGED TABLE pg_temp.unlogged3 (a int primary key); + ^ +CREATE TABLE pg_temp.implicitly_temp (a int primary key); -- OK +CREATE TEMP TABLE explicitly_temp (a int primary key); -- also OK +CREATE TEMP TABLE pg_temp.doubly_temp (a int primary key); -- also OK +CREATE TEMP TABLE public.temp_to_perm (a int primary key); -- not OK +ERROR: cannot create temporary relation in non-temporary schema +LINE 1: CREATE TEMP TABLE public.temp_to_perm (a int primary key); + ^ +DROP TABLE unlogged1, public.unlogged2; +CREATE TABLE as_select1 AS SELECT * FROM pg_class WHERE relkind = 'r'; +CREATE TABLE as_select1 AS SELECT * FROM pg_class WHERE relkind = 'r'; +ERROR: relation "as_select1" already exists +CREATE TABLE IF NOT EXISTS as_select1 AS SELECT * FROM pg_class WHERE relkind = 'r'; +NOTICE: relation "as_select1" already exists, skipping +DROP TABLE as_select1; +PREPARE select1 AS SELECT 1 as a; +CREATE TABLE as_select1 AS EXECUTE select1; +CREATE TABLE as_select1 AS EXECUTE select1; +ERROR: relation "as_select1" already exists +SELECT * FROM as_select1; + a +--- + 1 +(1 row) + +CREATE TABLE IF NOT EXISTS as_select1 AS EXECUTE select1; +NOTICE: relation "as_select1" already exists, skipping +DROP TABLE as_select1; +DEALLOCATE select1; +-- create an extra wide table to test for issues related to that +-- (temporarily hide query, to avoid the long CREATE TABLE stmt) +\set ECHO none +INSERT INTO extra_wide_table(firstc, lastc) VALUES('first col', 'last col'); +SELECT firstc, lastc FROM extra_wide_table; + firstc | lastc +-----------+---------- + first col | last col +(1 row) + +-- check that tables with oids cannot be created anymore +CREATE TABLE withoid() WITH OIDS; +ERROR: syntax error at or near "OIDS" +LINE 1: CREATE TABLE withoid() WITH OIDS; + ^ +CREATE TABLE withoid() WITH (oids); +ERROR: tables declared WITH OIDS are not supported +CREATE TABLE withoid() WITH (oids = true); +ERROR: tables declared WITH OIDS are not supported +-- but explicitly not adding oids is still supported +CREATE TEMP TABLE withoutoid() WITHOUT OIDS; DROP TABLE withoutoid; +CREATE TEMP TABLE withoutoid() WITH (oids = false); DROP TABLE withoutoid; +-- check restriction with default expressions +-- invalid use of column reference in default expressions +CREATE TABLE default_expr_column (id int DEFAULT (id)); +ERROR: cannot use column reference in DEFAULT expression +LINE 1: CREATE TABLE default_expr_column (id int DEFAULT (id)); + ^ +CREATE TABLE default_expr_column (id int DEFAULT (bar.id)); +ERROR: cannot use column reference in DEFAULT expression +LINE 1: CREATE TABLE default_expr_column (id int DEFAULT (bar.id)); + ^ +CREATE TABLE default_expr_agg_column (id int DEFAULT (avg(id))); +ERROR: cannot use column reference in DEFAULT expression +LINE 1: ...TE TABLE default_expr_agg_column (id int DEFAULT (avg(id))); + ^ +-- invalid column definition +CREATE TABLE default_expr_non_column (a int DEFAULT (avg(non_existent))); +ERROR: cannot use column reference in DEFAULT expression +LINE 1: ...TABLE default_expr_non_column (a int DEFAULT (avg(non_existe... + ^ +-- invalid use of aggregate +CREATE TABLE default_expr_agg (a int DEFAULT (avg(1))); +ERROR: aggregate functions are not allowed in DEFAULT expressions +LINE 1: CREATE TABLE default_expr_agg (a int DEFAULT (avg(1))); + ^ +-- invalid use of subquery +CREATE TABLE default_expr_agg (a int DEFAULT (select 1)); +ERROR: cannot use subquery in DEFAULT expression +LINE 1: CREATE TABLE default_expr_agg (a int DEFAULT (select 1)); + ^ +-- invalid use of set-returning function +CREATE TABLE default_expr_agg (a int DEFAULT (generate_series(1,3))); +ERROR: set-returning functions are not allowed in DEFAULT expressions +LINE 1: CREATE TABLE default_expr_agg (a int DEFAULT (generate_serie... + ^ +-- Verify that subtransaction rollback restores rd_createSubid. +BEGIN; +CREATE TABLE remember_create_subid (c int); +SAVEPOINT q; DROP TABLE remember_create_subid; ROLLBACK TO q; +COMMIT; +DROP TABLE remember_create_subid; +-- Verify that subtransaction rollback restores rd_firstRelfilenodeSubid. +CREATE TABLE remember_node_subid (c int); +BEGIN; +ALTER TABLE remember_node_subid ALTER c TYPE bigint; +SAVEPOINT q; DROP TABLE remember_node_subid; ROLLBACK TO q; +COMMIT; +DROP TABLE remember_node_subid; +-- +-- Partitioned tables +-- +-- cannot combine INHERITS and PARTITION BY (although grammar allows) +CREATE TABLE partitioned ( + a int +) INHERITS (some_table) PARTITION BY LIST (a); +ERROR: cannot create partitioned table as inheritance child +-- cannot use more than 1 column as partition key for list partitioned table +CREATE TABLE partitioned ( + a1 int, + a2 int +) PARTITION BY LIST (a1, a2); -- fail +ERROR: cannot use "list" partition strategy with more than one column +-- unsupported constraint type for partitioned tables +CREATE TABLE partitioned ( + a int, + EXCLUDE USING gist (a WITH &&) +) PARTITION BY RANGE (a); +ERROR: exclusion constraints are not supported on partitioned tables +LINE 3: EXCLUDE USING gist (a WITH &&) + ^ +-- prevent using prohibited expressions in the key +CREATE FUNCTION retset (a int) RETURNS SETOF int AS $$ SELECT 1; $$ LANGUAGE SQL IMMUTABLE; +CREATE TABLE partitioned ( + a int +) PARTITION BY RANGE (retset(a)); +ERROR: set-returning functions are not allowed in partition key expressions +DROP FUNCTION retset(int); +CREATE TABLE partitioned ( + a int +) PARTITION BY RANGE ((avg(a))); +ERROR: aggregate functions are not allowed in partition key expressions +CREATE TABLE partitioned ( + a int, + b int +) PARTITION BY RANGE ((avg(a) OVER (PARTITION BY b))); +ERROR: window functions are not allowed in partition key expressions +CREATE TABLE partitioned ( + a int +) PARTITION BY LIST ((a LIKE (SELECT 1))); +ERROR: cannot use subquery in partition key expression +CREATE TABLE partitioned ( + a int +) PARTITION BY RANGE ((42)); +ERROR: cannot use constant expression as partition key +CREATE FUNCTION const_func () RETURNS int AS $$ SELECT 1; $$ LANGUAGE SQL IMMUTABLE; +CREATE TABLE partitioned ( + a int +) PARTITION BY RANGE (const_func()); +ERROR: cannot use constant expression as partition key +DROP FUNCTION const_func(); +-- only accept valid partitioning strategy +CREATE TABLE partitioned ( + a int +) PARTITION BY MAGIC (a); +ERROR: unrecognized partitioning strategy "magic" +-- specified column must be present in the table +CREATE TABLE partitioned ( + a int +) PARTITION BY RANGE (b); +ERROR: column "b" named in partition key does not exist +LINE 3: ) PARTITION BY RANGE (b); + ^ +-- cannot use system columns in partition key +CREATE TABLE partitioned ( + a int +) PARTITION BY RANGE (xmin); +ERROR: cannot use system column "xmin" in partition key +LINE 3: ) PARTITION BY RANGE (xmin); + ^ +-- cannot use pseudotypes +CREATE TABLE partitioned ( + a int, + b int +) PARTITION BY RANGE (((a, b))); +ERROR: partition key column 1 has pseudo-type record +CREATE TABLE partitioned ( + a int, + b int +) PARTITION BY RANGE (a, ('unknown')); +ERROR: partition key column 2 has pseudo-type unknown +-- functions in key must be immutable +CREATE FUNCTION immut_func (a int) RETURNS int AS $$ SELECT a + random()::int; $$ LANGUAGE SQL; +CREATE TABLE partitioned ( + a int +) PARTITION BY RANGE (immut_func(a)); +ERROR: functions in partition key expression must be marked IMMUTABLE +DROP FUNCTION immut_func(int); +-- prevent using columns of unsupported types in key (type must have a btree operator class) +CREATE TABLE partitioned ( + a point +) PARTITION BY LIST (a); +ERROR: data type point has no default operator class for access method "btree" +HINT: You must specify a btree operator class or define a default btree operator class for the data type. +CREATE TABLE partitioned ( + a point +) PARTITION BY LIST (a point_ops); +ERROR: operator class "point_ops" does not exist for access method "btree" +CREATE TABLE partitioned ( + a point +) PARTITION BY RANGE (a); +ERROR: data type point has no default operator class for access method "btree" +HINT: You must specify a btree operator class or define a default btree operator class for the data type. +CREATE TABLE partitioned ( + a point +) PARTITION BY RANGE (a point_ops); +ERROR: operator class "point_ops" does not exist for access method "btree" +-- cannot add NO INHERIT constraints to partitioned tables +CREATE TABLE partitioned ( + a int, + CONSTRAINT check_a CHECK (a > 0) NO INHERIT +) PARTITION BY RANGE (a); +ERROR: cannot add NO INHERIT constraint to partitioned table "partitioned" +-- some checks after successful creation of a partitioned table +CREATE FUNCTION plusone(a int) RETURNS INT AS $$ SELECT a+1; $$ LANGUAGE SQL; +CREATE TABLE partitioned ( + a int, + b int, + c text, + d text +) PARTITION BY RANGE (a oid_ops, plusone(b), c collate "default", d collate "C"); +-- check relkind +SELECT relkind FROM pg_class WHERE relname = 'partitioned'; + relkind +--------- + p +(1 row) + +-- prevent a function referenced in partition key from being dropped +DROP FUNCTION plusone(int); +ERROR: cannot drop function plusone(integer) because other objects depend on it +DETAIL: table partitioned depends on function plusone(integer) +HINT: Use DROP ... CASCADE to drop the dependent objects too. +-- partitioned table cannot participate in regular inheritance +CREATE TABLE partitioned2 ( + a int, + b text +) PARTITION BY RANGE ((a+1), substr(b, 1, 5)); +CREATE TABLE fail () INHERITS (partitioned2); +ERROR: cannot inherit from partitioned table "partitioned2" +-- Partition key in describe output +\d partitioned + Partitioned table "public.partitioned" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | + b | integer | | | + c | text | | | + d | text | | | +Partition key: RANGE (a oid_ops, plusone(b), c, d COLLATE "C") +Number of partitions: 0 + +\d+ partitioned2 + Partitioned table "public.partitioned2" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+----------+--------------+------------- + a | integer | | | | plain | | + b | text | | | | extended | | +Partition key: RANGE (((a + 1)), substr(b, 1, 5)) +Number of partitions: 0 + +INSERT INTO partitioned2 VALUES (1, 'hello'); +ERROR: no partition of relation "partitioned2" found for row +DETAIL: Partition key of the failing row contains ((a + 1), substr(b, 1, 5)) = (2, hello). +CREATE TABLE part2_1 PARTITION OF partitioned2 FOR VALUES FROM (-1, 'aaaaa') TO (100, 'ccccc'); +\d+ part2_1 + Table "public.part2_1" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+----------+--------------+------------- + a | integer | | | | plain | | + b | text | | | | extended | | +Partition of: partitioned2 FOR VALUES FROM ('-1', 'aaaaa') TO (100, 'ccccc') +Partition constraint: (((a + 1) IS NOT NULL) AND (substr(b, 1, 5) IS NOT NULL) AND (((a + 1) > '-1'::integer) OR (((a + 1) = '-1'::integer) AND (substr(b, 1, 5) >= 'aaaaa'::text))) AND (((a + 1) < 100) OR (((a + 1) = 100) AND (substr(b, 1, 5) < 'ccccc'::text)))) + +DROP TABLE partitioned, partitioned2; +-- check reference to partitioned table's rowtype in partition descriptor +create table partitioned (a int, b int) + partition by list ((row(a, b)::partitioned)); +create table partitioned1 + partition of partitioned for values in ('(1,2)'::partitioned); +create table partitioned2 + partition of partitioned for values in ('(2,4)'::partitioned); +explain (costs off) +select * from partitioned where row(a,b)::partitioned = '(1,2)'::partitioned; + QUERY PLAN +----------------------------------------------------------- + Seq Scan on partitioned1 partitioned + Filter: (ROW(a, b)::partitioned = '(1,2)'::partitioned) +(2 rows) + +drop table partitioned; +-- whole-row Var in partition key works too +create table partitioned (a int, b int) + partition by list ((partitioned)); +create table partitioned1 + partition of partitioned for values in ('(1,2)'); +create table partitioned2 + partition of partitioned for values in ('(2,4)'); +explain (costs off) +select * from partitioned where partitioned = '(1,2)'::partitioned; + QUERY PLAN +----------------------------------------------------------------- + Seq Scan on partitioned1 partitioned + Filter: ((partitioned.*)::partitioned = '(1,2)'::partitioned) +(2 rows) + +\d+ partitioned1 + Table "public.partitioned1" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | + b | integer | | | | plain | | +Partition of: partitioned FOR VALUES IN ('(1,2)') +Partition constraint: (((partitioned1.*)::partitioned IS DISTINCT FROM NULL) AND ((partitioned1.*)::partitioned = '(1,2)'::partitioned)) + +drop table partitioned; +-- check that dependencies of partition columns are handled correctly +create domain intdom1 as int; +create table partitioned ( + a intdom1, + b text +) partition by range (a); +alter table partitioned drop column a; -- fail +ERROR: cannot drop column "a" because it is part of the partition key of relation "partitioned" +drop domain intdom1; -- fail, requires cascade +ERROR: cannot drop type intdom1 because other objects depend on it +DETAIL: table partitioned depends on type intdom1 +HINT: Use DROP ... CASCADE to drop the dependent objects too. +drop domain intdom1 cascade; +NOTICE: drop cascades to table partitioned +table partitioned; -- gone +ERROR: relation "partitioned" does not exist +LINE 1: table partitioned; + ^ +-- likewise for columns used in partition expressions +create domain intdom1 as int; +create table partitioned ( + a intdom1, + b text +) partition by range (plusone(a)); +alter table partitioned drop column a; -- fail +ERROR: cannot drop column "a" because it is part of the partition key of relation "partitioned" +drop domain intdom1; -- fail, requires cascade +ERROR: cannot drop type intdom1 because other objects depend on it +DETAIL: table partitioned depends on type intdom1 +HINT: Use DROP ... CASCADE to drop the dependent objects too. +drop domain intdom1 cascade; +NOTICE: drop cascades to table partitioned +table partitioned; -- gone +ERROR: relation "partitioned" does not exist +LINE 1: table partitioned; + ^ +-- +-- Partitions +-- +-- check partition bound syntax +CREATE TABLE list_parted ( + a int +) PARTITION BY LIST (a); +CREATE TABLE part_p1 PARTITION OF list_parted FOR VALUES IN ('1'); +CREATE TABLE part_p2 PARTITION OF list_parted FOR VALUES IN (2); +CREATE TABLE part_p3 PARTITION OF list_parted FOR VALUES IN ((2+1)); +CREATE TABLE part_null PARTITION OF list_parted FOR VALUES IN (null); +\d+ list_parted + Partitioned table "public.list_parted" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | +Partition key: LIST (a) +Partitions: part_null FOR VALUES IN (NULL), + part_p1 FOR VALUES IN (1), + part_p2 FOR VALUES IN (2), + part_p3 FOR VALUES IN (3) + +-- forbidden expressions for partition bound with list partitioned table +CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (somename); +ERROR: cannot use column reference in partition bound expression +LINE 1: ...expr_fail PARTITION OF list_parted FOR VALUES IN (somename); + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (somename.somename); +ERROR: cannot use column reference in partition bound expression +LINE 1: ...expr_fail PARTITION OF list_parted FOR VALUES IN (somename.s... + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (a); +ERROR: cannot use column reference in partition bound expression +LINE 1: ..._bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (a); + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (sum(a)); +ERROR: cannot use column reference in partition bound expression +LINE 1: ...s_expr_fail PARTITION OF list_parted FOR VALUES IN (sum(a)); + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (sum(somename)); +ERROR: cannot use column reference in partition bound expression +LINE 1: ..._fail PARTITION OF list_parted FOR VALUES IN (sum(somename))... + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (sum(1)); +ERROR: aggregate functions are not allowed in partition bound +LINE 1: ...s_expr_fail PARTITION OF list_parted FOR VALUES IN (sum(1)); + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN ((select 1)); +ERROR: cannot use subquery in partition bound +LINE 1: ...expr_fail PARTITION OF list_parted FOR VALUES IN ((select 1)... + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN (generate_series(4, 6)); +ERROR: set-returning functions are not allowed in partition bound +LINE 1: ...expr_fail PARTITION OF list_parted FOR VALUES IN (generate_s... + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF list_parted FOR VALUES IN ((1+1) collate "POSIX"); +ERROR: collations are not supported by type integer +LINE 1: ...ail PARTITION OF list_parted FOR VALUES IN ((1+1) collate "P... + ^ +-- syntax does not allow empty list of values for list partitions +CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES IN (); +ERROR: syntax error at or near ")" +LINE 1: ...E TABLE fail_part PARTITION OF list_parted FOR VALUES IN (); + ^ +-- trying to specify range for list partitioned table +CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES FROM (1) TO (2); +ERROR: invalid bound specification for a list partition +LINE 1: ...BLE fail_part PARTITION OF list_parted FOR VALUES FROM (1) T... + ^ +-- trying to specify modulus and remainder for list partitioned table +CREATE TABLE fail_part PARTITION OF list_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1); +ERROR: invalid bound specification for a list partition +LINE 1: ...BLE fail_part PARTITION OF list_parted FOR VALUES WITH (MODU... + ^ +-- check default partition cannot be created more than once +CREATE TABLE part_default PARTITION OF list_parted DEFAULT; +CREATE TABLE fail_default_part PARTITION OF list_parted DEFAULT; +ERROR: partition "fail_default_part" conflicts with existing default partition "part_default" +LINE 1: ...TE TABLE fail_default_part PARTITION OF list_parted DEFAULT; + ^ +-- specified literal can't be cast to the partition column data type +CREATE TABLE bools ( + a bool +) PARTITION BY LIST (a); +CREATE TABLE bools_true PARTITION OF bools FOR VALUES IN (1); +ERROR: specified value cannot be cast to type boolean for column "a" +LINE 1: ...REATE TABLE bools_true PARTITION OF bools FOR VALUES IN (1); + ^ +DROP TABLE bools; +-- specified literal can be cast, and the cast might not be immutable +CREATE TABLE moneyp ( + a money +) PARTITION BY LIST (a); +CREATE TABLE moneyp_10 PARTITION OF moneyp FOR VALUES IN (10); +CREATE TABLE moneyp_11 PARTITION OF moneyp FOR VALUES IN ('11'); +CREATE TABLE moneyp_12 PARTITION OF moneyp FOR VALUES IN (to_char(12, '99')::int); +DROP TABLE moneyp; +-- cast is immutable +CREATE TABLE bigintp ( + a bigint +) PARTITION BY LIST (a); +CREATE TABLE bigintp_10 PARTITION OF bigintp FOR VALUES IN (10); +-- fails due to overlap: +CREATE TABLE bigintp_10_2 PARTITION OF bigintp FOR VALUES IN ('10'); +ERROR: partition "bigintp_10_2" would overlap partition "bigintp_10" +LINE 1: ...ABLE bigintp_10_2 PARTITION OF bigintp FOR VALUES IN ('10'); + ^ +DROP TABLE bigintp; +CREATE TABLE range_parted ( + a date +) PARTITION BY RANGE (a); +-- forbidden expressions for partition bounds with range partitioned table +CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted + FOR VALUES FROM (somename) TO ('2019-01-01'); +ERROR: cannot use column reference in partition bound expression +LINE 2: FOR VALUES FROM (somename) TO ('2019-01-01'); + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted + FOR VALUES FROM (somename.somename) TO ('2019-01-01'); +ERROR: cannot use column reference in partition bound expression +LINE 2: FOR VALUES FROM (somename.somename) TO ('2019-01-01'); + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted + FOR VALUES FROM (a) TO ('2019-01-01'); +ERROR: cannot use column reference in partition bound expression +LINE 2: FOR VALUES FROM (a) TO ('2019-01-01'); + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted + FOR VALUES FROM (max(a)) TO ('2019-01-01'); +ERROR: cannot use column reference in partition bound expression +LINE 2: FOR VALUES FROM (max(a)) TO ('2019-01-01'); + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted + FOR VALUES FROM (max(somename)) TO ('2019-01-01'); +ERROR: cannot use column reference in partition bound expression +LINE 2: FOR VALUES FROM (max(somename)) TO ('2019-01-01'); + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted + FOR VALUES FROM (max('2019-02-01'::date)) TO ('2019-01-01'); +ERROR: aggregate functions are not allowed in partition bound +LINE 2: FOR VALUES FROM (max('2019-02-01'::date)) TO ('2019-01-01'... + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted + FOR VALUES FROM ((select 1)) TO ('2019-01-01'); +ERROR: cannot use subquery in partition bound +LINE 2: FOR VALUES FROM ((select 1)) TO ('2019-01-01'); + ^ +CREATE TABLE part_bogus_expr_fail PARTITION OF range_parted + FOR VALUES FROM (generate_series(1, 3)) TO ('2019-01-01'); +ERROR: set-returning functions are not allowed in partition bound +LINE 2: FOR VALUES FROM (generate_series(1, 3)) TO ('2019-01-01'); + ^ +-- trying to specify list for range partitioned table +CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES IN ('a'); +ERROR: invalid bound specification for a range partition +LINE 1: ...BLE fail_part PARTITION OF range_parted FOR VALUES IN ('a'); + ^ +-- trying to specify modulus and remainder for range partitioned table +CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1); +ERROR: invalid bound specification for a range partition +LINE 1: ...LE fail_part PARTITION OF range_parted FOR VALUES WITH (MODU... + ^ +-- each of start and end bounds must have same number of values as the +-- length of the partition key +CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM ('a', 1) TO ('z'); +ERROR: FROM must specify exactly one value per partitioning column +CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM ('a') TO ('z', 1); +ERROR: TO must specify exactly one value per partitioning column +-- cannot specify null values in range bounds +CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES FROM (null) TO (maxvalue); +ERROR: cannot specify NULL in range bound +-- trying to specify modulus and remainder for range partitioned table +CREATE TABLE fail_part PARTITION OF range_parted FOR VALUES WITH (MODULUS 10, REMAINDER 1); +ERROR: invalid bound specification for a range partition +LINE 1: ...LE fail_part PARTITION OF range_parted FOR VALUES WITH (MODU... + ^ +-- check partition bound syntax for the hash partition +CREATE TABLE hash_parted ( + a int +) PARTITION BY HASH (a); +CREATE TABLE hpart_1 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 10, REMAINDER 0); +CREATE TABLE hpart_2 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 50, REMAINDER 1); +CREATE TABLE hpart_3 PARTITION OF hash_parted FOR VALUES WITH (MODULUS 200, REMAINDER 2); +-- modulus 25 is factor of modulus of 50 but 10 is not a factor of 25. +CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES WITH (MODULUS 25, REMAINDER 3); +ERROR: every hash partition modulus must be a factor of the next larger modulus +DETAIL: The new modulus 25 is not divisible by 10, the modulus of existing partition "hpart_1". +-- previous modulus 50 is factor of 150 but this modulus is not a factor of next modulus 200. +CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES WITH (MODULUS 150, REMAINDER 3); +ERROR: every hash partition modulus must be a factor of the next larger modulus +DETAIL: The new modulus 150 is not a factor of 200, the modulus of existing partition "hpart_3". +-- trying to specify range for the hash partitioned table +CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES FROM ('a', 1) TO ('z'); +ERROR: invalid bound specification for a hash partition +LINE 1: ...BLE fail_part PARTITION OF hash_parted FOR VALUES FROM ('a',... + ^ +-- trying to specify list value for the hash partitioned table +CREATE TABLE fail_part PARTITION OF hash_parted FOR VALUES IN (1000); +ERROR: invalid bound specification for a hash partition +LINE 1: ...BLE fail_part PARTITION OF hash_parted FOR VALUES IN (1000); + ^ +-- trying to create default partition for the hash partitioned table +CREATE TABLE fail_default_part PARTITION OF hash_parted DEFAULT; +ERROR: a hash-partitioned table may not have a default partition +-- check if compatible with the specified parent +-- cannot create as partition of a non-partitioned table +CREATE TABLE unparted ( + a int +); +CREATE TABLE fail_part PARTITION OF unparted FOR VALUES IN ('a'); +ERROR: "unparted" is not partitioned +CREATE TABLE fail_part PARTITION OF unparted FOR VALUES WITH (MODULUS 2, REMAINDER 1); +ERROR: "unparted" is not partitioned +DROP TABLE unparted; +-- cannot create a permanent rel as partition of a temp rel +CREATE TEMP TABLE temp_parted ( + a int +) PARTITION BY LIST (a); +CREATE TABLE fail_part PARTITION OF temp_parted FOR VALUES IN ('a'); +ERROR: cannot create a permanent relation as partition of temporary relation "temp_parted" +DROP TABLE temp_parted; +-- check for partition bound overlap and other invalid specifications +CREATE TABLE list_parted2 ( + a varchar +) PARTITION BY LIST (a); +CREATE TABLE part_null_z PARTITION OF list_parted2 FOR VALUES IN (null, 'z'); +CREATE TABLE part_ab PARTITION OF list_parted2 FOR VALUES IN ('a', 'b'); +CREATE TABLE list_parted2_def PARTITION OF list_parted2 DEFAULT; +CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN (null); +ERROR: partition "fail_part" would overlap partition "part_null_z" +LINE 1: ...LE fail_part PARTITION OF list_parted2 FOR VALUES IN (null); + ^ +CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN ('b', 'c'); +ERROR: partition "fail_part" would overlap partition "part_ab" +LINE 1: ...ail_part PARTITION OF list_parted2 FOR VALUES IN ('b', 'c'); + ^ +-- check default partition overlap +INSERT INTO list_parted2 VALUES('X'); +CREATE TABLE fail_part PARTITION OF list_parted2 FOR VALUES IN ('W', 'X', 'Y'); +ERROR: updated partition constraint for default partition "list_parted2_def" would be violated by some row +CREATE TABLE range_parted2 ( + a int +) PARTITION BY RANGE (a); +-- trying to create range partition with empty range +CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (1) TO (0); +ERROR: empty range bound specified for partition "fail_part" +LINE 1: ..._part PARTITION OF range_parted2 FOR VALUES FROM (1) TO (0); + ^ +DETAIL: Specified lower bound (1) is greater than or equal to upper bound (0). +-- note that the range '[1, 1)' has no elements +CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (1) TO (1); +ERROR: empty range bound specified for partition "fail_part" +LINE 1: ..._part PARTITION OF range_parted2 FOR VALUES FROM (1) TO (1); + ^ +DETAIL: Specified lower bound (1) is greater than or equal to upper bound (1). +CREATE TABLE part0 PARTITION OF range_parted2 FOR VALUES FROM (minvalue) TO (1); +CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (minvalue) TO (2); +ERROR: partition "fail_part" would overlap partition "part0" +LINE 1: ..._part PARTITION OF range_parted2 FOR VALUES FROM (minvalue) ... + ^ +CREATE TABLE part1 PARTITION OF range_parted2 FOR VALUES FROM (1) TO (10); +CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (-1) TO (1); +ERROR: partition "fail_part" would overlap partition "part0" +LINE 1: ..._part PARTITION OF range_parted2 FOR VALUES FROM (-1) TO (1)... + ^ +CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (9) TO (maxvalue); +ERROR: partition "fail_part" would overlap partition "part1" +LINE 1: ..._part PARTITION OF range_parted2 FOR VALUES FROM (9) TO (max... + ^ +CREATE TABLE part2 PARTITION OF range_parted2 FOR VALUES FROM (20) TO (30); +CREATE TABLE part3 PARTITION OF range_parted2 FOR VALUES FROM (30) TO (40); +CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (10) TO (30); +ERROR: partition "fail_part" would overlap partition "part2" +LINE 1: ...art PARTITION OF range_parted2 FOR VALUES FROM (10) TO (30); + ^ +CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (10) TO (50); +ERROR: partition "fail_part" would overlap partition "part2" +LINE 1: ...art PARTITION OF range_parted2 FOR VALUES FROM (10) TO (50); + ^ +-- Create a default partition for range partitioned table +CREATE TABLE range2_default PARTITION OF range_parted2 DEFAULT; +-- More than one default partition is not allowed, so this should give error +CREATE TABLE fail_default_part PARTITION OF range_parted2 DEFAULT; +ERROR: partition "fail_default_part" conflicts with existing default partition "range2_default" +LINE 1: ... TABLE fail_default_part PARTITION OF range_parted2 DEFAULT; + ^ +-- Check if the range for default partitions overlap +INSERT INTO range_parted2 VALUES (85); +CREATE TABLE fail_part PARTITION OF range_parted2 FOR VALUES FROM (80) TO (90); +ERROR: updated partition constraint for default partition "range2_default" would be violated by some row +CREATE TABLE part4 PARTITION OF range_parted2 FOR VALUES FROM (90) TO (100); +-- now check for multi-column range partition key +CREATE TABLE range_parted3 ( + a int, + b int +) PARTITION BY RANGE (a, (b+1)); +CREATE TABLE part00 PARTITION OF range_parted3 FOR VALUES FROM (0, minvalue) TO (0, maxvalue); +CREATE TABLE fail_part PARTITION OF range_parted3 FOR VALUES FROM (0, minvalue) TO (0, 1); +ERROR: partition "fail_part" would overlap partition "part00" +LINE 1: ..._part PARTITION OF range_parted3 FOR VALUES FROM (0, minvalu... + ^ +CREATE TABLE part10 PARTITION OF range_parted3 FOR VALUES FROM (1, minvalue) TO (1, 1); +CREATE TABLE part11 PARTITION OF range_parted3 FOR VALUES FROM (1, 1) TO (1, 10); +CREATE TABLE part12 PARTITION OF range_parted3 FOR VALUES FROM (1, 10) TO (1, maxvalue); +CREATE TABLE fail_part PARTITION OF range_parted3 FOR VALUES FROM (1, 10) TO (1, 20); +ERROR: partition "fail_part" would overlap partition "part12" +LINE 1: ...rt PARTITION OF range_parted3 FOR VALUES FROM (1, 10) TO (1,... + ^ +CREATE TABLE range3_default PARTITION OF range_parted3 DEFAULT; +-- cannot create a partition that says column b is allowed to range +-- from -infinity to +infinity, while there exist partitions that have +-- more specific ranges +CREATE TABLE fail_part PARTITION OF range_parted3 FOR VALUES FROM (1, minvalue) TO (1, maxvalue); +ERROR: partition "fail_part" would overlap partition "part10" +LINE 1: ..._part PARTITION OF range_parted3 FOR VALUES FROM (1, minvalu... + ^ +-- check for partition bound overlap and other invalid specifications for the hash partition +CREATE TABLE hash_parted2 ( + a varchar +) PARTITION BY HASH (a); +CREATE TABLE h2part_1 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 4, REMAINDER 2); +CREATE TABLE h2part_2 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 0); +CREATE TABLE h2part_3 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 4); +CREATE TABLE h2part_4 PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 5); +-- overlap with part_4 +CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 2, REMAINDER 1); +ERROR: partition "fail_part" would overlap partition "h2part_4" +LINE 1: ...LE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODU... + ^ +-- modulus must be greater than zero +CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 0, REMAINDER 1); +ERROR: modulus for hash partition must be a positive integer +-- remainder must be greater than or equal to zero and less than modulus +CREATE TABLE fail_part PARTITION OF hash_parted2 FOR VALUES WITH (MODULUS 8, REMAINDER 8); +ERROR: remainder for hash partition must be less than modulus +-- check schema propagation from parent +CREATE TABLE parted ( + a text, + b int NOT NULL DEFAULT 0, + CONSTRAINT check_a CHECK (length(a) > 0) +) PARTITION BY LIST (a); +CREATE TABLE part_a PARTITION OF parted FOR VALUES IN ('a'); +-- only inherited attributes (never local ones) +SELECT attname, attislocal, attinhcount FROM pg_attribute + WHERE attrelid = 'part_a'::regclass and attnum > 0 + ORDER BY attnum; + attname | attislocal | attinhcount +---------+------------+------------- + a | f | 1 + b | f | 1 +(2 rows) + +-- able to specify column default, column constraint, and table constraint +-- first check the "column specified more than once" error +CREATE TABLE part_b PARTITION OF parted ( + b NOT NULL, + b DEFAULT 1, + b CHECK (b >= 0), + CONSTRAINT check_a CHECK (length(a) > 0) +) FOR VALUES IN ('b'); +ERROR: column "b" specified more than once +CREATE TABLE part_b PARTITION OF parted ( + b NOT NULL DEFAULT 1, + CONSTRAINT check_a CHECK (length(a) > 0), + CONSTRAINT check_b CHECK (b >= 0) +) FOR VALUES IN ('b'); +NOTICE: merging constraint "check_a" with inherited definition +-- conislocal should be false for any merged constraints, true otherwise +SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_b'::regclass ORDER BY conislocal, coninhcount; + conislocal | coninhcount +------------+------------- + f | 1 + t | 0 +(2 rows) + +-- Once check_b is added to the parent, it should be made non-local for part_b +ALTER TABLE parted ADD CONSTRAINT check_b CHECK (b >= 0); +NOTICE: merging constraint "check_b" with inherited definition +SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_b'::regclass; + conislocal | coninhcount +------------+------------- + f | 1 + f | 1 +(2 rows) + +-- Neither check_a nor check_b are droppable from part_b +ALTER TABLE part_b DROP CONSTRAINT check_a; +ERROR: cannot drop inherited constraint "check_a" of relation "part_b" +ALTER TABLE part_b DROP CONSTRAINT check_b; +ERROR: cannot drop inherited constraint "check_b" of relation "part_b" +-- And dropping it from parted should leave no trace of them on part_b, unlike +-- traditional inheritance where they will be left behind, because they would +-- be local constraints. +ALTER TABLE parted DROP CONSTRAINT check_a, DROP CONSTRAINT check_b; +SELECT conislocal, coninhcount FROM pg_constraint WHERE conrelid = 'part_b'::regclass; + conislocal | coninhcount +------------+------------- +(0 rows) + +-- specify PARTITION BY for a partition +CREATE TABLE fail_part_col_not_found PARTITION OF parted FOR VALUES IN ('c') PARTITION BY RANGE (c); +ERROR: column "c" named in partition key does not exist +LINE 1: ...TITION OF parted FOR VALUES IN ('c') PARTITION BY RANGE (c); + ^ +CREATE TABLE part_c PARTITION OF parted (b WITH OPTIONS NOT NULL DEFAULT 0) FOR VALUES IN ('c') PARTITION BY RANGE ((b)); +-- create a level-2 partition +CREATE TABLE part_c_1_10 PARTITION OF part_c FOR VALUES FROM (1) TO (10); +-- check that NOT NULL and default value are inherited correctly +create table parted_notnull_inh_test (a int default 1, b int not null default 0) partition by list (a); +create table parted_notnull_inh_test1 partition of parted_notnull_inh_test (a not null, b default 1) for values in (1); +insert into parted_notnull_inh_test (b) values (null); +ERROR: null value in column "b" of relation "parted_notnull_inh_test1" violates not-null constraint +DETAIL: Failing row contains (1, null). +-- note that while b's default is overriden, a's default is preserved +\d parted_notnull_inh_test1 + Table "public.parted_notnull_inh_test1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | not null | 1 + b | integer | | not null | 1 +Partition of: parted_notnull_inh_test FOR VALUES IN (1) + +drop table parted_notnull_inh_test; +-- check that collations are assigned in partition bound expressions +create table parted_boolean_col (a bool, b text) partition by list(a); +create table parted_boolean_less partition of parted_boolean_col + for values in ('foo' < 'bar'); +create table parted_boolean_greater partition of parted_boolean_col + for values in ('foo' > 'bar'); +drop table parted_boolean_col; +-- check for a conflicting COLLATE clause +create table parted_collate_must_match (a text collate "C", b text collate "C") + partition by range (a); +-- on the partition key +create table parted_collate_must_match1 partition of parted_collate_must_match + (a collate "POSIX") for values from ('a') to ('m'); +-- on another column +create table parted_collate_must_match2 partition of parted_collate_must_match + (b collate "POSIX") for values from ('m') to ('z'); +drop table parted_collate_must_match; +-- check that non-matching collations for partition bound +-- expressions are coerced to the right collation +create table test_part_coll_posix (a text) partition by range (a collate "POSIX"); +-- ok, collation is implicitly coerced +create table test_part_coll partition of test_part_coll_posix for values from ('a' collate "C") to ('g'); +-- ok +create table test_part_coll2 partition of test_part_coll_posix for values from ('g') to ('m'); +-- ok, collation is implicitly coerced +create table test_part_coll_cast partition of test_part_coll_posix for values from (name 'm' collate "C") to ('s'); +-- ok; partition collation silently overrides the default collation of type 'name' +create table test_part_coll_cast2 partition of test_part_coll_posix for values from (name 's') to ('z'); +drop table test_part_coll_posix; +-- Partition bound in describe output +\d+ part_b + Table "public.part_b" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+----------+--------------+------------- + a | text | | | | extended | | + b | integer | | not null | 1 | plain | | +Partition of: parted FOR VALUES IN ('b') +Partition constraint: ((a IS NOT NULL) AND (a = 'b'::text)) + +-- Both partition bound and partition key in describe output +\d+ part_c + Partitioned table "public.part_c" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+----------+--------------+------------- + a | text | | | | extended | | + b | integer | | not null | 0 | plain | | +Partition of: parted FOR VALUES IN ('c') +Partition constraint: ((a IS NOT NULL) AND (a = 'c'::text)) +Partition key: RANGE (b) +Partitions: part_c_1_10 FOR VALUES FROM (1) TO (10) + +-- a level-2 partition's constraint will include the parent's expressions +\d+ part_c_1_10 + Table "public.part_c_1_10" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+----------+--------------+------------- + a | text | | | | extended | | + b | integer | | not null | 0 | plain | | +Partition of: part_c FOR VALUES FROM (1) TO (10) +Partition constraint: ((a IS NOT NULL) AND (a = 'c'::text) AND (b IS NOT NULL) AND (b >= 1) AND (b < 10)) + +-- Show partition count in the parent's describe output +-- Tempted to include \d+ output listing partitions with bound info but +-- output could vary depending on the order in which partition oids are +-- returned. +\d parted + Partitioned table "public.parted" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | text | | | + b | integer | | not null | 0 +Partition key: LIST (a) +Number of partitions: 3 (Use \d+ to list them.) + +\d hash_parted + Partitioned table "public.hash_parted" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | +Partition key: HASH (a) +Number of partitions: 3 (Use \d+ to list them.) + +-- check that we get the expected partition constraints +CREATE TABLE range_parted4 (a int, b int, c int) PARTITION BY RANGE (abs(a), abs(b), c); +CREATE TABLE unbounded_range_part PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (MAXVALUE, MAXVALUE, MAXVALUE); +\d+ unbounded_range_part + Table "public.unbounded_range_part" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | + b | integer | | | | plain | | + c | integer | | | | plain | | +Partition of: range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (MAXVALUE, MAXVALUE, MAXVALUE) +Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL)) + +DROP TABLE unbounded_range_part; +CREATE TABLE range_parted4_1 PARTITION OF range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (1, MAXVALUE, MAXVALUE); +\d+ range_parted4_1 + Table "public.range_parted4_1" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | + b | integer | | | | plain | | + c | integer | | | | plain | | +Partition of: range_parted4 FOR VALUES FROM (MINVALUE, MINVALUE, MINVALUE) TO (1, MAXVALUE, MAXVALUE) +Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL) AND (abs(a) <= 1)) + +CREATE TABLE range_parted4_2 PARTITION OF range_parted4 FOR VALUES FROM (3, 4, 5) TO (6, 7, MAXVALUE); +\d+ range_parted4_2 + Table "public.range_parted4_2" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | + b | integer | | | | plain | | + c | integer | | | | plain | | +Partition of: range_parted4 FOR VALUES FROM (3, 4, 5) TO (6, 7, MAXVALUE) +Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL) AND ((abs(a) > 3) OR ((abs(a) = 3) AND (abs(b) > 4)) OR ((abs(a) = 3) AND (abs(b) = 4) AND (c >= 5))) AND ((abs(a) < 6) OR ((abs(a) = 6) AND (abs(b) <= 7)))) + +CREATE TABLE range_parted4_3 PARTITION OF range_parted4 FOR VALUES FROM (6, 8, MINVALUE) TO (9, MAXVALUE, MAXVALUE); +\d+ range_parted4_3 + Table "public.range_parted4_3" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | + b | integer | | | | plain | | + c | integer | | | | plain | | +Partition of: range_parted4 FOR VALUES FROM (6, 8, MINVALUE) TO (9, MAXVALUE, MAXVALUE) +Partition constraint: ((abs(a) IS NOT NULL) AND (abs(b) IS NOT NULL) AND (c IS NOT NULL) AND ((abs(a) > 6) OR ((abs(a) = 6) AND (abs(b) >= 8))) AND (abs(a) <= 9)) + +DROP TABLE range_parted4; +-- user-defined operator class in partition key +CREATE FUNCTION my_int4_sort(int4,int4) RETURNS int LANGUAGE sql + AS $$ SELECT CASE WHEN $1 = $2 THEN 0 WHEN $1 > $2 THEN 1 ELSE -1 END; $$; +CREATE OPERATOR CLASS test_int4_ops FOR TYPE int4 USING btree AS + OPERATOR 1 < (int4,int4), OPERATOR 2 <= (int4,int4), + OPERATOR 3 = (int4,int4), OPERATOR 4 >= (int4,int4), + OPERATOR 5 > (int4,int4), FUNCTION 1 my_int4_sort(int4,int4); +CREATE TABLE partkey_t (a int4) PARTITION BY RANGE (a test_int4_ops); +CREATE TABLE partkey_t_1 PARTITION OF partkey_t FOR VALUES FROM (0) TO (1000); +INSERT INTO partkey_t VALUES (100); +INSERT INTO partkey_t VALUES (200); +-- cleanup +DROP TABLE parted, list_parted, range_parted, list_parted2, range_parted2, range_parted3; +DROP TABLE partkey_t, hash_parted, hash_parted2; +DROP OPERATOR CLASS test_int4_ops USING btree; +DROP FUNCTION my_int4_sort(int4,int4); +-- comments on partitioned tables columns +CREATE TABLE parted_col_comment (a int, b text) PARTITION BY LIST (a); +COMMENT ON TABLE parted_col_comment IS 'Am partitioned table'; +COMMENT ON COLUMN parted_col_comment.a IS 'Partition key'; +SELECT obj_description('parted_col_comment'::regclass); + obj_description +---------------------- + Am partitioned table +(1 row) + +\d+ parted_col_comment + Partitioned table "public.parted_col_comment" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+----------+--------------+--------------- + a | integer | | | | plain | | Partition key + b | text | | | | extended | | +Partition key: LIST (a) +Number of partitions: 0 + +DROP TABLE parted_col_comment; +-- list partitioning on array type column +CREATE TABLE arrlp (a int[]) PARTITION BY LIST (a); +CREATE TABLE arrlp12 PARTITION OF arrlp FOR VALUES IN ('{1}', '{2}'); +\d+ arrlp12 + Table "public.arrlp12" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+-----------+-----------+----------+---------+----------+--------------+------------- + a | integer[] | | | | extended | | +Partition of: arrlp FOR VALUES IN ('{1}', '{2}') +Partition constraint: ((a IS NOT NULL) AND ((a = '{1}'::integer[]) OR (a = '{2}'::integer[]))) + +DROP TABLE arrlp; +-- partition on boolean column +create table boolspart (a bool) partition by list (a); +create table boolspart_t partition of boolspart for values in (true); +create table boolspart_f partition of boolspart for values in (false); +\d+ boolspart + Partitioned table "public.boolspart" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | boolean | | | | plain | | +Partition key: LIST (a) +Partitions: boolspart_f FOR VALUES IN (false), + boolspart_t FOR VALUES IN (true) + +drop table boolspart; +-- partitions mixing temporary and permanent relations +create table perm_parted (a int) partition by list (a); +create temporary table temp_parted (a int) partition by list (a); +create table perm_part partition of temp_parted default; -- error +ERROR: cannot create a permanent relation as partition of temporary relation "temp_parted" +create temp table temp_part partition of perm_parted default; -- error +ERROR: cannot create a temporary relation as partition of permanent relation "perm_parted" +create temp table temp_part partition of temp_parted default; -- ok +drop table perm_parted cascade; +drop table temp_parted cascade; +-- check that adding partitions to a table while it is being used is prevented +create table tab_part_create (a int) partition by list (a); +create or replace function func_part_create() returns trigger + language plpgsql as $$ + begin + execute 'create table tab_part_create_1 partition of tab_part_create for values in (1)'; + return null; + end $$; +create trigger trig_part_create before insert on tab_part_create + for each statement execute procedure func_part_create(); +insert into tab_part_create values (1); +ERROR: cannot CREATE TABLE .. PARTITION OF "tab_part_create" because it is being used by active queries in this session +CONTEXT: SQL statement "create table tab_part_create_1 partition of tab_part_create for values in (1)" +PL/pgSQL function func_part_create() line 3 at EXECUTE +drop table tab_part_create; +drop function func_part_create(); +-- test using a volatile expression as partition bound +create table volatile_partbound_test (partkey timestamp) partition by range (partkey); +create table volatile_partbound_test1 partition of volatile_partbound_test for values from (minvalue) to (current_timestamp); +create table volatile_partbound_test2 partition of volatile_partbound_test for values from (current_timestamp) to (maxvalue); +-- this should go into the partition volatile_partbound_test2 +insert into volatile_partbound_test values (current_timestamp); +select tableoid::regclass from volatile_partbound_test; + tableoid +-------------------------- + volatile_partbound_test2 +(1 row) + +drop table volatile_partbound_test; +-- test the case where a check constraint on default partition allows +-- to avoid scanning it when adding a new partition +create table defcheck (a int, b int) partition by list (b); +create table defcheck_def (a int, c int, b int); +alter table defcheck_def drop c; +alter table defcheck attach partition defcheck_def default; +alter table defcheck_def add check (b <= 0 and b is not null); +create table defcheck_1 partition of defcheck for values in (1, null); +-- test that complex default partition constraints are enforced correctly +insert into defcheck_def values (0, 0); +create table defcheck_0 partition of defcheck for values in (0); +ERROR: updated partition constraint for default partition "defcheck_def" would be violated by some row +drop table defcheck; +-- tests of column drop with partition tables and indexes using +-- predicates and expressions. +create table part_column_drop ( + useless_1 int, + id int, + useless_2 int, + d int, + b int, + useless_3 int +) partition by range (id); +alter table part_column_drop drop column useless_1; +alter table part_column_drop drop column useless_2; +alter table part_column_drop drop column useless_3; +create index part_column_drop_b_pred on part_column_drop(b) where b = 1; +create index part_column_drop_b_expr on part_column_drop((b = 1)); +create index part_column_drop_d_pred on part_column_drop(d) where d = 2; +create index part_column_drop_d_expr on part_column_drop((d = 2)); +create table part_column_drop_1_10 partition of + part_column_drop for values from (1) to (10); +\d part_column_drop + Partitioned table "public.part_column_drop" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + id | integer | | | + d | integer | | | + b | integer | | | +Partition key: RANGE (id) +Indexes: + "part_column_drop_b_expr" btree ((b = 1)) + "part_column_drop_b_pred" btree (b) WHERE b = 1 + "part_column_drop_d_expr" btree ((d = 2)) + "part_column_drop_d_pred" btree (d) WHERE d = 2 +Number of partitions: 1 (Use \d+ to list them.) + +\d part_column_drop_1_10 + Table "public.part_column_drop_1_10" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + id | integer | | | + d | integer | | | + b | integer | | | +Partition of: part_column_drop FOR VALUES FROM (1) TO (10) +Indexes: + "part_column_drop_1_10_b_idx" btree (b) WHERE b = 1 + "part_column_drop_1_10_d_idx" btree (d) WHERE d = 2 + "part_column_drop_1_10_expr_idx" btree ((b = 1)) + "part_column_drop_1_10_expr_idx1" btree ((d = 2)) + +drop table part_column_drop; diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out index 03df567d50f..554aa5c4986 100644 --- a/src/test/regress/expected/privileges.out +++ b/src/test/regress/expected/privileges.out @@ -1679,7 +1679,8 @@ REFRESH MATERIALIZED VIEW sro_mv; ERROR: cannot fire deferred trigger within security-restricted operation CONTEXT: SQL function "mv_action" statement 1 BEGIN; SET CONSTRAINTS ALL IMMEDIATE; REFRESH MATERIALIZED VIEW sro_mv; COMMIT; -ERROR: must have admin option on role "regress_priv_group2" +ERROR: permission denied to grant role "regress_priv_group2" +DETAIL: Only roles with the ADMIN option on role "regress_priv_group2" may grant this role. CONTEXT: SQL function "unwanted_grant" statement 1 SQL statement "SELECT unwanted_grant()" PL/pgSQL function sro_trojan() line 1 at PERFORM @@ -1709,10 +1710,12 @@ CREATE FUNCTION dogrant_ok() RETURNS void LANGUAGE sql SECURITY DEFINER AS GRANT regress_priv_group2 TO regress_priv_user5; -- ok: had ADMIN OPTION SET ROLE regress_priv_group2; GRANT regress_priv_group2 TO regress_priv_user5; -- fails: SET ROLE suspended privilege -ERROR: must have admin option on role "regress_priv_group2" +ERROR: permission denied to grant role "regress_priv_group2" +DETAIL: Only roles with the ADMIN option on role "regress_priv_group2" may grant this role. SET SESSION AUTHORIZATION regress_priv_user1; GRANT regress_priv_group2 TO regress_priv_user5; -- fails: no ADMIN OPTION -ERROR: must have admin option on role "regress_priv_group2" +ERROR: permission denied to grant role "regress_priv_group2" +DETAIL: Only roles with the ADMIN option on role "regress_priv_group2" may grant this role. SELECT dogrant_ok(); -- ok: SECURITY DEFINER conveys ADMIN NOTICE: role "regress_priv_user5" is already a member of role "regress_priv_group2" dogrant_ok @@ -1722,10 +1725,12 @@ NOTICE: role "regress_priv_user5" is already a member of role "regress_priv_gro SET ROLE regress_priv_group2; GRANT regress_priv_group2 TO regress_priv_user5; -- fails: SET ROLE did not help -ERROR: must have admin option on role "regress_priv_group2" +ERROR: permission denied to grant role "regress_priv_group2" +DETAIL: Only roles with the ADMIN option on role "regress_priv_group2" may grant this role. SET SESSION AUTHORIZATION regress_priv_group2; GRANT regress_priv_group2 TO regress_priv_user5; -- fails: no self-admin -ERROR: must have admin option on role "regress_priv_group2" +ERROR: permission denied to grant role "regress_priv_group2" +DETAIL: Only roles with the ADMIN option on role "regress_priv_group2" may grant this role. SET SESSION AUTHORIZATION regress_priv_user4; DROP FUNCTION dogrant_ok(); REVOKE regress_priv_group2 FROM regress_priv_user5; diff --git a/src/test/regress/expected/sequence_1.out b/src/test/regress/expected/sequence_1.out new file mode 100644 index 00000000000..3c1a7a325d8 --- /dev/null +++ b/src/test/regress/expected/sequence_1.out @@ -0,0 +1,841 @@ +-- +-- CREATE SEQUENCE +-- +-- various error cases +CREATE SEQUENCE sequence_testx INCREMENT BY 0; +ERROR: INCREMENT must not be zero +CREATE SEQUENCE sequence_testx INCREMENT BY -1 MINVALUE 20; +ERROR: MINVALUE (20) must be less than MAXVALUE (-1) +CREATE SEQUENCE sequence_testx INCREMENT BY 1 MAXVALUE -20; +ERROR: MINVALUE (1) must be less than MAXVALUE (-20) +CREATE SEQUENCE sequence_testx INCREMENT BY -1 START 10; +ERROR: START value (10) cannot be greater than MAXVALUE (-1) +CREATE SEQUENCE sequence_testx INCREMENT BY 1 START -10; +ERROR: START value (-10) cannot be less than MINVALUE (1) +CREATE SEQUENCE sequence_testx CACHE 0; +ERROR: CACHE (0) must be greater than zero +-- OWNED BY errors +CREATE SEQUENCE sequence_testx OWNED BY nobody; -- nonsense word +ERROR: invalid OWNED BY option +HINT: Specify OWNED BY table.column or OWNED BY NONE. +CREATE SEQUENCE sequence_testx OWNED BY pg_class_oid_index.oid; -- not a table +ERROR: sequence cannot be owned by relation "pg_class_oid_index" +DETAIL: This operation is not supported for indexes. +CREATE SEQUENCE sequence_testx OWNED BY pg_class.relname; -- not same schema +ERROR: sequence must be in same schema as table it is linked to +CREATE TABLE sequence_test_table (a int); +CREATE SEQUENCE sequence_testx OWNED BY sequence_test_table.b; -- wrong column +ERROR: column "b" of relation "sequence_test_table" does not exist +DROP TABLE sequence_test_table; +-- sequence data types +CREATE SEQUENCE sequence_test5 AS integer; +CREATE SEQUENCE sequence_test6 AS smallint; +CREATE SEQUENCE sequence_test7 AS bigint; +CREATE SEQUENCE sequence_test8 AS integer MAXVALUE 100000; +CREATE SEQUENCE sequence_test9 AS integer INCREMENT BY -1; +CREATE SEQUENCE sequence_test10 AS integer MINVALUE -100000 START 1; +CREATE SEQUENCE sequence_test11 AS smallint; +CREATE SEQUENCE sequence_test12 AS smallint INCREMENT -1; +CREATE SEQUENCE sequence_test13 AS smallint MINVALUE -32768; +CREATE SEQUENCE sequence_test14 AS smallint MAXVALUE 32767 INCREMENT -1; +CREATE SEQUENCE sequence_testx AS text; +ERROR: sequence type must be smallint, integer, or bigint +CREATE SEQUENCE sequence_testx AS nosuchtype; +ERROR: type "nosuchtype" does not exist +LINE 1: CREATE SEQUENCE sequence_testx AS nosuchtype; + ^ +CREATE SEQUENCE sequence_testx AS smallint MAXVALUE 100000; +ERROR: MAXVALUE (100000) is out of range for sequence data type smallint +CREATE SEQUENCE sequence_testx AS smallint MINVALUE -100000; +ERROR: MINVALUE (-100000) is out of range for sequence data type smallint +ALTER SEQUENCE sequence_test5 AS smallint; -- success, max will be adjusted +ALTER SEQUENCE sequence_test8 AS smallint; -- fail, max has to be adjusted +ERROR: MAXVALUE (100000) is out of range for sequence data type smallint +ALTER SEQUENCE sequence_test8 AS smallint MAXVALUE 20000; -- ok now +ALTER SEQUENCE sequence_test9 AS smallint; -- success, min will be adjusted +ALTER SEQUENCE sequence_test10 AS smallint; -- fail, min has to be adjusted +ERROR: MINVALUE (-100000) is out of range for sequence data type smallint +ALTER SEQUENCE sequence_test10 AS smallint MINVALUE -20000; -- ok now +ALTER SEQUENCE sequence_test11 AS int; -- max will be adjusted +ALTER SEQUENCE sequence_test12 AS int; -- min will be adjusted +ALTER SEQUENCE sequence_test13 AS int; -- min and max will be adjusted +ALTER SEQUENCE sequence_test14 AS int; -- min and max will be adjusted +--- +--- test creation of SERIAL column +--- +CREATE TABLE serialTest1 (f1 text, f2 serial); +INSERT INTO serialTest1 VALUES ('foo'); +INSERT INTO serialTest1 VALUES ('bar'); +INSERT INTO serialTest1 VALUES ('force', 100); +INSERT INTO serialTest1 VALUES ('wrong', NULL); +ERROR: null value in column "f2" of relation "serialtest1" violates not-null constraint +DETAIL: Failing row contains (wrong, null). +SELECT * FROM serialTest1; + f1 | f2 +-------+----- + foo | 1 + bar | 2 + force | 100 +(3 rows) + +SELECT pg_get_serial_sequence('serialTest1', 'f2'); + pg_get_serial_sequence +--------------------------- + public.serialtest1_f2_seq +(1 row) + +-- test smallserial / bigserial +CREATE TABLE serialTest2 (f1 text, f2 serial, f3 smallserial, f4 serial2, + f5 bigserial, f6 serial8); +INSERT INTO serialTest2 (f1) + VALUES ('test_defaults'); +INSERT INTO serialTest2 (f1, f2, f3, f4, f5, f6) + VALUES ('test_max_vals', 2147483647, 32767, 32767, 9223372036854775807, + 9223372036854775807), + ('test_min_vals', -2147483648, -32768, -32768, -9223372036854775808, + -9223372036854775808); +-- All these INSERTs should fail: +INSERT INTO serialTest2 (f1, f3) + VALUES ('bogus', -32769); +ERROR: smallint out of range +INSERT INTO serialTest2 (f1, f4) + VALUES ('bogus', -32769); +ERROR: smallint out of range +INSERT INTO serialTest2 (f1, f3) + VALUES ('bogus', 32768); +ERROR: smallint out of range +INSERT INTO serialTest2 (f1, f4) + VALUES ('bogus', 32768); +ERROR: smallint out of range +INSERT INTO serialTest2 (f1, f5) + VALUES ('bogus', -9223372036854775809); +ERROR: bigint out of range +INSERT INTO serialTest2 (f1, f6) + VALUES ('bogus', -9223372036854775809); +ERROR: bigint out of range +INSERT INTO serialTest2 (f1, f5) + VALUES ('bogus', 9223372036854775808); +ERROR: bigint out of range +INSERT INTO serialTest2 (f1, f6) + VALUES ('bogus', 9223372036854775808); +ERROR: bigint out of range +SELECT * FROM serialTest2 ORDER BY f2 ASC; + f1 | f2 | f3 | f4 | f5 | f6 +---------------+-------------+--------+--------+----------------------+---------------------- + test_min_vals | -2147483648 | -32768 | -32768 | -9223372036854775808 | -9223372036854775808 + test_defaults | 1 | 1 | 1 | 1 | 1 + test_max_vals | 2147483647 | 32767 | 32767 | 9223372036854775807 | 9223372036854775807 +(3 rows) + +SELECT nextval('serialTest2_f2_seq'); + nextval +--------- + 2 +(1 row) + +SELECT nextval('serialTest2_f3_seq'); + nextval +--------- + 2 +(1 row) + +SELECT nextval('serialTest2_f4_seq'); + nextval +--------- + 2 +(1 row) + +SELECT nextval('serialTest2_f5_seq'); + nextval +--------- + 2 +(1 row) + +SELECT nextval('serialTest2_f6_seq'); + nextval +--------- + 2 +(1 row) + +-- basic sequence operations using both text and oid references +CREATE SEQUENCE sequence_test; +CREATE SEQUENCE IF NOT EXISTS sequence_test; +NOTICE: relation "sequence_test" already exists, skipping +SELECT nextval('sequence_test'::text); + nextval +--------- + 1 +(1 row) + +SELECT nextval('sequence_test'::regclass); + nextval +--------- + 2 +(1 row) + +SELECT currval('sequence_test'::text); + currval +--------- + 2 +(1 row) + +SELECT currval('sequence_test'::regclass); + currval +--------- + 2 +(1 row) + +SELECT setval('sequence_test'::text, 32); + setval +-------- + 32 +(1 row) + +SELECT nextval('sequence_test'::regclass); + nextval +--------- + 33 +(1 row) + +SELECT setval('sequence_test'::text, 99, false); + setval +-------- + 99 +(1 row) + +SELECT nextval('sequence_test'::regclass); + nextval +--------- + 99 +(1 row) + +SELECT setval('sequence_test'::regclass, 32); + setval +-------- + 32 +(1 row) + +SELECT nextval('sequence_test'::text); + nextval +--------- + 33 +(1 row) + +SELECT setval('sequence_test'::regclass, 99, false); + setval +-------- + 99 +(1 row) + +SELECT nextval('sequence_test'::text); + nextval +--------- + 99 +(1 row) + +DISCARD SEQUENCES; +SELECT currval('sequence_test'::regclass); +ERROR: currval of sequence "sequence_test" is not yet defined in this session +DROP SEQUENCE sequence_test; +-- renaming sequences +CREATE SEQUENCE foo_seq; +ALTER TABLE foo_seq RENAME TO foo_seq_new; +SELECT * FROM foo_seq_new; + last_value | log_cnt | is_called +------------+---------+----------- + 1 | 0 | f +(1 row) + +SELECT nextval('foo_seq_new'); + nextval +--------- + 1 +(1 row) + +SELECT nextval('foo_seq_new'); + nextval +--------- + 2 +(1 row) + +-- log_cnt can be higher if there is a checkpoint just at the right +-- time, so just test for the expected range +SELECT last_value, log_cnt IN (31, 32) AS log_cnt_ok, is_called FROM foo_seq_new; + last_value | log_cnt_ok | is_called +------------+------------+----------- + 2 | f | t +(1 row) + +DROP SEQUENCE foo_seq_new; +-- renaming serial sequences +ALTER TABLE serialtest1_f2_seq RENAME TO serialtest1_f2_foo; +INSERT INTO serialTest1 VALUES ('more'); +SELECT * FROM serialTest1; + f1 | f2 +-------+----- + foo | 1 + bar | 2 + force | 100 + more | 3 +(4 rows) + +-- +-- Check dependencies of serial and ordinary sequences +-- +CREATE TEMP SEQUENCE myseq2; +CREATE TEMP SEQUENCE myseq3; +CREATE TEMP TABLE t1 ( + f1 serial, + f2 int DEFAULT nextval('myseq2'), + f3 int DEFAULT nextval('myseq3'::text) +); +-- Both drops should fail, but with different error messages: +DROP SEQUENCE t1_f1_seq; +ERROR: cannot drop sequence t1_f1_seq because other objects depend on it +DETAIL: default value for column f1 of table t1 depends on sequence t1_f1_seq +HINT: Use DROP ... CASCADE to drop the dependent objects too. +DROP SEQUENCE myseq2; +ERROR: cannot drop sequence myseq2 because other objects depend on it +DETAIL: default value for column f2 of table t1 depends on sequence myseq2 +HINT: Use DROP ... CASCADE to drop the dependent objects too. +-- This however will work: +DROP SEQUENCE myseq3; +DROP TABLE t1; +-- Fails because no longer existent: +DROP SEQUENCE t1_f1_seq; +ERROR: sequence "t1_f1_seq" does not exist +-- Now OK: +DROP SEQUENCE myseq2; +-- +-- Alter sequence +-- +ALTER SEQUENCE IF EXISTS sequence_test2 RESTART WITH 24 + INCREMENT BY 4 MAXVALUE 36 MINVALUE 5 CYCLE; +NOTICE: relation "sequence_test2" does not exist, skipping +ALTER SEQUENCE serialTest1 CYCLE; -- error, not a sequence +ERROR: "serialtest1" is not a sequence +CREATE SEQUENCE sequence_test2 START WITH 32; +CREATE SEQUENCE sequence_test4 INCREMENT BY -1; +SELECT nextval('sequence_test2'); + nextval +--------- + 32 +(1 row) + +SELECT nextval('sequence_test4'); + nextval +--------- + -1 +(1 row) + +ALTER SEQUENCE sequence_test2 RESTART; +SELECT nextval('sequence_test2'); + nextval +--------- + 32 +(1 row) + +ALTER SEQUENCE sequence_test2 RESTART WITH 0; -- error +ERROR: RESTART value (0) cannot be less than MINVALUE (1) +ALTER SEQUENCE sequence_test4 RESTART WITH 40; -- error +ERROR: RESTART value (40) cannot be greater than MAXVALUE (-1) +-- test CYCLE and NO CYCLE +ALTER SEQUENCE sequence_test2 RESTART WITH 24 + INCREMENT BY 4 MAXVALUE 36 MINVALUE 5 CYCLE; +SELECT nextval('sequence_test2'); + nextval +--------- + 24 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + 28 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + 32 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + 36 +(1 row) + +SELECT nextval('sequence_test2'); -- cycled + nextval +--------- + 5 +(1 row) + +ALTER SEQUENCE sequence_test2 RESTART WITH 24 + NO CYCLE; +SELECT nextval('sequence_test2'); + nextval +--------- + 24 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + 28 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + 32 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + 36 +(1 row) + +SELECT nextval('sequence_test2'); -- error +ERROR: nextval: reached maximum value of sequence "sequence_test2" (36) +ALTER SEQUENCE sequence_test2 RESTART WITH -24 START WITH -24 + INCREMENT BY -4 MINVALUE -36 MAXVALUE -5 CYCLE; +SELECT nextval('sequence_test2'); + nextval +--------- + -24 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + -28 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + -32 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + -36 +(1 row) + +SELECT nextval('sequence_test2'); -- cycled + nextval +--------- + -5 +(1 row) + +ALTER SEQUENCE sequence_test2 RESTART WITH -24 + NO CYCLE; +SELECT nextval('sequence_test2'); + nextval +--------- + -24 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + -28 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + -32 +(1 row) + +SELECT nextval('sequence_test2'); + nextval +--------- + -36 +(1 row) + +SELECT nextval('sequence_test2'); -- error +ERROR: nextval: reached minimum value of sequence "sequence_test2" (-36) +-- reset +ALTER SEQUENCE IF EXISTS sequence_test2 RESTART WITH 32 START WITH 32 + INCREMENT BY 4 MAXVALUE 36 MINVALUE 5 CYCLE; +SELECT setval('sequence_test2', -100); -- error +ERROR: setval: value -100 is out of bounds for sequence "sequence_test2" (5..36) +SELECT setval('sequence_test2', 100); -- error +ERROR: setval: value 100 is out of bounds for sequence "sequence_test2" (5..36) +SELECT setval('sequence_test2', 5); + setval +-------- + 5 +(1 row) + +CREATE SEQUENCE sequence_test3; -- not read from, to test is_called +-- Information schema +SELECT * FROM information_schema.sequences + WHERE sequence_name ~ ANY(ARRAY['sequence_test', 'serialtest']) + ORDER BY sequence_name ASC; + sequence_catalog | sequence_schema | sequence_name | data_type | numeric_precision | numeric_precision_radix | numeric_scale | start_value | minimum_value | maximum_value | increment | cycle_option +------------------+-----------------+--------------------+-----------+-------------------+-------------------------+---------------+-------------+----------------------+---------------------+-----------+-------------- + regression | public | sequence_test10 | smallint | 16 | 2 | 0 | 1 | -20000 | 32767 | 1 | NO + regression | public | sequence_test11 | integer | 32 | 2 | 0 | 1 | 1 | 2147483647 | 1 | NO + regression | public | sequence_test12 | integer | 32 | 2 | 0 | -1 | -2147483648 | -1 | -1 | NO + regression | public | sequence_test13 | integer | 32 | 2 | 0 | -32768 | -2147483648 | 2147483647 | 1 | NO + regression | public | sequence_test14 | integer | 32 | 2 | 0 | 32767 | -2147483648 | 2147483647 | -1 | NO + regression | public | sequence_test2 | bigint | 64 | 2 | 0 | 32 | 5 | 36 | 4 | YES + regression | public | sequence_test3 | bigint | 64 | 2 | 0 | 1 | 1 | 9223372036854775807 | 1 | NO + regression | public | sequence_test4 | bigint | 64 | 2 | 0 | -1 | -9223372036854775808 | -1 | -1 | NO + regression | public | sequence_test5 | smallint | 16 | 2 | 0 | 1 | 1 | 32767 | 1 | NO + regression | public | sequence_test6 | smallint | 16 | 2 | 0 | 1 | 1 | 32767 | 1 | NO + regression | public | sequence_test7 | bigint | 64 | 2 | 0 | 1 | 1 | 9223372036854775807 | 1 | NO + regression | public | sequence_test8 | smallint | 16 | 2 | 0 | 1 | 1 | 20000 | 1 | NO + regression | public | sequence_test9 | smallint | 16 | 2 | 0 | -1 | -32768 | -1 | -1 | NO + regression | public | serialtest1_f2_foo | integer | 32 | 2 | 0 | 1 | 1 | 2147483647 | 1 | NO + regression | public | serialtest2_f2_seq | integer | 32 | 2 | 0 | 1 | 1 | 2147483647 | 1 | NO + regression | public | serialtest2_f3_seq | smallint | 16 | 2 | 0 | 1 | 1 | 32767 | 1 | NO + regression | public | serialtest2_f4_seq | smallint | 16 | 2 | 0 | 1 | 1 | 32767 | 1 | NO + regression | public | serialtest2_f5_seq | bigint | 64 | 2 | 0 | 1 | 1 | 9223372036854775807 | 1 | NO + regression | public | serialtest2_f6_seq | bigint | 64 | 2 | 0 | 1 | 1 | 9223372036854775807 | 1 | NO +(19 rows) + +SELECT schemaname, sequencename, start_value, min_value, max_value, increment_by, cycle, cache_size, last_value +FROM pg_sequences +WHERE sequencename ~ ANY(ARRAY['sequence_test', 'serialtest']) + ORDER BY sequencename ASC; + schemaname | sequencename | start_value | min_value | max_value | increment_by | cycle | cache_size | last_value +------------+--------------------+-------------+----------------------+---------------------+--------------+-------+------------+------------ + public | sequence_test10 | 1 | -20000 | 32767 | 1 | f | 1 | + public | sequence_test11 | 1 | 1 | 2147483647 | 1 | f | 1 | + public | sequence_test12 | -1 | -2147483648 | -1 | -1 | f | 1 | + public | sequence_test13 | -32768 | -2147483648 | 2147483647 | 1 | f | 1 | + public | sequence_test14 | 32767 | -2147483648 | 2147483647 | -1 | f | 1 | + public | sequence_test2 | 32 | 5 | 36 | 4 | t | 1 | 5 + public | sequence_test3 | 1 | 1 | 9223372036854775807 | 1 | f | 1 | + public | sequence_test4 | -1 | -9223372036854775808 | -1 | -1 | f | 1 | -1 + public | sequence_test5 | 1 | 1 | 32767 | 1 | f | 1 | + public | sequence_test6 | 1 | 1 | 32767 | 1 | f | 1 | + public | sequence_test7 | 1 | 1 | 9223372036854775807 | 1 | f | 1 | + public | sequence_test8 | 1 | 1 | 20000 | 1 | f | 1 | + public | sequence_test9 | -1 | -32768 | -1 | -1 | f | 1 | + public | serialtest1_f2_foo | 1 | 1 | 2147483647 | 1 | f | 1 | 3 + public | serialtest2_f2_seq | 1 | 1 | 2147483647 | 1 | f | 1 | 2 + public | serialtest2_f3_seq | 1 | 1 | 32767 | 1 | f | 1 | 2 + public | serialtest2_f4_seq | 1 | 1 | 32767 | 1 | f | 1 | 2 + public | serialtest2_f5_seq | 1 | 1 | 9223372036854775807 | 1 | f | 1 | 2 + public | serialtest2_f6_seq | 1 | 1 | 9223372036854775807 | 1 | f | 1 | 2 +(19 rows) + +SELECT * FROM pg_sequence_parameters('sequence_test4'::regclass); + start_value | minimum_value | maximum_value | increment | cycle_option | cache_size | data_type +-------------+----------------------+---------------+-----------+--------------+------------+----------- + -1 | -9223372036854775808 | -1 | -1 | f | 1 | 20 +(1 row) + +\d sequence_test4 + Sequence "public.sequence_test4" + Type | Start | Minimum | Maximum | Increment | Cycles? | Cache +--------+-------+----------------------+---------+-----------+---------+------- + bigint | -1 | -9223372036854775808 | -1 | -1 | no | 1 + +\d serialtest2_f2_seq + Sequence "public.serialtest2_f2_seq" + Type | Start | Minimum | Maximum | Increment | Cycles? | Cache +---------+-------+---------+------------+-----------+---------+------- + integer | 1 | 1 | 2147483647 | 1 | no | 1 +Owned by: public.serialtest2.f2 + +-- Test comments +COMMENT ON SEQUENCE asdf IS 'won''t work'; +ERROR: relation "asdf" does not exist +COMMENT ON SEQUENCE sequence_test2 IS 'will work'; +COMMENT ON SEQUENCE sequence_test2 IS NULL; +-- Test lastval() +CREATE SEQUENCE seq; +SELECT nextval('seq'); + nextval +--------- + 1 +(1 row) + +SELECT lastval(); + lastval +--------- + 1 +(1 row) + +SELECT setval('seq', 99); + setval +-------- + 99 +(1 row) + +SELECT lastval(); + lastval +--------- + 99 +(1 row) + +DISCARD SEQUENCES; +SELECT lastval(); +ERROR: lastval is not yet defined in this session +CREATE SEQUENCE seq2; +SELECT nextval('seq2'); + nextval +--------- + 1 +(1 row) + +SELECT lastval(); + lastval +--------- + 1 +(1 row) + +DROP SEQUENCE seq2; +-- should fail +SELECT lastval(); +ERROR: lastval is not yet defined in this session +-- unlogged sequences +-- (more tests in src/test/recovery/) +CREATE UNLOGGED SEQUENCE sequence_test_unlogged; +ALTER SEQUENCE sequence_test_unlogged SET LOGGED; +\d sequence_test_unlogged + Sequence "public.sequence_test_unlogged" + Type | Start | Minimum | Maximum | Increment | Cycles? | Cache +--------+-------+---------+---------------------+-----------+---------+------- + bigint | 1 | 1 | 9223372036854775807 | 1 | no | 1 + +ALTER SEQUENCE sequence_test_unlogged SET UNLOGGED; +\d sequence_test_unlogged + Unlogged sequence "public.sequence_test_unlogged" + Type | Start | Minimum | Maximum | Increment | Cycles? | Cache +--------+-------+---------+---------------------+-----------+---------+------- + bigint | 1 | 1 | 9223372036854775807 | 1 | no | 1 + +DROP SEQUENCE sequence_test_unlogged; +-- Test sequences in read-only transactions +CREATE TEMPORARY SEQUENCE sequence_test_temp1; +START TRANSACTION READ ONLY; +SELECT nextval('sequence_test_temp1'); -- ok + nextval +--------- + 1 +(1 row) + +SELECT nextval('sequence_test2'); -- error +ERROR: cannot execute nextval() in a read-only transaction +ROLLBACK; +START TRANSACTION READ ONLY; +SELECT setval('sequence_test_temp1', 1); -- ok + setval +-------- + 1 +(1 row) + +SELECT setval('sequence_test2', 1); -- error +ERROR: cannot execute setval() in a read-only transaction +ROLLBACK; +-- privileges tests +CREATE USER regress_seq_user; +-- nextval +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +CREATE SEQUENCE seq3; +REVOKE ALL ON seq3 FROM regress_seq_user; +GRANT SELECT ON seq3 TO regress_seq_user; +SELECT nextval('seq3'); +ERROR: permission denied for sequence seq3 +ROLLBACK; +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +CREATE SEQUENCE seq3; +REVOKE ALL ON seq3 FROM regress_seq_user; +GRANT UPDATE ON seq3 TO regress_seq_user; +SELECT nextval('seq3'); + nextval +--------- + 1 +(1 row) + +ROLLBACK; +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +CREATE SEQUENCE seq3; +REVOKE ALL ON seq3 FROM regress_seq_user; +GRANT USAGE ON seq3 TO regress_seq_user; +SELECT nextval('seq3'); + nextval +--------- + 1 +(1 row) + +ROLLBACK; +-- currval +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +CREATE SEQUENCE seq3; +SELECT nextval('seq3'); + nextval +--------- + 1 +(1 row) + +REVOKE ALL ON seq3 FROM regress_seq_user; +GRANT SELECT ON seq3 TO regress_seq_user; +SELECT currval('seq3'); + currval +--------- + 1 +(1 row) + +ROLLBACK; +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +CREATE SEQUENCE seq3; +SELECT nextval('seq3'); + nextval +--------- + 1 +(1 row) + +REVOKE ALL ON seq3 FROM regress_seq_user; +GRANT UPDATE ON seq3 TO regress_seq_user; +SELECT currval('seq3'); +ERROR: permission denied for sequence seq3 +ROLLBACK; +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +CREATE SEQUENCE seq3; +SELECT nextval('seq3'); + nextval +--------- + 1 +(1 row) + +REVOKE ALL ON seq3 FROM regress_seq_user; +GRANT USAGE ON seq3 TO regress_seq_user; +SELECT currval('seq3'); + currval +--------- + 1 +(1 row) + +ROLLBACK; +-- lastval +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +CREATE SEQUENCE seq3; +SELECT nextval('seq3'); + nextval +--------- + 1 +(1 row) + +REVOKE ALL ON seq3 FROM regress_seq_user; +GRANT SELECT ON seq3 TO regress_seq_user; +SELECT lastval(); + lastval +--------- + 1 +(1 row) + +ROLLBACK; +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +CREATE SEQUENCE seq3; +SELECT nextval('seq3'); + nextval +--------- + 1 +(1 row) + +REVOKE ALL ON seq3 FROM regress_seq_user; +GRANT UPDATE ON seq3 TO regress_seq_user; +SELECT lastval(); +ERROR: permission denied for sequence seq3 +ROLLBACK; +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +CREATE SEQUENCE seq3; +SELECT nextval('seq3'); + nextval +--------- + 1 +(1 row) + +REVOKE ALL ON seq3 FROM regress_seq_user; +GRANT USAGE ON seq3 TO regress_seq_user; +SELECT lastval(); + lastval +--------- + 1 +(1 row) + +ROLLBACK; +-- setval +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +CREATE SEQUENCE seq3; +REVOKE ALL ON seq3 FROM regress_seq_user; +SAVEPOINT save; +SELECT setval('seq3', 5); +ERROR: permission denied for sequence seq3 +ROLLBACK TO save; +GRANT UPDATE ON seq3 TO regress_seq_user; +SELECT setval('seq3', 5); + setval +-------- + 5 +(1 row) + +SELECT nextval('seq3'); + nextval +--------- + 6 +(1 row) + +ROLLBACK; +-- ALTER SEQUENCE +BEGIN; +SET LOCAL SESSION AUTHORIZATION regress_seq_user; +ALTER SEQUENCE sequence_test2 START WITH 1; +ERROR: must be owner of sequence sequence_test2 +ROLLBACK; +-- Sequences should get wiped out as well: +DROP TABLE serialTest1, serialTest2; +-- Make sure sequences are gone: +SELECT * FROM information_schema.sequences WHERE sequence_name IN + ('sequence_test2', 'serialtest2_f2_seq', 'serialtest2_f3_seq', + 'serialtest2_f4_seq', 'serialtest2_f5_seq', 'serialtest2_f6_seq') + ORDER BY sequence_name ASC; + sequence_catalog | sequence_schema | sequence_name | data_type | numeric_precision | numeric_precision_radix | numeric_scale | start_value | minimum_value | maximum_value | increment | cycle_option +------------------+-----------------+----------------+-----------+-------------------+-------------------------+---------------+-------------+---------------+---------------+-----------+-------------- + regression | public | sequence_test2 | bigint | 64 | 2 | 0 | 32 | 5 | 36 | 4 | YES +(1 row) + +DROP USER regress_seq_user; +DROP SEQUENCE seq; +-- cache tests +CREATE SEQUENCE test_seq1 CACHE 10; +SELECT nextval('test_seq1'); + nextval +--------- + 1 +(1 row) + +SELECT nextval('test_seq1'); + nextval +--------- + 2 +(1 row) + +SELECT nextval('test_seq1'); + nextval +--------- + 3 +(1 row) + +DROP SEQUENCE test_seq1; diff --git a/src/test/regress/expected/spgist.out b/src/test/regress/expected/spgist.out index 2e911285600..c371e04a795 100644 --- a/src/test/regress/expected/spgist.out +++ b/src/test/regress/expected/spgist.out @@ -94,3 +94,6 @@ select box(point(i,j)) from generate_series(1,100,5) i, generate_series(1,10,5) j; -- leave this table around, to help in testing dump/restore +-- NEON: In Neon unlogged tables are wiped away on node restart +-- so drop the table to keep Neon tests clean. +drop table spgist_unlogged_tbl; diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 579b861d84f..231283c5636 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -118,7 +118,9 @@ select name, setting from pg_settings where name like 'enable%'; enable_hashjoin | on enable_incremental_sort | on enable_indexonlyscan | on + enable_indexonlyscan_prefetch | on enable_indexscan | on + enable_indexscan_prefetch | on enable_material | on enable_memoize | on enable_mergejoin | on @@ -129,9 +131,10 @@ select name, setting from pg_settings where name like 'enable%'; enable_partitionwise_aggregate | off enable_partitionwise_join | off enable_seqscan | on + enable_seqscan_prefetch | on enable_sort | on enable_tidscan | on -(20 rows) +(23 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail diff --git a/src/test/regress/expected/tablespace_1.out b/src/test/regress/expected/tablespace_1.out new file mode 100644 index 00000000000..f4c7e75060e --- /dev/null +++ b/src/test/regress/expected/tablespace_1.out @@ -0,0 +1,976 @@ +-- relative tablespace locations are not allowed +CREATE TABLESPACE regress_tblspace LOCATION 'relative'; -- fail +ERROR: tablespace location must be an absolute path +-- empty tablespace locations are not usually allowed +CREATE TABLESPACE regress_tblspace LOCATION ''; -- fail +ERROR: tablespace location must be an absolute path +-- as a special developer-only option to allow us to use tablespaces +-- with streaming replication on the same server, an empty location +-- can be allowed as a way to say that the tablespace should be created +-- as a directory in pg_tblspc, rather than being a symlink +SET allow_in_place_tablespaces = true; +-- create a tablespace using WITH clause +CREATE TABLESPACE regress_tblspacewith LOCATION '' WITH (some_nonexistent_parameter = true); -- fail +ERROR: unrecognized parameter "some_nonexistent_parameter" +CREATE TABLESPACE regress_tblspacewith LOCATION '' WITH (random_page_cost = 3.0); -- ok +-- check to see the parameter was used +SELECT spcoptions FROM pg_tablespace WHERE spcname = 'regress_tblspacewith'; + spcoptions +------------------------ + {random_page_cost=3.0} +(1 row) + +-- drop the tablespace so we can re-use the location +DROP TABLESPACE regress_tblspacewith; +-- create a tablespace we can use +CREATE TABLESPACE regress_tblspace LOCATION ''; +-- This returns a relative path as of an effect of allow_in_place_tablespaces, +-- masking the tablespace OID used in the path name. +SELECT regexp_replace(pg_tablespace_location(oid), '(pg_tblspc)/(\d+)', '\1/NNN') + FROM pg_tablespace WHERE spcname = 'regress_tblspace'; + regexp_replace +---------------- + pg_tblspc/NNN +(1 row) + +-- try setting and resetting some properties for the new tablespace +ALTER TABLESPACE regress_tblspace SET (random_page_cost = 1.0, seq_page_cost = 1.1); +ALTER TABLESPACE regress_tblspace SET (some_nonexistent_parameter = true); -- fail +ERROR: unrecognized parameter "some_nonexistent_parameter" +ALTER TABLESPACE regress_tblspace RESET (random_page_cost = 2.0); -- fail +ERROR: RESET must not include values for parameters +ALTER TABLESPACE regress_tblspace RESET (random_page_cost, effective_io_concurrency); -- ok +-- REINDEX (TABLESPACE) +-- catalogs and system tablespaces +-- system catalog, fail +REINDEX (TABLESPACE regress_tblspace) TABLE pg_am; +ERROR: cannot move system relation "pg_am_name_index" +REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_am; +ERROR: cannot reindex system catalogs concurrently +-- shared catalog, fail +REINDEX (TABLESPACE regress_tblspace) TABLE pg_authid; +ERROR: cannot move system relation "pg_authid_rolname_index" +REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_authid; +ERROR: cannot reindex system catalogs concurrently +-- toast relations, fail +REINDEX (TABLESPACE regress_tblspace) INDEX pg_toast.pg_toast_1260_index; +ERROR: cannot move system relation "pg_toast_1260_index" +REINDEX (TABLESPACE regress_tblspace) INDEX CONCURRENTLY pg_toast.pg_toast_1260_index; +ERROR: cannot reindex system catalogs concurrently +REINDEX (TABLESPACE regress_tblspace) TABLE pg_toast.pg_toast_1260; +ERROR: cannot move system relation "pg_toast_1260_index" +REINDEX (TABLESPACE regress_tblspace) TABLE CONCURRENTLY pg_toast.pg_toast_1260; +ERROR: cannot reindex system catalogs concurrently +-- system catalog, fail +REINDEX (TABLESPACE pg_global) TABLE pg_authid; +ERROR: cannot move system relation "pg_authid_rolname_index" +REINDEX (TABLESPACE pg_global) TABLE CONCURRENTLY pg_authid; +ERROR: cannot reindex system catalogs concurrently +-- table with toast relation +CREATE TABLE regress_tblspace_test_tbl (num1 bigint, num2 double precision, t text); +INSERT INTO regress_tblspace_test_tbl (num1, num2, t) + SELECT round(random()*100), random(), 'text' + FROM generate_series(1, 10) s(i); +CREATE INDEX regress_tblspace_test_tbl_idx ON regress_tblspace_test_tbl (num1); +-- move to global tablespace, fail +REINDEX (TABLESPACE pg_global) INDEX regress_tblspace_test_tbl_idx; +ERROR: only shared relations can be placed in pg_global tablespace +REINDEX (TABLESPACE pg_global) INDEX CONCURRENTLY regress_tblspace_test_tbl_idx; +ERROR: cannot move non-shared relation to tablespace "pg_global" +-- check transactional behavior of REINDEX (TABLESPACE) +BEGIN; +REINDEX (TABLESPACE regress_tblspace) INDEX regress_tblspace_test_tbl_idx; +REINDEX (TABLESPACE regress_tblspace) TABLE regress_tblspace_test_tbl; +ROLLBACK; +-- no relation moved to the new tablespace +SELECT c.relname FROM pg_class c, pg_tablespace s + WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace'; + relname +--------- +(0 rows) + +-- check that all indexes are moved to a new tablespace with different +-- relfilenode. +-- Save first the existing relfilenode for the toast and main relations. +SELECT relfilenode as main_filenode FROM pg_class + WHERE relname = 'regress_tblspace_test_tbl_idx' \gset +SELECT relfilenode as toast_filenode FROM pg_class + WHERE oid = + (SELECT i.indexrelid + FROM pg_class c, + pg_index i + WHERE i.indrelid = c.reltoastrelid AND + c.relname = 'regress_tblspace_test_tbl') \gset +REINDEX (TABLESPACE regress_tblspace) TABLE regress_tblspace_test_tbl; +SELECT c.relname FROM pg_class c, pg_tablespace s + WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace' + ORDER BY c.relname; + relname +------------------------------- + regress_tblspace_test_tbl_idx +(1 row) + +ALTER TABLE regress_tblspace_test_tbl SET TABLESPACE regress_tblspace; +ALTER TABLE regress_tblspace_test_tbl SET TABLESPACE pg_default; +SELECT c.relname FROM pg_class c, pg_tablespace s + WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace' + ORDER BY c.relname; + relname +------------------------------- + regress_tblspace_test_tbl_idx +(1 row) + +-- Move back to the default tablespace. +ALTER INDEX regress_tblspace_test_tbl_idx SET TABLESPACE pg_default; +SELECT c.relname FROM pg_class c, pg_tablespace s + WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace' + ORDER BY c.relname; + relname +--------- +(0 rows) + +REINDEX (TABLESPACE regress_tblspace, CONCURRENTLY) TABLE regress_tblspace_test_tbl; +SELECT c.relname FROM pg_class c, pg_tablespace s + WHERE c.reltablespace = s.oid AND s.spcname = 'regress_tblspace' + ORDER BY c.relname; + relname +------------------------------- + regress_tblspace_test_tbl_idx +(1 row) + +SELECT relfilenode = :main_filenode AS main_same FROM pg_class + WHERE relname = 'regress_tblspace_test_tbl_idx'; + main_same +----------- + f +(1 row) + +SELECT relfilenode = :toast_filenode as toast_same FROM pg_class + WHERE oid = + (SELECT i.indexrelid + FROM pg_class c, + pg_index i + WHERE i.indrelid = c.reltoastrelid AND + c.relname = 'regress_tblspace_test_tbl'); + toast_same +------------ + f +(1 row) + +DROP TABLE regress_tblspace_test_tbl; +-- REINDEX (TABLESPACE) with partitions +-- Create a partition tree and check the set of relations reindexed +-- with their new tablespace. +CREATE TABLE tbspace_reindex_part (c1 int, c2 int) PARTITION BY RANGE (c1); +CREATE TABLE tbspace_reindex_part_0 PARTITION OF tbspace_reindex_part + FOR VALUES FROM (0) TO (10) PARTITION BY list (c2); +CREATE TABLE tbspace_reindex_part_0_1 PARTITION OF tbspace_reindex_part_0 + FOR VALUES IN (1); +CREATE TABLE tbspace_reindex_part_0_2 PARTITION OF tbspace_reindex_part_0 + FOR VALUES IN (2); +-- This partitioned table will have no partitions. +CREATE TABLE tbspace_reindex_part_10 PARTITION OF tbspace_reindex_part + FOR VALUES FROM (10) TO (20) PARTITION BY list (c2); +-- Create some partitioned indexes +CREATE INDEX tbspace_reindex_part_index ON ONLY tbspace_reindex_part (c1); +CREATE INDEX tbspace_reindex_part_index_0 ON ONLY tbspace_reindex_part_0 (c1); +ALTER INDEX tbspace_reindex_part_index ATTACH PARTITION tbspace_reindex_part_index_0; +-- This partitioned index will have no partitions. +CREATE INDEX tbspace_reindex_part_index_10 ON ONLY tbspace_reindex_part_10 (c1); +ALTER INDEX tbspace_reindex_part_index ATTACH PARTITION tbspace_reindex_part_index_10; +CREATE INDEX tbspace_reindex_part_index_0_1 ON ONLY tbspace_reindex_part_0_1 (c1); +ALTER INDEX tbspace_reindex_part_index_0 ATTACH PARTITION tbspace_reindex_part_index_0_1; +CREATE INDEX tbspace_reindex_part_index_0_2 ON ONLY tbspace_reindex_part_0_2 (c1); +ALTER INDEX tbspace_reindex_part_index_0 ATTACH PARTITION tbspace_reindex_part_index_0_2; +SELECT relid, parentrelid, level FROM pg_partition_tree('tbspace_reindex_part_index') + ORDER BY relid, level; + relid | parentrelid | level +--------------------------------+------------------------------+------- + tbspace_reindex_part_index | | 0 + tbspace_reindex_part_index_0 | tbspace_reindex_part_index | 1 + tbspace_reindex_part_index_10 | tbspace_reindex_part_index | 1 + tbspace_reindex_part_index_0_1 | tbspace_reindex_part_index_0 | 2 + tbspace_reindex_part_index_0_2 | tbspace_reindex_part_index_0 | 2 +(5 rows) + +-- Track the original tablespace, relfilenode and OID of each index +-- in the tree. +CREATE TEMP TABLE reindex_temp_before AS + SELECT oid, relname, relfilenode, reltablespace + FROM pg_class + WHERE relname ~ 'tbspace_reindex_part_index'; +REINDEX (TABLESPACE regress_tblspace, CONCURRENTLY) TABLE tbspace_reindex_part; +-- REINDEX CONCURRENTLY changes the OID of the old relation, hence a check +-- based on the relation name below. +SELECT b.relname, + CASE WHEN a.relfilenode = b.relfilenode THEN 'relfilenode is unchanged' + ELSE 'relfilenode has changed' END AS filenode, + CASE WHEN a.reltablespace = b.reltablespace THEN 'reltablespace is unchanged' + ELSE 'reltablespace has changed' END AS tbspace + FROM reindex_temp_before b JOIN pg_class a ON b.relname = a.relname + ORDER BY 1; + relname | filenode | tbspace +--------------------------------+--------------------------+---------------------------- + tbspace_reindex_part_index | relfilenode is unchanged | reltablespace is unchanged + tbspace_reindex_part_index_0 | relfilenode is unchanged | reltablespace is unchanged + tbspace_reindex_part_index_0_1 | relfilenode has changed | reltablespace has changed + tbspace_reindex_part_index_0_2 | relfilenode has changed | reltablespace has changed + tbspace_reindex_part_index_10 | relfilenode is unchanged | reltablespace is unchanged +(5 rows) + +DROP TABLE tbspace_reindex_part; +-- create a schema we can use +CREATE SCHEMA testschema; +-- try a table +CREATE TABLE testschema.foo (i int) TABLESPACE regress_tblspace; +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname = 'foo'; + relname | spcname +---------+------------------ + foo | regress_tblspace +(1 row) + +INSERT INTO testschema.foo VALUES(1); +INSERT INTO testschema.foo VALUES(2); +-- tables from dynamic sources +CREATE TABLE testschema.asselect TABLESPACE regress_tblspace AS SELECT 1; +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname = 'asselect'; + relname | spcname +----------+------------------ + asselect | regress_tblspace +(1 row) + +PREPARE selectsource(int) AS SELECT $1; +CREATE TABLE testschema.asexecute TABLESPACE regress_tblspace + AS EXECUTE selectsource(2); +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname = 'asexecute'; + relname | spcname +-----------+------------------ + asexecute | regress_tblspace +(1 row) + +-- index +CREATE INDEX foo_idx on testschema.foo(i) TABLESPACE regress_tblspace; +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname = 'foo_idx'; + relname | spcname +---------+------------------ + foo_idx | regress_tblspace +(1 row) + +-- check \d output +\d testschema.foo + Table "testschema.foo" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + i | integer | | | +Indexes: + "foo_idx" btree (i), tablespace "regress_tblspace" +Tablespace: "regress_tblspace" + +\d testschema.foo_idx + Index "testschema.foo_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + i | integer | yes | i +btree, for table "testschema.foo" +Tablespace: "regress_tblspace" + +-- +-- partitioned table +-- +CREATE TABLE testschema.part (a int) PARTITION BY LIST (a); +SET default_tablespace TO pg_global; +CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1); +ERROR: only shared relations can be placed in pg_global tablespace +RESET default_tablespace; +CREATE TABLE testschema.part_1 PARTITION OF testschema.part FOR VALUES IN (1); +SET default_tablespace TO regress_tblspace; +CREATE TABLE testschema.part_2 PARTITION OF testschema.part FOR VALUES IN (2); +SET default_tablespace TO pg_global; +CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3); +ERROR: only shared relations can be placed in pg_global tablespace +ALTER TABLE testschema.part SET TABLESPACE regress_tblspace; +CREATE TABLE testschema.part_3 PARTITION OF testschema.part FOR VALUES IN (3); +CREATE TABLE testschema.part_4 PARTITION OF testschema.part FOR VALUES IN (4) + TABLESPACE pg_default; +CREATE TABLE testschema.part_56 PARTITION OF testschema.part FOR VALUES IN (5, 6) + PARTITION BY LIST (a); +ALTER TABLE testschema.part SET TABLESPACE pg_default; +CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8) + PARTITION BY LIST (a); +ERROR: only shared relations can be placed in pg_global tablespace +CREATE TABLE testschema.part_910 PARTITION OF testschema.part FOR VALUES IN (9, 10) + PARTITION BY LIST (a) TABLESPACE regress_tblspace; +RESET default_tablespace; +CREATE TABLE testschema.part_78 PARTITION OF testschema.part FOR VALUES IN (7, 8) + PARTITION BY LIST (a); +SELECT relname, spcname FROM pg_catalog.pg_class c + JOIN pg_catalog.pg_namespace n ON (c.relnamespace = n.oid) + LEFT JOIN pg_catalog.pg_tablespace t ON c.reltablespace = t.oid + where c.relname LIKE 'part%' AND n.nspname = 'testschema' order by relname; + relname | spcname +----------+------------------ + part | + part_1 | + part_2 | regress_tblspace + part_3 | regress_tblspace + part_4 | + part_56 | regress_tblspace + part_78 | + part_910 | regress_tblspace +(8 rows) + +RESET default_tablespace; +DROP TABLE testschema.part; +-- partitioned index +CREATE TABLE testschema.part (a int) PARTITION BY LIST (a); +CREATE TABLE testschema.part1 PARTITION OF testschema.part FOR VALUES IN (1); +CREATE INDEX part_a_idx ON testschema.part (a) TABLESPACE regress_tblspace; +CREATE TABLE testschema.part2 PARTITION OF testschema.part FOR VALUES IN (2); +SELECT relname, spcname FROM pg_catalog.pg_tablespace t, pg_catalog.pg_class c + where c.reltablespace = t.oid AND c.relname LIKE 'part%_idx'; + relname | spcname +-------------+------------------ + part1_a_idx | regress_tblspace + part2_a_idx | regress_tblspace + part_a_idx | regress_tblspace +(3 rows) + +\d testschema.part + Partitioned table "testschema.part" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | +Partition key: LIST (a) +Indexes: + "part_a_idx" btree (a), tablespace "regress_tblspace" +Number of partitions: 2 (Use \d+ to list them.) + +\d+ testschema.part + Partitioned table "testschema.part" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | +Partition key: LIST (a) +Indexes: + "part_a_idx" btree (a), tablespace "regress_tblspace" +Partitions: testschema.part1 FOR VALUES IN (1), + testschema.part2 FOR VALUES IN (2) + +\d testschema.part1 + Table "testschema.part1" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + a | integer | | | +Partition of: testschema.part FOR VALUES IN (1) +Indexes: + "part1_a_idx" btree (a), tablespace "regress_tblspace" + +\d+ testschema.part1 + Table "testschema.part1" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +--------+---------+-----------+----------+---------+---------+--------------+------------- + a | integer | | | | plain | | +Partition of: testschema.part FOR VALUES IN (1) +Partition constraint: ((a IS NOT NULL) AND (a = 1)) +Indexes: + "part1_a_idx" btree (a), tablespace "regress_tblspace" + +\d testschema.part_a_idx +Partitioned index "testschema.part_a_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +btree, for table "testschema.part" +Number of partitions: 2 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d+ testschema.part_a_idx + Partitioned index "testschema.part_a_idx" + Column | Type | Key? | Definition | Storage | Stats target +--------+---------+------+------------+---------+-------------- + a | integer | yes | a | plain | +btree, for table "testschema.part" +Partitions: testschema.part1_a_idx, + testschema.part2_a_idx +Tablespace: "regress_tblspace" + +-- partitioned rels cannot specify the default tablespace. These fail: +CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE pg_default; +ERROR: cannot specify default tablespace for partitioned relations +CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE pg_default) PARTITION BY LIST (a); +ERROR: cannot specify default tablespace for partitioned relations +SET default_tablespace TO 'pg_default'; +CREATE TABLE testschema.dflt (a int PRIMARY KEY) PARTITION BY LIST (a) TABLESPACE regress_tblspace; +ERROR: cannot specify default tablespace for partitioned relations +CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a); +ERROR: cannot specify default tablespace for partitioned relations +-- but these work: +CREATE TABLE testschema.dflt (a int PRIMARY KEY USING INDEX TABLESPACE regress_tblspace) PARTITION BY LIST (a) TABLESPACE regress_tblspace; +SET default_tablespace TO ''; +CREATE TABLE testschema.dflt2 (a int PRIMARY KEY) PARTITION BY LIST (a); +DROP TABLE testschema.dflt, testschema.dflt2; +-- check that default_tablespace doesn't affect ALTER TABLE index rebuilds +CREATE TABLE testschema.test_default_tab(id bigint) TABLESPACE regress_tblspace; +INSERT INTO testschema.test_default_tab VALUES (1); +CREATE INDEX test_index1 on testschema.test_default_tab (id); +CREATE INDEX test_index2 on testschema.test_default_tab (id) TABLESPACE regress_tblspace; +ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index3 PRIMARY KEY (id); +ALTER TABLE testschema.test_default_tab ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace; +\d testschema.test_index1 + Index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" + +\d testschema.test_index2 + Index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +-- use a custom tablespace for default_tablespace +SET default_tablespace TO regress_tblspace; +-- tablespace should not change if no rewrite +ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint; +\d testschema.test_index1 + Index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" + +\d testschema.test_index2 + Index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +SELECT * FROM testschema.test_default_tab; + id +---- + 1 +(1 row) + +-- tablespace should not change even if there is an index rewrite +ALTER TABLE testschema.test_default_tab ALTER id TYPE int; +\d testschema.test_index1 + Index "testschema.test_index1" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +btree, for table "testschema.test_default_tab" + +\d testschema.test_index2 + Index "testschema.test_index2" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +SELECT * FROM testschema.test_default_tab; + id +---- + 1 +(1 row) + +-- now use the default tablespace for default_tablespace +SET default_tablespace TO ''; +-- tablespace should not change if no rewrite +ALTER TABLE testschema.test_default_tab ALTER id TYPE int; +\d testschema.test_index1 + Index "testschema.test_index1" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +btree, for table "testschema.test_default_tab" + +\d testschema.test_index2 + Index "testschema.test_index2" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +-- tablespace should not change even if there is an index rewrite +ALTER TABLE testschema.test_default_tab ALTER id TYPE bigint; +\d testschema.test_index1 + Index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" + +\d testschema.test_index2 + Index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_index3 + Index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab" + +\d testschema.test_index4 + Index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab" +Tablespace: "regress_tblspace" + +DROP TABLE testschema.test_default_tab; +-- check that default_tablespace doesn't affect ALTER TABLE index rebuilds +-- (this time with a partitioned table) +CREATE TABLE testschema.test_default_tab_p(id bigint, val bigint) + PARTITION BY LIST (id) TABLESPACE regress_tblspace; +CREATE TABLE testschema.test_default_tab_p1 PARTITION OF testschema.test_default_tab_p + FOR VALUES IN (1); +INSERT INTO testschema.test_default_tab_p VALUES (1); +CREATE INDEX test_index1 on testschema.test_default_tab_p (val); +CREATE INDEX test_index2 on testschema.test_default_tab_p (val) TABLESPACE regress_tblspace; +ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index3 PRIMARY KEY (id); +ALTER TABLE testschema.test_default_tab_p ADD CONSTRAINT test_index4 UNIQUE (id) USING INDEX TABLESPACE regress_tblspace; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +-- use a custom tablespace for default_tablespace +SET default_tablespace TO regress_tblspace; +-- tablespace should not change if no rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +SELECT * FROM testschema.test_default_tab_p; + id | val +----+----- + 1 | +(1 row) + +-- tablespace should not change even if there is an index rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+---------+------+------------ + val | integer | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+---------+------+------------ + val | integer | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +SELECT * FROM testschema.test_default_tab_p; + id | val +----+----- + 1 | +(1 row) + +-- now use the default tablespace for default_tablespace +SET default_tablespace TO ''; +-- tablespace should not change if no rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE int; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+---------+------+------------ + val | integer | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+---------+------+------------ + val | integer | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +-- tablespace should not change even if there is an index rewrite +ALTER TABLE testschema.test_default_tab_p ALTER val TYPE bigint; +\d testschema.test_index1 +Partitioned index "testschema.test_index1" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index2 +Partitioned index "testschema.test_index2" + Column | Type | Key? | Definition +--------+--------+------+------------ + val | bigint | yes | val +btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +\d testschema.test_index3 +Partitioned index "testschema.test_index3" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +primary key, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) + +\d testschema.test_index4 +Partitioned index "testschema.test_index4" + Column | Type | Key? | Definition +--------+--------+------+------------ + id | bigint | yes | id +unique, btree, for table "testschema.test_default_tab_p" +Number of partitions: 1 (Use \d+ to list them.) +Tablespace: "regress_tblspace" + +DROP TABLE testschema.test_default_tab_p; +-- check that default_tablespace affects index additions in ALTER TABLE +CREATE TABLE testschema.test_tab(id int) TABLESPACE regress_tblspace; +INSERT INTO testschema.test_tab VALUES (1); +SET default_tablespace TO regress_tblspace; +ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_unique UNIQUE (id); +SET default_tablespace TO ''; +ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_pkey PRIMARY KEY (id); +\d testschema.test_tab_unique + Index "testschema.test_tab_unique" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +unique, btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_pkey + Index "testschema.test_tab_pkey" + Column | Type | Key? | Definition +--------+---------+------+------------ + id | integer | yes | id +primary key, btree, for table "testschema.test_tab" + +SELECT * FROM testschema.test_tab; + id +---- + 1 +(1 row) + +DROP TABLE testschema.test_tab; +-- check that default_tablespace is handled correctly by multi-command +-- ALTER TABLE that includes a tablespace-preserving rewrite +CREATE TABLE testschema.test_tab(a int, b int, c int); +SET default_tablespace TO regress_tblspace; +ALTER TABLE testschema.test_tab ADD CONSTRAINT test_tab_unique UNIQUE (a); +CREATE INDEX test_tab_a_idx ON testschema.test_tab (a); +SET default_tablespace TO ''; +CREATE INDEX test_tab_b_idx ON testschema.test_tab (b); +\d testschema.test_tab_unique + Index "testschema.test_tab_unique" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +unique, btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_a_idx + Index "testschema.test_tab_a_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_b_idx + Index "testschema.test_tab_b_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + b | integer | yes | b +btree, for table "testschema.test_tab" + +ALTER TABLE testschema.test_tab ALTER b TYPE bigint, ADD UNIQUE (c); +\d testschema.test_tab_unique + Index "testschema.test_tab_unique" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +unique, btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_a_idx + Index "testschema.test_tab_a_idx" + Column | Type | Key? | Definition +--------+---------+------+------------ + a | integer | yes | a +btree, for table "testschema.test_tab" +Tablespace: "regress_tblspace" + +\d testschema.test_tab_b_idx + Index "testschema.test_tab_b_idx" + Column | Type | Key? | Definition +--------+--------+------+------------ + b | bigint | yes | b +btree, for table "testschema.test_tab" + +DROP TABLE testschema.test_tab; +-- let's try moving a table from one place to another +CREATE TABLE testschema.atable AS VALUES (1), (2); +CREATE UNIQUE INDEX anindex ON testschema.atable(column1); +ALTER TABLE testschema.atable SET TABLESPACE regress_tblspace; +ALTER INDEX testschema.anindex SET TABLESPACE regress_tblspace; +ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_global; +ERROR: only shared relations can be placed in pg_global tablespace +ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default; +ALTER INDEX testschema.part_a_idx SET TABLESPACE regress_tblspace; +INSERT INTO testschema.atable VALUES(3); -- ok +INSERT INTO testschema.atable VALUES(1); -- fail (checks index) +ERROR: duplicate key value violates unique constraint "anindex" +DETAIL: Key (column1)=(1) already exists. +SELECT COUNT(*) FROM testschema.atable; -- checks heap + count +------- + 3 +(1 row) + +-- let's try moving a materialized view from one place to another +CREATE MATERIALIZED VIEW testschema.amv AS SELECT * FROM testschema.atable; +ALTER MATERIALIZED VIEW testschema.amv SET TABLESPACE regress_tblspace; +REFRESH MATERIALIZED VIEW testschema.amv; +SELECT COUNT(*) FROM testschema.amv; + count +------- + 3 +(1 row) + +-- Will fail with bad path +CREATE TABLESPACE regress_badspace LOCATION '/no/such/location'; +ERROR: directory "/no/such/location" does not exist +-- No such tablespace +CREATE TABLE bar (i int) TABLESPACE regress_nosuchspace; +ERROR: tablespace "regress_nosuchspace" does not exist +-- Fail, in use for some partitioned object +DROP TABLESPACE regress_tblspace; +ERROR: tablespace "regress_tblspace" cannot be dropped because some objects depend on it +DETAIL: tablespace for index testschema.part_a_idx +ALTER INDEX testschema.part_a_idx SET TABLESPACE pg_default; +-- Fail, not empty +DROP TABLESPACE regress_tblspace; +CREATE ROLE regress_tablespace_user1 login; +CREATE ROLE regress_tablespace_user2 login; +GRANT USAGE ON SCHEMA testschema TO regress_tablespace_user2; +ALTER TABLESPACE regress_tblspace OWNER TO regress_tablespace_user1; +ERROR: tablespace "regress_tblspace" does not exist +CREATE TABLE testschema.tablespace_acl (c int); +-- new owner lacks permission to create this index from scratch +CREATE INDEX k ON testschema.tablespace_acl (c) TABLESPACE regress_tblspace; +ERROR: tablespace "regress_tblspace" does not exist +ALTER TABLE testschema.tablespace_acl OWNER TO regress_tablespace_user2; +SET SESSION ROLE regress_tablespace_user2; +CREATE TABLE tablespace_table (i int) TABLESPACE regress_tblspace; -- fail +ERROR: tablespace "regress_tblspace" does not exist +ALTER TABLE testschema.tablespace_acl ALTER c TYPE bigint; +REINDEX (TABLESPACE regress_tblspace) TABLE tablespace_table; -- fail +ERROR: tablespace "regress_tblspace" does not exist +REINDEX (TABLESPACE regress_tblspace, CONCURRENTLY) TABLE tablespace_table; -- fail +ERROR: tablespace "regress_tblspace" does not exist +RESET ROLE; +ALTER TABLESPACE regress_tblspace RENAME TO regress_tblspace_renamed; +ERROR: tablespace "regress_tblspace" does not exist +ALTER TABLE ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; +ERROR: tablespace "regress_tblspace_renamed" does not exist +ALTER INDEX ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; +ERROR: tablespace "regress_tblspace_renamed" does not exist +ALTER MATERIALIZED VIEW ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; +ERROR: tablespace "regress_tblspace_renamed" does not exist +-- Should show notice that nothing was done +ALTER TABLE ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; +ERROR: tablespace "regress_tblspace_renamed" does not exist +ALTER MATERIALIZED VIEW ALL IN TABLESPACE regress_tblspace_renamed SET TABLESPACE pg_default; +ERROR: tablespace "regress_tblspace_renamed" does not exist +-- Should succeed +DROP TABLESPACE regress_tblspace_renamed; +ERROR: tablespace "regress_tblspace_renamed" does not exist +DROP SCHEMA testschema CASCADE; +NOTICE: drop cascades to 7 other objects +DETAIL: drop cascades to table testschema.foo +drop cascades to table testschema.asselect +drop cascades to table testschema.asexecute +drop cascades to table testschema.part +drop cascades to table testschema.atable +drop cascades to materialized view testschema.amv +drop cascades to table testschema.tablespace_acl +DROP ROLE regress_tablespace_user1; +DROP ROLE regress_tablespace_user2; diff --git a/src/test/regress/sql/spgist.sql b/src/test/regress/sql/spgist.sql index 4828ede68c3..9d6394516a2 100644 --- a/src/test/regress/sql/spgist.sql +++ b/src/test/regress/sql/spgist.sql @@ -89,3 +89,6 @@ select box(point(i,j)) from generate_series(1,100,5) i, generate_series(1,10,5) j; -- leave this table around, to help in testing dump/restore +-- NEON: In Neon unlogged tables are wiped away on node restart +-- so drop the table to keep Neon tests clean. +drop table spgist_unlogged_tbl; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 826d5d6e083..c10c4f5e666 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2252,6 +2252,8 @@ RelMapFile RelMapping RelOptInfo RelOptKind +RelSizeEntry +RelTag RelToCheck RelToCluster RelabelType @@ -2942,6 +2944,8 @@ WaitPMResult WalCloseMethod WalCompression WalLevel +Safekeeper +WalMessage WalRcvData WalRcvExecResult WalRcvExecStatus @@ -3053,6 +3057,17 @@ XmlTableBuilderData YYLTYPE YYSTYPE YY_BUFFER_STATE +ZenithErrorResponse +ZenithExistsRequest +ZenithExistsResponse +ZenithGetPageRequest +ZenithGetPageResponse +ZenithMessage +ZenithMessageTag +ZenithNblocksRequest +ZenithNblocksResponse +ZenithRequest +ZenithResponse ZSTD_CCtx ZSTD_DCtx ZSTD_inBuffer